In [3]:
print('hi')

hi


# Hotel_Recommender_System/ streamlit_app

Project Type - Recommender System

Contribution - Individual

## Problem Statement

Build a recommendation model to provide hotel suggestions based on user preferences and historical data. Develop a Streamlit web application to display insights and visualizations derived from the deployed travel recommendation model, offering an interactive and user-friendly interface for data exploration.

## Github Link

https://github.com/ish-war/voyage_analysis-Integrating-MLOps-in-Travel

## General Guidelines : -

1. Well-structured, formatted, and commented code is required.

2. Exception Handling, Production Grade Code & Deployment Ready Code will be a plus. Those students will be awarded some additional credits.

3. The additional credits will have advantages over other students during Star Student selection.

    [ Note: - Deployment Ready Code is defined as, the whole .ipynb notebook should be executable in one go
              without a single error logged. ]
Each and every logic should have proper comments.

4. You may add as many number of charts you want. Make Sure for each and every chart the following format should be answered.

## Let's Begin !


### Import Libraries

In [4]:
# Import Libraries
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import streamlit as st
import random

import warnings
warnings.filterwarnings("ignore")

### Dataset Loading

In [5]:
flights_df = pd.read_csv('data/flights.csv')
hotel_df = pd.read_csv('data/hotels.csv')
users_df = pd.read_csv('data/users.csv')
sample_size = 5000

In [27]:
hotel_df.head()

Unnamed: 0,travelCode,userCode,name,place,days,price,total,date
0,0,0,Hotel A,Florianopolis (SC),4,313.02,1252.08,09/26/2019
1,2,0,Hotel K,Salvador (BH),2,263.41,526.82,10/10/2019
2,7,0,Hotel K,Salvador (BH),3,263.41,790.23,11/14/2019
3,11,0,Hotel K,Salvador (BH),4,263.41,1053.64,12/12/2019
4,13,0,Hotel A,Florianopolis (SC),1,313.02,313.02,12/26/2019


In [28]:
hotel_df.tail()

Unnamed: 0,travelCode,userCode,name,place,days,price,total,date
40547,135938,1339,Hotel BP,Brasilia (DF),3,247.62,742.86,06/18/2020
40548,135939,1339,Hotel BP,Brasilia (DF),1,247.62,247.62,06/25/2020
40549,135940,1339,Hotel BW,Campo Grande (MS),3,60.39,181.17,07/02/2020
40550,135941,1339,Hotel BW,Campo Grande (MS),3,60.39,181.17,07/09/2020
40551,135942,1339,Hotel BD,Natal (RN),4,242.88,971.52,07/16/2020


In [29]:
hotel_df.isna().sum()

travelCode    0
userCode      0
name          0
place         0
days          0
price         0
total         0
date          0
dtype: int64

In [32]:
hotel_df.duplicated().sum()

np.int64(0)

In [33]:
hotel_df.info

<bound method DataFrame.info of        travelCode  userCode      name               place  days   price  \
0               0         0   Hotel A  Florianopolis (SC)     4  313.02   
1               2         0   Hotel K       Salvador (BH)     2  263.41   
2               7         0   Hotel K       Salvador (BH)     3  263.41   
3              11         0   Hotel K       Salvador (BH)     4  263.41   
4              13         0   Hotel A  Florianopolis (SC)     1  313.02   
...           ...       ...       ...                 ...   ...     ...   
40547      135938      1339  Hotel BP       Brasilia (DF)     3  247.62   
40548      135939      1339  Hotel BP       Brasilia (DF)     1  247.62   
40549      135940      1339  Hotel BW   Campo Grande (MS)     3   60.39   
40550      135941      1339  Hotel BW   Campo Grande (MS)     3   60.39   
40551      135942      1339  Hotel BD          Natal (RN)     4  242.88   

         total        date  
0      1252.08  09/26/2019  
1       5

In [34]:
hotel_df.describe()

Unnamed: 0,travelCode,userCode,days,price,total
count,40552.0,40552.0,40552.0,40552.0,40552.0
mean,67911.794461,666.963726,2.499679,214.439554,536.229513
std,39408.199333,391.136794,1.119326,76.742305,319.331482
min,0.0,0.0,1.0,60.39,60.39
25%,33696.75,323.0,1.0,165.99,247.62
50%,67831.0,658.0,2.0,242.88,495.24
75%,102211.25,1013.0,4.0,263.41,742.86
max,135942.0,1339.0,4.0,313.02,1252.08


In [6]:
# Set a random seed for reproducibility
random.seed(42)

In [7]:
users_with_enough_interactions_df = hotel_df.groupby(['userCode']).size().groupby('userCode').size()


In [8]:
users_interactions_count_df = hotel_df.groupby(['userCode','name']).size().groupby('userCode').size()
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 2].reset_index()[['userCode']]


In [9]:
interactions_from_selected_users_df = hotel_df.merge(users_with_enough_interactions_df,
               how = 'right',
               left_on = 'userCode',
               right_on = 'userCode')


In [10]:
# Encode userCode and hotel name to numeric values
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
#df_hotel['userCode'] = label_encoder.fit_transform(df_hotel['userCode'])
interactions_from_selected_users_df['name_encoded'] = label_encoder.fit_transform(interactions_from_selected_users_df['name'])
  

In [11]:
import math
def smooth_user_preference(x):
    return math.log(1+x, 2)
     


In [12]:
interactions_full_df = interactions_from_selected_users_df.groupby(['name_encoded','userCode'])['price'].sum().reset_index()
 

In [25]:
interactions_train_df, interactions_test_df = train_test_split(interactions_full_df,
                                                               stratify=interactions_full_df['userCode'],
                                   test_size=0.25,
                                   random_state=42)

x_test=set(interactions_test_df['userCode'])
x_train=set(interactions_train_df['userCode'])

only_in_set1 = x_train - x_test


#print("Elements in train but not in test:", only_in_set1)

only_in_set2 = x_test - x_train

#print("Elements in test but not in train:", only_in_set2)

In [14]:
#Creating a sparse pivot table with users in rows and items in columns
items_users_pivot_matrix_df = interactions_train_df.pivot(index='userCode',
                                                          columns='name_encoded',
                                                          values='price').fillna(0)


In [15]:
items_users_pivot_matrix = items_users_pivot_matrix_df.values
#items_users_pivot_matrix[:10]

user_ids = list(items_users_pivot_matrix_df.index)
#user_ids[:10]

In [16]:
#items_users_pivot_matrix.shape

# The number of factors to factor the item-user matrix.
NUMBER_OF_FACTORS_MF = 8

In [17]:
import scipy
from scipy.sparse.linalg import svds
#Performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(items_users_pivot_matrix, k = NUMBER_OF_FACTORS_MF)

sigma = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)

In [18]:
#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings, columns = items_users_pivot_matrix_df.columns,index=user_ids).transpose()


In [19]:
class CFRecommender:

    MODEL_NAME = 'Collaborative Filtering'

    def __init__(self, cf_predictions_df , items_df):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df

    def get_model_name(self):
        return self.MODEL_NAME
  

In [20]:
def recommend_items(self, user_id, items_to_ignore=[], topn=5, verbose=False):
        if user_id not in self.cf_predictions_df.columns:
            raise KeyError(f"User '{user_id}' not found in prediction data.")
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False).reset_index().rename(columns={user_id: 'recStrength'})

        # Recommend the highest predicted rating content that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['name_encoded'].isin(items_to_ignore)].sort_values('recStrength', ascending=False).head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            # Merge recommendations_df with items_df
            recommendations_df = recommendations_df.merge(self.items_df, how='left',
                                                          left_on='name_encoded',
                                                          right_on='name_encoded')[['name_encoded','name','recStrength']]
            recommendations_df=pd.DataFrame(recommendations_df.groupby('name').max('recStrength').sort_values('recStrength', ascending=False))

        return recommendations_df
 

In [21]:
# Assuming cf_preds_df and interactions_from_selected_users_df are defined elsewhere
cf_recommender_model = CFRecommender(cf_preds_df, interactions_from_selected_users_df)


## Save Model

In [26]:
import pickle

# Save the CFRecommender model
with open("cf_recommender.pkl", "wb") as f:
    pickle.dump(cf_recommender_model, f)

print("Model saved successfully!")


Model saved successfully!
