In [2]:
import pandas as pd
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
import ast
from surprise import Dataset, Reader, KNNBasic
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from surprise.model_selection import train_test_split
from sklearn.model_selection import train_test_split as skl_train_test_split
from datetime import datetime
from gensim.models.doc2vec import Doc2Vec,TaggedDocument
from sklearn.preprocessing import MinMaxScaler




nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\helen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\helen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\helen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
#Data

df_business = pd.read_csv('yelp_academic_dataset_business.csv')
df_checkin = pd.read_csv('yelp_academic_dataset_checkin.csv')
df_review=pd.read_csv('yelp_academic_dataset_review.csv')
df_tip=pd.read_csv('yelp_academic_dataset_tip.csv')
df_user=pd.read_csv('yelp_academic_dataset_user.csv')

# print(df_business.columns)
# print(df_checkin.columns)
# print(df_review.columns)
# print(df_tip.columns)
# print(df_user.columns)

In [4]:
def map_rating(cosine_similarity):
    '''
    Maps a consine similarity score to a rating from 1 to 5
    '''
    return 1 + 4 * ((cosine_similarity + 1) / 2)

In [5]:
#Load data

def load_data():
    # df_business = pd.read_csv('yelp_academic_dataset_business.csv')
    # df_review=pd.read_csv('yelp_academic_dataset_review.csv')
    # df_user=pd.read_csv('yelp_academic_dataset_user.csv')
    
    df_business_filadelfia = df_business[(df_business['city'] == 'Philadelphia') & (df_business['categories'].str.contains('Restaurants', na=False)) & (df_business['is_open']==1)].reset_index(drop=True)
    df_business_filadelfia=df_business_filadelfia[['business_id', 'name', 'stars', 'review_count', 'attributes', 'categories', 'hours']].reset_index(drop=True)
    df_business_filadelfia = df_business_filadelfia.dropna().reset_index(drop=True)

    df_review_filadelfia = df_review[df_review['business_id'].isin(df_business_filadelfia['business_id'])]
    df_review_filadelfia=df_review_filadelfia[['review_id', 'user_id', 'business_id', 'stars', 'text', 'date']]
    df_review_filadelfia['liked'] = (df_review_filadelfia['stars'] > 3).astype(int)
    df_review_filadelfia_profiles = df_review_filadelfia[df_review_filadelfia['liked'] == 1].reset_index(drop=True)
    df_review_filadelfia = df_review_filadelfia.dropna().reset_index(drop=True)

    df_user_filadelfia = df_user[df_user['user_id'].isin(df_review_filadelfia['user_id'])]
    df_user_filadelfia=df_user_filadelfia[['user_id', 'name', 'review_count', 'yelping_since', 'elite', 'average_stars']].reset_index(drop=True)
    df_user_filadelfia = df_user_filadelfia.dropna().reset_index(drop=True)

    # counts
    user_counts = df_review_filadelfia['user_id'].value_counts()
    restaurant_counts = df_review_filadelfia['business_id'].value_counts()

    # creating filters for users and restaurants with 5+ reviews
    users_with_5_plus_reviews = user_counts[user_counts >= 5].index
    restaurants_with_5_plus_reviews = restaurant_counts[restaurant_counts >= 5].index

    return df_business_filadelfia,df_review_filadelfia,df_user_filadelfia, users_with_5_plus_reviews, restaurants_with_5_plus_reviews

In [6]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def lemmatize_text(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(lemmatized_words)

In [7]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def stem_text(text):
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(stemmed_words)


In [8]:
#Pre-processamento
def pre_processing(data_review,method):
    if method == 'with lemma':

        data_review['text'] = data_review['text'].apply(lemmatize_text)

    
    else: 
        data_review['text'] = data_review['text'].apply(stem_text)

    return data_review


In [9]:
#Feature Engineering
def feature_engineering(data_reviews, method, components=8):
    if method == 'bag of words':
        vectorizer = CountVectorizer(max_df=0.95, min_df=2)
        matrix = vectorizer.fit_transform(data_reviews['text'])
        matrix = matrix.toarray()
        components = matrix.shape[1]
        # feature_names = vectorizer.get_feature_names_out()
        # df = pd.DataFrame(matrix, columns=feature_names)
   
    elif method =='lda':
        count_vectorizer = CountVectorizer(max_df=0.95, min_df=2)
        count_matrix = count_vectorizer.fit_transform(data_reviews['text'])
        lda_model = LatentDirichletAllocation(n_components=components, random_state=42)
        matrix = lda_model.fit_transform(count_matrix)
        # df = pd.DataFrame(matrix, columns=[f'Topic_{i}' for i in range(lda_model.n_components)])
    
    elif method =='lsa':
        vectorizer = CountVectorizer()
        count_matrix = vectorizer.fit_transform(data_reviews['text'])
        lsa_model = TruncatedSVD(n_components=components)
        matrix = lsa_model.fit_transform(count_matrix)

    elif method == 'doc2vec':
        # preproces the documents, and create TaggedDocuments
        tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()),
                                    tags=[str(i)]) for i,
                    doc in enumerate(data_reviews['text'])]

        # Doc2vec model
        model = Doc2Vec(vector_size=components,
                        min_count=2, epochs=50)
        model.build_vocab(tagged_data)
        model.train(tagged_data,
                    total_examples=model.corpus_count,
                    epochs=model.epochs)

        # document vectors
        matrix = [model.infer_vector(
            word_tokenize(doc.lower())) for doc in data_reviews['text']]

    return matrix, components


In [30]:
def features_business(df_business_filadelfia):
    df_business_filadelfia = df_business_filadelfia[['business_id', 'name', 'stars', 'review_count','attributes', 'categories', 'hours']]


    #variavel horario
    df_business_filadelfia['hours'] = df_business_filadelfia['hours'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else {})

    # Crie colunas separadas para cada dia da semana
    dias_da_semana = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

    for dia in dias_da_semana:
        df_business_filadelfia[dia] = df_business_filadelfia['hours'].apply(lambda x: x.get(dia, None))

    # Remova a coluna original 'hours' se não for mais necessária
    df_business_filadelfia.drop(columns=['hours'], inplace=True)


    def categorize_hours(hours):

        if hours == None:
            return 0 #'Fechado'
        
        else:
            start_time, end_time = hours.split('-')
            start_hour = int(start_time.split(':')[0])
            end_hour = int(end_time.split(':')[0])
            
            if end_hour <= 12:
                return 1 #'Manhã'
            elif 12 < start_hour and end_hour<=15:
                return 2 #'Almoço'
            elif start_hour > 15 and end_hour < 19:
                return 3 #'Tarde'
            elif start_hour>=19:
                return 4 #'Noite'
            else:
                return 5 #'Dia todo'

    # Aplicar a função de categorização a cada coluna de dia da semana
    for day in ['Monday', 'Tuesday', 'Wednesday','Thursday', 'Friday', 'Saturday', 'Sunday']:
        df_business_filadelfia[day] = df_business_filadelfia[day].apply(categorize_hours)


    #variavel categoria
    df_business_filadelfia['Food'] = 0
    df_business_filadelfia['Nightlife'] = 0
    df_business_filadelfia['Bars'] = 0
    df_business_filadelfia['Sandwiches'] = 0
    df_business_filadelfia['American (New)'] = 0
    df_business_filadelfia['Pizza'] = 0
    df_business_filadelfia['Breakfast & Brunch'] = 0
    df_business_filadelfia['American (Traditional)'] = 0
    df_business_filadelfia['Coffee & Tea'] = 0

    for index, row in df_business_filadelfia.iterrows():
        categories = row['categories']

        if 'Food' in categories:
            df_business_filadelfia.at[index, 'Food'] = 1

        if 'Nightlife' in categories:
            df_business_filadelfia.at[index, 'Nightlife'] = 1

        if 'Bars' in categories:
            df_business_filadelfia.at[index, 'Bars'] = 1

        if 'Sandwiches' in categories:
            df_business_filadelfia.at[index, 'Sandwiches'] = 1

        if 'American (New)' in categories:
            df_business_filadelfia.at[index, 'American (New)'] = 1
        
        if 'Pizza' in categories:
            df_business_filadelfia.at[index, 'Pizza'] = 1

        if 'Breakfast & Brunch' in categories:
            df_business_filadelfia.at[index, 'Breakfast & Brunch'] = 1

        if 'American (Traditional)' in categories:
            df_business_filadelfia.at[index, 'American (Traditional)'] = 1

        if 'Coffee & Tea' in categories:
            df_business_filadelfia.at[index, 'Coffee & Tea'] = 1



    #variavel atributos

    df_business_filadelfia['RestaurantsTakeOut'] = 0
    df_business_filadelfia['BusinessAcceptsCreditCards'] = 0
    df_business_filadelfia['RestaurantsDelivery'] = 0
    df_business_filadelfia['RestaurantsAttire_casual'] = 0
    df_business_filadelfia['HasTV'] = 0
    df_business_filadelfia['RestaurantsGoodForGroups'] = 0
    df_business_filadelfia['BikeParking'] = 0
    df_business_filadelfia['BusinessParking_street'] = 0
    df_business_filadelfia['GoodForKids'] = 0


    df_business_filadelfia['attributes'] = df_business_filadelfia['attributes'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else {})

    df_atributos = pd.json_normalize(df_business_filadelfia['attributes'])
    # Normalizar a coluna 'BusinessParking' para expandir as sub-chaves
    if 'BusinessParking' in df_atributos:
        df_parking = pd.json_normalize(df_atributos['BusinessParking'].dropna().apply(ast.literal_eval), sep='_')
        df_parking.columns = ['BusinessParking_garage', 'BusinessParking_street', 'BusinessParking_validated', 'BusinessParking_lot', 'BusinessParking_valet']
        # Juntar o df_parking ao df_atributos
        df_atributos = df_atributos.drop(columns=['BusinessParking']).join(df_parking, how='left')

    if "Ambience" in df_atributos:
        df_ambience = pd.json_normalize(df_atributos['Ambience'].dropna().apply(ast.literal_eval), sep='_')
        df_ambience.columns = [f"Ambience_{col}" for col in df_ambience.columns]
        # Juntar o df_ambience ao df_atributos
        df_atributos = df_atributos.drop(columns=['Ambience']).join(df_ambience, how='left')

    if "GoodForMeal" in df_atributos:
        df_GoodForMeal = pd.json_normalize(df_atributos['GoodForMeal'].dropna().apply(ast.literal_eval), sep='_')
        df_GoodForMeal.columns = [f"GoodForMeal_{col}" for col in df_GoodForMeal.columns]
        # Juntar o df_GoodForMeal ao df_atributos
        df_atributos = df_atributos.drop(columns=['GoodForMeal']).join(df_GoodForMeal, how='left')

    if "Music" in df_atributos:
        df_Music = pd.json_normalize(df_atributos['Music'].dropna().apply(ast.literal_eval), sep='_')
        df_Music.columns = [f"Music_{col}" for col in df_Music.columns]
        # Juntar o df_Music ao df_atributos
        df_atributos = df_atributos.drop(columns=['Music']).join(df_Music, how='left')

    if "BestNights" in df_atributos:
        df_BestNights = pd.json_normalize(df_atributos['BestNights'].dropna().apply(ast.literal_eval), sep='_')
        df_BestNights.columns = [f"BestNights_{col}" for col in df_BestNights.columns]
        # Juntar o df_BestNights ao df_atributos
        df_atributos = df_atributos.drop(columns=['BestNights']).join(df_BestNights, how='left')

    if "DietaryRestrictions" in df_atributos:
        df_DietaryRestrictions = pd.json_normalize(df_atributos['DietaryRestrictions'].dropna().apply(ast.literal_eval), sep='_')
        df_DietaryRestrictions.columns = [f"DietaryRestrictions_{col}" for col in df_DietaryRestrictions.columns]
        # Juntar o df_DietaryRestrictions ao df_atributos
        df_atributos = df_atributos.drop(columns=['DietaryRestrictions']).join(df_DietaryRestrictions, how='left')

    
    df_atributos = df_atributos.applymap(lambda x: x.replace("u'", "").replace("'", "") if isinstance(x, str) else x)
    def create_price_range_columns(row):
        price_ranges = ['1', '2', '3', '4']
        for price in price_ranges:
            column_name = f'RestaurantsPriceRange2_{price}'
            if row['RestaurantsPriceRange2'] == price:
                row[column_name] = 'True'
            elif row['RestaurantsPriceRange2'] == 'False':
                row[column_name] = 'False'
            else:
                row[column_name] = 'False'
        return row

    # Aplicar a função linha por linha
    df_atributos = df_atributos.apply(create_price_range_columns, axis=1)
    df_atributos=df_atributos.drop('RestaurantsPriceRange2',axis=1)

    def alchool_columns(row):
        types = ['full_bar','beer_and_wine']
        for t in types:
            column_name = f'Alcohol_{t}'
            if row['Alcohol'] == t:
                row[column_name] = 'True'
            elif row['Alcohol'] == 'False':
                row[column_name] = 'False'
            else:
                row[column_name] = 'False'
        return row

    df_atributos = df_atributos.apply(alchool_columns, axis=1)
    df_atributos=df_atributos.drop('Alcohol',axis=1)


    def wifi_columns(row):
        types = ['free','paid']
        for t in types:
            column_name = f'WiFi_{t}'
            if row['WiFi'] == t:
                row[column_name] = 'True'
            elif row['WiFi'] == 'False':
                row[column_name] = 'False'
            else:
                row[column_name] = 'False'
        return row

    df_atributos = df_atributos.apply(wifi_columns, axis=1)
    df_atributos=df_atributos.drop('WiFi',axis=1)


    def attire_columns(row):
        types = ['casual', 'dressy', 'formal']
        for t in types:
            column_name = f'RestaurantsAttire_{t}'
            if row['RestaurantsAttire'] == t:
                row[column_name] = 'True'
            elif row['RestaurantsAttire'] == 'False':
                row[column_name] = 'False'
            else:
                row[column_name] = 'False'
        return row

    df_atributos = df_atributos.apply(attire_columns, axis=1)
    df_atributos=df_atributos.drop('RestaurantsAttire',axis=1)

    def noise_columns(row):
        types = ['average', 'quiet', 'loud', 'very_loud']
        for t in types:
            column_name = f'NoiseLevel_{t}'
            if row['NoiseLevel'] == t:
                row[column_name] = 'True'
            elif row['NoiseLevel'] == 'False':
                row[column_name] = 'False'
            else:
                row[column_name] = 'False'
        return row

    df_atributos = df_atributos.apply(noise_columns, axis=1)
    df_atributos=df_atributos.drop('NoiseLevel',axis=1)

    def ages_columns(row):
        types = ['21plus', 'allages']
        for t in types:
            column_name = f'AgesAllowed_{t}'
            if row['AgesAllowed'] == t:
                row[column_name] = 'True'
            elif row['AgesAllowed'] == 'False':
                row[column_name] = 'False'
            else:
                row[column_name] = 'False'
        return row

    df_atributos = df_atributos.apply(ages_columns, axis=1)
    df_atributos=df_atributos.drop('AgesAllowed',axis=1)

    df_atributos = df_atributos.replace('None','False')
    df_atributos = df_atributos.replace( np.nan,'False')
    df_atributos = df_atributos.replace('none','False')
    df_atributos = df_atributos.replace('no','False')
    df_atributos = df_atributos.replace('yes','True')
    df_atributos['Smoking'] = df_atributos['Smoking'].replace('outdoor','True')
    df_atributos['BYOBCorkage'] = df_atributos['BYOBCorkage'].replace('yes_free','True')
    df_atributos['BYOBCorkage'] = df_atributos['BYOBCorkage'].replace('yes_corkage','True')

    df_atributos['business_id'] = df_business_filadelfia['business_id']


    for index, row in df_atributos.iterrows():
        # attributes = row['attributes']
        if row['RestaurantsTakeOut']=='True':
            df_business_filadelfia.at[index, 'RestaurantsTakeOut'] = 1

        if row['BusinessAcceptsCreditCards']=='True':
            df_business_filadelfia.at[index, 'BusinessAcceptsCreditCards'] = 1

        if row['RestaurantsDelivery']=='True':
            df_business_filadelfia.at[index, 'RestaurantsDelivery'] = 1

        if row['RestaurantsAttire_casual']=='True':
            df_business_filadelfia.at[index, 'RestaurantsAttire_casual'] = 1

        if row['HasTV']=='True':
            df_business_filadelfia.at[index, 'HasTV'] = 1

        if row['RestaurantsGoodForGroups']=='True':
            df_business_filadelfia.at[index, 'RestaurantsGoodForGroups'] = 1

        if row['BikeParking']=='True':
            df_business_filadelfia.at[index, 'BikeParking'] = 1

        if row['BusinessParking_street']=='True':
            df_business_filadelfia.at[index, 'BusinessParking_street'] = 1

        if row['GoodForKids']=='True':
            df_business_filadelfia.at[index, 'GoodForKids'] = 1

    df_business_filadelfia=df_business_filadelfia.drop(['attributes', 'categories','name'],axis=1)

    columns_to_scale = [col for col in df_business_filadelfia.columns if col != 'business_id']

    print(columns_to_scale)


    scaler = MinMaxScaler()
    df_business_filadelfia[columns_to_scale] = scaler.fit_transform(df_business_filadelfia[columns_to_scale])

    return df_business_filadelfia

In [28]:
def features_user(df_user_filadelfia):


    yelping_since = pd.to_datetime(df_user_filadelfia['yelping_since'])

    # Calcule o número de anos no Yelp
    current_year = datetime.now().year
    df_user_filadelfia['yelping_since'] = current_year - yelping_since.dt.year


    df_user_filadelfia.columns = ['user_id', 'name', 'review_count_user', 'yelping_years', 'elite', 'average_stars']
    df_user_filadelfia=df_user_filadelfia.drop(['elite','name'],axis=1)
    # df_user_filadelfia=df_user_filadelfia.drop('yelping_since',axis=1)

    columns_to_scale = [col for col in df_user_filadelfia.columns if col != 'user_id']

    print(columns_to_scale)

    scaler = MinMaxScaler()
    df_user_filadelfia[columns_to_scale] = scaler.fit_transform(df_user_filadelfia[columns_to_scale])

    return df_user_filadelfia

In [12]:
def profiling(X_train, method, n_components, users_with_5_plus_reviews, restaurants_with_5_plus_reviews):
    
    # this outputs the topic matrix according to the method chosen (bag-of-word, lsa, lda and doc2vec) 
    topic_matrix, n_components = feature_engineering(X_train, method, n_components)

    # attach topic matrix to the dataset
    column_names = ['comp_{}'.format(i+1) for i in range(n_components)]
    topics = pd.DataFrame(topic_matrix, columns=column_names)

    # create profiles
    user_profile = pd.concat([X_train, topics], axis=1).drop(columns=['business_id', 'text']).groupby("user_id", as_index=False)[column_names].mean()
    restaurant_profile = pd.concat([X_train, topics], axis=1).drop(columns=['user_id', 'text']).groupby("business_id", as_index=False)[column_names].mean()

    # filtering for the ones with 5+ reviews (more representative)
    user_profile = user_profile[user_profile['user_id'].isin(users_with_5_plus_reviews)].reset_index(drop=True)
    restaurant_profile = restaurant_profile[restaurant_profile['business_id'].isin(restaurants_with_5_plus_reviews)].reset_index(drop=True)

    return user_profile, restaurant_profile

In [13]:
#Divisão teste e treino
#por enquanto está assim mas depois temos de definir como vamos querer dividir 
def split_data(final_data,business_data,users_data):

    general_trainset, general_testset = train_test_split(final_data, test_size=0.20, random_state=42)

    # Criar users_trainset e users_testset
    users_trainset = users_data[users_data['user_id'].isin(general_trainset['user_id'])]
    users_testset = users_data[users_data['user_id'].isin(general_testset['user_id'])]

    # Criar business_trainset e business_testset
    business_trainset = business_data[business_data['business_id'].isin(general_trainset['business_id'])]
    business_testset = business_data[business_data['business_id'].isin(general_testset['business_id'])]

    # Verificar os tamanhos dos conjuntos
    print("Tamanho do trainset:", len(general_trainset))
    print("Tamanho do testset:", len(general_testset))
    print("Tamanho do users_trainset:", len(users_trainset))
    print("Tamanho do users_testset:", len(users_testset))
    print("Tamanho do business_trainset:", len(business_trainset))
    print("Tamanho do business_testset:", len(business_testset))

    return general_trainset, general_testset,users_trainset, users_testset,business_trainset, business_testset

In [14]:
def train_test_split(df_review_filadelfia):
    # Desired dataset shape
    #data = df_review_filadelfia_profiles[['user_id', 'business_id', 'text', 'stars']] # only positive reviews
    data = df_review_filadelfia[['user_id', 'business_id', 'text', 'stars']]

    #data_sample = data.sample(100000, random_state=10).reset_index(drop=True)

    # train-test split
    X_train, X_test, y_train, y_test = skl_train_test_split(data[['user_id', 'business_id', 'text']], data['stars'], test_size=0.2, random_state=1)
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)

    return X_train, X_test, y_train, y_test

In [15]:
def recommend_similar_restaurants(restaurant_id, philly_restaurants, similarity_matrix, n=5):
    # Obter o índice do restaurante
    idx = philly_restaurants.index[philly_restaurants['business_id'] == restaurant_id].tolist()[0]
    
    # Obter similaridade do restaurante com todos os outros
    similars_indices = similarity_matrix[idx].argsort()[::-1]  # Do mais similar para o menos similar
    
    # Excluir o próprio restaurante da recomendação
    similars_indices = similars_indices[similars_indices != idx]
    
    # Selecionar os n mais similares
    similars_restaurants = philly_restaurants.iloc[similars_indices[:n]]
    
    return similars_restaurants[['business_id', 'name', 'categories', 'stars']]

In [16]:
# Função para recomendar com base nos restaurantes que o usuário já avaliou bem
def recommend_for_user(user_id, philly_restaurants, algo, similarity_matrix, n=5):
    # Obter os restaurantes avaliados pelo usuário
    user_reviews = ratings[ratings['user_id'] == user_id]
    highly_rated = user_reviews[user_reviews['stars'] >= 4]['business_id']
    
    # Para cada restaurante que o usuário gostou, recomendar restaurantes similares
    recommendations = pd.DataFrame()
    # print('highly rated ',highly_rated)

    for restaurant_id in highly_rated:
        try:
            # Obter o índice do restaurante
            inner_id = algo.trainset.to_inner_iid(restaurant_id)
            
            # Obter os restaurantes mais similares usando o modelo treinado
            neighbors = algo.get_neighbors(inner_id, k=n)
            # print('neighbors ',neighbors)
            # Converter os índices internos para IDs de restaurantes
            similar_restaurant_ids_knn = [algo.trainset.to_raw_iid(inner_id) for inner_id in neighbors]
            
            # Obter os detalhes dos restaurantes similares usando o modelo treinado
            similar_restaurants_knn = philly_restaurants[philly_restaurants['business_id'].isin(similar_restaurant_ids_knn)]
            # print('similar_restaurants_knn ',similar_restaurants_knn)
        except ValueError:
            # Se o restaurante não estiver no conjunto de treino, retornar um DataFrame vazio
            similar_restaurants_knn = pd.DataFrame()
        
        # Obter os detalhes dos restaurantes similares usando a matriz de similaridade
        similar_restaurants_matrix = recommend_similar_restaurants(restaurant_id, philly_restaurants, similarity_matrix, n)
        # print('similar_restaurants_matrix ',similar_restaurants_matrix)
        # Combinar as recomendações de ambos os métodos
        combined_recommendations = pd.concat([similar_restaurants_knn, similar_restaurants_matrix]).drop_duplicates(subset='business_id')
        
        recommendations = pd.concat([recommendations, combined_recommendations])
        # print('recommendations ',recommendations)
    # Remover duplicatas e ordenar por popularidade (opcional: você pode melhorar o critério de ordenação)
    recommendations = recommendations.drop_duplicates(subset='name').sort_values(by='stars', ascending=False)
    # print('recommendations ',recommendations)
    return recommendations['name'].head(n)



In [17]:
def get_top_n_similar_users(user_id, n,similarity_matrix,user_trainset):
    if user_id in similarity_matrix.index:
        similar_users = similarity_matrix[user_id].sort_values(ascending=False).index[1:n+1]
    else:
        from sklearn.neighbors import NearestNeighbors
        knn = NearestNeighbors(n_neighbors=n, metric='cosine')
        knn.fit(user_trainset)
        distances, indices = knn.kneighbors(user_trainset.loc[user_id].values.reshape(1, -1), n_neighbors=n+1)
        similar_users = user_trainset.index[indices.flatten()][1:]
    return similar_users


In [18]:
def get_recommended_restaurants(user_id, similar_users, n,user_trainset):
    target_user_ratings = user_trainset.loc[user_id]
    target_user_visited = target_user_ratings[target_user_ratings > 0].index

    similar_users_ratings = user_trainset.loc[similar_users]
    similar_users_ratings = similar_users_ratings.drop(columns=target_user_visited, errors='ignore')

    top_rated_restaurants = similar_users_ratings.mean().sort_values(ascending=False).head(n)
    return top_rated_restaurants


In [19]:
# Recommendation system

def recommend(user_id, restaurant_id, df_review_filadelfia, user_profile, restaurant_profile, type='UIBH'):
    
    '''
    Esstimates rating a user gives to a restaurant

    Inputs:
    user_id - the user
    restaurant_id - the restaurant to be rated
    df_review_filadelfia - original df with no filtering regarding the review being positive or not
    user_profile - df with the profiles of the users (vectors from LSA/LDA/doc2vec)
    restaurant_profile - df with the profiles of the restaurants (vectors from LSA/LDA/doc2vec)
    type - type of recommendations (user_item, users or items) (default = "user_item")

    Outputs:
    Rating
    '''

    try:

        # Extracting the vector relative to the user and removing the user from the profiles
        usr_lst = user_profile[user_profile['user_id'] == user_id].drop(columns=['user_id']).fillna(0).values
        user_profile_function = user_profile[user_profile['user_id'] != user_id].reset_index(drop=True).fillna(0)
        
        # Extracting the vector relative to the restaurant and removing the restaurant from the profiles
        bus_lst = restaurant_profile[restaurant_profile['business_id'] == restaurant_id].drop(columns=['business_id']).fillna(0).values
        restaurant_profile_function = restaurant_profile[restaurant_profile['business_id'] != restaurant_id].reset_index(drop=True).fillna(0)


        if type == 'UIBH':
            # Measures the similarity between user and restaurant
            # Rating is a linear function of the similarity

            # Removing the added feature so that the vectors have the same dimensions
            usr_lst = usr_lst[:,:-3]
            bus_lst = bus_lst[:,:-29]

            similarity_score = cosine_similarity(usr_lst, bus_lst)
            rating = map_rating(similarity_score[0][0])
            
        elif type == 'UBH':
            # Measures the similarity between the user and the other users that rated the restaurant
            # Rating is a weighted average of the ratings given by the users (weighted by the similarity)

            # Getting the other users that rated the restaurant and removing user
            users = df_review_filadelfia[df_review_filadelfia['business_id'] == restaurant_id]['user_id']
            users = users[users != user_id]

            # Due to considering only the positive reviews for the profiles, some users don't have profile
            # This is to remove them from the users list
            users = users[users.isin(user_profile_function['user_id'])].unique() # and remove duplicates

            # Getting the ratings given by the users and averaging if there are more than one
            users_ratings = df_review_filadelfia[df_review_filadelfia['user_id'].isin(users)][['user_id', 'stars']]
            users_ratings = users_ratings.groupby('user_id').mean().reset_index()

            # Creating a matrix for the similar users
            user_matrix = user_profile_function[user_profile_function['user_id'].isin(users)].drop(columns=['user_id']).values

            # Similarities between user and users
            similarity_scores = cosine_similarity(usr_lst, user_matrix)

            # Transform similarities into weights
            # This assumes that there will be other similar users.
            # If all the other users are not similar we are giving high weights to "not similar" users due to this rescaling
            similarity_scores = (similarity_scores+1)/2
            weights = similarity_scores / np.sum(similarity_scores, axis=1)

            # Computing weight-averaged rating
            rating = np.dot(weights[0], users_ratings['stars'])

        elif type == 'IBH':
            # Measures the similarity between the restaurant and other 
            # Recommends the restaurants that are most similar to the ones the user liked before

            # Getting the other restaurants the user rated and removing the restaurant
            restaurants = df_review_filadelfia[df_review_filadelfia['user_id'] == user_id]['business_id']
            restaurants = restaurants[restaurants != restaurant_id]

            # Due to considering only the positive reviews for the profiles, some restaurants don't have profile
            # This is to remove them from the restaurants list
            restaurants = restaurants[restaurants.isin(restaurant_profile_function['business_id'])].unique() # and remove duplicates

            # Getting the ratings given by the user and averaging if there are more than one
            user_ratings = df_review_filadelfia[df_review_filadelfia['user_id'] == user_id][['business_id', 'stars']]
            user_ratings = user_ratings[user_ratings['business_id'].isin(restaurants)].reset_index(drop=True)
            user_ratings = user_ratings.groupby('business_id').mean().reset_index()

            # Creating a matrix for the similar restaurants
            restaurant_matrix = restaurant_profile_function[restaurant_profile_function['business_id'].isin(restaurants)].drop(columns=['business_id']).fillna(0).values

            # Similarities between the restaurant and the other restaurants
            similarity_scores = cosine_similarity(bus_lst, restaurant_matrix)

            # Transform similarities into weights
            # This assumes that there will be other similar restaurants.
            # If all the other restaurants are not similar we are giving high weights to "not similar" restaurants due to this rescaling
            similarity_scores = (similarity_scores+1)/2
            weights = similarity_scores / np.sum(similarity_scores, axis=1)

            # Computing weight-averaged rating
            rating = np.dot(weights[0], user_ratings['stars'])
        
        else:
            raise ValueError("Invalid type. Please choose 'UIBH', 'UBH', or 'IBH'.")
        
        return rating
    
    except:
        return np.nan

In [20]:
def precision_at_k(ground_truth, recommendations, k):

    #top k predicted restaurants
    top_k = recommendations[:k]
    
    # number of relevant items in the top-k predictions
    relevant = sum([1 for i in top_k if i in list(ground_truth)])

    return relevant / k


def recall_at_k(ground_truth, recommendations, k):

    #top k predicted restaurants
    top_k = recommendations[:k]
    
    # Count the number of relevant items in the top-k predictions
    relevant = sum([1 for i in top_k if i in list(ground_truth)])
    
    #number of relevant items
    relevant_total = len(ground_truth)
    
    return relevant / relevant_total if relevant_total > 0 else 0

def calculate_precision_recall(recommendations, ground_truth, k):
    precision_results = []
    recall_results = []

    grouped_recommendations = recommendations.groupby('user_id')
    grouped_ground_truth = ground_truth.groupby('user_id')
    
    for user_id, rec_group in grouped_recommendations:

        predicted_order = rec_group['business_id'].tolist()
        user_truth = grouped_ground_truth.get_group(user_id)['business_id']
        user_truth_stars = grouped_ground_truth.get_group(user_id)['stars']
        user_truth = pd.concat([user_truth, user_truth_stars], axis=1)
        user_truth = user_truth[user_truth['stars'] > 3].reset_index(drop=True)
        user_truth = user_truth['business_id'].tolist()

        precision_k = precision_at_k(user_truth, predicted_order, k)
        recall_k = recall_at_k(user_truth, predicted_order, k)
        
        precision_results.append(precision_k)
        recall_results.append(recall_k)    

    return np.mean(precision_results), np.mean(recall_results)

In [21]:
def test_evaluate(X_test, y_test, df_review_filadelfia, user_profile, restaurant_profile, method):
    # predicting on the test set using the recommendations function
    X_test['star_pred'] = X_test.apply(lambda row: recommend(row['user_id'], row['business_id'], df_review_filadelfia, user_profile, restaurant_profile, type=method), axis=1)
    y = pd.concat([X_test['user_id'], X_test['business_id'], X_test['star_pred'], y_test], axis=1).dropna()

    # Filtering the users with at least x reviews
    user_counts = y['user_id'].value_counts()

    # for metrics @3 or @5, users should have at least 10 reviews
    users_with_10_plus_reviews = user_counts[user_counts >= 10].index
    y_filtered = y[(y['user_id'].isin(users_with_10_plus_reviews))].reset_index(drop=True)

    recommendations_3_5 = y_filtered.sort_values(by=['user_id', 'star_pred'], ascending=[True, False]).reset_index(drop=True)
    ground_truth_3_5 = y_filtered.sort_values(by=['user_id', 'stars'], ascending=[True, False]).reset_index(drop=True)
    # ground_truth_3_5 = ground_truth_3_5[ground_truth_3_5['stars'] > 3].reset_index(drop=True)

    # for metrics @10, users should have at least 20 reviews
    users_with_20_plus_reviews = user_counts[user_counts >= 20].index
    y_filtered_2 = y[(y['user_id'].isin(users_with_20_plus_reviews))].reset_index(drop=True)

    recommendations_10 = y_filtered_2.sort_values(by=['user_id', 'star_pred'], ascending=[True, False]).reset_index(drop=True)
    ground_truth_10 = y_filtered_2.sort_values(by=['user_id', 'stars'], ascending=[True, False]).reset_index(drop=True)
    # ground_truth_10 = ground_truth_10[ground_truth_10['stars'] > 3].reset_index(drop=True)

    rmse = np.sqrt(mean_squared_error(y['star_pred'], y['stars']))
    precision_3, recall_3 = calculate_precision_recall(recommendations_3_5, ground_truth_3_5, 3)
    precision_5, recall_5 = calculate_precision_recall(recommendations_3_5, ground_truth_3_5, 5)
    precision_10, recall_10 = calculate_precision_recall(recommendations_10, ground_truth_10, 10)

    return rmse, precision_3, recall_3, precision_5, recall_5, precision_10, recall_10

In [22]:
def normalize(column):
    return (column - column.min()) / (column.max() - column.min())

206 >= 20 reviews e 753 >= 10 reviews

In [23]:
methods_pre_processing = ['with lemma', 'with stemma']
# methods_feature_engineering = ['bag of words', 'word embeddings', 'lda']
methods_feature_engineering = ['lda', 'lsa', 'doc2vec']
# add_features_decision = ['yes','no']
algorithms_cf = ['CF-UB','CF-IB'] #CF-IB(Colaborative Filtering Item Based),CF-UB(Colaborative Filtering User Based)
algorithms_content = ['UBH','IBH','UIBH'] #UBH(User Based Hybrid), IBH(Item Based Hybrid), UIBH(User Item Based Hybrid)

In [25]:
#Main J

def main():

    metrics = pd.DataFrame(columns=['Pre-processing', 'Feature Engineering', 'Algorithm', 'RMSE', 'Precision@3', 'Recall@3', 'Precision@5', 'Recall@5', 'Precision@10', 'Recall@10'])

    # Load the dataset
    df_business_filadelfia,df_review_filadelfia,df_user_filadelfia, users_with_5_plus_reviews, restaurants_with_5_plus_reviews = load_data()

    df_review_filadelfia = df_review_filadelfia.sample(100000, random_state=10).reset_index(drop=True) # TO TEST AND DELETE AFTER


    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(df_review_filadelfia)

    users_train = X_train['user_id']

    businesses_train = X_train['business_id']

    df_user_filadelfia_treino = df_user_filadelfia[df_user_filadelfia['user_id'].isin(users_train)]

    df_business_filadelfia_treino = df_business_filadelfia[df_business_filadelfia['business_id'].isin(businesses_train)]

    df_features_user = features_user(df_user_filadelfia_treino)
    df_features_business = features_business(df_business_filadelfia_treino)

    # Collaborative Filtering (ACHO QUE O SPLIT NÃO VAI FUNCIONAR PARA AQUI - ACHO QUE VALE A PENA TER UMA MAIN SÓ PARA O CF)
    # for a in algorithms_cf:
    #     if a == 'CF-UB':
    #         algo = KNNBasic(sim_options={'user_based': True})
    #         algo.fit(user_trainset)
    #         predictions = algo.test(user_testset)

    #     else:

    #         algo = KNNBasic(sim_options={'user_based': False})
    #         algo.fit(business_trainset)
    #         predictions = algo.test(business_testset)  

    # Content-based / Hybrid
    for a in methods_pre_processing:
        X_train_pre_processed = pre_processing(X_train, a)

        for b in methods_feature_engineering:

            if b == 'doc2vec':
                n_components = 100
            else:
                n_components = 8

            user_profile, restaurant_profile = profiling(X_train_pre_processed, b, n_components, users_with_5_plus_reviews, restaurants_with_5_plus_reviews)

            user_profile=user_profile.merge(df_features_user,on='user_id')
            restaurant_profile=restaurant_profile.merge(df_features_business,on='business_id')

            print(X_test)

            for c in algorithms_content:
                print(a,b,c)
                # return(X_test, y_test, user_profile, restaurant_profile, df_review_filadelfia)
                rmse, precision_3, recall_3, precision_5, recall_5, precision_10, recall_10 = test_evaluate(X_test, y_test, df_review_filadelfia, user_profile, restaurant_profile, c)
                # print(f'{a} - {b} - {c} - {rmse} - {precision_3} - {recall_3} - {precision_5} - {recall_5} - {precision_10} - {recall_10}')
                metrics = metrics.append({'Pre-processing': a, 'Feature Engineering': b, 'Algorithm': c, 'RMSE': rmse, 'Precision@3':precision_3, 'Recall@3':recall_3, 'Precision@5':precision_5, 'Recall@5':recall_5, 'Precision@10':precision_10, 'Recall@10':recall_10}, ignore_index=True)
                metrics.to_csv('metrics.csv')

    return metrics

In [31]:
metrics_final = main()
metrics_final.to_csv('metrics_final.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_filadelfia['yelping_since'] = current_year - yelping_since.dt.year


['review_count_user', 'yelping_years', 'average_stars']
['stars', 'review_count', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'Restaurants', 'Food', 'Nightlife', 'Bars', 'Sandwiches', 'American (New)', 'Pizza', 'Breakfast & Brunch', 'American (Traditional)', 'Coffee & Tea', 'Restaurantes', 'RestaurantsTakeOut', 'BusinessAcceptsCreditCards', 'RestaurantsDelivery', 'RestaurantsAttire_casual', 'HasTV', 'RestaurantsGoodForGroups', 'BikeParking', 'BusinessParking_street', 'GoodForKids']
                      user_id  review_count_user  yelping_years  average_stars
4      NIhcRW6DWvk1JQhDhXwgOQ           0.130646       0.941176       0.488281
6      AkBtT43dYcttxQ3qOzPBAg           0.058854       0.823529       0.546875
7      RDTVzWPoCeGaUujrHIWRBQ           0.008530       0.647059       0.652344
8      IpLRJY4CP3fXtlEd8Y4GFQ           0.029312       0.705882       0.199219
9      RgDVC3ZUBqpEe6Y1kPhIpw           0.073052       0.705882       0.675781
...  

KeyboardInterrupt: 