In [22]:
import pandas as pd
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast
from surprise import Dataset, Reader, KNNBasic
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from surprise.model_selection import train_test_split


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\helen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\helen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\helen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
#Data

df_business = pd.read_csv('yelp_academic_dataset_business.csv')
df_checkin = pd.read_csv('yelp_academic_dataset_checkin.csv')
df_review=pd.read_csv('yelp_academic_dataset_review.csv')
df_tip=pd.read_csv('yelp_academic_dataset_tip.csv')
df_user=pd.read_csv('yelp_academic_dataset_user.csv')

print(df_business.columns)
print(df_checkin.columns)
print(df_review.columns)
print(df_tip.columns)
print(df_user.columns)

  df_user=pd.read_csv('yelp_academic_dataset_user.csv')


Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours'],
      dtype='object')
Index(['business_id', 'date'], dtype='object')
Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date'],
      dtype='object')
Index(['user_id', 'business_id', 'text', 'date', 'compliment_count'], dtype='object')
Index(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny',
       'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot',
       'compliment_more', 'compliment_profile', 'compliment_cute',
       'compliment_list', 'compliment_note', 'compliment_plain',
       'compliment_cool', 'compliment_funny', 'compliment_writer',
       'compliment_photos'],
      dtype='object')


In [None]:
#Subset de dados

df_business_filadelfia = df_business[(df_business['city'] == 'Philadelphia') & (df_business['categories'].str.contains('Restaurants', na=False)) & (df_business['is_open']==1)].reset_index(drop=True)
df_business_filadelfia=df_business_filadelfia[['business_id', 'name', 'stars', 'review_count', 'attributes', 'categories', 'hours']]

df_review_filadelfia = df_review[df_review['business_id'].isin(df_business_filadelfia['business_id'])]
df_review_filadelfia=df_review_filadelfia[['review_id', 'user_id', 'business_id', 'stars', 'text', 'date']]

df_user_filadelfia = df_user[df_user['user_id'].isin(df_review_filadelfia['user_id'])]
df_user_filadelfia=df_user_filadelfia[['user_id', 'name', 'review_count', 'yelping_since', 'elite', 'average_stars']]

In [16]:
#Load data

def load_data():
    # df_business = pd.read_csv('yelp_academic_dataset_business.csv')
    # df_review=pd.read_csv('yelp_academic_dataset_review.csv')
    # df_user=pd.read_csv('yelp_academic_dataset_user.csv')
    
    df_business_filadelfia = df_business[(df_business['city'] == 'Philadelphia') & (df_business['categories'].str.contains('Restaurants', na=False)) & (df_business['is_open']==1)].reset_index(drop=True)
    df_business_filadelfia=df_business_filadelfia[['business_id', 'name', 'stars', 'review_count', 'attributes', 'categories', 'hours']]

    df_review_filadelfia = df_review[df_review['business_id'].isin(df_business_filadelfia['business_id'])]
    df_review_filadelfia=df_review_filadelfia[['review_id', 'user_id', 'business_id', 'stars', 'text', 'date']]

    df_user_filadelfia = df_user[df_user['user_id'].isin(df_review_filadelfia['user_id'])]
    df_user_filadelfia=df_user_filadelfia[['user_id', 'name', 'review_count', 'yelping_since', 'elite', 'average_stars']]

    return df_business_filadelfia,df_review_filadelfia,df_user_filadelfia

In [27]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def lemmatize_text(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(lemmatized_words)

In [28]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def stem_text(text):
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(stemmed_words)


In [29]:
#Pre-processamento
def pre_processing(data_review,method):
    if method == 'with lemma':

        data_review['text'] = data_review['text'].apply(lemmatize_text)

    
    else: 
        data_review['text'] = data_review['text'].apply(stem_text)

    return data_review


In [54]:
#Feature Engineering
def feature_engineering(data_reviews, method):
    if method == 'bag of words':
        vectorizer = CountVectorizer(max_df=0.95, min_df=2)
        matrix = vectorizer.fit_transform(data_reviews['text'])
        matrix = matrix.toarray()
        feature_names = vectorizer.get_feature_names_out()
        df = pd.DataFrame(matrix, columns=feature_names)

    # elif method=='word embeddings':
    
    elif method =='lda':
        count_vectorizer = CountVectorizer(max_df=0.95, min_df=2)
        count_matrix = count_vectorizer.fit_transform(data_reviews['text'])

        lda_model = LatentDirichletAllocation(n_components=10, random_state=42)
        matrix = lda_model.fit_transform(count_matrix)
        df = pd.DataFrame(matrix, columns=[f'Topic_{i}' for i in range(lda_model.n_components)])

    return df


In [None]:
def add_features(data_business,data_user,data_review):
    return business_data,users_data

In [11]:
#Divisão teste e treino
#por enquanto está assim mas depois temos de definir como vamos querer dividir 
def split_data(final_data,business_data,users_data):

    general_trainset, general_testset = train_test_split(final_data, test_size=0.20, random_state=42)

    # Criar users_trainset e users_testset
    users_trainset = users_data[users_data['user_id'].isin(general_trainset['user_id'])]
    users_testset = users_data[users_data['user_id'].isin(general_testset['user_id'])]

    # Criar business_trainset e business_testset
    business_trainset = business_data[business_data['business_id'].isin(general_trainset['business_id'])]
    business_testset = business_data[business_data['business_id'].isin(general_testset['business_id'])]

    # Verificar os tamanhos dos conjuntos
    print("Tamanho do trainset:", len(general_trainset))
    print("Tamanho do testset:", len(general_testset))
    print("Tamanho do users_trainset:", len(users_trainset))
    print("Tamanho do users_testset:", len(users_testset))
    print("Tamanho do business_trainset:", len(business_trainset))
    print("Tamanho do business_testset:", len(business_testset))

    return general_trainset, general_testset,users_trainset, users_testset,business_trainset, business_testset

In [None]:
def recommend_similar_restaurants(restaurant_id, philly_restaurants, similarity_matrix, n=5):
    # Obter o índice do restaurante
    idx = philly_restaurants.index[philly_restaurants['business_id'] == restaurant_id].tolist()[0]
    
    # Obter similaridade do restaurante com todos os outros
    similars_indices = similarity_matrix[idx].argsort()[::-1]  # Do mais similar para o menos similar
    
    # Excluir o próprio restaurante da recomendação
    similars_indices = similars_indices[similars_indices != idx]
    
    # Selecionar os n mais similares
    similars_restaurants = philly_restaurants.iloc[similars_indices[:n]]
    
    return similars_restaurants[['business_id', 'name', 'categories', 'stars']]

In [None]:
# Função para recomendar com base nos restaurantes que o usuário já avaliou bem
def recommend_for_user(user_id, philly_restaurants, algo, similarity_matrix, n=5):
    # Obter os restaurantes avaliados pelo usuário
    user_reviews = ratings[ratings['user_id'] == user_id]
    highly_rated = user_reviews[user_reviews['stars'] >= 4]['business_id']
    
    # Para cada restaurante que o usuário gostou, recomendar restaurantes similares
    recommendations = pd.DataFrame()
    # print('highly rated ',highly_rated)

    for restaurant_id in highly_rated:
        try:
            # Obter o índice do restaurante
            inner_id = algo.trainset.to_inner_iid(restaurant_id)
            
            # Obter os restaurantes mais similares usando o modelo treinado
            neighbors = algo.get_neighbors(inner_id, k=n)
            # print('neighbors ',neighbors)
            # Converter os índices internos para IDs de restaurantes
            similar_restaurant_ids_knn = [algo.trainset.to_raw_iid(inner_id) for inner_id in neighbors]
            
            # Obter os detalhes dos restaurantes similares usando o modelo treinado
            similar_restaurants_knn = philly_restaurants[philly_restaurants['business_id'].isin(similar_restaurant_ids_knn)]
            # print('similar_restaurants_knn ',similar_restaurants_knn)
        except ValueError:
            # Se o restaurante não estiver no conjunto de treino, retornar um DataFrame vazio
            similar_restaurants_knn = pd.DataFrame()
        
        # Obter os detalhes dos restaurantes similares usando a matriz de similaridade
        similar_restaurants_matrix = recommend_similar_restaurants(restaurant_id, philly_restaurants, similarity_matrix, n)
        # print('similar_restaurants_matrix ',similar_restaurants_matrix)
        # Combinar as recomendações de ambos os métodos
        combined_recommendations = pd.concat([similar_restaurants_knn, similar_restaurants_matrix]).drop_duplicates(subset='business_id')
        
        recommendations = pd.concat([recommendations, combined_recommendations])
        # print('recommendations ',recommendations)
    # Remover duplicatas e ordenar por popularidade (opcional: você pode melhorar o critério de ordenação)
    recommendations = recommendations.drop_duplicates(subset='name').sort_values(by='stars', ascending=False)
    # print('recommendations ',recommendations)
    return recommendations['name'].head(n)



In [12]:
def get_top_n_similar_users(user_id, n,similarity_matrix,user_trainset):
    if user_id in similarity_matrix.index:
        similar_users = similarity_matrix[user_id].sort_values(ascending=False).index[1:n+1]
    else:
        from sklearn.neighbors import NearestNeighbors
        knn = NearestNeighbors(n_neighbors=n, metric='cosine')
        knn.fit(user_trainset)
        distances, indices = knn.kneighbors(user_trainset.loc[user_id].values.reshape(1, -1), n_neighbors=n+1)
        similar_users = user_trainset.index[indices.flatten()][1:]
    return similar_users


In [13]:
def get_recommended_restaurants(user_id, similar_users, n,user_trainset):
    target_user_ratings = user_trainset.loc[user_id]
    target_user_visited = target_user_ratings[target_user_ratings > 0].index

    similar_users_ratings = user_trainset.loc[similar_users]
    similar_users_ratings = similar_users_ratings.drop(columns=target_user_visited, errors='ignore')

    top_rated_restaurants = similar_users_ratings.mean().sort_values(ascending=False).head(n)
    return top_rated_restaurants


In [None]:
#Avaliação

In [43]:
methods_pre_processing = ['with lemma','with stemma']
# methods_feature_engineering = ['bag of words', 'word embeddings', 'lda']
methods_feature_engineering = ['bag of words','lda']
# add_features_decision = ['yes','no']
add_features_decision = ['no'] #enquanto não criar a função
algorithms = ['CF-UB','CF-IB','UBH','IBH','UIBH'] #CF-IB(Colaborative Filtering Item Based),CF-UB(Colaborative Filtering User Based), UBH(User Based Hybrid), IBH(Item Based Hybrid), UIBH(User Item Based Hybrid)

In [55]:
#Main
feature_engineering_method = []
algoritmo=[]
accuracy = []
precision = []
f1_score = []
recall = []

def main():
    df_business_filadelfia,df_review_filadelfia,df_user_filadelfia = load_data()

    df_business_filadelfia=df_business_filadelfia.sample(1000)
    df_review_filadelfia=df_review_filadelfia.sample(1000)
    df_user_filadelfia=df_user_filadelfia.sample(1000)

    for a in methods_pre_processing:
        df_review_filadelfia = pre_processing(df_review_filadelfia,a)
        # print(df_business_filadelfia)
        # print(df_review_filadelfia)
        # print(df_user_filadelfia)

        for b in methods_feature_engineering:
            feature_engineering_method.append(b)
            final_data = feature_engineering(df_review_filadelfia,b)
            print(final_data)
            for c in add_features_decision:
                print(a,b,c)
                if c == 'yes':
                    business_data,users_data = add_features(final_data) #adicionamos as repetivas features a cada matriz
                
                else:
                    business_data=final_data
                    users_data=final_data
                
                general_trainset, general_testset,user_trainset, user_testset,business_trainset, business_testset = split_data(final_data,business_data,users_data)

                print('general_trainset')
                print(general_trainset)

                print('general_testset')
                print(general_testset)

                print('user_trainset')
                print(user_trainset)

                print('user_testset')
                print(user_testset)

                print('business_trainset')
                print(business_trainset)

                print('business_testset')
                print(business_testset)

    #             for d in algorithms:
    #                 algoritmo.append(d)

    #                 if d == 'CF-UB':
    #                     algo = KNNBasic(sim_options={'user_based': True})
    #                     algo.fit(user_trainset)
    #                     predictions = algo.test(user_testset)

    #                 elif d == 'CF-IB':
    #                     algo = KNNBasic(sim_options={'user_based': False})
    #                     algo.fit(business_trainset)
    #                     predictions = algo.test(business_testset)

    #                 elif d == 'UBH':
    #                     similarity_matrix = cosine_similarity(user_trainset)
    #                     similarity_matrix = pd.DataFrame(similarity_matrix, index=user_trainset.index, columns=user_trainset.index)

    #                     for user_id in user_testset['user_id']:
    #                         similar_users = get_top_n_similar_users(user_id, n=5)
    #                         recommended_restaurants = get_recommended_restaurants(user_id, similar_users, n_restaurants=5)
    #                         best_restaurant = recommended_restaurants.idxmax()


    #                 elif d == 'IBH':
    #                     for business_id in business_testset['business_id']:
    #                         similarity_matrix = cosine_similarity(business_trainset)
    #                         similarity_matrix = pd.DataFrame(similarity_matrix, index=business_trainset.index, columns=business_trainset.index)

    #                         recommendations = recommend_for_user(user_id, philly_restaurants, algo, similarity_matrix, n=5)


    #                 elif d == 'UIBH':
    #                     similarity_matrix = cosine_similarity(user_trainset,business_trainset)
    #                     similarity_matrix = pd.DataFrame(similarity_matrix, index=user_trainset.index, columns=business_trainset.index)


In [56]:
main()

     00  10  100  10am  10pm  11  12  13  14  15  ...  your  yourself  \
0     0   0    0     0     0   0   0   0   0   0  ...     0         0   
1     0   0    0     0     0   0   0   0   0   0  ...     0         0   
2     0   0    0     0     0   0   0   0   0   0  ...     0         0   
3     0   0    0     0     0   0   0   0   0   0  ...     0         0   
4     0   0    0     0     0   0   0   0   0   0  ...     0         0   
..   ..  ..  ...   ...   ...  ..  ..  ..  ..  ..  ...   ...       ...   
995   0   0    0     0     0   0   0   0   0   0  ...     0         0   
996   0   0    0     0     0   0   0   0   0   0  ...     0         0   
997   0   0    0     0     0   0   0   0   0   0  ...     0         0   
998   0   0    0     0     0   0   0   0   0   0  ...     0         0   
999   0   1    0     0     0   0   0   0   0   0  ...     0         0   

     youtube  yuck  yum  yummm  yummy  yuppie  zahav  zero  
0          0     0    0      0      0       0      0     0  
1

AttributeError: 'DataFrame' object has no attribute 'raw_ratings'