In [172]:
import string

import pandas as pd
import numpy as np
from nltk import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from rake_nltk import Rake
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from unidecode import unidecode

### Tratamento dos dados

In [173]:
movies_df = pd.read_csv('../data/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [174]:
overview_df = pd.read_csv('../data/overviews.csv')
overview_df.head()

Unnamed: 0,movieId,tmdbId,overview
0,1,862,"Led by Woody, Andy's toys live happily in his ..."
1,2,8844,When siblings Judy and Peter discover an encha...
2,3,15602,A family wedding reignites the ancient feud be...
3,4,31357,"Cheated on, mistreated and stepped on, the wom..."
4,5,11862,Just when George Banks has recovered from his ...


In [175]:
ratings_df = pd.read_csv('../data/ratings.csv')
ratings_df.drop(columns=['timestamp'], inplace=True)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [176]:
ratings_df.shape

(100836, 3)

In [177]:
movies_df.shape

(9742, 3)

In [178]:
overview_df.shape

(9742, 3)

In [179]:
ratings_df.isna().sum()

userId     0
movieId    0
rating     0
dtype: int64

In [180]:
movies_df.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [181]:
overview_df.isna().sum()

movieId       0
tmdbId        0
overview    117
dtype: int64

In [182]:
list_movies_missing_overview = list(overview_df[overview_df.overview.isna()]['movieId'])

In [183]:
overview_df.dropna(inplace=True)
overview_df.isna().sum()

movieId     0
tmdbId      0
overview    0
dtype: int64

In [184]:
# remove ratings dos filmes 'dropados'

ratings_df = ratings_df[~ratings_df.movieId.isin(list_movies_missing_overview)]
ratings_df.reset_index(inplace=True)
ratings_df.drop(columns=['index'], inplace=True)

In [185]:
#remove filmes 'dropados' do df de filmes

movies_df = movies_df[~movies_df.movieId.isin(list_movies_missing_overview)]

In [186]:
movies_df.shape

(9625, 3)

In [187]:
overview_df.shape

(9625, 3)

In [188]:
ratings_df.shape

(100525, 3)

In [189]:
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100520,610,166534,4.0
100521,610,168248,5.0
100522,610,168250,5.0
100523,610,168252,5.0


In [190]:
experiment_df = movies_df.set_index('movieId').join(overview_df.set_index('movieId'), how='left')

In [191]:
experiment_df.reset_index(inplace=True)
experiment_df.head()

Unnamed: 0,movieId,title,genres,tmdbId,overview
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,862,"Led by Woody, Andy's toys live happily in his ..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,8844,When siblings Judy and Peter discover an encha...
2,3,Grumpier Old Men (1995),Comedy|Romance,15602,A family wedding reignites the ancient feud be...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,31357,"Cheated on, mistreated and stepped on, the wom..."
4,5,Father of the Bride Part II (1995),Comedy,11862,Just when George Banks has recovered from his ...


In [192]:
experiment_df.replace('(no genres listed)', '', inplace=True)

In [193]:
# colocando os gêneros em uma lista de palavras
experiment_df['genres'] = experiment_df['genres'].map(lambda x: x.lower().split('|'))

In [194]:
experiment_df.drop(columns=['tmdbId'], inplace=True)
experiment_df.set_index('movieId', inplace=True)

In [195]:
experiment_df.head()

Unnamed: 0_level_0,title,genres,overview
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),"[adventure, animation, children, comedy, fantasy]","Led by Woody, Andy's toys live happily in his ..."
2,Jumanji (1995),"[adventure, children, fantasy]",When siblings Judy and Peter discover an encha...
3,Grumpier Old Men (1995),"[comedy, romance]",A family wedding reignites the ancient feud be...
4,Waiting to Exhale (1995),"[comedy, drama, romance]","Cheated on, mistreated and stepped on, the wom..."
5,Father of the Bride Part II (1995),[comedy],Just when George Banks has recovered from his ...


### Iniciando o processo de recomendação

#### Definindo funções

In [196]:
def nlp_pre_process(corpus, stop_words_remv = True, lemmatization = True, stemmization = True):
    
    corpus = corpus.lower()
    corpus = unidecode(corpus) # remove non-ascii characters

    punctuations = list(string.punctuation)
    punctuations.append('...')
    
    corpus = " ".join([token for token in wordpunct_tokenize(corpus) if token not in punctuations])
    
    if (stop_words_remv):
        stopset = stopwords.words('english')
        corpus = " ".join([token for token in word_tokenize(corpus) if token not in stopset])

    if (lemmatization):
        lemmatizer = WordNetLemmatizer()
        corpus = " ".join([lemmatizer.lemmatize(token) for token in word_tokenize(corpus)])
    
    if (stemmization):
        stemmer = PorterStemmer()
        corpus = " ".join([stemmer.stem(token) for token in word_tokenize(corpus)])
        
    return corpus

In [197]:
def create_bag_of_words(df):
    df['bag_of_words'] = ''
    columns = df.columns

    for index, row in df.iterrows():
        words = ''
        for col in columns:
            if col != 'title':
                words = words + ' '.join(row[col])+ ' '
        row['bag_of_words'] = words
        
    df.drop(columns = [col for col in df.columns if col != 'bag_of_words' and col != 'title'], inplace = True)

    return df

In [198]:
def pre_process(movies_df, stopwords_removal=True, lemmatization=True, stemmization=True):
    df = movies_df.copy()
    
    df['overview'] = df['overview'].apply(nlp_pre_process, args=(stopwords_removal, lemmatization, stemmization))

    df['overview'] = df['overview'].apply(str.split)

    df = create_bag_of_words(df)

    return df

In [200]:
def recommender(movies_interacteds, movies_not_interacteds, movie_id_list, cosine_similarity):

    movie_to_index = {movie_id:index for index, movie_id in enumerate(movie_id_list)}

    similaritys = []

    # pegando lista de similaridades dos itens interagidos
    for movie in movies_interacteds:
        similaritys.append(cosine_similarity[movie_to_index[movie]])

    similaritys = pd.DataFrame(similaritys, columns=movie_id_list)
    similaritys['interact_ids'] = movies_interacteds
    similaritys.set_index('interact_ids', inplace=True)
    
    for movie in movies_interacteds:
        similaritys.loc[similaritys.index == movie, movies_interacteds] = 0.0

    average_similarity = similaritys.sum() / len(similaritys)

    top_10_movies = list(average_similarity.sort_values(ascending = False).iloc[0:20].index)

    return top_10_movies

In [201]:
def generate_similarity_matrix(df):
    count = TfidfVectorizer()
    count_matrix = count.fit_transform(df['bag_of_words'])

    # gerando a matriz de similaridade de cosseno
    cosine_sim = cosine_similarity(count_matrix, count_matrix)

    return cosine_sim

In [202]:
def get_recomendations(movies_df, ratings_df, user_id, cosine_sim, rating_threshold, frac, seed):
    profile = ratings_df[ratings_df.userId == user_id]
    profile = profile[profile.rating >= rating_threshold]

    train_items = profile.sample(frac=frac, random_state=seed)
    test_items = profile[~profile.movieId.isin(train_items.movieId)]

    user_not_interact = movies_df[~movies_df.index.isin(train_items.movieId)]

    user_interact_ids = list(train_items.movieId)

    user_not_interact_ids = list(user_not_interact.index)

    movie_id_list = list(movies_df.index)

    movies_recomended = recommender(user_interact_ids, user_not_interact_ids, movie_id_list, cosine_sim)

    precision = [True if movie in test_items else False for movie in movies_recomended].count(True) / 10

    return movies_recomended, precision

In [203]:
def evaluation_recommendation(movies_df, ratings_df, pre_process_tec, count=1, user_ids_sample=None, rating_threshold=4.0, frac=0.7, seed=5):

    stopwords, lemma, stemm = pre_process_tec
   
    movies_df = pre_process(movies_df, stopwords_removal=stopwords, lemmatization=lemma, stemmization=stemm)

    cosine_sim = generate_similarity_matrix(movies_df)

    if user_ids_sample:
        user_ids = user_ids_sample
    else:
        user_ids = set(list(ratings_df.userId))

    for user_id in user_ids:
  
        movies_recomended, precision = get_recomendations(movies_df, ratings_df, user_id, cosine_sim, rating_threshold, frac, seed)
        
        try:
            with open(f'../result/recomendations_{count}.csv', 'a') as recomendations:
                recomendations.write(f'{user_id},"{str(movies_recomended)[1:-1]}",{precision}\n')
        except Exception:
            print('Falha ao gravar as recomendações do id {}'.format(user_id))
            with open(f'../result/fails_{count}.csv', 'a') as fails:
                fails.write(f'{user_id},"{str(movies_recomended)[1:-1]}",{precision}\n')
        else:
            print('Recomendações do id {} gravado.'.format(user_id))

In [2099]:
combination_pre_process_techniques = [
    (1, (False, False, False)),
    (2, (False, False, True)),
    (3, (False, True, False)),
    (4, (False, True, True)),
    (5, (True, False, False)),
    (6, (True, False, True)),
    (7, (True, True, False)),
    (8, (True, True, True)),
]

In [None]:
for count, technique in combination_pre_process_techniques:
    evaluation_recommendation(experiment_df, ratings_df, technique, count)