In [1]:
import string

import pandas as pd
import numpy as np
from nltk import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from rake_nltk import Rake
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from unidecode import unidecode

### Tratamento dos dados

In [2]:
movies_df = pd.read_csv('../data/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
overview_df = pd.read_csv('../data/overviews.csv')
overview_df.head()

Unnamed: 0,movieId,tmdbId,overview
0,1,862,"Led by Woody, Andy's toys live happily in his ..."
1,2,8844,When siblings Judy and Peter discover an encha...
2,3,15602,A family wedding reignites the ancient feud be...
3,4,31357,"Cheated on, mistreated and stepped on, the wom..."
4,5,11862,Just when George Banks has recovered from his ...


In [4]:
ratings_df = pd.read_csv('../data/ratings.csv')
ratings_df.drop(columns=['timestamp'], inplace=True)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [5]:
ratings_df.shape

(100836, 3)

In [6]:
movies_df.shape

(9742, 3)

In [7]:
overview_df.shape

(9742, 3)

In [8]:
ratings_df.isna().sum()

userId     0
movieId    0
rating     0
dtype: int64

In [9]:
movies_df.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [10]:
overview_df.isna().sum()

movieId       0
tmdbId        0
overview    117
dtype: int64

In [11]:
list_movies_missing_overview = list(overview_df[overview_df.overview.isna()]['movieId'])

In [12]:
overview_df.dropna(inplace=True)
overview_df.isna().sum()

movieId     0
tmdbId      0
overview    0
dtype: int64

In [13]:
# remove ratings dos filmes 'dropados'

ratings_df = ratings_df[~ratings_df.movieId.isin(list_movies_missing_overview)]
ratings_df.reset_index(inplace=True)
ratings_df.drop(columns=['index'], inplace=True)

In [14]:
#remove filmes 'dropados' do df de filmes

movies_df = movies_df[~movies_df.movieId.isin(list_movies_missing_overview)]

In [15]:
movies_df.shape

(9625, 3)

In [16]:
overview_df.shape

(9625, 3)

In [17]:
ratings_df.shape

(100525, 3)

In [18]:
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100520,610,166534,4.0
100521,610,168248,5.0
100522,610,168250,5.0
100523,610,168252,5.0


In [19]:
experiment_df = movies_df.set_index('movieId').join(overview_df.set_index('movieId'), how='left')

In [20]:
experiment_df.reset_index(inplace=True)
experiment_df.head()

Unnamed: 0,movieId,title,genres,tmdbId,overview
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,862,"Led by Woody, Andy's toys live happily in his ..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,8844,When siblings Judy and Peter discover an encha...
2,3,Grumpier Old Men (1995),Comedy|Romance,15602,A family wedding reignites the ancient feud be...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,31357,"Cheated on, mistreated and stepped on, the wom..."
4,5,Father of the Bride Part II (1995),Comedy,11862,Just when George Banks has recovered from his ...


In [21]:
experiment_df.replace('(no genres listed)', '', inplace=True)

In [22]:
# colocando os gêneros em uma lista de palavras
experiment_df['genres'] = experiment_df['genres'].map(lambda x: x.lower().split('|'))

In [23]:
experiment_df.drop(columns=['tmdbId'], inplace=True)
experiment_df.set_index('movieId', inplace=True)

In [24]:
experiment_df.head()

Unnamed: 0_level_0,title,genres,overview
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),"[adventure, animation, children, comedy, fantasy]","Led by Woody, Andy's toys live happily in his ..."
2,Jumanji (1995),"[adventure, children, fantasy]",When siblings Judy and Peter discover an encha...
3,Grumpier Old Men (1995),"[comedy, romance]",A family wedding reignites the ancient feud be...
4,Waiting to Exhale (1995),"[comedy, drama, romance]","Cheated on, mistreated and stepped on, the wom..."
5,Father of the Bride Part II (1995),[comedy],Just when George Banks has recovered from his ...


In [25]:
# filtrar ratings maiores ou iguais a 4.0
ratings_df = ratings_df[ratings_df.rating >= 4.0]
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100519,610,166528,4.0
100520,610,166534,4.0
100521,610,168248,5.0
100522,610,168250,5.0


In [26]:
# Filtrar usuários que deram menos do que 20 ratings
users = set(ratings_df.userId.to_list())

users_to_remove = []

for user in users:
    if len(ratings_df[ratings_df.userId == user]) < 20:
        users_to_remove.append(user)

users_to_remove
ratings_df = ratings_df[~ratings_df.userId.isin(users_to_remove)]

In [27]:
users = set(ratings_df.userId.to_list())
len(users)

465

In [28]:
experiment_df

Unnamed: 0_level_0,title,genres,overview
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),"[adventure, animation, children, comedy, fantasy]","Led by Woody, Andy's toys live happily in his ..."
2,Jumanji (1995),"[adventure, children, fantasy]",When siblings Judy and Peter discover an encha...
3,Grumpier Old Men (1995),"[comedy, romance]",A family wedding reignites the ancient feud be...
4,Waiting to Exhale (1995),"[comedy, drama, romance]","Cheated on, mistreated and stepped on, the wom..."
5,Father of the Bride Part II (1995),[comedy],Just when George Banks has recovered from his ...
...,...,...,...
193581,Black Butler: Book of the Atlantic (2017),"[action, animation, comedy, fantasy]","Ciel learns of a Aurora Society, that is rumor..."
193583,No Game No Life: Zero (2017),"[animation, comedy, fantasy]","In ancient Disboard, Riku is an angry, young w..."
193585,Flint (2017),[drama],A woman deals with the toxic water scandal in ...
193587,Bungo Stray Dogs: Dead Apple (2018),"[action, animation]",A large scale catastrophe is occurring across ...


In [508]:
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100519,610,166528,4.0
100520,610,166534,4.0
100521,610,168248,5.0
100522,610,168250,5.0


### Iniciando o processo de recomendação

#### Definindo funções

In [509]:
def nlp_pre_process(corpus, stop_words_remv = True, lemmatization = True, stemmization = True):
    
    corpus = corpus.lower()
    corpus = unidecode(corpus) # remove non-ascii characters

    punctuations = list(string.punctuation)
    punctuations.append('...')
    
    corpus = " ".join([token for token in wordpunct_tokenize(corpus) if token not in punctuations])
    
    if (stop_words_remv):
        stopset = stopwords.words('english')
        corpus = " ".join([token for token in word_tokenize(corpus) if token not in stopset])

    if (lemmatization):
        lemmatizer = WordNetLemmatizer()
        corpus = " ".join([lemmatizer.lemmatize(token) for token in word_tokenize(corpus)])
    
    if (stemmization):
        stemmer = PorterStemmer()
        corpus = " ".join([stemmer.stem(token) for token in word_tokenize(corpus)])
        
    return corpus

In [510]:
def create_bag_of_words(df):
    df['bag_of_words'] = ''
    columns = df.columns

    for index, row in df.iterrows():
        words = ''
        for col in columns:
            if col != 'title':
                words = words + ' '.join(row[col])+ ' '
        row['bag_of_words'] = words
        
    df.drop(columns = [col for col in df.columns if col != 'bag_of_words' and col != 'title'], inplace = True)

    return df

In [511]:
def pre_process(movies_df, stopwords_removal=True, lemmatization=True, stemmization=True):
    df = movies_df.copy()
    
    df['overview'] = df['overview'].apply(nlp_pre_process, args=(stopwords_removal, lemmatization, stemmization))

    df['overview'] = df['overview'].apply(str.split)

    df = create_bag_of_words(df)

    return df

In [512]:
def recommender(movies_interacteds, movies_to_recomend, cosine_similarity):

    similaritys = cosine_similarity[movies_to_recomend][cosine_similarity.index.isin(movies_interacteds)]

    average_similarity = similaritys.mean()

    top_10_movies = average_similarity.sort_values(ascending = False).iloc[0:10].index.to_list()

    return top_10_movies

In [513]:
def generate_similarity_matrix(df):
    count = TfidfVectorizer()
    count_matrix = count.fit_transform(df['bag_of_words'])

    # gerando a matriz de similaridade de cosseno
    cosine_sim = cosine_similarity(count_matrix, count_matrix)

    return cosine_sim

In [514]:
def get_rr_from_list(relevance_array):
    relevance_list_size = len(relevance_array)
    if relevance_list_size == 0:
        return 0.0
    for i in range(relevance_list_size):
        if relevance_array[i]:
            return 1 / (i + 1)
    return 0.0

In [515]:
def get_ap_from_list(relevance_array):
    relevance_list_size = len(relevance_array)
    if relevance_list_size == 0:
        return 0.0
    hit_list = []
    relevant = 0
    for i in range(relevance_list_size):
        if relevance_array[i]:
            relevant += 1
        hit_list.append(relevant / (i + 1))
    ap = sum(hit_list)
    if ap > 0.0:
        return ap / relevance_list_size
    else:
        return 0.0

In [516]:
def get_recomendations(movies_df, profile, cosine_sim, frac, seed):
    
    train_items = profile.sample(frac=frac, random_state=seed)
    test_items = profile[~profile.movieId.isin(train_items.movieId)]

    movies_interacteds = train_items.sample(15).movieId.to_list()

    movies_to_recomend = test_items.sample(5).movieId.to_list() + movies_df.sample(20).index.to_list()

    movies_recomended = recommender(movies_interacteds, movies_to_recomend, cosine_sim)

    relevance = [True if movie in test_items.movieId.to_list() else False for movie in movies_recomended]

    return relevance

    # precision = relevance.count(True) / 10

    # average_precision = get_ap_from_list(relevance)

    # reciprocal_rank = get_rr_from_list(relevance)

    # return movies_recomended, precision, average_precision, reciprocal_rank

In [517]:
def evaluation_recommendation(movies_df, ratings_df, pre_process_tec, count=1, rating_threshold=4.0, frac=0.75, seed=15):

    stopwords, lemma, stemm = pre_process_tec
   
    movies_df = pre_process(movies_df, stopwords_removal=stopwords, lemmatization=lemma, stemmization=stemm)

    cosine_sim = pd.DataFrame(generate_similarity_matrix(movies_df), columns=movies_df.index.to_list(), index=movies_df.index.to_list())

    user_ids = set(list(ratings_df.userId))

    for user_id in user_ids:
        profile = ratings_df[ratings_df.userId == user_id]
        
        relevance = get_recomendations(movies_df, profile, cosine_sim, frac, seed)
        
        prc_10 = relevance.count(True) / 10
        ap_10 = get_ap_from_list(relevance)
        rr_10 = get_rr_from_list(relevance)

        relevance_5 = relevance[:5]
        prc_5 = relevance_5.count(True) / 5
        ap_5 = get_ap_from_list(relevance_5)
        rr_5 = get_rr_from_list(relevance_5)

        relevance_3 = relevance[:3]
        prc_3 = relevance_3.count(True) / 3
        ap_3 = get_ap_from_list(relevance_3)
        rr_3 = get_rr_from_list(relevance_3)

        try:
            with open(f'../result/recomendations_{count}.csv', 'a') as recomendations:
                recomendations.write(f'{user_id},"{prc_10}",{prc_5},{prc_3},{ap_10},{ap_5},{ap_3},{rr_10},{rr_5},{rr_3}\n')
        except Exception:
            print('Falha ao gravar as recomendações do id {}'.format(user_id))
            with open(f'../result/fails_{count}.csv', 'a') as fails:
                fails.write(f'{user_id},"{prc_10}",{prc_5},{prc_3},{ap_10},{ap_5},{ap_3},{rr_10},{rr_5},{rr_3}\n')
        else:
            print('Recomendações do id {} gravado.'.format(user_id))

In [518]:
combination_pre_process_techniques = [
    (1, (False, False, False)),
    (2, (False, False, True)),
    (3, (False, True, False)),
    (4, (False, True, True)),
    (5, (True, False, False)),
    (6, (True, False, True)),
    (7, (True, True, False)),
    (8, (True, True, True)),
]

In [519]:
for count, technique in combination_pre_process_techniques:
    evaluation_recommendation(experiment_df, ratings_df, technique, count)

Recomendações do id 1 gravado.
Recomendações do id 4 gravado.
Recomendações do id 5 gravado.
Recomendações do id 6 gravado.
Recomendações do id 7 gravado.
Recomendações do id 8 gravado.
Recomendações do id 9 gravado.
Recomendações do id 10 gravado.
Recomendações do id 11 gravado.
Recomendações do id 12 gravado.
Recomendações do id 14 gravado.
Recomendações do id 15 gravado.
Recomendações do id 16 gravado.
Recomendações do id 17 gravado.
Recomendações do id 18 gravado.
Recomendações do id 19 gravado.
Recomendações do id 20 gravado.
Recomendações do id 21 gravado.
Recomendações do id 22 gravado.
Recomendações do id 23 gravado.
Recomendações do id 24 gravado.
Recomendações do id 25 gravado.
Recomendações do id 27 gravado.
Recomendações do id 28 gravado.
Recomendações do id 29 gravado.
Recomendações do id 30 gravado.
Recomendações do id 31 gravado.
Recomendações do id 32 gravado.
Recomendações do id 33 gravado.
Recomendações do id 34 gravado.
Recomendações do id 38 gravado.
Recomendações d