In [2]:
import string

import pandas as pd
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from rake_nltk import Rake
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from unidecode import unidecode

### Tratamento dos dados

In [3]:
movies_df = pd.read_csv('../data/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
overview_df = pd.read_csv('../data/overviews.csv')
overview_df.head()

Unnamed: 0,movieId,tmdbId,overview
0,1,862,"Led by Woody, Andy's toys live happily in his ..."
1,2,8844,When siblings Judy and Peter discover an encha...
2,3,15602,A family wedding reignites the ancient feud be...
3,4,31357,"Cheated on, mistreated and stepped on, the wom..."
4,5,11862,Just when George Banks has recovered from his ...


In [5]:
ratings_df = pd.read_csv('../data/ratings.csv')
ratings_df.drop(columns=['timestamp'], inplace=True)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [6]:
ratings_df.shape

(100836, 3)

In [7]:
movies_df.shape

(9742, 3)

In [8]:
overview_df.shape

(9742, 3)

In [9]:
ratings_df.isna().sum()

userId     0
movieId    0
rating     0
dtype: int64

In [10]:
movies_df.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [11]:
overview_df.isna().sum()

movieId       0
tmdbId        0
overview    117
dtype: int64

In [12]:
list_movies_missing_overview = list(overview_df[overview_df.overview.isna()]['movieId'])

In [13]:
overview_df.dropna(inplace=True)
overview_df.isna().sum()

movieId     0
tmdbId      0
overview    0
dtype: int64

In [14]:
# remove ratings aos filmes 'dropados'

ratings_df = ratings_df[~ratings_df.movieId.isin(list_movies_missing_overview)]

In [15]:
#remove filmes 'dropados' do df de filmes

movies_df = movies_df[~movies_df.movieId.isin(list_movies_missing_overview)]

In [16]:
movies_df.shape

(9625, 3)

In [17]:
overview_df.shape

(9625, 3)

In [18]:
ratings_df.shape

(100525, 3)

In [19]:
experiment_df = overview_df.set_index('movieId').join(movies_df.set_index('movieId'), how='left')

In [20]:
experiment_df.reset_index(inplace=True)
experiment_df.head()

Unnamed: 0,movieId,tmdbId,overview,title,genres
0,1,862,"Led by Woody, Andy's toys live happily in his ...",Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,8844,When siblings Judy and Peter discover an encha...,Jumanji (1995),Adventure|Children|Fantasy
2,3,15602,A family wedding reignites the ancient feud be...,Grumpier Old Men (1995),Comedy|Romance
3,4,31357,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,11862,Just when George Banks has recovered from his ...,Father of the Bride Part II (1995),Comedy


In [21]:
experiment_df.replace('(no genres listed)', '', inplace=True)

In [22]:
# colocando os gêneros em uma lista de palavras
experiment_df['genres'] = experiment_df['genres'].map(lambda x: x.lower().split('|'))

In [23]:
experiment_df.drop(columns=['tmdbId'], inplace=True)
experiment_df.set_index('movieId', inplace=True)

In [24]:
experiment_df.head()

Unnamed: 0_level_0,overview,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"Led by Woody, Andy's toys live happily in his ...",Toy Story (1995),"[adventure, animation, children, comedy, fantasy]"
2,When siblings Judy and Peter discover an encha...,Jumanji (1995),"[adventure, children, fantasy]"
3,A family wedding reignites the ancient feud be...,Grumpier Old Men (1995),"[comedy, romance]"
4,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale (1995),"[comedy, drama, romance]"
5,Just when George Banks has recovered from his ...,Father of the Bride Part II (1995),[comedy]


### Iniciando o processo de recomendação

#### Definindo funções

In [25]:
def nlp_pre_process(corpus, stop_words_remv = True, lemmatizetion = True, stemmization = True):
    corpus = corpus.lower()
    stopset = None
    if (stop_words_remv):
        stopset = stopwords.words('english')
        stopset += list(string.punctuation)

    if (lemmatizetion):
        lemmatizer = WordNetLemmatizer()
        corpus = " ".join([lemmatizer.lemmatize(i) for i in word_tokenize(corpus) if i not in stopset])
    
    if (stemmization):
        stemmer = PorterStemmer()
        corpus = " ".join([stemmer.stem(i) for i in word_tokenize(corpus)])
    
    # remove non-ascii characters
    corpus = unidecode(corpus)
    
    return corpus

In [26]:
def apply_rake(df, col_overview_name, col_key_words):
    for index, row in df.iterrows():
        overview = row[col_overview_name]

        # instanciando Rake, por padrão usa palavras irrelevantes em inglês do NLTK 
        # e descarta todos os caracteres de pontuação
        r = Rake()

        # extraindo as palavras passando o texto
        r.extract_keywords_from_text(overview)

        # obtendo o dicionário com palavras-chave e suas pontuações
        key_words_dict_scores = r.get_word_degrees()

        # atribuindo as palavras-chave à nova coluna
        row['key_words'] = list(key_words_dict_scores.keys())

    return df

In [27]:
def create_bag_of_words(df):
    df['bag_of_words'] = ''
    columns = df.columns

    for index, row in df.iterrows():
        words = ''
        for col in columns:
            if col != 'title':
                words = words + ' '.join(row[col])+ ' '
        row['bag_of_words'] = words
        
    df.drop(columns = [col for col in df.columns if col != 'bag_of_words' and col != 'title'], inplace = True)

    return df

In [28]:
def pre_process(df, col_overview_name, stopwords_removal, lemmatization, stemmization):
    df[col_overview_name].apply(nlp_pre_process, args=(stopwords_removal, lemmatization, stemmization))

    df['key_words'] = ''

    df = apply_rake(df, col_overview_name, 'key_words')

    df.drop(columns=[col_overview_name], inplace = True)

    df = create_bag_of_words(df)

    return df

In [72]:
def get_recommended(movies_interacteds, movies_not_interacteds, movie_id_list, cosine_similarity):

    movie_to_index = {movie_id:index for index, movie_id in enumerate(movie_id_list)}
    index_to_movie = {index:movie_id for index, movie_id in enumerate(movie_id_list)}

    similaritys = []

    # pegando lista de similaridades dos itens não interagidos
    for movie in movies_interacteds:
        similaritys.append([sim for i, sim in enumerate(cosine_similarity[movie_to_index[movie]]) if index_to_movie[i] in movies_not_interacteds])

    # obtendo a similaridade média
    num_of_items = len(movies_not_interacteds)
    sum_similarity = np.array([0.0] * num_of_items)
    for similarity in similaritys:
        sum_similarity += np.array(similarity)

    average_similarity = sum_similarity / num_of_items

    items_more_similar = pd.Series(average_similarity).sort_values(ascending = False)

    # pegando o top 10 indices de filmes mais similares
    top_10_similarity = list(items_more_similar.iloc[0:10].index)

    movies_to_recommend = pd.Series(movies_not_interacteds)

    top_10_movies = list(movies_to_recommend[movies_to_recommend.index.isin(top_10_similarity)])

    return top_10_movies

In [30]:
experiment_pos_processing = pre_process(experiment_df, 'overview', stopwords_removal=True, lemmatization=True, stemmization=True)

In [31]:
experiment_pos_processing.head()

Unnamed: 0_level_0,title,bag_of_words
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),adventure animation children comedy fantasy le...
2,Jumanji (1995),adventure children fantasy siblings judy peter...
3,Grumpier Old Men (1995),comedy romance family wedding reignites ancien...
4,Waiting to Exhale (1995),comedy drama romance cheated mistreated steppe...
5,Father of the Bride Part II (1995),comedy george banks recovered daughter wedding...


In [32]:
# instanciando e gerando a matriz de contagem
count = CountVectorizer()
count_matrix = count.fit_transform(experiment_pos_processing['bag_of_words'])

In [33]:
count_matrix.shape

(9625, 29102)

In [34]:
# gerando a matriz de similaridade de cosseno
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.11470787, 0.02988072, ..., 0.        , 0.04950738,
        0.05330018],
       [0.11470787, 1.        , 0.05484085, ..., 0.        , 0.02271554,
        0.0244558 ],
       [0.02988072, 0.05484085, 1.        , ..., 0.05345225, 0.        ,
        0.07644708],
       ...,
       [0.        , 0.        , 0.05345225, ..., 1.        , 0.        ,
        0.        ],
       [0.04950738, 0.02271554, 0.        , ..., 0.        , 1.        ,
        0.02111002],
       [0.05330018, 0.0244558 , 0.07644708, ..., 0.        , 0.02111002,
        1.        ]])

In [35]:
cosine_sim.shape

(9625, 9625)

In [36]:
movies_list = list(experiment_pos_processing.index)

In [61]:
user_preferences = experiment_pos_processing.sample(100)
user_preferences

Unnamed: 0_level_0,title,bag_of_words
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
3951,Two Family House (2000),drama buddy visalo michael rispoli factory wor...
3899,Circus (2000),crime drama thriller bruno sadistic criminal w...
2740,"Kindred, The (1986)",horror sci-fi amanda deathbed request son john...
747,"Stupids, The (1996)",comedy incredibly dull witted family unknowing...
8711,Mr. Blandings Builds His Dream House (1948),comedy advertising executive dreams getting ci...
...,...,...
64285,Wallace and Gromit in 'A Matter of Loaf and De...,animation comedy wallace gromit open bakery ac...
126420,American Heist (2015),action two brothers troubled paths find middle...
3111,Places in the Heart (1984),drama 1930s texas widow family fight save home...
155659,Florence Foster Jenkins (2016),comedy drama story florence foster jenkins new...


In [62]:
user_not_interact = experiment_pos_processing[~experiment_pos_processing.index.isin(user_preferences.index)]
user_not_interact

Unnamed: 0_level_0,title,bag_of_words
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),adventure animation children comedy fantasy le...
2,Jumanji (1995),adventure children fantasy siblings judy peter...
3,Grumpier Old Men (1995),comedy romance family wedding reignites ancien...
4,Waiting to Exhale (1995),comedy drama romance cheated mistreated steppe...
5,Father of the Bride Part II (1995),comedy george banks recovered daughter wedding...
...,...,...
193579,Jon Stewart Has Left the Building (2015),documentary celebrating jon stewart decade hal...
193583,No Game No Life: Zero (2017),animation comedy fantasy ancient disboard riku...
193585,Flint (2017),drama woman deals toxic water scandal flint mi...
193587,Bungo Stray Dogs: Dead Apple (2018),action animation large scale catastrophe occur...


In [64]:
user_preferences_ids = list(user_preferences.index)
user_preferences_ids

[3951,
 3899,
 2740,
 747,
 8711,
 440,
 47122,
 33672,
 8640,
 166024,
 3316,
 83796,
 58404,
 91337,
 6063,
 4489,
 26303,
 7714,
 387,
 26082,
 1228,
 1242,
 166534,
 116411,
 1722,
 93840,
 38159,
 513,
 2451,
 6294,
 27074,
 1137,
 87028,
 151695,
 5706,
 4506,
 5338,
 5093,
 175,
 4970,
 8492,
 6598,
 111622,
 103341,
 5094,
 45672,
 142509,
 26622,
 4265,
 2469,
 71899,
 102165,
 93326,
 2174,
 184791,
 89,
 52287,
 89837,
 71640,
 126921,
 2848,
 99149,
 85397,
 852,
 44931,
 33903,
 175707,
 6978,
 2382,
 1173,
 3706,
 7163,
 2070,
 66297,
 106785,
 6339,
 376,
 6281,
 95690,
 7369,
 145150,
 72701,
 692,
 5503,
 193581,
 62834,
 4756,
 322,
 6965,
 107447,
 5786,
 116823,
 8533,
 1907,
 8153,
 64285,
 126420,
 3111,
 155659,
 55]

In [65]:
user_not_interact_ids = list(user_not_interact.index)
len(user_not_interact_ids)

9525

In [74]:
movies_recomended = get_recommended(user_preferences_ids, user_not_interact_ids, movie_id_list, cosine_sim)
movies_recomended

[20, 892, 3325, 4644, 4719, 6596, 6902, 89898, 103027, 148775]