In [1]:
import string

import pandas as pd
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from rake_nltk import Rake
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from unidecode import unidecode

In [2]:
def pre_process(corpus, stop_words_remv = True, lemmatizetion = True, stemmization = True):
    corpus = corpus.lower()
    stopset = None
    if (stop_words_remv):
        stopset = stopwords.words('english')
        stopset += list(string.punctuation)

    if (lemmatizetion):
        lemmatizer = WordNetLemmatizer()
        corpus = " ".join([lemmatizer.lemmatize(i) for i in word_tokenize(corpus) if i not in stopset])
    
    if (stemmization):
        stemmer = PorterStemmer()
        corpus = " ".join([stemmer.stem(i) for i in word_tokenize(corpus)])
    
    # remove non-ascii characters
    corpus = unidecode(corpus)
    
    return corpus

In [3]:
movies_df = pd.read_csv('../data/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
overview_df = pd.read_csv('../data/overviews.csv')
overview_df.head()

Unnamed: 0,tmdbId,movieId,imdbId,overview
0,862,1,114709,"Led by Woody, Andy's toys live happily in his ..."
1,8844,2,113497,When siblings Judy and Peter discover an encha...
2,15602,3,113228,A family wedding reignites the ancient feud be...
3,31357,4,114885,"Cheated on, mistreated and stepped on, the wom..."
4,11862,5,113041,Just when George Banks has recovered from his ...


In [5]:
movies_df.shape

(27278, 3)

In [6]:
overview_df.shape

(26657, 4)

In [60]:
experiment_df = overview_df.set_index('movieId').join(movies_df.set_index('movieId'), how='left')

In [61]:
experiment_df.reset_index(inplace=True)
experiment_df.head()

Unnamed: 0,movieId,tmdbId,imdbId,overview,title,genres
0,1,862,114709,"Led by Woody, Andy's toys live happily in his ...",Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,8844,113497,When siblings Judy and Peter discover an encha...,Jumanji (1995),Adventure|Children|Fantasy
2,3,15602,113228,A family wedding reignites the ancient feud be...,Grumpier Old Men (1995),Comedy|Romance
3,4,31357,114885,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,11862,113041,Just when George Banks has recovered from his ...,Father of the Bride Part II (1995),Comedy


In [62]:
for index, row in experiment_df.iterrows():
    if row['genres'] == '(no genres listed)':
        experiment_df['genres'].iloc[index] = ''

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  experiment_df['genres'].iloc[index] = ''


In [63]:
# colocando os gêneros em uma lista de palavras
experiment_df['genres'] = experiment_df['genres'].map(lambda x: x.lower().split('|'))

In [64]:
experiment_df.drop(columns=['tmdbId', 'imdbId'], inplace=True)
experiment_df.set_index('movieId', inplace=True)
experiment_df.head()

Unnamed: 0_level_0,overview,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"Led by Woody, Andy's toys live happily in his ...",Toy Story (1995),"[adventure, animation, children, comedy, fantasy]"
2,When siblings Judy and Peter discover an encha...,Jumanji (1995),"[adventure, children, fantasy]"
3,A family wedding reignites the ancient feud be...,Grumpier Old Men (1995),"[comedy, romance]"
4,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale (1995),"[comedy, drama, romance]"
5,Just when George Banks has recovered from his ...,Father of the Bride Part II (1995),[comedy]


In [66]:
# aplicando o pré-processamento
for index, row in experiment_df.iterrows():
    overview = row['overview']
    row['overview'] = pre_process(overview)

In [67]:
experiment_df.head()

Unnamed: 0_level_0,overview,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,led woodi andi 's toy live happili room andi '...,Toy Story (1995),"[adventure, animation, children, comedy, fantasy]"
2,sibl judi peter discov enchant board game open...,Jumanji (1995),"[adventure, children, fantasy]"
3,famili wed reignit ancient feud next-door neig...,Grumpier Old Men (1995),"[comedy, romance]"
4,cheat mistreat step woman hold breath wait elu...,Waiting to Exhale (1995),"[comedy, drama, romance]"
5,georg bank recov daughter 's wed receiv news '...,Father of the Bride Part II (1995),[comedy]


In [68]:
#iniciando uma nova coluna
experiment_df['key_words'] = ''

In [69]:
for index, row in experiment_df.iterrows():
    overview = row['overview']

    # instanciando Rake, por padrão usa palavras irrelevantes em inglês do NLTK 
    # e descarta todos os caracteres de pontuação
    r = Rake()

    # extraindo as palavras passando o texto
    r.extract_keywords_from_text(overview)

    # obtendo o dicionário com palavras-chave e suas pontuações
    key_words_dict_scores = r.get_word_degrees()

    # atribuindo as palavras-chave à nova coluna
    row['key_words'] = list(key_words_dict_scores.keys())

In [70]:
#descartando a coluna overview
experiment_df.drop(columns=['overview'], inplace = True)
experiment_df.head()

Unnamed: 0_level_0,title,genres,key_words
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),"[adventure, animation, children, comedy, fantasy]","[led, woodi, andi, toy, live, happili, room, b..."
2,Jumanji (1995),"[adventure, children, fantasy]","[sibl, judi, peter, discov, enchant, board, ga..."
3,Grumpier Old Men (1995),"[comedy, romance]","[famili, wed, reignit, ancient, feud, next, do..."
4,Waiting to Exhale (1995),"[comedy, drama, romance]","[cheat, mistreat, step, woman, hold, breath, w..."
5,Father of the Bride Part II (1995),[comedy],"[georg, bank, recov, daughter, wed, receiv, ne..."


In [71]:
experiment_df['bag_of_words'] = ''
columns = experiment_df.columns

for index, row in experiment_df.iterrows():
    words = ''
    for col in columns:
        if col != 'title':
            words = words + ' '.join(row[col])+ ' '
    row['bag_of_words'] = words
    
experiment_df.drop(columns = [col for col in experiment_df.columns if col != 'bag_of_words' and col != 'title'], inplace = True)

experiment_df.head()

Unnamed: 0_level_0,title,bag_of_words
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),adventure animation children comedy fantasy le...
2,Jumanji (1995),adventure children fantasy sibl judi peter dis...
3,Grumpier Old Men (1995),comedy romance famili wed reignit ancient feud...
4,Waiting to Exhale (1995),comedy drama romance cheat mistreat step woman...
5,Father of the Bride Part II (1995),comedy georg bank recov daughter wed receiv ne...


In [73]:
# instanciando e gerando a matriz de contagem
count = CountVectorizer()
count_matrix = count.fit_transform(experiment_df['bag_of_words'])

In [74]:
count_matrix

<26657x37936 sparse matrix of type '<class 'numpy.int64'>'
	with 770757 stored elements in Compressed Sparse Row format>

In [87]:

# criando uma Série para os títulos dos filmes para que sejam associados a uma 
# lista numérica ordenada que usarei posteriormente para corresponder aos índices
indexes = pd.Series(experiment_df.index)
indexes

0             1
1             2
2             3
3             4
4             5
          ...  
26652    131254
26653    131256
26654    131258
26655    131260
26656    131262
Name: movieId, Length: 26657, dtype: int64

In [80]:
# gerando a matriz de similaridade de cosseno
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.14338483, 0.03077287, ..., 0.0521286 , 0.        ,
        0.10825318],
       [0.14338483, 1.        , 0.05647825, ..., 0.04783649, 0.        ,
        0.09933993],
       [0.03077287, 0.05647825, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.0521286 , 0.04783649, 0.        , ..., 1.        , 0.        ,
        0.03009646],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.10825318, 0.09933993, 0.        , ..., 0.03009646, 0.        ,
        1.        ]])

In [85]:
def get_similarity(movies_interacteds, movies_not_interacteds, movies_ids, cosine_similarity):
        
    # obtendo os índices dos filmes interagidos
    indexes_interact = []
    for movie_id in movies_interacteds:
        indexes_interact.append(movies_ids[movies_ids == movie_id].index[0])

    # obtendo os índices dos filmes não interagidos
    indexes_not_interact = [idx for idx in list(movies_ids.index) if idx not in indexes_interact]

    similarity_non_interacted_items = {}

    # pegando lista de similaridades dos itens não interagidos
    for idx in indexes_not_interact:
        similarity_non_interacted_items[idx] = cosine_similarity[idx]

    # zerando a similaridade dos itens interagidos nas listas de similaridade dos itens não interagidos
    for key, similarity_list in similarity_non_interacted_items.items():
        similarity_list[key] = 0.0
        for index in indexes_interact:
            similarity_list[index] = 0.0

    #cria dicionário para armazenar as médias
    average_similarity = {}
    for idx in indexes_not_interact:
        average_similarity[idx] = 0.0

    # obtendo a similaridade média
    num_of_items = len(movies_not_interacteds)
    sum_similarity = np.array([0.0] * num_of_items)
    for similarity_item in similarity_non_interacted_items:
        sum_similarity += np.array(similarity_item)

    average_similarity = sum_similarity / num_of_items

    items_more_similar = pd.Series(average_similarity).sort_values(ascending = False)

    # pegando o top 10 indices de filmes mais similares
    top_10_indexes = list(items_more_similar.iloc[0:10].index)


In [132]:
def get_id(item, ids_series):
    return ids_series[ids_series == item].index[0]


In [187]:
def get_similarity(movies_interacteds, movies_not_interacteds, movies_ids, cosine_similarity):

    similarity_non_interacted_items = {}

    # pegando lista de similaridades dos itens não interagidos
    for movie_id in movies_not_interacteds:
        similarity_non_interacted_items[movie_id] = cosine_similarity[get_id(movie_id, movies_ids)]

    # zerando a similaridade dos itens interagidos nas listas de similaridade dos itens não interagidos
    for movie, similarity_list in similarity_non_interacted_items.items():
        similarity_list[get_id(movie, movies_ids)] = 0.0
        for movie_id in movies_interacteds:
            similarity_list[get_id(movie_id, movies_ids)] = 0.0

    # obtendo a similaridade média
    num_of_items = len(movies_ids)
    sum_similarity = np.array([0.0] * num_of_items)
    for similarity_item in similarity_non_interacted_items.values():
        sum_similarity += np.array(similarity_item)

    average_similarity = sum_similarity / num_of_items

    items_more_similar = pd.Series(average_similarity).sort_values(ascending = False)

    # # pegando o top 10 indices de filmes mais similares
    # top_10_indexes = list(items_more_similar.iloc[0:10].index)

    # top_10_movies = {}
    # for i, index in enumerate(top_10_indexes):
    #     top_10_movies[index] = items_more_similar[i]

    # return top_10_movies

    return items_more_similar


In [130]:
len(indexes)

26657

In [139]:
# pegando filmes interagidos pelo usuário

movies_interacteds = list(indexes.sample(10))
len(movies_interacteds)

10

In [140]:
# pegando filmes não interagidos pelo usuário

movies_not_interacteds = [index for index in list(indexes) if index not in movies_interacteds]
len(movies_not_interacteds)

26647

In [188]:

movies_recomended = get_similarity(movies_interacteds, movies_not_interacteds, indexes, cosine_sim)
movies_recomended

23894    0.108799
14679    0.096813
3194     0.090869
13762    0.090373
21022    0.089102
           ...   
6745     0.000000
17192    0.000000
10688    0.000000
2906     0.000000
795      0.000000
Length: 26657, dtype: float64

In [189]:
list(movies_recomended.iloc[0:10])

[0.10879864358920825,
 0.09681327600388623,
 0.09086915332844082,
 0.09037304804638421,
 0.08910175373936494,
 0.08847638043154062,
 0.08806593959015885,
 0.08663895954974045,
 0.08641274150220904,
 0.0857020969395022]

In [183]:
experiment_df[experiment_df.index.isin(movies_interacteds)]

Unnamed: 0_level_0,title,bag_of_words
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
825,Regular Guys (Echte Kerle) (1996),comedy romance christoph cop self confid macho...
3036,"Quest for Fire (Guerre du feu, La) (1981)",adventure drama coloss adventur odyssey turn b...
3509,Black and White (1999),drama rich bower com star hip hop world everyo...
6910,Kronos (a.k.a. Captain Kronos: Vampire Hunter)...,horror sever young girl found dead left hideou...
43333,Water (2005),drama romance year 1938 mahatma gandhi groundb...
84366,I Knew It Was You: Rediscovering John Cazale (...,documentary john cazal five film godfath conve...
87890,"Carey Treatment, The (1972)",mystery dr peter carey pathologist boston hosp...
96456,ATM (2012),horror thriller leav compani christma parti to...
97946,"Day, The (2011)",drama sci-fi thriller open war human rage five...
104066,Alcan Highway (Alaska Highway) (2013),documentary alcan highway film diari one dream...


In [184]:
experiment_df[experiment_df.index.isin(movies_recomended)]

Unnamed: 0_level_0,title,bag_of_words
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
3325,"Next Best Thing, The (2000)",comedy drama comedy drama best friend one stra...
52189,Dark Horse (Voksne mennesker) (2005),comedy drama romance young man spur romanc hel...
69729,I Have Found It (Kandukondain Kandukondain) (2...,comedy drama musical romance two young woman d...
71413,Angel (1937),comedy drama romance woman husband take separ ...
73101,Looking for Eric (2009),comedy drama fantasy man tri put life back tra...
73145,Under the Bridges (Unter den Brücken) (1945),comedy drama romance two barg skipper fall lov...
74327,"First Day of the Rest of Your Life, The (Le pr...",comedy drama sprawl drama center five key day ...
88591,Souls for Sale (1923),comedy drama romance young woman hit hollywood...
103920,"All Together, The (2007)",comedy drama romance overview found
115724,Separation City (2009),comedy drama romance comedy drama follow colla...


In [3]:
import pandas as pd

movies_df = pd.read_csv('../data/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies_df[movies_df['movieId'] == 700]

Unnamed: 0,movieId,title,genres
688,700,Angus (1995),Comedy
