In [5]:
# Importando as bibliotecas que vamos usar

import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [6]:
# Importando e visualizando o arquivo que contém dados dos filmes

df_movies = pd.read_csv("movies_metadata.csv", low_memory=False)

df_movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [7]:
# Importando e visualizando o arquivo que contém dados das avaliações

df_ratings = pd.read_csv("ratings.csv")

df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [8]:
# Deixando apenas colunas importantes em nosso DataFrame de filmes

df_movies = df_movies[["id", "original_title", "original_language", "vote_count"]]

df_movies.head()

Unnamed: 0,id,original_title,original_language,vote_count
0,862,Toy Story,en,5415.0
1,8844,Jumanji,en,2413.0
2,15602,Grumpier Old Men,en,92.0
3,31357,Waiting to Exhale,en,34.0
4,11862,Father of the Bride Part II,en,173.0


In [9]:
# Deixando apenas colunas importantes em nosso DataFrame de avaliações

df_ratings = df_ratings[["userId", "movieId", "rating"]]

df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


In [10]:
# Verificando a quantidade de valores nulos no nosso DataFrame de filmes

df_movies.isna().sum()

id                    0
original_title        0
original_language    11
vote_count            6
dtype: int64

In [11]:
# Como são poucos dados nulos, vamos remove-los para não atrapalhar no treinamento do modelo

df_movies.dropna(inplace=True)
df_movies.isna().sum()

id                   0
original_title       0
original_language    0
vote_count           0
dtype: int64

In [12]:
# Verificando a quantidade de valores nulos no nosso DataFrame de avaliações

df_ratings.isna().sum()

userId     0
movieId    0
rating     0
dtype: int64

In [13]:
# Verificando a quantidade de avaliações por usuário

df_ratings["userId"].value_counts()

userId
45811     18276
8659       9279
270123     7638
179792     7515
228291     7410
          ...  
30155         1
9641          1
164717        1
243426        1
234625        1
Name: count, Length: 270896, dtype: int64

In [14]:
# Trazendo os usuários que tiveram mais de 999 avaliações

count_ratings = df_ratings["userId"].value_counts() > 999

y = count_ratings[count_ratings].index

y

Index([ 45811,   8659, 270123, 179792, 228291, 243443,  98415, 229879,  98787,
       172224,
       ...
       227649, 244253, 257117,  30733, 196384,  53075, 220764, 214328,  14354,
       182812],
      dtype='int64', name='userId', length=2509)

In [15]:
# Trazendo os usuários que avaliaram mais de 999 vezes em nosso DataFrame

df_ratings = df_ratings[df_ratings["userId"].isin(y)]
df_ratings.value_counts("userId")

userId
45811     18276
8659       9279
270123     7638
179792     7515
228291     7410
          ...  
14354      1000
196384     1000
220764     1000
53075      1000
30733      1000
Name: count, Length: 2509, dtype: int64

In [16]:
# Trazendo e conferindo os filmes que tiveram mais de 999 avaliações

df_movies = df_movies[df_movies["vote_count"] > 999] 

df_movies.sort_values(by="vote_count" ,ascending=True)

Unnamed: 0,id,original_title,original_language,vote_count
10776,9286,Final Destination 3,en,1000.0
26554,280092,Insidious: Chapter 3,en,1005.0
13952,20943,The Ugly Truth,en,1005.0
2831,658,Goldfinger,en,1005.0
24438,242512,Ouija,en,1006.0
...,...,...,...,...
26564,293660,Deadpool,en,11444.0
17818,24428,The Avengers,en,12000.0
14551,19995,Avatar,en,12114.0
12481,155,The Dark Knight,en,12269.0


In [17]:
# Verificando quantos filmes cada linguagem possui

df_movies.value_counts("original_language")

original_language
en    1100
fr       5
ja       5
it       3
ko       2
cn       1
de       1
es       1
id       1
pt       1
sv       1
Name: count, dtype: int64

In [18]:
# Vamos trazer apenas os filmes com a língua original "English"

df_movies = df_movies[df_movies["original_language"] == "en"]

df_movies.value_counts("original_language")

original_language
en    1100
Name: count, dtype: int64

In [19]:
# Visualizando o tipo dos dados do DataFrame de filmes 

df_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1100 entries, 0 to 44842
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1100 non-null   object 
 1   original_title     1100 non-null   object 
 2   original_language  1100 non-null   object 
 3   vote_count         1100 non-null   float64
dtypes: float64(1), object(3)
memory usage: 43.0+ KB


In [20]:
# Visualizando o tipo dos dados do DataFrame de avaliações 

df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3844582 entries, 17291 to 26023521
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int64  
 1   movieId  int64  
 2   rating   float64
dtypes: float64(1), int64(2)
memory usage: 117.3 MB


In [21]:
# Como o id do DataFrame de filmes é object e o movieId da tabela de avaliações é um int
# Vamos converter para inteiro o id do DataFrame de filmes para realizar a junção desses dados corretamente.

df_movies["id"] = df_movies["id"].astype(int)

df_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1100 entries, 0 to 44842
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1100 non-null   int32  
 1   original_title     1100 non-null   object 
 2   original_language  1100 non-null   object 
 3   vote_count         1100 non-null   float64
dtypes: float64(1), int32(1), object(2)
memory usage: 38.7+ KB


In [22]:
# Realizando um "inner join" entre os dois DataFrames e armazenamos essa junção em outro DataFrame
# Assim vamos conseguir visualisar o o titulo do filme e sua nota

ratings_and_movies = df_ratings.merge(df_movies, left_on="movieId", right_on="id", how="inner")
ratings_and_movies.drop(columns="id", inplace=True) # Removendo coluna desnecessária
ratings_and_movies.head()

Unnamed: 0,userId,movieId,rating,original_title,original_language,vote_count
0,229,12,1.0,Finding Nemo,en,6292.0
1,229,70,3.0,Million Dollar Baby,en,2519.0
2,229,77,3.0,Memento,en,4168.0
3,229,85,3.0,Raiders of the Lost Ark,en,3949.0
4,229,106,4.0,Predator,en,2129.0


In [23]:
# Confirmando se o novo DataFrame não possui valores nulos

ratings_and_movies.isna().sum()

userId               0
movieId              0
rating               0
original_title       0
original_language    0
vote_count           0
dtype: int64

In [24]:
# Removendo linhas duplicadas caso ocorra do usuário avaliar o mesmo filme várias vezes

ratings_and_movies.drop_duplicates(["userId", "movieId"], inplace=True)

In [25]:
# Deletando a coluna movieId já que precisamos apenas do título do nosso filme a partir daqui

del ratings_and_movies["movieId"]

In [26]:
# Criando um PIVOT para que cada usuário seja uma coluna com o respectivo valor de nota
# Definindo "userID" como as colunas, titulo do filme como o index e as avaliações como os valores para cada linha

movies_pivot = ratings_and_movies.pivot_table(columns="userId", index="original_title", values="rating")

display(movies_pivot)

userId,229,231,741,836,1104,1136,1243,1380,1652,1846,...,269632,269750,269913,270071,270123,270213,270237,270564,270654,270887
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You,,,,,,,,,,,...,,2.5,,3.0,3.0,,,,,
12 Angry Men,,,,,,,,,,,...,,,,,,,,,3.5,
127 Hours,,,,,,,,,,,...,,,,,,,,,,
1408,,,,,,,,,,,...,,,,,2.5,2.0,,,,
2 Fast 2 Furious,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wild Wild West,,,,,,,,,,,...,,,,,,,,,,
X-Men Origins: Wolverine,4.0,,4.0,3.5,,4.0,,,3.0,3.0,...,,,,,4.0,4.0,,,3.0,
X-Men: Days of Future Past,,,,,,,,,,,...,,,,,,,,,,
Zodiac,5.0,,,,,,,,,,...,,,,,4.0,,,,,


In [27]:
# Preenchendo os valores nulos com 0

movies_pivot.fillna(0, inplace=True)

display(movies_pivot)

userId,229,231,741,836,1104,1136,1243,1380,1652,1846,...,269632,269750,269913,270071,270123,270213,270237,270564,270654,270887
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.5,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0
127 Hours,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.5,2.0,0.0,0.0,0.0,0.0
2 Fast 2 Furious,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wild Wild West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
X-Men Origins: Wolverine,4.0,0.0,4.0,3.5,0.0,4.0,0.0,0.0,3.0,3.0,...,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,3.0,0.0
X-Men: Days of Future Past,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zodiac,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0


In [28]:
# Como possímos muitos valores nulos, transformaremos nosso DataFrame em uma Matriz Sparsa
# Assim não atrasamos o treinamento do modelo

movies_parse = csr_matrix(movies_pivot)
type(movies_parse)

scipy.sparse._csr.csr_matrix

In [29]:
# Criando e treinando o modelo preditivo

model = NearestNeighbors(algorithm="brute")
model.fit(movies_parse)

In [30]:
# Armazenando a distância e os índices dos vizinhos mais próximos e filtrando apenas a linha do filme "127 Hours"
# Refêrenciado cada índice do nosso PIVOT e imprimindo o nome de cada filme
# Essa recomendação é feita com base em vários usuários que avaliaram o mesmo filme

# 127 Hours

distance, sugestions = model.kneighbors(movies_pivot.filter(items=["127 Hours"], axis=0).values.reshape(1, -1))


for i in range(len(sugestions)):
    print(movies_pivot.index[sugestions[i]])

Index(['127 Hours', 'American Hustle', 'The Expendables 2', 'Lord of War',
       'RED 2'],
      dtype='object', name='original_title')


In [31]:
# Titanic

distance, sugestions = model.kneighbors(movies_pivot.filter(items=["Titanic"], axis=0).values.reshape(1, -1))

for i in range(len(sugestions)):
    print(movies_pivot.index[sugestions[i]])


Index(['Titanic', 'Big Fish', 'Psycho', 'Reservoir Dogs',
       'A Nightmare on Elm Street'],
      dtype='object', name='original_title')


In [32]:
# The Terminator

distance, sugestions = model.kneighbors(movies_pivot.filter(items=["The Terminator"], axis=0).values.reshape(1, -1))

for i in range(len(sugestions)):
    print(movies_pivot.index[sugestions[i]])

Index(['The Terminator', 'Annie Hall', 'The Matrix',
       'The Shawshank Redemption', 'Good Will Hunting'],
      dtype='object', name='original_title')


##### 127 Hours - Aqui tivemos recomendações de alguns filmes Thriller e Ação e Aventura
##### Titanic - Conseguimos observar o interesse das pessoas em filmes clássicos lançados em períodos próximos
##### The Terminator - Podemos observar a recomendação de The Matrix que é um filme de ação e Ficção científica semelhante

Considerando os outros filmes não mencionados, a avaliação semelhante de um grande grupo de pessoas indica uma versatilidade de gêneros. Isso sugere que o sistema de recomendações é capaz de identificar não apenas filmes similares, mas também filmes de gêneros opostos, atendendo a uma ampla gama de preferências.