# 7DaysOfCode
## Desafio de Sistema de Recomendação

### Objetivo
O objetivo deste desafio é criar um sistema de recomendações utilizando o dataset MovieLens, um conjunto de dados clássico usado em problemas de sistemas de recomendação. A proposta é desenvolver um sistema que recomende 5 filmes para o usuário com base em comportamentos passados.


In [1]:

import pandas as pd
# Biblioteca para abrir o arquivo
from zipfile import ZipFile

# Arquivos estão aqui https://files.grouplens.org/datasets/movielens/ml-100k.zip
# Subindo os arquivos
files = ZipFile('../data/ml-100k.zip')
files

<zipfile.ZipFile filename='../data/ml-100k.zip' mode='r'>

In [2]:
#subindo os dataset filmes e notas com as variáveis que irei utilizar
df_filmes = pd.read_csv(files.open('ml-100k/u.item'), sep='|', encoding='latin-1', header=None, usecols=[0, 1], names=['movie_id', 'title'])
df_notas = pd.read_csv(files.open('ml-100k/u.data'), sep='\t', header=None,usecols=[0, 1, 2], names=['user_id', 'movie_id', 'rating'])
print(df_filmes)
print(df_notas)


      movie_id                                      title
0            1                           Toy Story (1995)
1            2                           GoldenEye (1995)
2            3                          Four Rooms (1995)
3            4                          Get Shorty (1995)
4            5                             Copycat (1995)
...        ...                                        ...
1677      1678                          Mat' i syn (1997)
1678      1679                           B. Monkey (1998)
1679      1680                       Sliding Doors (1998)
1680      1681                        You So Crazy (1994)
1681      1682  Scream of Stone (Schrei aus Stein) (1991)

[1682 rows x 2 columns]
       user_id  movie_id  rating
0          196       242       3
1          186       302       3
2           22       377       1
3          244        51       2
4          166       346       1
...        ...       ...     ...
99995      880       476       3
99996      716 

In [3]:
# Informações sobre os dados
print(df_filmes.info())
print(df_notas.info())
# Verificar valores ausentes
print(df_filmes.isnull().sum())
print(df_notas.isnull().sum())

# Verificar dados duplicados
print(df_filmes.duplicated().sum())
print(df_notas.duplicated().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  1682 non-null   int64 
 1   title     1682 non-null   object
dtypes: int64(1), object(1)
memory usage: 26.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   user_id   100000 non-null  int64
 1   movie_id  100000 non-null  int64
 2   rating    100000 non-null  int64
dtypes: int64(3)
memory usage: 2.3 MB
None
movie_id    0
title       0
dtype: int64
user_id     0
movie_id    0
rating      0
dtype: int64
0
0


In [4]:
df = pd.merge(df_notas, df_filmes, on='movie_id', how='left')
df = df[['user_id', 'movie_id', 'title', 'rating']]


In [5]:
# Primeira Heurística: baseado nas infos de outros users vou indicar os 5 primeiros filmes com maior popularidade
#Fazendo modelo simples de recomendação 
# Calcula o total de ratings por movie_id
total_rating = df.groupby('movie_id')['rating'].count().reset_index()
total_rating.columns = ['movie_id', 'total_rating']
# Agora faz a junção com o DataFrame original df para adicionar a coluna total_rating
df = pd.merge(df, total_rating, on='movie_id', how='left')
df

Unnamed: 0,user_id,movie_id,title,rating,total_rating
0,196,242,Kolya (1996),3,117
1,186,302,L.A. Confidential (1997),3,297
2,22,377,Heavyweights (1994),1,13
3,244,51,Legends of the Fall (1994),2,81
4,166,346,Jackie Brown (1997),1,126
...,...,...,...,...,...
99995,880,476,"First Wives Club, The (1996)",3,160
99996,716,204,Back to the Future (1985),5,350
99997,276,1090,Sliver (1993),1,37
99998,13,225,101 Dalmatians (1996),2,109


In [6]:
# Exibe os filmes mais populares
top_movies = df.sort_values(by='total_rating', ascending=False)[['movie_id', 'title', 'total_rating']].drop_duplicates().head(5)
print(top_movies)

       movie_id                      title  total_rating
24954        50           Star Wars (1977)           583
99847       258             Contact (1997)           509
93021       100               Fargo (1996)           508
77101       181  Return of the Jedi (1983)           507
61770       294           Liar Liar (1997)           485


In [7]:
# Segunda Heurística: baseado nas infos de outros users vou indicar os 15 primeiros filmes com maior nota média
average_ratings = df.groupby('movie_id')['rating'].mean().reset_index()
average_ratings.columns = ['movie_id', 'average_rating']
df = pd.merge(df, average_ratings, on='movie_id', how='left')
df

Unnamed: 0,user_id,movie_id,title,rating,total_rating,average_rating
0,196,242,Kolya (1996),3,117,3.991453
1,186,302,L.A. Confidential (1997),3,297,4.161616
2,22,377,Heavyweights (1994),1,13,2.153846
3,244,51,Legends of the Fall (1994),2,81,3.456790
4,166,346,Jackie Brown (1997),1,126,3.642857
...,...,...,...,...,...,...
99995,880,476,"First Wives Club, The (1996)",3,160,3.018750
99996,716,204,Back to the Future (1985),5,350,3.834286
99997,276,1090,Sliver (1993),1,37,2.405405
99998,13,225,101 Dalmatians (1996),2,109,2.908257


In [8]:
# Exibe os filmes com maiores notas médias desde que tenham mais de 300 avaliações pois pode ficar muito nichado
top_average = df.query('total_rating >= 300').sort_values(by='average_rating', ascending=False).drop_duplicates(subset='movie_id').head(15)
top_average

Unnamed: 0,user_id,movie_id,title,rating,total_rating,average_rating
34016,498,50,Star Wars (1977),4,583,4.358491
6820,56,98,"Silence of the Lambs, The (1991)",4,390,4.289744
49654,744,127,"Godfather, The (1972)",5,413,4.283293
28131,342,174,Raiders of the Lost Ark (1981),2,420,4.252381
25981,461,313,Titanic (1997),4,350,4.245714
7708,345,172,"Empire Strikes Back, The (1980)",4,367,4.20436
82453,786,173,"Princess Bride, The (1987)",4,324,4.17284
2695,150,100,Fargo (1996),2,508,4.155512
85268,394,168,Monty Python and the Holy Grail (1974),5,316,4.066456
92239,927,56,Pulp Fiction (1994),4,394,4.060914


In [9]:
import re
def remove_year(title):
    # Remove o ano e os parênteses ao final do título
    return re.sub(r'\(\d{4}\)$', '', title).strip()
# Aplica a função para remover o ano do título
df['title'] = df['title'].apply(remove_year)

df.head()
# Transformei em json para poder subir no firebase

df.to_json('filmes_notas.json')

In [10]:
#vamos criar os métodos de recomendação 
tabela_filmes = pd.pivot_table(df, index='title', columns='user_id', values='rating').fillna(0)
tabela_filmes



user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1-900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men,5.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
187,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Young Guns II,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
"Young Poisoner's Handbook, The",0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zeus and Roxanne,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
unknown,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
from sklearn.neighbors import NearestNeighbors


# Treinar o modelo KNN com a métrica de cosseno
modelo_knn = NearestNeighbors(metric='cosine', algorithm='brute')
modelo_knn.fit(tabela_filmes)

# Função para recomendar filmes usando KNN
def recomendar_filmes_knn(filme, num_recomendacoes=5):
    if filme not in tabela_filmes.index:
        print(f"O filme '{filme}' não está presente na tabela de treino.")
        return []

    # Encontrar o índice do filme na tabela de treino
    indice_film = tabela_filmes.index.get_loc(filme)
    
    # Encontrar os vizinhos mais próximos do filme
    distancia, vizinhos = modelo_knn.kneighbors(
        tabela_filmes.iloc[indice_film].values.reshape(1, -1), 
        n_neighbors=num_recomendacoes + 1
    )
    
    # Obter os títulos dos filmes recomendados (remover o próprio filme)
    recomendacoes = [tabela_filmes.index[i] for i in vizinhos.flatten()]
    recomendacoes.remove(filme)  # Remover o próprio filme da lista
    return recomendacoes[:num_recomendacoes]

# Função para testar recomendações para um filme específico
def testar_recomendacoes(filme, num_recomendacoes=5):
    recomendacoes = recomendar_filmes_knn(filme, num_recomendacoes)
    if recomendacoes:
        print(f"Recomendações para o filme '{filme}':")
        for i, rec in enumerate(recomendacoes, 1):
            print(f"{i}: {rec}")
    else:
        print(f"Filme '{filme}' não encontrado no conjunto de dados.")

# Testar a função de recomendações
testar_recomendacoes('101 Dalmatians')



Recomendações para o filme '101 Dalmatians':
1: Jack
2: Twister
3: Willy Wonka and the Chocolate Factory
4: Independence Day (ID4)
5: Toy Story


In [12]:
from sklearn.metrics.pairwise import cosine_similarity

# Calcular a similaridade de cosseno no conjunto de treino
modelo_cosine = cosine_similarity(tabela_filmes)
modelo_cosine = pd.DataFrame(tabela_filmes, columns=tabela_filmes.index, index=tabela_filmes.index)


In [13]:
def recomendar_filmes(filme, num_recomendacoes=5):
    # Selecionar as recomendações para um filme específico
    cossine_df = pd.DataFrame(modelo_cosine[filme].sort_values(ascending=False))
    cossine_df.columns = ['Recomendações']
    return cossine_df.head(num_recomendacoes + 1)[1:].index.tolist()

In [14]:
# Função para testar recomendações para um filme específico
def testar_recomendacoes(filme, num_recomendacoes=5):
    recomendacoes = recomendar_filmes(filme, num_recomendacoes)
    if recomendacoes:
        print(f"Recomendações para o filme '{filme}':")
        for i, rec in enumerate(recomendacoes, 1):
            print(f"{i}: {rec}")
    else:
        print(f"Filme '{filme}' não encontrado no conjunto de dados.")

# Testar a função de recomendações
filme_teste = '101 Dalmatians'
testar_recomendacoes(filme_teste)

Recomendações para o filme '101 Dalmatians':
1: 1-900
2: 101 Dalmatians
3: 12 Angry Men
4: 187
5: 2 Days in the Valley


In [15]:
#exportando o modelo  com pickle

import pickle
filename = 'modelo.pk1'
pickle.dump(modelo_knn, open(filename, 'wb'))

In [16]:
#exportando o modelo  com jolib (Pelo que entendi é o mais recomendado)
import joblib

joblib.dump(modelo_knn, 'knn.sav')

['knn.sav']