# Recomendação de filmes utilizando o dataset do MovieLens
## Tratamento dos dados, implementação e comparação entre o método Baseline e o modelo Apriori

### Importar Bibliotecas

In [1]:
!pip install numpy pandas mlxtend wget

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import numpy as np
import pandas as pd

### Dados crus do dataset movieLens

In [3]:
import wget
!python3 -m wget https://github.com/mmanzato/MBABigData/raw/master/ml-20m-compact.tar.gz
# Botar referência e créditos ao Marcelo Manzato
!tar -xvzf ml-20m-compact.tar.gz
# Aprox 400 filmes e 11k usuarios

100% [....................................................] 65019041 / 65019041
Saved under ml-20m-compact.tar (7).gz
dataset/
dataset/tags_sample.csv
dataset/._.DS_Store
dataset/.DS_Store
dataset/movies_sample.csv
dataset/._genome-tags.csv
dataset/genome-tags.csv
dataset/._ml-youtube.csv
dataset/ml-youtube.csv
dataset/._genome-scores.csv
dataset/genome-scores.csv
dataset/ratings_sample.csv


In [4]:
# Explorar os dados
movies = pd.read_csv('./dataset/movies_sample.csv')
ratings = pd.read_csv('./dataset/ratings_sample.csv')
df = ratings[['userId', 'movieId', 'rating']]
df = df.merge(movies[['movieId', 'title']])
# Mapeamento em idx
map_users = {user: idx for idx, user in enumerate(df.userId.unique())}
map_items = {item: idx for idx, item in enumerate(df.movieId.unique())}
df['userId'] = df['userId'].map(map_users)
df['movieId'] = df['movieId'].map(map_items)

map_title = {}
for _, row in df.iterrows():
    map_title[row.movieId] = row.title

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,30,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,Crime|Drama
1,31,Dangerous Minds (1995),Drama
2,37,Across the Sea of Time (1995),Documentary|IMAX
3,161,Crimson Tide (1995),Drama|Thriller|War
4,193,Showgirls (1995),Drama


In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,11,7481,5.0,1230788624
1,11,1046,4.5,1251144145
2,11,616,4.0,1230782542
3,11,3535,2.0,1230784884
4,11,5669,5.0,1230853788


In [7]:
df.head()

Unnamed: 0,userId,movieId,rating,title
0,0,0,5.0,Enemy Mine (1985)
1,1,0,4.0,Enemy Mine (1985)
2,2,0,3.0,Enemy Mine (1985)
3,3,0,3.0,Enemy Mine (1985)
4,4,0,3.0,Enemy Mine (1985)


### Divisão da base em treino e teste

In [8]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=.2, random_state=2)

### Funções para obter informações específicas do DataFrame

In [9]:
# Obter a nota que um usuário deu para um item.
def get_rating(userId,movieId):
    if len(df[(df['userId']==userId)&(df['movieId']==movieId)]) == 0:
        return 0
    return (df.loc[(df.userId==userId) & (df.movieId == movieId),'rating'].iloc[0])

get_rating(6102, 413)

3.5

In [10]:
# Obter a lista de todos os filmes que um usuário avaliou.
def get_movie_ids(userId):
    if userId not in df['userId'].values:
        return []
    return (df.loc[(df.userId==userId),'movieId'].tolist())

get_movie_ids(0)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [11]:
# Obter o título do item dado o seu id.
def get_movie_title(movieId):
    if movieId not in df['movieId'].values:
        return ''
    return (df.loc[(df.movieId == movieId),'title'].iloc[0])

get_movie_title(0)

'Enemy Mine (1985)'

In [12]:
# Obter a lista de ratings de um usuário.
def get_user_ratings(userId):
    if userId not in df['userId'].values:
        return []
    return (df.loc[(df.userId==userId),'rating'].tolist())

get_user_ratings(0)

[5.0, 4.5, 4.0, 2.0, 5.0, 5.0, 5.0, 5.0, 3.0, 5.0, 5.0, 3.5, 5.0]

In [13]:
# Obter a média de ratings de um usuário
def get_user_mean(userId):
    return np.mean(get_user_ratings(userId))

get_user_mean(1)

3.38

In [14]:
# Obter a lista de todos os usuários que avaliaram o filme
def get_user_ids(movieId):
    if movieId not in df['movieId'].values:
        return []
    return (df.loc[(df.movieId==movieId),'userId'].tolist())



In [15]:
# Obter todas as notas do filme
def get_movie_ratings(movieId):
    if movieId not in df['movieId'].values:
        return []
    return (df.loc[(df.movieId==movieId),'rating'].tolist())


In [16]:
# Obter a média de notas do filme
def get_movie_mean(movieId):
    return np.mean(get_movie_ratings(movieId))

get_movie_mean(0)

3.4496732026143793

# Método Baseline
### Método simples para predição de avaliações baseado em tendências de cada usuário e item

> Recomeda filmes considerando o contexto e os dados tanto dos filmes quanto dos usuários, e a associação entre os filmes e os usuários

In [17]:
# Compute the global mean, movie bias and user bias
c = 1
global_mean = df['rating'].mean()
movie_list = df['movieId'].unique()
movie_bias = {}
for i in movie_list:
    users = get_user_ids(i)
    movie_bias[i] = sum((get_rating(u, i)-global_mean) for u in users) / (len(users) + c)

user_list = df['userId'].unique()
user_bias = {}
for u in user_list:
    items = get_movie_ids(u)
    user_bias[u] = sum((get_rating(u, i)-global_mean-movie_bias[i]) for i in items) / (len(items) + c)

print(user_bias)
print(movie_bias)

pred = global_mean + user_bias[3] + movie_bias[1]
pred

{0: 0.9591448165754742, 1: -0.04869727647102709, 2: -0.564784684687678, 3: -0.22195443098479137, 4: 0.12998848795779563, 5: -0.056108067733634866, 6: 0.18304527377715707, 7: -0.4811877759425817, 8: -0.5245319081526818, 9: -0.32186096131863956, 10: -0.045975197312582844, 11: -0.12273996608529551, 12: -0.5694747105118395, 13: 0.3625331276706996, 14: 0.2911454074577962, 15: -0.6912442063291419, 16: -0.007081374528962683, 17: 0.7757812547498685, 18: 0.18606010816996008, 19: 0.31638587085475744, 20: 0.3658605317796833, 21: -0.06471141741088646, 22: 0.016013654059483105, 23: 0.41291764036439116, 24: -0.6260452013207763, 25: -0.18314854259600688, 26: 0.16205030356365327, 27: -0.20106221606561445, 28: 0.340775101254133, 29: -0.07040215684299775, 30: 0.784051275227572, 31: 0.398622620265576, 32: 0.49325755227192775, 33: 0.5363116119298528, 34: -0.2435998389799411, 35: -0.3280983833995616, 36: -0.1183245883859704, 37: -0.4053958263635231, 38: 0.3012311690523328, 39: -0.6668458104345114, 40: 0.12

3.5601633660387946

In [18]:
# Recomend movies that the user didnt watched yet
def RecommendMovies(userId, k = 5):
    movie_list = df['movieId'].unique()
    watched = get_movie_ids(userId)
    recommend = []
    for i in movie_list:
        if(not i in watched):
             # Calculate the recommendation score for each movie
            recommendation_score = global_mean + user_bias[userId] + movie_bias[i]
            # Append movie ID, calculated value pair to the recommend list
            recommend.append((i, recommendation_score))
      # Sort the recommendations in decreasing order by the calculated value
    recommend.sort(key=lambda x: x[1], reverse=True)
    return recommend[:k]

RecommendMovies(0)

[(33, 5.158610062571328),
 (167, 5.024112458170631),
 (268, 5.021000365894105),
 (19, 5.020394702080607),
 (80, 5.0111744544026875)]

In [19]:
print(get_movie_ids(1))
print(RecommendMovies(1))

[0, 3, 5, 6, 12, 14, 16, 18, 19, 21, 23, 29, 30, 34, 39, 54, 57, 72, 74, 84, 85, 87, 105, 106, 107]
[(33, 4.150767969524827), (167, 4.01627036512413), (268, 4.013158272847603), (80, 4.003332361356186), (40, 3.9922465170350643)]


### TODO: Avaliação do desempenho do algoritmo baseline

### Método que para cada usuário pega no conjunto de teste os filmes que o usuário avaliou bem (rating > 3 por exemplo) e verifica se este filme foi de fato recomendado pelo método Baseline na função RecommendMovies
### Usar plots?

# TODO
# Modelo Apriori

> Recomenda filmes considerando principalmente o contexto dos filmes e a relação (associação) entre eles

### Pré-processamento e criação da tabela de filmes assistidos

In [20]:
df_pivot = df.pivot(index='userId', columns='title', values='rating').fillna(0)

In [21]:
df_pivot = df_pivot.astype('int64')

In [22]:
df_pivot = df_pivot.applymap(lambda x: 1 if x > 0 else 0)

  df_pivot = df_pivot.applymap(lambda x: 1 if x > 0 else 0)


In [23]:
df_pivot.head()


title,100 Rifles (1969),21 (2008),2AM: The Smiling Man (2013),3 Simoa (2012),97 Percent True (2008),A Rumor Of War (1980),A Walk in the Woods (2015),A mí las mujeres ni fu ni fa (1971),About Cherry (2012),"Ace in the Hole (Big Carnival, The) (1951)",...,While You Were Sleeping (1995),White Nights (1985),Wild Target (2010),"Winning Team, The (1952)",Wish I Was Here (2014),"Woman, a Gun and a Noodle Shop, A (San qiang pai an jing qi) (2009)",Wonderful Days (a.k.a. Sky Blue) (2003),Yolanda and the Thief (1945),Zero Effect (1998),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


### Treinando o modelo

In [24]:
from mlxtend.frequent_patterns import apriori

frequent_itemset = apriori(df_pivot, min_support=0.07, use_colnames=True)



In [25]:
frequent_itemset.head()


Unnamed: 0,support,itemsets
0,0.148963,(Along Came a Spider (2001))
1,0.401894,(American Psycho (2000))
2,0.13697,(Apocalypto (2006))
3,0.194049,"(Aristocats, The (1970))"
4,0.107574,(Bachelor Party (1984))


In [26]:
from mlxtend.frequent_patterns import association_rules

rules = association_rules(frequent_itemset, metric="lift", min_threshold=1)


In [27]:
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Reservoir Dogs (1992)),(Along Came a Spider (2001)),0.719477,0.148963,0.111001,0.15428,1.035693,0.003825,1.006287,0.122853
1,(Along Came a Spider (2001)),(Reservoir Dogs (1992)),0.148963,0.719477,0.111001,0.745157,1.035693,0.003825,1.10077,0.040495
2,(Star Wars: Episode V - The Empire Strikes Bac...,(Along Came a Spider (2001)),0.855185,0.148963,0.128855,0.150675,1.011491,0.001464,1.002015,0.078451
3,(Along Came a Spider (2001)),(Star Wars: Episode V - The Empire Strikes Bac...,0.148963,0.855185,0.128855,0.865012,1.011491,0.001464,1.072801,0.013349
4,(American Psycho (2000)),(Bowling for Columbine (2002)),0.401894,0.454013,0.22633,0.563159,1.240404,0.043865,1.249854,0.324041


### Resultados


In [28]:
df_res = rules.sort_values(by=['lift'], ascending=False)
df_res.head()


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
9523,"(Reservoir Dogs (1992), I, Robot (2004))",(Star Wars: Episode V - The Empire Strikes Bac...,0.290352,0.165014,0.106312,0.366149,2.218903,0.0584,1.317323,0.774083
9522,(Star Wars: Episode V - The Empire Strikes Bac...,"(Reservoir Dogs (1992), I, Robot (2004))",0.165014,0.290352,0.106312,0.644262,2.218903,0.0584,1.994864,0.657887
7237,"(Collateral (2004), Lara Croft: Tomb Raider (2...","(I, Robot (2004), Reservoir Dogs (1992))",0.167538,0.290352,0.106673,0.636706,2.192879,0.058028,1.953373,0.653458
7232,"(I, Robot (2004), Reservoir Dogs (1992))","(Collateral (2004), Lara Croft: Tomb Raider (2...",0.290352,0.167538,0.106673,0.367391,2.192879,0.058028,1.315919,0.766547
7110,(Star Wars: Episode V - The Empire Strikes Bac...,"(Crimson Tide (1995), I, Robot (2004))",0.318485,0.150586,0.103968,0.326444,2.167822,0.056008,1.261089,0.790456


### Testando o modelo

In [29]:
movie_test = 'I, Robot (2004)'

In [30]:
df_test = df_res[df_res['antecedents'].apply(lambda x: len(x) == 1 and next(iter(x)) == movie_test)]

In [31]:
df_test = df_test[df_test['lift'] > 1.5]

In [32]:
df_test.head()


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
7253,"(I, Robot (2004))",(Star Wars: Episode V - The Empire Strikes Bac...,0.39486,0.151398,0.119748,0.303266,2.003106,0.059967,1.217971,0.827537
2171,"(I, Robot (2004))","(Collateral (2004), Lara Croft: Tomb Raider (2...",0.39486,0.167538,0.131109,0.332039,1.981871,0.064955,1.246273,0.818697
7239,"(I, Robot (2004))","(Reservoir Dogs (1992), Collateral (2004), Lar...",0.39486,0.136339,0.106673,0.270153,1.981479,0.052838,1.183345,0.818532
2194,"(I, Robot (2004))","(Rush Hour 2 (2001), Collateral (2004))",0.39486,0.137872,0.107484,0.272208,1.974356,0.053044,1.184581,0.815523
7226,"(I, Robot (2004))","(Karate Kid, The (1984), Collateral (2004), St...",0.39486,0.151758,0.109378,0.277004,1.825296,0.049454,1.173231,0.747172


In [33]:
movies = df_test['consequents'].values

movie_list = []
for movie in movies:
    for title in movie:
        if title not in movie_list:
            movie_list.append(title)

In [34]:
movie_list[0:10]


['Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Collateral (2004)',
 'Lara Croft: Tomb Raider (2001)',
 'Reservoir Dogs (1992)',
 'Rush Hour 2 (2001)',
 'Karate Kid, The (1984)',
 'Conspiracy Theory (1997)',
 'American Psycho (2000)',
 'Dark City (1998)',
 'Crimson Tide (1995)']

# TODO
# Modelo K-Nearest-Neighbors

> Recomenda filmes com base nas preferências de usuários semelhantes

In [35]:
!pip install scikit-surprise

from surprise import Dataset, Reader, KNNBasic

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

Defaulting to user installation because normal site-packages is not writeable
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f610c438130>

In [None]:
# Essa função Percorre todos os dados de treinamento disponíveis e extrai informações necessárias para construir o conjunto
# de treinamento. Isso inclui o conjunto completo de usuários, itens e avaliações.
trainset = data.build_full_trainset()

# Aqui estabelecemos as opções de similaridade do KNN usando cosseno (por conta de sua invariância a escala) e explicitando
# que as recomendações são baseadas nos usuários
sim_options = {'name': 'cosine', 'user_based': True}
model = KNNBasic(sim_options=sim_options)
model.fit(trainset)

In [36]:
user_id_to_predict = 1  # Aqui é estabelecido o usuário em questão
items_to_ignore = df[df['userId'] == user_id_to_predict]['movieId'].tolist() # Aqui guardamos os filmes já avaliados pelo usuário

# Obtém IDs de filmes ainda não avaliados pelo usuário
all_movie_ids = df['movieId'].unique()
movies_to_predict = [movie_id for movie_id in all_movie_ids if movie_id not in items_to_ignore]

# Gera previsões para os filmes não avaliados
predictions = [model.predict(user_id_to_predict, movie_id) for movie_id in movies_to_predict]

# Organiza as previsões em ordem decrescente de estimativa de classificação
predictions.sort(key=lambda x: x.est, reverse=True)

# Obtém os IDs dos filmes recomendados
recommended_movie_ids = [prediction.iid for prediction in predictions]

# Mapeia os IDs dos filmes recomendados para os títulos reais.
recommended_movies = df[df['movieId'].isin(recommended_movie_ids)][['movieId', 'title']].drop_duplicates()

recommended_movies[:10]

Unnamed: 0,movieId,title
30572,11,Masters of the Universe (1987)
41054,13,Sliding Doors (1998)
48984,16,Rush Hour 2 (2001)
71009,20,While You Were Sleeping (1995)
89893,24,"Deep End of the Ocean, The (1999)"
96517,28,My Best Friend's Wedding (1997)
100764,29,"Man in the Iron Mask, The (1998)"
110977,33,North by Northwest (1959)
115786,34,Roman Holiday (1953)
118657,36,Cloudy with a Chance of Meatballs (2009)
