# Recomendação de filmes utilizando o dataset do MovieLens
## Tratamento dos dados, implementação e comparação entre o método Baseline e o modelo Apriori

### Importar Bibliotecas

In [None]:
!pip install numpy pandas mlxtend wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=c42ea54899615eebca0ce11dbd95455c585c32888754be03aea80e8871d87405
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [None]:
import numpy as np
import pandas as pd

### Dados crus do dataset movieLens

In [None]:
import wget
!python3 -m wget https://github.com/mmanzato/MBABigData/raw/master/ml-20m-compact.tar.gz
# Botar referência e créditos ao Marcelo Manzato
!tar -xvzf ml-20m-compact.tar.gz
# Aprox 400 filmes e 11k usuarios


Saved under ml-20m-compact.tar.gz
dataset/
dataset/tags_sample.csv
dataset/._.DS_Store
dataset/.DS_Store
dataset/movies_sample.csv
dataset/._genome-tags.csv
dataset/genome-tags.csv
dataset/._ml-youtube.csv
dataset/ml-youtube.csv
dataset/._genome-scores.csv
dataset/genome-scores.csv
dataset/ratings_sample.csv


In [None]:
# Explorar os dados
movies = pd.read_csv('./dataset/movies_sample.csv')
ratings = pd.read_csv('./dataset/ratings_sample.csv')
df = ratings[['userId', 'movieId', 'rating']]
df = df.merge(movies[['movieId', 'title']])
# Mapeamento em idx
map_users = {user: idx for idx, user in enumerate(df.userId.unique())}
map_items = {item: idx for idx, item in enumerate(df.movieId.unique())}
df['userId'] = df['userId'].map(map_users)
df['movieId'] = df['movieId'].map(map_items)

map_title = {}
for _, row in df.iterrows():
    map_title[row.movieId] = row.title

In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,30,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,Crime|Drama
1,31,Dangerous Minds (1995),Drama
2,37,Across the Sea of Time (1995),Documentary|IMAX
3,161,Crimson Tide (1995),Drama|Thriller|War
4,193,Showgirls (1995),Drama


In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,11,7481,5.0,1230788624
1,11,1046,4.5,1251144145
2,11,616,4.0,1230782542
3,11,3535,2.0,1230784884
4,11,5669,5.0,1230853788


In [None]:
df.head()

Unnamed: 0,userId,movieId,rating,title
0,0,0,5.0,Enemy Mine (1985)
1,1,0,4.0,Enemy Mine (1985)
2,2,0,3.0,Enemy Mine (1985)
3,3,0,3.0,Enemy Mine (1985)
4,4,0,3.0,Enemy Mine (1985)


### Divisão da base em treino e teste

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=.2, random_state=2)

### Funções para obter informações específicas do DataFrame

In [None]:
# Obter a nota que um usuário deu para um item.
def get_rating(userId,movieId):
    if len(df[(df['userId']==userId)&(df['movieId']==movieId)]) == 0:
        return 0
    return (df.loc[(df.userId==userId) & (df.movieId == movieId),'rating'].iloc[0])

get_rating(6102, 413)

3.5

In [None]:
# Obter a lista de todos os filmes que um usuário avaliou.
def get_movie_ids(userId):
    if userId not in df['userId'].values:
        return []
    return (df.loc[(df.userId==userId),'movieId'].tolist())

get_movie_ids(0)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [None]:
# Obter o título do item dado o seu id.
def get_movie_title(movieId):
    if movieId not in df['movieId'].values:
        return ''
    return (df.loc[(df.movieId == movieId),'title'].iloc[0])

get_movie_title(0)

'Enemy Mine (1985)'

In [None]:
# Obter a lista de ratings de um usuário.
def get_user_ratings(userId):
    if userId not in df['userId'].values:
        return []
    return (df.loc[(df.userId==userId),'rating'].tolist())

get_user_ratings(0)

[5.0, 4.5, 4.0, 2.0, 5.0, 5.0, 5.0, 5.0, 3.0, 5.0, 5.0, 3.5, 5.0]

In [None]:
# Obter a média de ratings de um usuário
def get_user_mean(userId):
    return np.mean(get_user_ratings(userId))

get_user_mean(1)

3.38

In [None]:
# Obter a lista de todos os usuários que avaliaram o filme
def get_user_ids(movieId):
    if movieId not in df['movieId'].values:
        return []
    return (df.loc[(df.movieId==movieId),'userId'].tolist())



In [None]:
# Obter todas as notas do filme
def get_movie_ratings(movieId):
    if movieId not in df['movieId'].values:
        return []
    return (df.loc[(df.movieId==movieId),'rating'].tolist())


In [None]:
# Obter a média de notas do filme
def get_movie_mean(movieId):
    return np.mean(get_movie_ratings(movieId))

get_movie_mean(0)

3.4496732026143793

# Método Baseline
### Método simples para predição de avaliações baseado em tendências de cada usuário e item

> Recomeda filmes considerando o contexto e os dados tanto dos filmes quanto dos usuários, e a associação entre os filmes e os usuários

In [None]:
# Compute the global mean, movie bias and user bias
c = 1
global_mean = df['rating'].mean()
movie_list = df['movieId'].unique()
movie_bias = {}
for i in movie_list:
    users = get_user_ids(i)
    movie_bias[i] = sum((get_rating(u, i)-global_mean) for u in users) / (len(users) + c)

user_list = df['userId'].unique()
user_bias = {}
for u in user_list:
    items = get_movie_ids(u)
    user_bias[u] = sum((get_rating(u, i)-global_mean-movie_bias[i]) for i in items) / (len(items) + c)

print(user_bias)
print(movie_bias)

pred = global_mean + user_bias[3] + movie_bias[1]
pred

In [None]:
# Recomend movies that the user didnt watched yet
def RecommendMovies(userId, k = 5):
    movie_list = df['movieId'].unique()
    watched = get_movie_ids(userId)
    recommend = []
    for i in movie_list:
        if(not i in watched):
             # Calculate the recommendation score for each movie
            recommendation_score = global_mean + user_bias[userId] + movie_bias[i]
            # Append movie ID, calculated value pair to the recommend list
            recommend.append((i, recommendation_score))
      # Sort the recommendations in decreasing order by the calculated value
    recommend.sort(key=lambda x: x[1], reverse=True)
    return recommend[:k]

RecommendMovies(0)

[(33, 5.158610062571328),
 (167, 5.024112458170631),
 (268, 5.021000365894105),
 (19, 5.020394702080607),
 (80, 5.0111744544026875)]

In [None]:
print(get_movie_ids(1))
print(RecommendMovies(1))

[0, 3, 5, 6, 12, 14, 16, 18, 19, 21, 23, 29, 30, 34, 39, 54, 57, 72, 74, 84, 85, 87, 105, 106, 107]
[(33, 4.150767969524827), (167, 4.01627036512413), (268, 4.013158272847603), (80, 4.003332361356186), (40, 3.9922465170350643)]


### TODO: Avaliação do desempenho do algoritmo baseline

### Método que para cada usuário pega no conjunto de teste os filmes que o usuário avaliou bem (rating > 3 por exemplo) e verifica se este filme foi de fato recomendado pelo método Baseline na função RecommendMovies
### Usar plots?

# TODO
# Modelo Apriori

> Recomenda filmes considerando principalmente o contexto dos filmes e a relação (associação) entre eles


### Pré-processamento e criação da tabela de filmes assistidos

In [None]:
df_pivot = df.pivot(index='userId', columns='title', values='rating').fillna(0)

In [None]:
df_pivot = df_pivot.astype('int64')


In [None]:
def encode_ratings(x):
    if x<=0:
        return 0
    if x>=1:
        return 1

df_pivot = df_pivot.applymap(encode_ratings)

In [35]:
df_pivot.head()


  and should_run_async(code)


title,100 Rifles (1969),21 (2008),2AM: The Smiling Man (2013),3 Simoa (2012),97 Percent True (2008),A Rumor Of War (1980),A Walk in the Woods (2015),A mí las mujeres ni fu ni fa (1971),About Cherry (2012),"Ace in the Hole (Big Carnival, The) (1951)",...,While You Were Sleeping (1995),White Nights (1985),Wild Target (2010),"Winning Team, The (1952)",Wish I Was Here (2014),"Woman, a Gun and a Noodle Shop, A (San qiang pai an jing qi) (2009)",Wonderful Days (a.k.a. Sky Blue) (2003),Yolanda and the Thief (1945),Zero Effect (1998),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


### Treinando o modelo

In [43]:
from mlxtend.frequent_patterns import apriori

frequent_itemset = apriori(df_pivot, min_support=0.07, use_colnames=True)


  and should_run_async(code)


In [44]:
frequent_itemset.head()


  and should_run_async(code)


Unnamed: 0,support,itemsets
0,0.091794,(21 (2008))
1,0.148963,(Along Came a Spider (2001))
2,0.401894,(American Psycho (2000))
3,0.13697,(Apocalypto (2006))
4,0.194049,"(Aristocats, The (1970))"


In [45]:
from mlxtend.frequent_patterns import association_rules

rules = association_rules(frequent_itemset, metric="lift", min_threshold=1)


  and should_run_async(code)


In [46]:
rules.head()

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Along Came a Spider (2001)),(American Psycho (2000)),0.148963,0.401894,0.086655,0.581719,1.447446,0.026787,1.429916,0.363237
1,(American Psycho (2000)),(Along Came a Spider (2001)),0.401894,0.148963,0.086655,0.215616,1.447446,0.026787,1.084975,0.516844
2,(Along Came a Spider (2001)),(Bowling for Columbine (2002)),0.148963,0.454013,0.07367,0.494552,1.089291,0.006039,1.080205,0.09632
3,(Bowling for Columbine (2002)),(Along Came a Spider (2001)),0.454013,0.148963,0.07367,0.162264,1.089291,0.006039,1.015877,0.150135
4,(Along Came a Spider (2001)),(Chasing Amy (1997)),0.148963,0.496303,0.083499,0.560533,1.129416,0.009568,1.146154,0.134644


### Resultados


In [80]:
df_res = rules.sort_values(by=['lift'], ascending=False)
df_res.head()


  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
42672,"(Collateral (2004), Lara Croft: Tomb Raider (2...","(I, Robot (2004), Rush Hour 2 (2001), Star War...",0.167538,0.138774,0.070334,0.419806,3.025115,0.047084,1.484377,0.804162
42653,"(I, Robot (2004), Rush Hour 2 (2001), Star War...","(Collateral (2004), Lara Croft: Tomb Raider (2...",0.138774,0.167538,0.070334,0.506823,3.025115,0.047084,1.687956,0.777303
42662,"(Lara Croft: Tomb Raider (2001), Collateral (2...","(I, Robot (2004), Rush Hour 2 (2001))",0.151398,0.154914,0.070334,0.464562,2.998833,0.04688,1.578308,0.785453
42663,"(I, Robot (2004), Rush Hour 2 (2001))","(Lara Croft: Tomb Raider (2001), Collateral (2...",0.154914,0.151398,0.070334,0.454016,2.998833,0.04688,1.554263,0.788721
21145,"(I, Robot (2004), Rush Hour 2 (2001))","(Collateral (2004), Lara Croft: Tomb Raider (2...",0.154914,0.167538,0.076105,0.491269,2.932278,0.050151,1.636349,0.779765


### Testando o modelo

In [81]:
movie_test = 'I, Robot (2004)'

  and should_run_async(code)


In [82]:
df_test = df_res[df_res['antecedents'].apply(lambda x: len(x) ==1 and next(iter(x)) == movie_test)]

  and should_run_async(code)


In [83]:
df_test = df_test[df_test['lift'] > 1.5]

  and should_run_async(code)


In [84]:
df_test.head()


  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
42673,"(I, Robot (2004))","(Collateral (2004), Rush Hour 2 (2001), Lara C...",0.39486,0.082777,0.070334,0.178123,2.151833,0.037648,1.11601,0.884556
21149,"(I, Robot (2004))","(Collateral (2004), Lara Croft: Tomb Raider (2...",0.39486,0.089901,0.076105,0.192738,2.143897,0.040606,1.12739,0.881713
7368,"(I, Robot (2004))","(National Treasure: Book of Secrets (2007), St...",0.39486,0.086023,0.070784,0.179265,2.083905,0.036817,1.113607,0.859523
31945,"(I, Robot (2004))","(Lara Croft: Tomb Raider (2001), Collateral (2...",0.39486,0.088729,0.072047,0.182462,2.056403,0.037011,1.114653,0.848918
20280,"(I, Robot (2004))","(Collateral (2004), Lara Croft: Tomb Raider (2...",0.39486,0.090081,0.072858,0.184517,2.048342,0.037289,1.115804,0.845755


In [85]:
movies = df_test['consequents'].values

movie_list = []
for movie in movies:
    for title in movie:
        if title not in movie_list:
            movie_list.append(title)

  and should_run_async(code)


In [86]:
movie_list[0:10]


  and should_run_async(code)


['Collateral (2004)',
 'Rush Hour 2 (2001)',
 'Lara Croft: Tomb Raider (2001)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'National Treasure: Book of Secrets (2007)',
 'American Psycho (2000)',
 'Conspiracy Theory (1997)',
 'Karate Kid, The (1984)',
 'Dark City (1998)',
 'Reservoir Dogs (1992)']