In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:
# pip install mlxtend

In [3]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules


In [4]:
rating_data = pd.read_csv('ratings.csv')
rating_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
rating_data.shape

(100836, 4)

In [6]:
print(rating_data.userId.nunique())
print(rating_data.movieId.nunique())

610
9724


In [7]:
movies = pd.read_csv('movies.csv')
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
data_full=rating_data.merge(movies, on= "movieId")


In [9]:
data_full.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64

In [10]:
ratings_merge = data_full.groupby(by="title")['rating'].count().reset_index().rename(columns={'rating':'totalRatings'})[['title','totalRatings']]
ratings_merge.head()

Unnamed: 0,title,totalRatings
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [11]:
ratings_merge.shape

(9719, 2)

In [12]:
len(ratings_merge['title'].unique())

9719

In [13]:
ratings_total = pd.merge(data_full,ratings_merge,on='title',how='left')

In [14]:
ratings_total.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,totalRatings
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
5,18,1,3.5,1455209816,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
6,19,1,4.0,965705637,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
7,21,1,3.5,1407618878,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
8,27,1,3.0,962685262,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
9,31,1,5.0,850466616,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215


In [15]:
ratings_total.drop(['timestamp','genres'],axis=1,inplace=True)

In [16]:
ratings_total.head(5)

Unnamed: 0,userId,movieId,rating,title,totalRatings
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [17]:
ratings_total['totalRatings'].describe()

count    100836.000000
mean         58.758777
std          61.965384
min           1.000000
25%          13.000000
50%          39.000000
75%          84.000000
max         329.000000
Name: totalRatings, dtype: float64

In [18]:
ratings_total['totalRatings'].quantile(np.arange(0,1.01,0.1))

0.0      1.0
0.1      4.0
0.2     10.0
0.3     17.0
0.4     27.0
0.5     39.0
0.6     52.0
0.7     69.0
0.8    100.0
0.9    143.0
1.0    329.0
Name: totalRatings, dtype: float64

In [19]:
#Nos quedamos con las películas que tengan mas de 10 calificaciones
votes_count_threshold = 10

In [20]:
ratings_total['totalRatings'].quantile(np.arange(0,1.01,0.1))

0.0      1.0
0.1      4.0
0.2     10.0
0.3     17.0
0.4     27.0
0.5     39.0
0.6     52.0
0.7     69.0
0.8    100.0
0.9    143.0
1.0    329.0
Name: totalRatings, dtype: float64

In [21]:
ratings_top = ratings_total.query('totalRatings > @votes_count_threshold')

In [22]:
ratings_top['totalRatings'].quantile(np.arange(0,1.01,0.1))

0.0     11.0
0.1     16.0
0.2     23.0
0.3     32.0
0.4     42.0
0.5     52.0
0.6     65.0
0.7     88.0
0.8    115.0
0.9    165.0
1.0    329.0
Name: totalRatings, dtype: float64

In [23]:
ratings_top.head(5)

Unnamed: 0,userId,movieId,rating,title,totalRatings
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [24]:
ratings_top.shape

(79640, 5)

In [25]:
if not ratings_top[ratings_top.duplicated(['userId','title'])].empty:
    ratings_top = ratings_top.drop_duplicates(['userId','title'])

In [26]:
ratings_top.shape

(79636, 5)

In [27]:
film_user= ratings_top.pivot(index='title',columns='userId',values='rating').fillna(0)

In [28]:
film_user.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
10 Cloverfield Lane (2016),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
"10,000 BC (2008)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
film_user.shape

(2121, 610)

In [30]:
film_user_sparse = csr_matrix(film_user.values)

# Recommendations using Association Rules

In [31]:
def encode_units(x):
    if x<=0:
        return 0
    if x>=1:
        return 1

In [32]:
data_association = film_user.T.applymap(encode_units)

In [33]:
data_association.shape

(610, 2121)

In [34]:
data_association.head(10)

title,"'burbs, The (1989)",(500) Days of Summer (2009),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),12 Years a Slave (2013),127 Hours (2010),...,Zack and Miri Make a Porno (2008),Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0.0,0,0.0,0.0,0,0,0.0,0,0,...,0,0,0,0.0,0,0.0,0,0,0.0,1
2,0,0.0,0,0.0,0.0,0,0,0.0,0,0,...,0,0,0,0.0,1,0.0,0,0,0.0,0
3,0,0.0,0,0.0,0.0,0,0,0.0,0,0,...,0,0,0,0.0,0,0.0,0,0,0.0,0
4,0,0.0,0,0.0,0.0,0,0,1.0,0,0,...,0,0,0,0.0,0,0.0,0,0,0.0,0
5,0,0.0,0,0.0,0.0,0,0,0.0,0,0,...,0,0,0,0.0,0,0.0,0,0,0.0,0
6,0,0.0,0,0.0,0.0,0,0,0.0,0,0,...,0,0,0,0.0,0,0.0,0,0,0.0,0
7,0,0.0,0,0.0,0.0,0,0,0.0,0,0,...,0,0,0,0.0,0,0.0,0,0,0.0,0
8,0,0.0,0,0.0,0.0,0,0,0.0,0,0,...,0,0,0,0.0,0,0.0,0,0,0.0,0
9,0,0.0,0,0.0,0.0,0,0,0.0,0,0,...,0,0,0,0.0,0,0.0,0,0,1.0,0
10,0,0.0,0,0.0,0.0,0,0,0.0,0,0,...,0,0,0,0.0,0,0.0,0,0,0.0,0


In [35]:
data_association.fillna(value=0, inplace=True)

In [36]:
#min_support=0.05 que la película analizada esté en almentos en el 5% de las "transacciones" de los usuarios
#max_len= no se define cuál es el tamaño maximo de combinaciones de items
frequent_itemsets = apriori(data_association, min_support=0.05, use_colnames=True, max_len=2)
# %time frequent_itemsets = apriori(df_for_ar, min_support=0.07, use_colnames=True)

In [37]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.shape
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,((500) Days of Summer (2009)),(Forrest Gump (1994)),0.067213,0.537705,0.054098,0.804878,1.496877,0.017958,2.369262
1,(Forrest Gump (1994)),((500) Days of Summer (2009)),0.537705,0.067213,0.054098,0.10061,1.496877,0.017958,1.037133
2,(Shrek (2001)),((500) Days of Summer (2009)),0.277049,0.067213,0.052459,0.189349,2.817145,0.033838,1.150664
3,((500) Days of Summer (2009)),(Shrek (2001)),0.067213,0.277049,0.052459,0.780488,2.817145,0.033838,3.293443
4,(10 Things I Hate About You (1999)),(American Beauty (1999)),0.086885,0.332787,0.060656,0.698113,2.097779,0.031741,2.210143


In [38]:
all_antecedents = [list(x) for x in rules['antecedents'].values]


In [59]:
movie_selected="Terminator 2: Judgment Day (1991)"
#"Fargo (1996)"



In [60]:
desired_indices = [i for i in range(len(all_antecedents)) if len(all_antecedents[i])==1 and all_antecedents[i][0]==movie_selected]


In [61]:
apriori_recommendations=rules.iloc[desired_indices,].sort_values(by=['lift'],ascending=False)
apriori_recommendations.head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
67647,(Terminator 2: Judgment Day (1991)),(Predator (1987)),0.362295,0.1,0.088525,0.244344,2.443439,0.052295,1.191018
74413,(Terminator 2: Judgment Day (1991)),"(Terminator, The (1984))",0.362295,0.211475,0.186885,0.515837,2.43923,0.110269,1.628635
47847,(Terminator 2: Judgment Day (1991)),(Gremlins (1984)),0.362295,0.067213,0.059016,0.162896,2.423574,0.034665,1.114302
69633,(Terminator 2: Judgment Day (1991)),(RoboCop (1987)),0.362295,0.113115,0.098361,0.271493,2.400157,0.05738,1.217401
60565,(Terminator 2: Judgment Day (1991)),(Mad Max (1979)),0.362295,0.062295,0.054098,0.149321,2.396999,0.031529,1.102302
72855,(Terminator 2: Judgment Day (1991)),(Species (1995)),0.362295,0.070492,0.060656,0.167421,2.375039,0.035117,1.11642
74415,(Terminator 2: Judgment Day (1991)),(The Devil's Advocate (1997)),0.362295,0.083607,0.068852,0.190045,2.27309,0.038562,1.131413
69835,(Terminator 2: Judgment Day (1991)),(Rocky (1976)),0.362295,0.101639,0.083607,0.230769,2.270471,0.046783,1.167869
37051,(Terminator 2: Judgment Day (1991)),(Escape from New York (1981)),0.362295,0.063934,0.052459,0.144796,2.264764,0.029296,1.094553
74411,(Terminator 2: Judgment Day (1991)),(Terminator 3: Rise of the Machines (2003)),0.362295,0.072131,0.059016,0.162896,2.25833,0.032884,1.108427


In [62]:
apriori_list = [list(x) for x in apriori_recommendations['consequents'].values]
print("Apriori Recommendations for movie selected")
for i in range(10):
    print("{0}: {1} with lift of {2}".format(i+1,apriori_list[i],apriori_recommendations.iloc[i,6]))

Apriori Recommendations for movie selected
1: ['Predator (1987)'] with lift of 2.443438914027149
2: ['Terminator, The (1984)'] with lift of 2.4392297169314956
3: ['Gremlins (1984)'] with lift of 2.4235735570025385
4: ['RoboCop (1987)'] with lift of 2.4001573873696636
5: ['Mad Max (1979)'] with lift of 2.396999285544177
6: ['Species (1995)'] with lift of 2.375039461222772
7: ["The Devil's Advocate (1997)"] with lift of 2.2730902315677404
8: ['Rocky (1976)'] with lift of 2.270471464019851
9: ['Escape from New York (1981)'] with lift of 2.26476389372317
10: ['Terminator 3: Rise of the Machines (2003)'] with lift of 2.2583299053887287
