# Recommendation Systems

In [1]:
import pandas as pd
import scipy
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
movies = pd.read_csv('data/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings = pd.read_csv('data/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
ratings.drop('timestamp', axis = 1, inplace = True)
movies.drop('genres', axis = 1, inplace = True)

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [6]:
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [7]:
df = pd.merge(ratings, movies, on = 'movieId')

In [8]:
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [50]:
#item based
item_df = pd.pivot_table(df, index='title', columns = 'userId', values = 'rating')
user_df = pd.pivot_table(df, index = 'userId', columns = 'title', values = 'rating')

In [10]:
item_df.fillna(0).head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
item_df.shape

(9719, 610)

In [51]:
sparse_items = scipy.sparse.csr_matrix(item_df.fillna(0))
sparse_users = scipy.sparse.csr_matrix(user_df.fillna(0))

In [13]:
print(sparse_items)

  (0, 609)	4.0
  (1, 331)	4.0
  (2, 331)	3.5
  (2, 376)	3.5
  (3, 344)	5.0
  (4, 112)	3.0
  (4, 344)	5.0
  (5, 20)	1.5
  (6, 11)	5.0
  (6, 18)	2.0
  (6, 90)	2.0
  (6, 94)	3.0
  (6, 171)	4.0
  (6, 216)	4.0
  (6, 287)	3.0
  (6, 293)	1.0
  (6, 306)	3.5
  (6, 376)	3.5
  (6, 413)	3.0
  (6, 473)	1.0
  (6, 476)	3.5
  (6, 519)	4.0
  (6, 554)	5.0
  (6, 560)	4.5
  (6, 598)	2.0
  :	:
  (9717, 26)	5.0
  (9717, 41)	5.0
  (9717, 56)	2.0
  (9717, 67)	4.0
  (9717, 87)	3.5
  (9717, 140)	3.5
  (9717, 197)	2.0
  (9717, 214)	2.5
  (9717, 216)	2.0
  (9717, 220)	3.5
  (9717, 238)	3.0
  (9717, 281)	4.0
  (9717, 293)	4.0
  (9717, 306)	2.5
  (9717, 312)	1.0
  (9717, 413)	3.0
  (9717, 420)	3.0
  (9717, 447)	3.0
  (9717, 473)	3.0
  (9717, 476)	3.5
  (9717, 554)	3.0
  (9717, 560)	4.0
  (9717, 596)	3.0
  (9717, 598)	2.5
  (9718, 526)	1.0


In [52]:
recommender = pairwise_distances(sparse_items)#, metric = 'cosine')
u_recommender = pairwise_distances(sparse_users)

In [35]:
recommender

array([[ 0.        ,  5.65685425,  6.36396103, ...,  5.        ,
        17.1391365 ,  4.12310563],
       [ 5.65685425,  0.        ,  3.53553391, ...,  6.08276253,
        17.1391365 ,  4.12310563],
       [ 6.36396103,  3.53553391,  0.        , ...,  6.74536878,
        17.38533865,  5.04975247],
       ...,
       [ 5.        ,  6.08276253,  6.74536878, ...,  0.        ,
        17.28438602,  4.69041576],
       [17.1391365 , 17.1391365 , 17.38533865, ..., 17.28438602,
         0.        , 16.69580786],
       [ 4.12310563,  4.12310563,  5.04975247, ...,  4.69041576,
        16.69580786,  0.        ]])

In [53]:
rdf = pd.DataFrame(recommender, columns = item_df.index, index = item_df.index)
udf = pd.DataFrame(u_recommender, columns = user_df.index, index = user_df.index)

In [54]:
udf.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,70.436141,69.336138,78.797208,68.985506,86.752522,74.161985,68.88396,70.192592,78.574805,...,77.993589,73.082146,118.460964,74.141756,77.286804,131.552271,74.020267,99.698295,68.702256,143.376253
2,70.436141,0.0,29.457597,59.692964,32.806249,66.777616,47.662879,32.927952,32.128648,45.241021,...,45.565886,46.067885,115.192231,41.668333,53.80985,125.760685,57.395557,96.997423,29.141894,136.139634
3,69.336138,29.457597,0.0,59.114296,31.882597,66.682082,47.439435,32.19472,30.975797,45.765708,...,48.862051,45.513734,114.640743,40.786027,52.93156,125.825276,56.661274,97.194393,28.293109,137.452719
4,78.797208,59.692964,59.114296,0.0,58.034473,80.826976,66.355105,59.732738,60.282667,68.234888,...,68.359345,64.791975,109.863552,64.482556,71.156518,125.785532,71.916618,103.134621,58.591808,141.893446
5,68.985506,32.806249,31.882597,58.034473,0.0,61.049161,47.370877,26.907248,34.438351,47.518417,...,49.709154,38.052595,113.393121,37.815341,51.800097,124.34227,55.407581,95.429293,27.658633,137.403603


In [37]:
rdf.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,5.656854,6.363961,6.403124,7.071068,4.272002,14.56022,5.0,24.46426,9.643651,...,4.272002,10.98863,5.43139,3.162278,5.0,18.920888,14.343988,5.0,17.139137,4.123106
'Hellboy': The Seeds of Creation (2004),5.656854,0.0,3.535534,6.403124,7.071068,4.272002,14.56022,5.0,25.029982,9.643651,...,4.272002,12.359207,7.582875,5.830952,5.0,18.920888,14.891273,6.082763,17.139137,4.123106
'Round Midnight (1986),6.363961,3.535534,0.0,7.035624,7.648529,5.17204,14.0,5.787918,25.199206,10.074721,...,5.17204,12.698425,8.124038,6.519202,5.787918,19.14419,15.173991,6.745369,17.385339,5.049752
'Salem's Lot (2004),6.403124,6.403124,7.035624,0.0,3.0,5.220153,14.866069,5.830952,25.209125,10.099505,...,5.220153,12.718097,8.154753,6.557439,5.830952,19.157244,15.190458,6.78233,17.399713,5.09902
'Til There Was You (1997),7.071068,7.071068,7.648529,3.0,0.0,6.020797,15.165751,6.557439,25.387005,10.535654,...,6.020797,13.067134,8.689074,7.211103,6.557439,19.390719,15.483863,7.416198,17.656444,5.91608


In [38]:
search = 'Die Hard'

In [39]:
movies.loc[movies['title'].str.contains(search), 'title']

138     Die Hard: With a Vengeance (1995)
793                       Die Hard (1988)
1053                    Die Hard 2 (1990)
6518         Live Free or Die Hard (2007)
8103       Good Day to Die Hard, A (2013)
Name: title, dtype: object

In [40]:
item_df.head(2)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,4.0
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,


In [41]:
item_df.loc['Live Free or Die Hard (2007)', :].mean()

3.40625

In [42]:
item_df.T['Live Free or Die Hard (2007)'].count()

32

In [43]:
def recommender(title, n):
    return pd.DataFrame(rdf[title].sort_values()[1:n])

In [44]:
recommender('Live Free or Die Hard (2007)', 5)

Unnamed: 0_level_0,Live Free or Die Hard (2007)
title,Unnamed: 1_level_1
Resident Evil: Afterlife (2010),17.146428
Disturbia (2007),17.269916
Salt (2010),17.31329
Transformers: Revenge of the Fallen (2009),17.414075


In [45]:
import numpy as np

In [46]:
recommender(rdf.columns[np.random.randint(0, rdf.shape[1])], 7)

Unnamed: 0_level_0,Traitor (2008)
title,Unnamed: 1_level_1
Out of Time (2003),4.272002
Pride and Glory (2008),4.66369
Curse of the Golden Flower (Man cheng jin dai huang jin jia) (2006),4.898979
Two Mules for Sister Sara (1970),5.024938
Quigley Down Under (1990),5.049752
"Big Bounce, The (2004)",5.267827


In [47]:
rdf.loc[:, 'Die Hard: With a Vengeance (1995)'].nsmallest(10)

title
Die Hard: With a Vengeance (1995)          0.000000
Cliffhanger (1993)                        34.781461
Ace Ventura: Pet Detective (1994)         37.212229
Clear and Present Danger (1994)           37.312866
Outbreak (1995)                           37.403208
Batman Forever (1995)                     37.446629
GoldenEye (1995)                          37.519995
Net, The (1995)                           37.950626
Dumb & Dumber (Dumb and Dumber) (1994)    38.065733
Waterworld (1995)                         38.144462
Name: Die Hard: With a Vengeance (1995), dtype: float64

In [48]:
def recommender_getter(title, n = 5):
        avg = item_df.loc[title, :].mean()
        num_ratings = item_df.T[title].count()
        recs = rdf[title].nsmallest(n)[1:n]
        print(f'{title} had {num_ratings} ratings \nwith an average rating of {avg: .4f}')
        return recs

In [49]:
recommender_getter('Grumpier Old Men (1995)', 10)

Grumpier Old Men (1995) had 52 ratings 
with an average rating of  3.2596


title
Son in Law (1993)                                          22.688103
Striptease (1996)                                          22.852790
Juror, The (1996)                                          22.940139
Flipper (1996)                                             22.967368
Beverly Hillbillies, The (1993)                            23.086793
Bio-Dome (1996)                                            23.130067
Kazaam (1996)                                              23.232520
Angus (1995)                                               23.237900
Tales from the Crypt Presents: Bordello of Blood (1996)    23.270153
Name: Grumpier Old Men (1995), dtype: float64

In [68]:
from surprise import SVD, KNNBasic

In [56]:
from surprise import Dataset

In [57]:
data = Dataset.load_builtin('ml-100k')

In [61]:
algo = SVD()

In [63]:
trainset = data.build_full_trainset()

In [64]:
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f88921c9390>

In [66]:
algo.predict(196, 302, r_ui = 4)

Prediction(uid=196, iid=302, r_ui=4, est=3.52986, details={'was_impossible': False})

In [69]:
knn = KNNBasic()

In [70]:
knn.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f88b1681c90>

In [71]:
knn.predict(196, 302, r_ui = 4)

Prediction(uid=196, iid=302, r_ui=4, est=3.52986, details={'was_impossible': True, 'reason': 'User and/or item is unkown.'})

In [None]:
#user based


In [None]:
#make sparse matrix


In [None]:
#compute pairwise


##### Topic Models

In [None]:
from sklearn.decomposition import LatentDirichletAllocation