In [3]:
import numpy as np
import pandas as pd

from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier

from surprise import Reader, Dataset
from surprise import KNNWithMeans, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy


### Popularity based Recommendation system

In [4]:
df = pd.read_csv("book.csv")
df.head(2)

Unnamed: 0.1,Unnamed: 0,userID,ISBN,bookRating,bookTitle,totalRatingCount,Location
0,0,276725,034545104X,0,Flesh Tones: A Novel,60,"tyler, texas, usa"
1,1,2313,034545104X,5,Flesh Tones: A Novel,60,"cincinnati, ohio, usa"


In [5]:
df.drop(columns="Unnamed: 0", inplace=True)
df.head(2)

Unnamed: 0,userID,ISBN,bookRating,bookTitle,totalRatingCount,Location
0,276725,034545104X,0,Flesh Tones: A Novel,60,"tyler, texas, usa"
1,2313,034545104X,5,Flesh Tones: A Novel,60,"cincinnati, ohio, usa"


In [6]:
df_booktile = df.groupby(["bookTitle"]).agg({"bookRating": "mean", "totalRatingCount": "count"})
df_booktile.head(5)

Unnamed: 0_level_0,bookRating,totalRatingCount
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1
1st to Die: A Novel,3.415301,366
"A Child Called \It\"": One Child's Courage to Survive""",4.139738,229
A Cold Heart: An Alex Delaware Novel,3.112903,62
A Kiss of Shadows (Meredith Gentry Novels (Paperback)),4.013333,75
A Man in Full,2.111111,90


In [7]:
df_booktile.loc[(df_booktile["totalRatingCount"] > 100)].sort_values(by="bookRating", ascending=False).head(10)

Unnamed: 0_level_0,bookRating,totalRatingCount
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1
Harry Potter and the Chamber of Secrets (Book 2),6.720588,136
Harry Potter and the Order of the Phoenix (Book 5),5.565693,274
Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback)),4.850598,502
To Kill a Mockingbird,4.761329,331
The Da Vinci Code,4.699329,745
The Lovely Bones: A Novel,4.622624,1052
Fahrenheit 451,4.61512,291
A Wrinkle In Time,4.569444,144
Girl with a Pearl Earring,4.319648,341
The Notebook,4.307692,104


### Content Based Recommendation system

In [8]:
df_movie = pd.read_csv("movie_metadata.csv")
df_movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [9]:
genres = df_movie.genres.str.strip().str.split("|", expand=True)[[0,1,2]]
genres.columns=['genre1','genre2','genre3']
genres

Unnamed: 0,genre1,genre2,genre3
0,Action,Adventure,Fantasy
1,Action,Adventure,Fantasy
2,Action,Adventure,Thriller
3,Action,Thriller,
4,Documentary,,
...,...,...,...
5038,Comedy,Drama,
5039,Crime,Drama,Mystery
5040,Drama,Horror,Thriller
5041,Comedy,Drama,Romance


In [10]:
movie_feat = ['movie_title','genre1','genre2','genre3','content_rating','imdb_score']

In [11]:
data1 = pd.concat([df_movie, genres], axis=1)
data1.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,genre1,genre2,genre3
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,Action,Adventure,Fantasy
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,Action,Adventure,Fantasy
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,Action,Adventure,Thriller
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,Action,Thriller,
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,12.0,7.1,,0,Documentary,,


In [12]:
data2 = data1[movie_feat].copy()
data2.set_index("movie_title", inplace=True)
data2.fillna('NA', inplace=True)
data2.head()

Unnamed: 0_level_0,genre1,genre2,genre3,content_rating,imdb_score
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Avatar,Action,Adventure,Fantasy,PG-13,7.9
Pirates of the Caribbean: At World's End,Action,Adventure,Fantasy,PG-13,7.1
Spectre,Action,Adventure,Thriller,PG-13,6.8
The Dark Knight Rises,Action,Thriller,,PG-13,8.5
Star Wars: Episode VII - The Force Awakens,Documentary,,,,7.1


In [13]:
df_enc = pd.get_dummies(data2, dtype=int)
df_enc.head()

Unnamed: 0_level_0,imdb_score,genre1_Action,genre1_Adventure,genre1_Animation,genre1_Biography,genre1_Comedy,genre1_Crime,genre1_Documentary,genre1_Drama,genre1_Family,...,content_rating_Passed,content_rating_R,content_rating_TV-14,content_rating_TV-G,content_rating_TV-MA,content_rating_TV-PG,content_rating_TV-Y,content_rating_TV-Y7,content_rating_Unrated,content_rating_X
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,7.9,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Pirates of the Caribbean: At World's End,7.1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Spectre,6.8,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The Dark Knight Rises,8.5,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Star Wars: Episode VII - The Force Awakens,7.1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
df_enc.index = df_enc.index.str.strip()
nn = NearestNeighbors(n_neighbors=5, metric="cosine")
nn.fit(df_enc)

In [15]:
df_enc.loc["Avatar"]

imdb_score                7.9
genre1_Action             1.0
genre1_Adventure          0.0
genre1_Animation          0.0
genre1_Biography          0.0
                         ... 
content_rating_TV-PG      0.0
content_rating_TV-Y       0.0
content_rating_TV-Y7      0.0
content_rating_Unrated    0.0
content_rating_X          0.0
Name: Avatar, Length: 90, dtype: float64

In [16]:
dist, ind = nn.kneighbors(df_enc.loc["Avatar"].to_frame().T, n_neighbors=5)

In [17]:
df_enc.iloc[ind[0]].index

Index(['Avatar', 'X-Men: Days of Future Past',
       'Pirates of the Caribbean: The Curse of the Black Pearl',
       'Star Wars: Episode III - Revenge of the Sith',
       'Indiana Jones and the Last Crusade'],
      dtype='object', name='movie_title')

### Collaborative recommendation system

In [18]:
df = pd.read_csv("ratings.csv")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [19]:
df.drop(columns="timestamp", inplace=True)

In [20]:
df.rating.value_counts().sort_index(ascending=False)

rating
5.0    15095
4.5     7723
4.0    28750
3.5    10538
3.0    20064
2.5     4449
2.0     7271
1.5     1687
1.0     3326
0.5     1101
Name: count, dtype: int64

In [21]:
reader = Reader(rating_scale=(0.5, 5))

rating_data = Dataset.load_from_df(df, reader)

In [22]:
trainfullset = rating_data.build_full_trainset()

In [23]:
trainfullset.n_items, trainfullset.n_users

(9066, 671)

In [24]:
svd = SVD(n_factors=50, n_epochs=200)
svd.fit(trainfullset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1e1a3713b50>

In [25]:
svd.predict(uid=1, iid=30)

Prediction(uid=1, iid=30, r_ui=None, est=2.984878885130138, details={'was_impossible': False})

In [26]:
trainset, testset = train_test_split(rating_data, test_size=0.3)

In [27]:
pred = svd.test(testset)
pred[1:5]

[Prediction(uid=17, iid=2916, r_ui=4.0, est=4.024927777534383, details={'was_impossible': False}),
 Prediction(uid=547, iid=8763, r_ui=1.5, est=1.6992424840238378, details={'was_impossible': False}),
 Prediction(uid=311, iid=531, r_ui=2.0, est=2.21143367260458, details={'was_impossible': False}),
 Prediction(uid=380, iid=45928, r_ui=3.0, est=3.0833786978266855, details={'was_impossible': False})]

In [28]:
accuracy.rmse(pred)

RMSE: 0.2248


0.22481971867273945

In [29]:
uid1_not_watched = list(set(df.movieId.unique()) - set(df[df.userId == 1].movieId))

In [30]:
uid1_rating = []
for mid in uid1_not_watched:
    pred = svd.predict(uid=1, iid=mid)
    uid1_rating.append(pred.est)

In [31]:
pd.DataFrame({"movieId": uid1_not_watched, "rating": uid1_rating}).sort_values(by="rating", ascending=False)

Unnamed: 0,movieId,rating
154,162,4.521244
1985,2278,4.127964
3501,4011,4.111532
3193,3676,4.083425
2164,2467,4.060807
...,...,...
2235,2548,1.018745
1650,1917,0.990782
3490,3997,0.990240
6432,8985,0.977785


In [32]:
# User Based

In [33]:
knn = KNNWithMeans(K = 40, sim_options={'name': 'pearson_baseline', 'user_based': True})
knn.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1e1a89bfc10>

In [35]:
knn_pred = knn.test(testset)

In [36]:
accuracy.rmse(knn_pred)

RMSE: 0.9304


0.9304464715003747

: 