In [1]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m940.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-macosx_11_0_arm64.whl size=1103979 sha256=0f12ba1699228a50f808f3dafe56ebc210dfc9b76c2b4ef99938e24f82b30bb5
  Stored in directory: /Users/jenniferwang/Library/Caches/pip/wheels/df/e4/a6/7ad72453dd693f420b0c639bedeec34641738d11b55d8d9b84
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [2]:
import numpy as np
import pandas as pd

## SVD Matrix Factorization

In [3]:
movies_data = pd.read_csv('movies.csv')
ratings_data = pd.read_csv('ratings.csv')

ratings = ratings_data.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)

ratings = ratings.values
user_ratings_mean = np.mean(ratings, axis = 1)
ratings = ratings - user_ratings_mean.reshape(-1, 1)

ratings_data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [8]:
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV

In [None]:
algo = SVD()
data = Dataset.load_builtin('ml-100k')

min_rating = ratings_data.rating.min()
max_rating = ratings_data.rating.max()

reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(ratings_data[['userId', 'movieId', 'rating']], reader)
out = cross_validate(algo, data, ["rmse", "mae"], cv=5, verbose=True)

In [None]:
trainset, testset = train_test_split(data, test_size=.20)

SVDmodel = SVD(n_factors=20, n_epochs=20)
SVDmodel.fit(trainset)
predictions = SVDmodel.test(testset)
rmse = accuracy.rmse(predictions)

RMSE: 0.8684


In [10]:
data = Dataset.load_builtin('ml-100k')

param_grid = {
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)
print(gs.best_score["rmse"])
print(gs.best_params["rmse"])


0.9630490924445843
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [19]:
trainset, testset = train_test_split(data, test_size=0.25)
SVDmodel = SVD(n_epochs = 10, lr_all = 0.005, reg_all=0.4)
SVDmodel.fit(trainset)
predictions = SVDmodel.test(testset)
rmse = accuracy.rmse(predictions)
SVDmodel.pu.shape
SVDmodel.qi.shape

RMSE: 0.9650


(1648, 100)

In [21]:
from google.colab import files

pd.DataFrame(SVDmodel.qi).to_csv("item_factors.csv")
pd.DataFrame(SVDmodel.pu).to_csv("user_factors.csv")

files.download("item_factors.csv")
files.download("user_factors.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
def generate_recommendation(model, user_id, ratings_df, movies_df, n_items):

   movie_ids = ratings_df["movieId"].unique()
   movies_seen = ratings_df.loc[ratings_df["userId"] == user_id, "movieId"]
   movies_unseen = list(set(movies_df['movieId']) - set(movies_seen))
   test_set = [[user_id, movie_id, 0] for movie_id in movies_unseen]

   movie_scores = []
   for movie_id in movies_unseen:
      prediction = model.predict(user_id, movie_id)
      movie_scores.append((movie_id, prediction.est))

   movie_scores.sort(key=lambda x: x[1], reverse=True)
   top_n_movies = movie_scores[:n_items]
   print("Top {0} item recommendations for user {1}:".format(n_items, user_id))
   for movie_id, score in top_n_movies:
     print(movie_id,
          movies_df[movies_df["movieId"]==movie_id]["title"].values[0],
          movies_df[movies_df["movieId"]==movie_id]["genres"].values[0],
          score)

userID = 23
n_items = 10
generate_recommendation(SVDmodel,userID,ratings_data,movies_data,n_items)

Top 10 item recommendations for user 23:
1204 Lawrence of Arabia (1962) Adventure|Drama|War 4.163547515174557
56782 There Will Be Blood (2007) Drama|Western 4.088105660666564
318 Shawshank Redemption, The (1994) Crime|Drama 4.086882033042829
3451 Guess Who's Coming to Dinner (1967) Drama 4.085854254317727
1276 Cool Hand Luke (1967) Drama 4.065976272445917
1197 Princess Bride, The (1987) Action|Adventure|Comedy|Fantasy|Romance 4.056841847391797
898 Philadelphia Story, The (1940) Comedy|Drama|Romance 4.036842041357257
2571 Matrix, The (1999) Action|Sci-Fi|Thriller 4.032126824199478
1242 Glory (1989) Drama|War 4.030745682581733
3275 Boondock Saints, The (2000) Action|Crime|Drama|Thriller 4.021043788231223


In [None]:
merged = movies_data.merge(ratings_data, on='movieId')
merged = merged.sort_values(by='rating', ascending=False)
merged[merged['userId'] == 23]

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
29142,1284,"Big Sleep, The (1946)",Crime|Film-Noir|Mystery,23,5.0,1107162632
75320,7587,"Samouraï, Le (Godson, The) (1967)",Crime|Drama|Thriller,23,5.0,1107162995
14613,541,Blade Runner (1982),Action|Sci-Fi|Thriller,23,5.0,1107164433
71087,6440,Barton Fink (1991),Drama|Thriller,23,5.0,1107341765
26075,1212,"Third Man, The (1949)",Film-Noir|Mystery|Thriller,23,5.0,1107162640
...,...,...,...,...,...,...
65365,5013,Gosford Park (2001),Comedy|Drama|Mystery,23,3.0,1107164209
20033,866,Bound (1996),Crime|Drama|Romance|Thriller,23,3.0,1107164206
44833,2542,"Lock, Stock & Two Smoking Barrels (1998)",Comedy|Crime|Thriller,23,3.0,1107162552
12467,454,"Firm, The (1993)",Drama|Thriller,23,2.5,1107342160


# KNN

In [None]:
from surprise import KNNBasic

In [None]:
trainset, testset = train_test_split(data, test_size=.20)
KNNmodel = KNNBasic(k=10, sim_options={'name': 'cosine', 'user_based': True})
KNNmodel.fit(trainset)
predictions = KNNmodel.test(testset)
rmse = accuracy.rmse(predictions)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9959


In [None]:
userID = 23
n_items = 10
generate_recommendation(KNNmodel, userID, ratings_data, movies_data, n_items)

Top 10 item recommendations for user 23:
131098 Saving Santa (2013) Animation|Children|Comedy 5.0
53 Lamerica (1994) Adventure|Drama 5.0
99 Heidi Fleiss: Hollywood Madam (1995) Documentary 5.0
32892 Ivan's Childhood (a.k.a. My Name is Ivan) (Ivanovo detstvo) (1962) Drama|War 5.0
148 Awfully Big Adventure, An (1995) Drama 5.0
33138 Palindromes (2004) Adventure|Comedy|Drama 5.0
467 Live Nude Girls (1995) Comedy 5.0
633 Denise Calls Up (1995) Comedy 5.0
668 Song of the Little Road (Pather Panchali) (1955) Drama 5.0
779 'Til There Was You (1997) Drama|Romance 5.0


In [None]:
user_neighbors = KNNmodel.get_neighbors(userID, k=5)
user_neighbors

[10, 24, 54, 60, 80]

In [None]:
merged[merged['userId'] == 54][:5]

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
8681,318,"Shawshank Redemption, The (1994)",Crime|Drama,54,4.0,830247358
10050,356,Forrest Gump (1994),Comedy|Drama|Romance|War,54,4.0,839921027
16253,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,54,4.0,839920981
18,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,54,3.0,830247330
3406,110,Braveheart (1995),Action|Drama|War,54,3.0,830247832


# SVD Gradient Descent

In [None]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets
!pip install -q scann

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m588.3/588.3 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m439.2/439.2 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━