# Contents

[Imports](#Imports)<br>
[Model with raw data](#Model_raw)<br>
[Model with pre-processed data](#Model_pre)<br>
[Item-based model](#item-based)

## General concept
Central Idea: a user is likely to have the same preferences as a user with the same rating behaviour as them. <br>
Compare sparse tabel based on preprocessed data vs. from df_ratings: count ratings, compare missing value ration

# Imports

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import sklearn.metrics.pairwise as dist

In [2]:
df_pre = pd.read_csv('../data/processed/preprocessed_data_movielens.csv')
df_pre.head()

Unnamed: 0,movieId,title,genres,relevance,tag,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.99925, 0.99875, 0.99575, 0.98575, 0.98425, ...","['toys', 'computer animation', 'pixar animatio...",74244,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.99925, 0.99875, 0.99575, 0.98575, 0.98425, ...","['toys', 'computer animation', 'pixar animatio...",54322,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.99925, 0.99875, 0.99575, 0.98575, 0.98425, ...","['toys', 'computer animation', 'pixar animatio...",106130,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.99925, 0.99875, 0.99575, 0.98575, 0.98425, ...","['toys', 'computer animation', 'pixar animatio...",43484,3.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.99925, 0.99875, 0.99575, 0.98575, 0.98425, ...","['toys', 'computer animation', 'pixar animatio...",16874,4.0


In [3]:
df_raw = pd.read_csv('../data/raw/ml-25m/ratings.csv')
df_raw.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [4]:
print('df_pre:')
df_pre.info()
display(df_pre.describe())
print()
print('df_raw:')
df_raw.info()
display(df_raw.describe())

# => df_raw has about 5 times as many entries as df_pre, the mean rating in df_raw is 0.06 higher, the std is almost the same

df_pre:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5273559 entries, 0 to 5273558
Data columns (total 7 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   title      object 
 2   genres     object 
 3   relevance  object 
 4   tag        object 
 5   userId     int64  
 6   rating     float64
dtypes: float64(1), int64(2), object(4)
memory usage: 281.6+ MB


Unnamed: 0,movieId,userId,rating
count,5273559.0,5273559.0,5273559.0
mean,21258.21,81239.99,3.474195
std,37386.09,46779.61,1.057685
min,1.0,1.0,0.5
25%,1286.0,40631.0,3.0
50%,3087.0,80978.0,3.5
75%,8873.0,121509.0,4.0
max,195159.0,162541.0,5.0



df_raw:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB


Unnamed: 0,userId,movieId,rating,timestamp
count,25000100.0,25000100.0,25000100.0,25000100.0
mean,81189.28,21387.98,3.533854,1215601000.0
std,46791.72,39198.86,1.060744,226875800.0
min,1.0,1.0,0.5,789652000.0
25%,40510.0,1196.0,3.0,1011747000.0
50%,80914.0,2947.0,3.5,1198868000.0
75%,121557.0,8623.0,4.0,1447205000.0
max,162541.0,209171.0,5.0,1574328000.0


# Model_raw (does not work currently)

In [5]:
df_raw.duplicated(subset=['userId','movieId']).sum()
# => no user rated a movie twice

0

In [7]:
n_users = len(df_raw.userId.unique())
n_movies = len(df_raw.movieId.unique())
print('Number of users:', n_users, 'Number of movies:', n_movies)

mat_ratings = df_raw.pivot_table(index='userId', columns='movieId', values='rating') # values='rating' seems to be optional
mat_ratings.head(10)

Number of users: 162541 Number of movies: 59047


  num_cells = num_rows * num_columns


IndexError: index 1007637055 is out of bounds for axis 0 with size 1007623835

# Model_pre

In [9]:
n_users = len(df_pre.userId.unique())
n_movies = len(df_pre.movieId.unique())
print('Number of users:', n_users, 'Number of movies:', n_movies)

mat_ratings = df_pre.pivot_table(index='userId', columns='movieId', values='rating') # values='rating' seems to be optional
display(mat_ratings.head())

print('Number of non-NaN cells:', mat_ratings.count().sum())
print('Percentage of non-NaN cells:', np.round(100*mat_ratings.count().sum()/(mat_ratings.shape[0]*mat_ratings.shape[1]),2),'%')

Number of users: 161393 Number of movies: 2428


movieId,1,2,3,4,5,6,7,9,10,11,...,177765,179819,180031,182715,183897,185029,187541,187593,192803,195159
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,3.5,,,,,,4.5,,
4,,,,,,,,,,,...,,2.5,,4.5,,,,,,5.0
5,,,,,,,,,,,...,,,,,,,,,,


Number of non-NaN cells: 5273559
Percentage of non-NaN cells: 1.35 %


In [10]:
mat_ratings.min().min()
# => 0.5 is the minimum value, we can replace NaNs with 0 without loosing information

0.5

In [11]:
mat_ratings.fillna(0, inplace=True)
mat_ratings.head()

movieId,1,2,3,4,5,6,7,9,10,11,...,177765,179819,180031,182715,183897,185029,187541,187593,192803,195159
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.5,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.5,0.0,4.5,0.0,0.0,0.0,0.0,0.0,5.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# transformation in Compressed Sparse Row (CSR) format for reducing memory usage
sparse_ratings = csr_matrix(mat_ratings)

# Extract user IDs and book titles from the ratings matrix.
user_ids = mat_ratings.index.tolist()  
movie_ids = mat_ratings.columns.tolist()  

print(sparse_ratings)
# (0,166) 3.5 means in row 0 (equal to usreId 1) and column 166 there ist a rating of 3.5

  (0, 166)	3.5
  (0, 521)	3.5
  (0, 551)	4.0
  (0, 561)	3.5
  (0, 1449)	5.0
  (0, 1541)	4.5
  (0, 1566)	4.0
  (0, 1627)	4.0
  (0, 1648)	5.0
  (0, 1758)	5.0
  (0, 1774)	3.5
  (0, 1808)	3.0
  (0, 1834)	3.0
  (1, 45)	0.5
  (1, 180)	5.0
  (1, 291)	2.0
  (1, 320)	3.0
  (1, 414)	4.5
  (1, 576)	4.5
  (1, 584)	4.0
  (1, 622)	5.0
  (1, 716)	4.5
  (1, 768)	4.0
  (1, 859)	4.5
  (1, 890)	4.5
  :	:
  (161392, 882)	2.5
  (161392, 902)	5.0
  (161392, 930)	2.5
  (161392, 1107)	4.5
  (161392, 1118)	3.5
  (161392, 1145)	1.5
  (161392, 1202)	3.0
  (161392, 1203)	2.5
  (161392, 1211)	1.0
  (161392, 1217)	4.0
  (161392, 1316)	2.0
  (161392, 1352)	4.0
  (161392, 1356)	1.5
  (161392, 1361)	2.5
  (161392, 1491)	3.0
  (161392, 1541)	4.5
  (161392, 1557)	1.5
  (161392, 1651)	5.0
  (161392, 1657)	3.0
  (161392, 1670)	4.0
  (161392, 1684)	3.0
  (161392, 1827)	4.5
  (161392, 1862)	4.0
  (161392, 2006)	2.5
  (161392, 2071)	5.0


In [None]:
# Idea: convert datatype to int for reducing memory usage, therefore first multiply by 10
print(sparse_ratings * 10)
sparse_ratings

  (0, 166)	35.0
  (0, 521)	35.0
  (0, 551)	40.0
  (0, 561)	35.0
  (0, 1449)	50.0
  (0, 1541)	45.0
  (0, 1566)	40.0
  (0, 1627)	40.0
  (0, 1648)	50.0
  (0, 1758)	50.0
  (0, 1774)	35.0
  (0, 1808)	30.0
  (0, 1834)	30.0
  (1, 45)	5.0
  (1, 180)	50.0
  (1, 291)	20.0
  (1, 320)	30.0
  (1, 414)	45.0
  (1, 576)	45.0
  (1, 584)	40.0
  (1, 622)	50.0
  (1, 716)	45.0
  (1, 768)	40.0
  (1, 859)	45.0
  (1, 890)	45.0
  :	:
  (161392, 882)	25.0
  (161392, 902)	50.0
  (161392, 930)	25.0
  (161392, 1107)	45.0
  (161392, 1118)	35.0
  (161392, 1145)	15.0
  (161392, 1202)	30.0
  (161392, 1203)	25.0
  (161392, 1211)	10.0
  (161392, 1217)	40.0
  (161392, 1316)	20.0
  (161392, 1352)	40.0
  (161392, 1356)	15.0
  (161392, 1361)	25.0
  (161392, 1491)	30.0
  (161392, 1541)	45.0
  (161392, 1557)	15.0
  (161392, 1651)	50.0
  (161392, 1657)	30.0
  (161392, 1670)	40.0
  (161392, 1684)	30.0
  (161392, 1827)	45.0
  (161392, 1862)	40.0
  (161392, 2006)	25.0
  (161392, 2071)	50.0


<161393x2428 sparse matrix of type '<class 'numpy.float64'>'
	with 5273559 stored elements in Compressed Sparse Row format>

## Model_pre: user-based
Does not work currently due to memory needed.

In [None]:
# Calculate the cosine similarity between users.
user_similarity = dist.cosine_similarity(sparse_ratings) 

# Creation of a pandas DataFrame from the similarity matrix between users.
# The indexes and columns of the DataFrame are the user identifiers.
user_similarity = pd.DataFrame(user_similarity, index=user_ids, columns=user_ids)

MemoryError: Unable to allocate 35.8 GiB for an array with shape (4808469397,) and data type float64

In [None]:
# Define a function to predict based on user_similarity,, for a given user, the ratings for all movies they have not rated.
def pred_user(mat_ratings, user_similarity, k, user_id):

    # Select in mat_ratings the books that have not yet been read by the user
    to_predict = mat_ratings.loc[user_id][mat_ratings.loc[user_id]==0]
    # mat_ratings.loc[user_id] returns row/column for user with id user_id
    # [mat_ratings.loc[user_id]==0] selects only entries with 0

    # Select the k most similar users excluding the user itself
    similar_users = user_similarity.loc[user_id].sort_values(ascending=False)[1:k+1]
    # user_similarity.loc[user_id] returns row/column for user with id user_id => has similarity to all other users

    # Calculation of the denominator (=Nenner)
    norm = np.sum(np.abs(similar_users))

    for i in to_predict.index:

        # Retrieve similar user ratings associated with the movie i
        ratings = mat_ratings[i].loc[similar_users.index]
        
        # Calculate the dot product between ratings and similar_users
        scalar_prod = np.dot(ratings, similar_users)
        
        # Calculate predicted rating for movie i
        pred = scalar_prod / norm

        # Replace with prediction
        to_predict[i] = pred

    return to_predict

## Model_pre: item-based

In [13]:
item_similarity = dist.cosine_similarity(sparse_ratings.T)
item_similarity = pd.DataFrame(item_similarity, index=movie_ids, columns=movie_ids)

In [14]:
mat_ratings.loc[1][mat_ratings.loc[1]==0].info()

<class 'pandas.core.series.Series'>
Index: 2415 entries, 1 to 195159
Series name: 1
Non-Null Count  Dtype  
--------------  -----  
2415 non-null   float64
dtypes: float64(1)
memory usage: 37.7 KB


In [15]:
# Define a function to predict based on item_similarity, for a given user, the ratings for all movies they have not rated.
def pred_item(mat_ratings, item_similarity, k, user_id):

    # Select in mat_ratings the books that have not yet been read by the user
    to_predict = mat_ratings.loc[user_id][mat_ratings.loc[user_id]==0]
    
    # Iterate over all these books
    for i in to_predict.index:

        #Find the k most similar books excluding the book itself
        similar_items = item_similarity.loc[i].sort_values(ascending=False)[1:k+1]

        # Calculation of the norm of the similar_items vector
        norm = np.sum(np.abs(similar_items))

        # Retrieve the ratings given by the user to the k nearest neighbors
        ratings = mat_ratings[similar_items.index].loc[user_id]


        # Calculate the dot product between ratings and similar_items
        scalar_prod = np.dot(ratings, similar_items)
        
        #Calculate predicted rating for movie i
        pred = scalar_prod / norm

        # Replace with prediction
        to_predict[i] = pred


    return to_predict

In [16]:
# Example: top 10 predictions for userId 1
preds = pred_item(mat_ratings, item_similarity, 3, user_id = 1).sort_values(ascending=False).head(10)

# Series has userId as name; rename for better display:
preds.name = 'predicted rating for user ' + str(preds.name)

In [17]:
df_pred = preds.to_frame().reset_index().rename(columns={'title':'Title'})
# df to link movieId to title 
df_mov = df_pre[['movieId','title']].rename(columns={'title':'Title'}).drop_duplicates()
df_pred = df_pred.merge(right=df_mov, on='movieId', how='left')
#df_pred = df_pred[['Title',1]]
df_pred

Unnamed: 0,movieId,predicted rating for user 1,Title
0,2360,2.964203,"Celebration, The (Festen) (1998)"
1,7981,2.870635,Infernal Affairs (Mou gaan dou) (2002)
2,4235,2.550774,Amores Perros (Love's a Bitch) (2000)
3,3030,2.423851,Yojimbo (1961)
4,123,2.235471,Chungking Express (Chung Hing sam lam) (1994)
5,1237,1.666619,"Seventh Seal, The (Sjunde inseglet, Det) (1957)"
6,44694,1.545655,Volver (2006)
7,3083,1.497551,All About My Mother (Todo sobre mi madre) (1999)
8,5291,1.496149,Rashomon (Rashômon) (1950)
9,4914,1.402356,Breathless (À bout de souffle) (1960)


## Top users (1000+ ratings)

In [18]:
# Example: top 10 predictions for userId 72315
preds = pred_item(mat_ratings, item_similarity, 3, user_id = 72315).sort_values(ascending=False).head(10)

# Series has userId as name; rename for better display:
preds.name = 'predicted rating for user ' + str(preds.name)

df_pred = preds.to_frame().reset_index().rename(columns={'title':'Title'})
# df to link movieId to title 
df_mov = df_pre[['movieId','title']].rename(columns={'title':'Title'}).drop_duplicates()
df_pred = df_pred.merge(right=df_mov, on='movieId', how='left')
#df_pred = df_pred[['Title',1]]
df_pred

Unnamed: 0,movieId,predicted rating for user 72315,Title
0,3095,4.493769,"Grapes of Wrath, The (1940)"
1,3168,4.474725,Easy Rider (1969)
2,1103,4.430279,Rebel Without a Cause (1955)
3,922,4.347191,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)
4,1217,4.342622,Ran (1985)
5,300,4.341523,Quiz Show (1994)
6,1104,4.338985,"Streetcar Named Desire, A (1951)"
7,2313,4.337724,"Elephant Man, The (1980)"
8,1289,4.336347,Koyaanisqatsi (a.k.a. Koyaanisqatsi: Life Out ...
9,923,4.334833,Citizen Kane (1941)


In [19]:
# Example: top 10 predictions for userId 80974
preds = pred_item(mat_ratings, item_similarity, 3, user_id = 80974).sort_values(ascending=False).head(10)

# Series has userId as name; rename for better display:
preds.name = 'predicted rating for user ' + str(preds.name)

df_pred = preds.to_frame().reset_index().rename(columns={'title':'Title'})
# df to link movieId to title 
df_mov = df_pre[['movieId','title']].rename(columns={'title':'Title'}).drop_duplicates()
df_pred = df_pred.merge(right=df_mov, on='movieId', how='left')
#df_pred = df_pred[['Title',1]]
df_pred

Unnamed: 0,movieId,predicted rating for user 80974,Title
0,1041,4.33206,Secrets & Lies (1996)
1,63876,4.177129,Milk (2008)
2,6787,4.165896,All the President's Men (1976)
3,55052,4.162786,Atonement (2007)
4,5060,4.0,M*A*S*H (a.k.a. MASH) (1970)
5,1218,4.0,"Killer, The (Die xue shuang xiong) (1989)"
6,2336,4.0,Elizabeth (1998)
7,5291,4.0,Rashomon (Rashômon) (1950)
8,2359,4.0,Waking Ned Devine (a.k.a. Waking Ned) (1998)
9,2360,4.0,"Celebration, The (Festen) (1998)"


In [20]:
# Example: top 10 predictions for userId 137293
preds = pred_item(mat_ratings, item_similarity, 3, user_id = 137293).sort_values(ascending=False).head(10)

# Series has userId as name; rename for better display:
preds.name = 'predicted rating for user ' + str(preds.name)

df_pred = preds.to_frame().reset_index().rename(columns={'title':'Title'})
# df to link movieId to title 
df_mov = df_pre[['movieId','title']].rename(columns={'title':'Title'}).drop_duplicates()
df_pred = df_pred.merge(right=df_mov, on='movieId', how='left')
#df_pred = df_pred[['Title',1]]
df_pred

Unnamed: 0,movieId,predicted rating for user 137293,Title
0,1237,4.645451,"Seventh Seal, The (Sjunde inseglet, Det) (1957)"
1,3788,4.382949,Blow-Up (Blowup) (1966)
2,97,4.159661,"Hate (Haine, La) (1995)"
3,39183,4.017036,Brokeback Mountain (2005)
4,8014,4.001628,"Spring, Summer, Fall, Winter... and Spring (Bo..."
5,1945,4.0,On the Waterfront (1954)
6,971,4.0,Cat on a Hot Tin Roof (1958)
7,34437,3.992734,Broken Flowers (2005)
8,5878,3.960748,Talk to Her (Hable con Ella) (2002)
9,1281,3.849943,"Great Dictator, The (1940)"


## Users with only 20 ratings

### Users with 20 ratings and low average (0.5*)

In [21]:
preds = pred_item(mat_ratings, item_similarity, 3, user_id = 63044).sort_values(ascending=False).head(10)

# Series has userId as name; rename for better display:
preds.name = 'predicted rating for user ' + str(preds.name)

df_pred = preds.to_frame().reset_index().rename(columns={'title':'Title'})
# df to link movieId to title 
df_mov = df_pre[['movieId','title']].rename(columns={'title':'Title'}).drop_duplicates()
df_pred = df_pred.merge(right=df_mov, on='movieId', how='left')
#df_pred = df_pred[['Title',1]]
df_pred

Unnamed: 0,movieId,predicted rating for user 63044,Title
0,733,0.16664,"Rock, The (1996)"
1,1196,0.165754,Star Wars: Episode V - The Empire Strikes Back...
2,62,0.158078,Mr. Holland's Opus (1995)
3,1,0.0,Toy Story (1995)
4,5872,0.0,Die Another Day (2002)
5,5782,0.0,"Professional, The (Le professionnel) (1981)"
6,5785,0.0,Jackass: The Movie (2002)
7,5791,0.0,Frida (2002)
8,5810,0.0,8 Mile (2002)
9,5812,0.0,Far from Heaven (2002)


In [22]:
preds = pred_item(mat_ratings, item_similarity, 3, user_id = 38998).sort_values(ascending=False).head(10)

# Series has userId as name; rename for better display:
preds.name = 'predicted rating for user ' + str(preds.name)

df_pred = preds.to_frame().reset_index().rename(columns={'title':'Title'})
# df to link movieId to title 
df_mov = df_pre[['movieId','title']].rename(columns={'title':'Title'}).drop_duplicates()
df_pred = df_pred.merge(right=df_mov, on='movieId', how='left')
#df_pred = df_pred[['Title',1]]
df_pred

Unnamed: 0,movieId,predicted rating for user 38998,Title
0,3701,0.176769,Alien Nation (1988)
1,2863,0.173201,"Hard Day's Night, A (1964)"
2,919,0.171713,"Wizard of Oz, The (1939)"
3,3175,0.169195,Galaxy Quest (1999)
4,1030,0.167796,Pete's Dragon (1977)
5,3471,0.1663,Close Encounters of the Third Kind (1977)
6,63,0.162983,Don't Be a Menace to South Central While Drink...
7,1135,0.162667,Private Benjamin (1980)
8,2142,0.160038,"American Tail: Fievel Goes West, An (1991)"
9,2139,0.159288,"Secret of NIMH, The (1982)"


## Users with average rating of 5.0 stars

### 5* users with many ratings

In [23]:
preds = pred_item(mat_ratings, item_similarity, 3, user_id = 75309).sort_values(ascending=False).head(10)

# Series has userId as name; rename for better display:
preds.name = 'predicted rating for user ' + str(preds.name)

df_pred = preds.to_frame().reset_index().rename(columns={'title':'Title'})
# df to link movieId to title 
df_mov = df_pre[['movieId','title']].rename(columns={'title':'Title'}).drop_duplicates()
df_pred = df_pred.merge(right=df_mov, on='movieId', how='left')
#df_pred = df_pred[['Title',1]]
df_pred

Unnamed: 0,movieId,predicted rating for user 75309,Title
0,79357,5.0,Mr. Nobody (2009)
1,5265,5.0,Death to Smoochy (2002)
2,3979,5.0,Little Nicky (2000)
3,7439,5.0,"Punisher, The (2004)"
4,4890,5.0,Shallow Hal (2001)
5,71745,5.0,Where the Wild Things Are (2009)
6,7090,5.0,Hero (Ying xiong) (2002)
7,52245,5.0,Blades of Glory (2007)
8,69757,5.0,(500) Days of Summer (2009)
9,3972,3.732316,"Legend of Drunken Master, The (Jui kuen II) (1..."


In [24]:
preds = pred_item(mat_ratings, item_similarity, 3, user_id = 12002).sort_values(ascending=False).head(10)

# Series has userId as name; rename for better display:
preds.name = 'predicted rating for user ' + str(preds.name)

df_pred = preds.to_frame().reset_index().rename(columns={'title':'Title'})
# df to link movieId to title 
df_mov = df_pre[['movieId','title']].rename(columns={'title':'Title'}).drop_duplicates()
df_pred = df_pred.merge(right=df_mov, on='movieId', how='left')
#df_pred = df_pred[['Title',1]]
df_pred

Unnamed: 0,movieId,predicted rating for user 12002,Title
0,112552,5.0,Whiplash (2014)
1,81591,5.0,Black Swan (2010)
2,82459,5.0,True Grit (2010)
3,164909,5.0,La La Land (2016)
4,174055,5.0,Dunkirk (2017)
5,152077,5.0,10 Cloverfield Lane (2016)
6,2648,3.854613,Frankenstein (1931)
7,1279,3.539312,Night on Earth (1991)
8,103688,3.512931,"Conjuring, The (2013)"
9,123,3.495122,Chungking Express (Chung Hing sam lam) (1994)


### 5* users with 20 ratings

In [25]:
preds = pred_item(mat_ratings, item_similarity, 3, user_id = 36868).sort_values(ascending=False).head(10)

# Series has userId as name; rename for better display:
preds.name = 'predicted rating for user ' + str(preds.name)

df_pred = preds.to_frame().reset_index().rename(columns={'title':'Title'})
# df to link movieId to title 
df_mov = df_pre[['movieId','title']].rename(columns={'title':'Title'}).drop_duplicates()
df_pred = df_pred.merge(right=df_mov, on='movieId', how='left')
#df_pred = df_pred[['Title',1]]
df_pred

Unnamed: 0,movieId,predicted rating for user 36868,Title
0,1,0.0,Toy Story (1995)
1,5620,0.0,Sweet Home Alabama (2002)
2,5630,0.0,Red Dragon (2002)
3,5669,0.0,Bowling for Columbine (2002)
4,5673,0.0,Punch-Drunk Love (2002)
5,5679,0.0,"Ring, The (2002)"
6,5690,0.0,Grave of the Fireflies (Hotaru no haka) (1988)
7,5693,0.0,Saturday Night Fever (1977)
8,5782,0.0,"Professional, The (Le professionnel) (1981)"
9,5785,0.0,Jackass: The Movie (2002)


In [26]:
preds = pred_item(mat_ratings, item_similarity, 3, user_id = 31747).sort_values(ascending=False).head(10)

# Series has userId as name; rename for better display:
preds.name = 'predicted rating for user ' + str(preds.name)

df_pred = preds.to_frame().reset_index().rename(columns={'title':'Title'})
# df to link movieId to title 
df_mov = df_pre[['movieId','title']].rename(columns={'title':'Title'}).drop_duplicates()
df_pred = df_pred.merge(right=df_mov, on='movieId', how='left')
#df_pred = df_pred[['Title',1]]
df_pred

Unnamed: 0,movieId,predicted rating for user 31747,Title
0,1379,1.892806,Young Guns II (1990)
1,1968,1.805313,"Breakfast Club, The (1985)"
2,1215,1.759249,Army of Darkness (1993)
3,4105,1.670292,"Evil Dead, The (1981)"
4,3273,1.662333,Scream 3 (2000)
5,1407,1.647415,Scream (1996)
6,2144,1.625387,Sixteen Candles (1984)
7,1101,1.593618,Top Gun (1986)
8,480,1.567363,Jurassic Park (1993)
9,1358,1.419986,Sling Blade (1996)
