<a href="https://colab.research.google.com/github/harbidel/Movie-Recommendation-Model/blob/main/Movie_Recommendation_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# library imports
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from scipy import sparse

In [3]:
data = pd.read_csv("ratings.csv")
data.shape
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
data = data.drop(['timestamp'], axis=1)
data.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [5]:
Counter(data.rating)

Counter({4.0: 26818,
         5.0: 13211,
         3.0: 20047,
         2.0: 7551,
         1.0: 2811,
         4.5: 8551,
         3.5: 13136,
         2.5: 5550,
         0.5: 1370,
         1.5: 1791})

In [6]:
Counter(data.groupby(['userId']).count()['movieId'])

Counter({232: 2,
         29: 8,
         39: 4,
         216: 2,
         44: 3,
         314: 1,
         152: 4,
         47: 4,
         46: 5,
         140: 4,
         64: 5,
         32: 7,
         31: 7,
         48: 7,
         135: 3,
         98: 3,
         105: 1,
         502: 1,
         703: 1,
         242: 1,
         443: 2,
         119: 2,
         121: 2,
         110: 2,
         26: 13,
         21: 15,
         570: 1,
         81: 2,
         34: 9,
         50: 7,
         102: 2,
         156: 1,
         86: 2,
         23: 13,
         60: 2,
         78: 3,
         100: 3,
         103: 3,
         217: 1,
         440: 1,
         114: 2,
         399: 1,
         42: 5,
         33: 10,
         310: 1,
         359: 1,
         130: 1,
         20: 14,
         25: 9,
         476: 1,
         112: 4,
         107: 2,
         22: 14,
         366: 1,
         271: 1,
         517: 1,
         345: 1,
         36: 9,
         1260: 1,
         62: 2,

In [7]:
#Average number of ratings per user
np.mean(data.groupby(['userId']).count()['movieId'])

165.30491803278687

In [8]:
train_df, valid_df = train_test_split(data, test_size=0.2)

#resetting indices to avoid indexing errors in the future
train_df = train_df.reset_index()[['userId', 'movieId', 'rating']]
valid_df = valid_df.reset_index()[['userId', 'movieId', 'rating']]

In [9]:
def encode_column(column):
    """ Encodes a pandas column with continous IDs"""
    keys = column.unique()
    key_to_id = {key:idx for idx,key in enumerate(keys)}
    return key_to_id, np.array([key_to_id[x] for x in column]), len(keys)

In [10]:
def encode_df(data):
    """Encodes rating data with continuous user and anime ids"""
    
    movie_ids, data['movieId'], num_movie = encode_column(data['movieId'])
    user_ids, data['userId'], num_users = encode_column(data['userId'])
    return data, num_users, num_movie, user_ids, movie_ids

In [11]:
data, num_users, num_movie, user_ids, movie_ids = encode_df(train_df)
print("Number of users :", num_users)
print("Number of anime :", num_movie)
data.head()

Number of users : 610
Number of anime : 8905


Unnamed: 0,userId,movieId,rating
0,0,0,1.5
1,1,1,3.0
2,2,2,3.5
3,3,3,4.0
4,4,4,2.0


In [12]:
def create_embeddings(n, K):
    """
    Creates a random numpy matrix of shape n, K with uniform values in (0, 11/K)
    n: number of items/users
    K: number of factors in the embedding 
    """
    return 11*np.random.random((n, K)) / K

In [13]:
def create_sparse_matrix(df, rows, cols, column_name="rating"):
    """ Returns a sparse utility matrix""" 
    return sparse.csc_matrix((df[column_name].values,(df['userId'].values, df['movieId'].values)),shape=(rows, cols))

In [14]:
data, num_users, num_movie, user_ids, movie_ids = encode_df(train_df)
Y = create_sparse_matrix(data, num_users, num_movie)

In [15]:
# to view matrix
Y.todense()

matrix([[1.5, 0. , 0. , ..., 0. , 0. , 0. ],
        [0. , 3. , 0. , ..., 0. , 0. , 0. ],
        [0. , 0. , 3.5, ..., 0. , 0. , 0. ],
        ...,
        [0. , 0. , 0. , ..., 0. , 0. , 0. ],
        [0. , 0. , 0. , ..., 0. , 0. , 0. ],
        [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [16]:
def predict(df, emb_user, emb_movie):
    """ This function computes df["prediction"] without doing (U*V^T).
    
    Computes df["prediction"] by using elementwise multiplication of the corresponding embeddings and then 
    sum to get the prediction u_i*v_j. This avoids creating the dense matrix U*V^T.
    """
    df['prediction'] = np.sum(np.multiply(emb_movie[df['movieId']],emb_user[df['userId']]), axis=1)
    return df

In [17]:
lmbda = 0.0001

In [18]:
def cost(df, emb_user, emb_movie):
    """ Computes mean square error"""
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_movie.shape[0])
    predicted = create_sparse_matrix(predict(df, emb_user, emb_movie), emb_user.shape[0], emb_movie.shape[0], 'prediction')
    return np.sum((Y-predicted).power(2))/df.shape[0] 

In [19]:
def gradient(df, emb_user, emb_movie):
    """ Computes the gradient for user and anime embeddings"""
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_movie.shape[0])
    predicted = create_sparse_matrix(predict(df, emb_user, emb_movie), emb_user.shape[0], emb_movie.shape[0], 'prediction')
    delta =(Y-predicted)
    grad_user = (-2/df.shape[0])*(delta*emb_movie) + 2*lmbda*emb_user
    grad_anime = (-2/df.shape[0])*(delta.T*emb_user) + 2*lmbda*emb_movie
    return grad_user, grad_anime

In [20]:
def gradient_descent(df, emb_user, emb_movie, iterations=2000, learning_rate=0.005, df_val=None):
    """ 
    Computes gradient descent with momentum (0.9) for given number of iterations.
    emb_user: the trained user embedding
    emb_anime: the trained anime embedding
    """
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_movie.shape[0])
    beta = 0.9
    grad_user, grad_movie = gradient(df, emb_user, emb_movie)
    v_user = grad_user
    v_movie = grad_movie
    for i in range(iterations):
        grad_user, grad_movie = gradient(df, emb_user, emb_movie)
        v_user = beta*v_user + (1-beta)*grad_user
        v_movie = beta*v_movie + (1-beta)*grad_movie
        emb_user = emb_user - learning_rate*v_user
        emb_movie = emb_movie - learning_rate*v_movie
        if(not (i+1)%50):
            print("\niteration", i+1, ":")
            print("train mse:",  cost(df, emb_user, emb_movie))
            if df_val is not None:
                print("validation mse:",  cost(df_val, emb_user, emb_movie))
    return emb_user, emb_movie

In [21]:
emb_user = create_embeddings(num_users, 3)
emb_movie = create_embeddings(num_movie, 3)
emb_user, emb_movie = gradient_descent(data, emb_user, emb_movie, iterations=800, learning_rate=0.9)


iteration 50 :
train mse: 5.422533363840953

iteration 100 :
train mse: 3.517158773641032

iteration 150 :
train mse: 2.7925039890175847

iteration 200 :
train mse: 2.3798441118371887

iteration 250 :
train mse: 2.1093490010283027

iteration 300 :
train mse: 1.916496603071896

iteration 350 :
train mse: 1.7710838844513568

iteration 400 :
train mse: 1.6570094243165727

iteration 450 :
train mse: 1.564854972564694

iteration 500 :
train mse: 1.4887081975550591

iteration 550 :
train mse: 1.4246454834607534

iteration 600 :
train mse: 1.3699451205784574

iteration 650 :
train mse: 1.3226511377329377

iteration 700 :
train mse: 1.2813174297070298

iteration 750 :
train mse: 1.2448500688409971

iteration 800 :
train mse: 1.2124058538460947


In [22]:
def encode_new_data(valid_df, user_ids, movie_ids):
    """ Encodes valid_df with the same encoding as train_df.
    """
    df_val_chosen = valid_df['movieId'].isin(movie_ids.keys()) & valid_df['userId'].isin(user_ids.keys())
    valid_df = valid_df[df_val_chosen]
    valid_df['movieId'] =  np.array([movie_ids[x] for x in valid_df['movieId']])
    valid_df['userId'] = np.array([user_ids[x] for x in valid_df['userId']])
    return valid_df

In [23]:
print("before encoding:", valid_df.shape)
valid_df = encode_new_data(valid_df, user_ids, movie_ids)
print("after encoding:", valid_df.shape)

before encoding: (20168, 3)
after encoding: (15272, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [24]:
train_mse = cost(train_df, emb_user, emb_movie)
val_mse = cost(valid_df, emb_user, emb_movie)
print(train_mse, val_mse)

1.2124058538460947 3.141428684601969


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [25]:
#looking at the predictions
valid_df[70:80].head()

Unnamed: 0,userId,movieId,rating,prediction
91,458,329,4.0,3.360084
92,584,420,4.0,3.858602
93,50,1252,4.0,3.357062
94,292,1544,4.0,5.012688
95,477,2115,4.0,1.465559
