# Movie Recommendation System

### Importing necessary modules

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.utils import shuffle
from sortedcontainers import SortedList
from multiprocessing import Pool, Manager
import multiprocessing

### Reading the CSV file

In [2]:
os.chdir('../movielens-20m-dataset')

In [3]:
df = pd.read_csv('rating.csv')

### Exploring the DataFrame

In [4]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [5]:
df.describe()

Unnamed: 0,userId,movieId,rating
count,20000260.0,20000260.0,20000260.0
mean,69045.87,9041.567,3.525529
std,40038.63,19789.48,1.051989
min,1.0,1.0,0.5
25%,34395.0,902.0,3.0
50%,69141.0,2167.0,3.5
75%,103637.0,4770.0,4.0
max,138493.0,131262.0,5.0


In [6]:
n_users = df.userId.nunique()

In [7]:
n_movies = df.movieId.nunique()

In [8]:
n_users*n_movies

3703856792

### Data Preprocessing

In [9]:
# Making sure that userId indexes start at 1
# The maximum userId Value from the describe function equals the unique number of IDs, so they are sequential
df.userId = df.userId - 1

In [10]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,0,2,3.5,2005-04-02 23:53:47
1,0,29,3.5,2005-04-02 23:31:16
2,0,32,3.5,2005-04-02 23:33:39
3,0,47,3.5,2005-04-02 23:32:07
4,0,50,3.5,2005-04-02 23:29:40


In [11]:
# Making sure that movieId are sequential
# The maximum movieId value from the descrive function does'nt match the number of unique IDs, so they are NOT seq.

# First step: Getting the unique values from movieIds
uniq_mov_id = set(df.movieId)

# Second step: Creating a dictionary that matches old Id values with sequential values.
mov_dic = {}
count = 0
for mov_id in uniq_mov_id:
    mov_dic[mov_id] = count
    count+=1

# Third step: Mapping the old values to the new sequential values
new_values = []
for old_val in df.movieId.values:
    new_values.append(mov_dic[old_val])

df['seq_movieId'] = new_values

In [12]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,seq_movieId
0,0,2,3.5,2005-04-02 23:53:47,2
1,0,29,3.5,2005-04-02 23:31:16,29
2,0,32,3.5,2005-04-02 23:33:39,32
3,0,47,3.5,2005-04-02 23:32:07,47
4,0,50,3.5,2005-04-02 23:29:40,50


### Dropping the irrelevant (in this experiment) timestamp column

In [13]:
df.drop('timestamp',axis=1)

Unnamed: 0,userId,movieId,rating,seq_movieId
0,0,2,3.5,2
1,0,29,3.5,29
2,0,32,3.5,32
3,0,47,3.5,47
4,0,50,3.5,50
...,...,...,...,...
20000258,138492,68954,4.5,13821
20000259,138492,69526,4.5,13929
20000260,138492,69644,3.0,13942
20000261,138492,70286,5.0,14060


### Saving the Preprocessed DataFrame into a CSV file

In [14]:
df.to_csv('edited_rating.csv') 

### Separating the Data into Training and Test sets

In [15]:
# Step 1: Shuffle the DataFrame
df = shuffle(df).reset_index(drop=True)

# Step 2: Determine the Train/Test split ratio
ratio = 0.80
cutoff = int(ratio*len(df))

# Step 3: Create the Train and Test DataFrames

df_train = df[:cutoff].reset_index(drop=True)
df_test = df[cutoff:].reset_index(drop=True)

### Creating dictionaries of Users that rated Movies and Movies that were rated by Users

In [16]:
def fill_train(split):
    user2movie = {}
    movie2user = {}
    usermovie2rating = {}
    df_s = df_train.iloc[split[0]: split[1] + 1,:]
    for index, row in df_s.iterrows():
        user = row.userId
        movie = row.seq_movieId
        rating = row.rating
        if user not in user2movie:
            user2movie[user] = [movie]
        else:
            user2movie[user].append(movie)

        if movie not in movie2user:
            movie2user[movie] = [user]
        else:
            movie2user[movie].append(user)
        
        usermovie2rating[(user,movie)] = rating
    return [user2movie, movie2user, usermovie2rating]

In [17]:
def fill_test(split):
    usermovie2rating_test = {}
    df_s = df_test.iloc[split[0]: split[1] + 1,:]
    for index, row in df_s.iterrows():
        user = row.userId
        movie = row.seq_movieId
        rating = row.rating
        
        usermovie2rating_test[(user,movie)] = rating
    return [usermovie2rating_test]

In [18]:
# As working with the full DataFrame is very slow, multi processing will be used
n_cpu = multiprocessing.cpu_count()

def get_splits(x,n):
    a= np.array_split(range(len(np.array(x))),n)
    return [[i[0],i[-1]] for i in a]

train_splits = get_splits(df_train,n_cpu)
test_splits = get_splits(df_test,n_cpu)

In [19]:
p = Pool(n_cpu)
dicts = [i for i in p.map(fill_train, train_splits)]
p.close()
p.join()

In [20]:
user2movie = dicts[0][0].copy()
for i in range(1,len(dicts)):
    for user,movies in dicts[i][0].items():
        if user not in user2movie:
            user2movie[user] = movies
        else:
            user2movie[user].extend(movies)
        
movie2user = dicts[0][1].copy()
for i in range(1,len(dicts)):
    for movie,users in dicts[i][1].items():
        if movie not in movie2user:
            movie2user[movie] = users
        else:
            movie2user[movie].extend(users)

usermovie2rating = dicts[0][2].copy()
for i in range(1,len(dicts)):
    for (user,movie),rating in dicts[i][2].items():
        if (user,movie) not in usermovie2rating:
            usermovie2rating[(user,movie)] = rating
        else:
            usermovie2rating[(user,movie)].append(rating)

dicts = []

In [21]:
p = Pool(n_cpu)
dicts = [i for i in p.map(fill_test, test_splits)]
p.close()
p.join()

usermovie2rating_test = dicts[0][0].copy()
for i in range(1,len(dicts)):
    for (user,movie),rating in dicts[i][0].items():
        if (user,movie) not in usermovie2rating_test:
            usermovie2rating_test[(user,movie)] = rating
        else:
            usermovie2rating_test[(user,movie)].append(rating)

dicts = []

In [22]:
N = len(set(df.userId.values)) # Number of unique users
M = len(set(df.seq_movieId.values)) # Number of unique movies

In [23]:
user2movierating = {}
for user, movies in user2movie.items():
    r = np.array([usermovie2rating[(user,movie)] for movie in movies])
    user2movierating[user] = (movies, r)
movie2userrating = {}
for movie, users in movie2user.items():
    r = np.array([usermovie2rating[(user,movie)] for user in users])
    movie2userrating[movie] = (users, r)
    
movie2userrating_test = {}
for (user,movie), r in usermovie2rating_test.items():
    if movie not in movie2userrating_test:
        movie2userrating_test[movie] = [[user], [r]]
    else:
        movie2userrating_test[movie][0].append(user)
        movie2userrating_test[movie][1].append(r)
for movie, (users, r) in movie2userrating_test.items():
    movie2userrating_test[movie][1] = np.array(r)

### Matrix Factorization and Model Creation

In [24]:
K = 10 # latent dimensionality
W = np.random.randn(N,K)
b = np.zeros(N) # user bias
U = np.random.randn(M,K)
c = np.zeros(M) # movie bias
mu = np.mean(list(usermovie2rating.values())) # global average


def loss_function(m2ur):
    N = 0
    sse = 0 # Sum of squared errors
    for movie, (u_ids, r) in m2ur.items():
        p = W[u_ids].dot(U[movie]) + b[u_ids] + c[movie] + mu
        delta = p - r
        sse += delta.dot(delta)
        N += len(r)
    return sse / N # Mean squared error

# Training
epochs = 25
reg = 0.1 # Regularization penalty
train_losses = []
test_losses = []
for epoch in range(epochs):
    print("Epoch", epoch)
    for user in range(N):
        try:
            m_ids, r = user2movierating[user]
            matrix = U[m_ids].T.dot(U[m_ids]) + np.eye(K) * reg
            vector = (r - b[user] - c[m_ids] - mu).dot(U[m_ids])
            b_u = (r - U[m_ids].dot(W[user]) - mu).sum()

            # Updating the variables
            W[user] = np.linalg.solve(matrix,vector)
            b[user] = b_u / (len(user2movie[user]) + reg)
        except:
            pass

    for movie in range(M):
        try:
            u_ids, r = movie2userrating[movie]
            matrix = W[u_ids].T.dot(W[u_ids]) + np.eye(K) * reg
            vector = (r - b[u_ids] - c[movie] - mu).dot(W[u_ids])
            c_m = (r - W[u_ids].dot(U[movie]) - b[u_ids] - mu).sum()
            
            # Updating the variables
            U[movie] = np.linalg.solve(matrix,vector)
            c[movie] = c_m / (len(movie2user[movie]) + reg)
        except:
            pass
        
    train_losses.append(loss_function(movie2userrating))
    test_losses.append(loss_function(movie2userrating_test))
    print(f"Train Loss: {train_losses[-1]:.3f}, Test Loss {test_losses[-1]:.3f}")
    

Epoch 0
Train Loss: 0.791, Test Loss 0.971
Epoch 1
Train Loss: 0.641, Test Loss 0.829
Epoch 2
Train Loss: 0.601, Test Loss 0.826
Epoch 3
Train Loss: 0.573, Test Loss 0.792
Epoch 4
Train Loss: 0.554, Test Loss 0.769
Epoch 5
Train Loss: 0.541, Test Loss 0.742
Epoch 6
Train Loss: 0.533, Test Loss 0.731
Epoch 7
Train Loss: 0.528, Test Loss 0.719
Epoch 8
Train Loss: 0.524, Test Loss 0.715
Epoch 9
Train Loss: 0.521, Test Loss 0.707
Epoch 10
Train Loss: 0.519, Test Loss 0.706
Epoch 11
Train Loss: 0.517, Test Loss 0.700
Epoch 12
Train Loss: 0.516, Test Loss 0.700
Epoch 13
Train Loss: 0.514, Test Loss 0.696
Epoch 14
Train Loss: 0.514, Test Loss 0.697
Epoch 15
Train Loss: 0.513, Test Loss 0.693
Epoch 16
Train Loss: 0.512, Test Loss 0.695
Epoch 17
Train Loss: 0.511, Test Loss 0.691
Epoch 18
Train Loss: 0.511, Test Loss 0.693
Epoch 19
Train Loss: 0.510, Test Loss 0.690
Epoch 20
Train Loss: 0.510, Test Loss 0.692
Epoch 21
Train Loss: 0.510, Test Loss 0.689
Epoch 22
Train Loss: 0.509, Test Loss 0.69