# Movie Recommendation System

### Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.utils import shuffle
from sortedcontainers import SortedList

### Reading the CSV file

In [2]:
#os.chdir('../movielens-20m-dataset')

In [3]:
df = pd.read_csv('rating.csv')

### Exploring the DataFrame

In [4]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [5]:
df.describe()

Unnamed: 0,userId,movieId,rating
count,20000260.0,20000260.0,20000260.0
mean,69045.87,9041.567,3.525529
std,40038.63,19789.48,1.051989
min,1.0,1.0,0.5
25%,34395.0,902.0,3.0
50%,69141.0,2167.0,3.5
75%,103637.0,4770.0,4.0
max,138493.0,131262.0,5.0


In [6]:
n_users = df.userId.nunique()

In [7]:
n_movies = df.movieId.nunique()

In [8]:
n_users*n_movies

3703856792

### Data Preprocessing

In [9]:
# Making sure that userId indexes start at 1
# The maximum userId Value from the describe function equals the unique number of IDs, so they are sequential
df.userId = df.userId - 1

In [10]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,0,2,3.5,2005-04-02 23:53:47
1,0,29,3.5,2005-04-02 23:31:16
2,0,32,3.5,2005-04-02 23:33:39
3,0,47,3.5,2005-04-02 23:32:07
4,0,50,3.5,2005-04-02 23:29:40


In [11]:
# Making sure that movieId are sequential
# The maximum movieId value from the descrive function does'nt match the number of unique IDs, so they are NOT seq.

# First step: Getting the unique values from movieIds
uniq_mov_id = set(df.movieId)

# Second step: Creating a dictionary that matches old Id values with sequential values.
mov_dic = {}
count = 0
for mov_id in uniq_mov_id:
    mov_dic[mov_id] = count
    count+=1

# Third step: Mapping the old values to the new sequential values
new_values = []
for old_val in df.movieId.values:
    new_values.append(mov_dic[old_val])

df['seq_movieId'] = new_values

In [12]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,seq_movieId
0,0,2,3.5,2005-04-02 23:53:47,2
1,0,29,3.5,2005-04-02 23:31:16,29
2,0,32,3.5,2005-04-02 23:33:39,32
3,0,47,3.5,2005-04-02 23:32:07,47
4,0,50,3.5,2005-04-02 23:29:40,50


### Dropping the irrelevant (in this experiment) timestamp column

In [13]:
df.drop('timestamp',axis=1)

Unnamed: 0,userId,movieId,rating,seq_movieId
0,0,2,3.5,2
1,0,29,3.5,29
2,0,32,3.5,32
3,0,47,3.5,47
4,0,50,3.5,50
...,...,...,...,...
20000258,138492,68954,4.5,13821
20000259,138492,69526,4.5,13929
20000260,138492,69644,3.0,13942
20000261,138492,70286,5.0,14060


### Saving the Preprocessed DataFrame into a CSV file

In [14]:
df.to_csv('edited_rating.csv')

### Shrinking the DataFrame into most active users and most rated movies

In [15]:
# 1000 most rated movies
movies_id = df.seq_movieId.value_counts()[:1000].index.tolist()

In [16]:
# 500 most active users
users_id = df.userId.value_counts()[:500].index.tolist()

In [17]:
small_df = df[df.userId.isin(users_id) & df.seq_movieId.isin(movies_id)].copy()

In [18]:
# Once again, remapping both users and movies to start at 0 and to be sequential

small_unique_users = set(small_df['userId'])
small_users_dic = {}
count = 0
for user in small_unique_users:
    small_users_dic[user] = count
    count+=1
    
new_users = []
for old_val in small_df.userId.values:
    new_users.append(small_users_dic[old_val])

small_df['userId'] = new_users
    
small_unique_movies = set(small_df['seq_movieId'])
small_movies_dic = {}
count = 0
for movies in small_unique_movies:
    small_movies_dic[movies] = count
    count+=1
    
new_movies = []
for old_val in small_df.seq_movieId.values:
    new_movies.append(small_movies_dic[old_val])

small_df['seq_movieId'] = new_movies

In [19]:
small_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,seq_movieId
19846,31,1,5.0,2002-11-19 20:54:26,0
19847,31,2,5.0,2002-12-26 21:20:49,1
19848,31,3,2.0,2002-12-02 03:53:45,2
19850,31,5,3.0,2002-12-26 23:16:23,4
19851,31,6,4.0,2002-11-20 19:55:17,5


### Saving the Shrinked DataFrame into a CSV File

In [20]:
small_df.to_csv('small_edited_ratings.csv')

### Separating the Data into Training and Test sets

In [21]:
# Step 1: Shuffle the DataFrame
small_df = shuffle(small_df)

# Step 2: Determine the Train/Test split ratio
ratio = 0.8
cutoff = int(ratio*len(small_df))

# Step 3: Create the Train and Test DataFrames

df_train = small_df[:cutoff]
df_test = small_df[cutoff:]

### Creating dictionaries of Users that rated Movies and Movies that were rated by Users

In [22]:
user2movie = {}
movie2user = {}
usermovie2rating = {}
usermovie2rating_test = {}

In [23]:
def fill_train(df):
    for index, row in df.iterrows():
        user = row.userId
        movie = row.seq_movieId
        rating = row.rating

        if user not in user2movie:
            user2movie[user] = [movie]
        else:
            user2movie[user].append(movie)

        if movie not in movie2user:
            movie2user[movie] = [user]
        else:
            movie2user[movie].append(user)

        usermovie2rating[(user,movie)] = rating

fill_train(df_train)

In [24]:
def fill_test(df):
    for index, row in df.iterrows():
        user = row.userId
        movie = row.seq_movieId
        rating = row.rating

        usermovie2rating_test[(user,movie)] = rating

fill_test(df_test)

In [25]:
N = len(set(small_df.userId.values)) # Number of unique users
M = len(set(small_df.seq_movieId.values)) # Number of unique movies

### Creating a User-User Collaborative Filtering

In [26]:
K = 25 # maximum number of neighbors to be considered
limit = 5 # minimum number of movies shared by both users
neighbors = []
averages = []
deviations = []

for user in range(N): # Loop through every user
    movies_u = user2movie[user] # List of every movie watched by given user
    movies_u_set = set(movies_u)
    ratings_u = {movie : usermovie2rating[(user,movie)] for movie in movies_u} # Ratings of movies rated by the user
    average_u = np.mean(list(ratings_u.values())) # User's average rating
    dev_u = {movie: ratings_u[movie]-average_u for movie in movies_u}
    dev_u_values = np.array(list(dev_u.values()))
    sigma_u = np.sqrt(dev_u_values.dot(dev_u_values))
    
    averages.append(average_u)
    deviations.append(dev_u)
    
    sl = SortedList()
    for user2 in range(N):
        if user != user2:
            movies_u2 = user2movie[user2]
            movies_u2_set = set(movies_u2)
            common_movies = (movies_u_set & movies_u2_set)
            if len(common_movies) > limit:
                ratings_u2 = {movie : usermovie2rating[(user2,movie)] for movie in movies_u2}
                average_u2 = np.mean(list(ratings_u2.values()))
                dev_u2 = {movie: ratings_u2[movie]-average_u2 for movie in movies_u2}
                dev_u2_values = np.array(list(dev_u2.values()))
                sigma_u2 = np.sqrt(dev_u2_values.dot(dev_u2_values))
            
            # Calculating correlation coefficient
                numerator = sum(dev_u[m]*dev_u2[m] for m in common_movies)
                w_ij = numerator / (sigma_u*sigma_u2)
            

                sl.add((-w_ij, user2))
                if len(sl) > K:
                    del sl[-1]
    neighbors.append(sl)

### Predicting Movie Ratings

In [27]:
def predict(user,movie):
    # Calculating the weighted sum of deviations
    numerator = 0
    denominator = 0
    for neg_w, user2 in neighbors[user]:
        try:
            numerator += -neg_w * deviations[user2][movie]
            denominator += abs(neg_w)
        except:
            # Cases where the user2 did not rate the movie user did
            pass
    
    if denominator == 0:
        # There is no way to actually predict anything, so just use the average
        prediction = averages[user]
    else: 
        prediction = numerator / denominator + averages[user]
    prediction = min(5, prediction)
    prediction = max(0.5, prediction)
    
    return prediction

In [28]:
train_predictions = []
train_targets = []
for (user,movie), target in usermovie2rating.items():
    prediction = predict(user,movie)
    
    train_predictions.append(prediction)
    train_targets.append(target)

test_predictions = []
test_targets = []
for (user,movie), target in usermovie2rating_test.items():
    prediction = predict(user,movie)
    
    test_predictions.append(prediction)
    test_targets.append(target)

### Calculating Accuracy

In [29]:
def mse(p,t): # Calculating the Mean Squared Error
    p = np.array(p)
    t = np.array(t)
    return np.mean((p-t)**2)

In [30]:
print(f'Training MSE: {mse(train_predictions, train_targets):.3f}')
print(f'Test MSE: {mse(test_predictions, test_targets):.3f}')

Training MSE: 0.586
Test MSE: 0.634


### Creating an Item-Item Collaborative Filtering

In [31]:
K = 25 # maximum number of neighbors to be considered
limit = 5 # minimum number of users shared by both movies
neighbors_m = []
averages_m = []
deviations_m = []

for movie in range(M): # Loop through every movie
    users_m = movie2user[movie] # List of every movie watched by given user
    users_m_set = set(users_m)
    ratings_m = {user : usermovie2rating[(user,movie)] for user in users_m} # Ratings of the movie
    average_m = np.mean(list(ratings_m.values())) # Movie's average rating
    dev_m = {user: ratings_m[user]-average_m for user in users_m}
    dev_m_values = np.array(list(dev_m.values()))
    sigma_m = np.sqrt(dev_m_values.dot(dev_m_values))
    
    averages_m.append(average_m)
    deviations_m.append(dev_m)
    
    sl = SortedList()
    for movie2 in range(M):
        if movie != movie2:
            users_m2 = movie2user[movie2]
            users_m2_set = set(users_m2)
            common_users = (users_m_set & users_m2_set)
            if len(common_users) > limit:
                ratings_m2 = {user : usermovie2rating[(user,movie2)] for user in users_m2}
                average_m2 = np.mean(list(ratings_m2.values()))
                dev_m2 = {user: ratings_m2[user]-average_m2 for user in users_m2}
                dev_m2_values = np.array(list(dev_m2.values()))
                sigma_m2 = np.sqrt(dev_m2_values.dot(dev_m2_values))
            
            # Calculating correlation coefficient
                numerator = sum(dev_m[u]*dev_m2[u] for u in common_users)
                w_ij = numerator / (sigma_m*sigma_m2)
            

                sl.add((-w_ij, movie2))
                if len(sl) > K:
                    del sl[-1]
    neighbors_m.append(sl)

### Predicting Movie Ratings

In [32]:
def predict_m(user,movie):
    # Calculating the weighted sum of deviations
    numerator = 0
    denominator = 0
    for neg_w, movie2 in neighbors_m[movie]:
        try:
            numerator += -neg_w * deviations_m[movie2][user]
            denominator += abs(neg_w)
        except:
            # Cases where the user2 did not rate the movie user did
            pass
    
    if denominator == 0:
        # There is no way to actually predict anything, so just use the average
        prediction = averages_m[movie]
    else: 
        prediction = numerator / denominator + averages_m[movie]
    prediction = min(5, prediction)
    prediction = max(0.5, prediction)
    
    return prediction

In [33]:
train_predictions_m = []
train_targets_m = []
for (user,movie), target in usermovie2rating.items():
    prediction = predict_m(user,movie)
    
    train_predictions_m.append(prediction)
    train_targets_m.append(target)

test_predictions_m = []
test_targets_m = []
for (user,movie), target in usermovie2rating_test.items():
    prediction = predict_m(user,movie)
    
    test_predictions_m.append(prediction)
    test_targets_m.append(target)

### Calculating Accuracy

In [34]:
print(f'Training MSE: {mse(train_predictions_m, train_targets_m):.3f}')
print(f'Test MSE: {mse(test_predictions_m, test_targets_m):.3f}')

Training MSE: 0.445
Test MSE: 0.563
