# Movie Recommend System

## Importing Libraries & Datasets

In [1]:
from collections import defaultdict
import numpy as np
import scipy
import pandas as pd

In [2]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

## Data Pre-processing

In [3]:
userId = []
movieId = []
toSplit = train.loc[:, 'userId_movieId']
for i in toSplit:
    items = i.split('_')
    userId.append(items[0])
    movieId.append(items[1])

In [4]:
train["user_id"] = userId
train["movie_id"] = movieId
train.head()

Unnamed: 0,userId_movieId,rating,user_id,movie_id
0,10_1358,0.4,10,1358
1,237_1544,0.7,237,1544
2,54_373,1.0,54,373
3,11_2053,0.8,11,2053
4,183_2524,0.6,183,2524


In [5]:
usersPerItem = defaultdict(set) # Maps item to users who rated it
itemsPerUser = defaultdict(set) # Maps user to items that they rated
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
itemNames = {}
ratingDict = {} # To retrieve a rating for a specific user/item pair

for d in train.iterrows():
    user,item = d[1]['user_id'], d[1]['movie_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    reviewsPerUser[user].append(d[1])
    reviewsPerItem[item].append(d[1])
    ratingDict[(user,item)] = d[1]['rating']

## Model Training

In [6]:
userAverages = {}
itemAverages = {}
ratingMean = []

for u in itemsPerUser:
    rs = [ratingDict[(u,i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(u,i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)
    
for d in train.iterrows():
    ratingMean.append(d[1]['rating'])
    
ratingMean = sum(ratingMean) / len(ratingMean)

In [7]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

def mostSimilar(i, N):
    similarities = []
    users = usersPerItem[i]
    for i2 in usersPerItem:
        if i2 == i: continue
        sim = Jaccard(users, usersPerItem[i2])
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:10]


In [8]:
def predictRating(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['movie_id']
        if i2 == item: continue
        ratings.append(d['rating'] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean

In [9]:
simPredictions0 = [predictRating(d[1]['user_id'], d[1]['movie_id']) for d in train.iterrows()]
labels = [d[1]['rating'] for d in train.iterrows()]

## Model Evaluation

In [11]:
from sklearn.metrics import mean_squared_error
mean_squared_error(simPredictions0, labels, squared=False)

0.16058704617132066

In [12]:
userId = []
movieId = []
toSplit = test.loc[:, 'userId_movieId']
for i in toSplit:
    items = i.split('_')
    userId.append(items[0])
    movieId.append(items[1])

test["user_id"] = userId
test["movie_id"] = movieId
test.head()

Unnamed: 0,userId_movieId,user_id,movie_id
0,469_2124,469,2124
1,439_3753,439,3753
2,522_1682,522,1682
3,429_1217,429,1217
4,71_1210,71,1210


In [13]:
simPredictions = [predictRating(d[1]['user_id'], d[1]['movie_id']) for d in test.iterrows()]

In [14]:
test['predicted_rating'] = simPredictions
test.head()

Unnamed: 0,userId_movieId,user_id,movie_id,predicted_rating
0,469_2124,469,2124,0.666547
1,439_3753,439,3753,0.726337
2,522_1682,522,1682,0.926164
3,429_1217,429,1217,0.967306
4,71_1210,71,1210,0.826258


In [None]:
import pickle
pickle.dump(simPredictions, open('model.pkl', 'wb'))

#Store as .csv file
submission = test
submission.to_csv('submission.csv', index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b96c0c3a-77b4-4865-808b-c86298105c3b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>