# Import Libraries

In [1]:
import numpy as np
import pandas as pd

# Get the data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/emerging-technologies/Q2

/content/drive/MyDrive/emerging-technologies/Q2


In [4]:
df = pd.read_csv('MyRecommenderData.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,user_id,item_id,rating,timestamp,title
0,0,0,50,5,881250949,Star Wars (1977)
1,1,290,50,5,880473582,Star Wars (1977)
2,2,79,50,4,891271545,Star Wars (1977)
3,3,2,50,5,888552084,Star Wars (1977)
4,4,8,50,5,879362124,Star Wars (1977)


In [6]:
df = df.drop(["Unnamed: 0"], axis = 1)

In [7]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,0,50,5,881250949,Star Wars (1977)
1,290,50,5,880473582,Star Wars (1977)
2,79,50,4,891271545,Star Wars (1977)
3,2,50,5,888552084,Star Wars (1977)
4,8,50,5,879362124,Star Wars (1977)


## Find out the number of unique users and movies.

In [8]:
n_users = df.user_id.nunique()
n_items = df.item_id.nunique()

print('Num. of Users: '+ str(n_users))
print('Num of Movies: '+str(n_items))

Num. of Users: 944
Num of Movies: 1682


## Perform Train Test Split

In [10]:
# Start your solution here
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.2)

In [13]:
train_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
29973,724,347,4,883757670,Wag the Dog (1997)
12622,543,165,4,874863436,Jean de Florette (1986)
29136,684,202,4,878759384,Groundhog Day (1993)
37013,268,357,4,875309882,One Flew Over the Cuckoo's Nest (1975)
5929,104,246,3,888465319,Chasing Amy (1997)


In [15]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

In [16]:
train_data_matrix

array([[5., 3., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
train_data

Unnamed: 0,user_id,item_id,rating,timestamp,title
29973,724,347,4,883757670,Wag the Dog (1997)
12622,543,165,4,874863436,Jean de Florette (1986)
29136,684,202,4,878759384,Groundhog Day (1993)
37013,268,357,4,875309882,One Flew Over the Cuckoo's Nest (1975)
5929,104,246,3,888465319,Chasing Amy (1997)
...,...,...,...,...,...
38601,458,321,3,889323855,Mother (1996)
28542,63,748,4,875747010,"Saint, The (1997)"
9990,38,679,5,892432062,Conan the Barbarian (1981)
15851,535,382,5,879618058,"Adventures of Priscilla, Queen of the Desert, ..."


In [18]:
test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [19]:
test_data_matrix

array([[0., 0., 4., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [21]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [22]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [23]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [24]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 3.096696004413641
Item-based CF RMSE: 3.4387457418554703
