In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### Read data

In [7]:
ratings_df = pd.read_csv('ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
movies_df = pd.read_csv('movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [25]:
data = pd.merge(ratings_df, movies_df, on=['movieId'])
data.drop(['timestamp'], axis=1, inplace=True)
print("data dimensions:", data.shape)
data.head()

data dimensions: (100836, 5)


Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [53]:
# change from movie's id to index in matrix
movie_id_to_index ={}

index = 0
for row in movies_df['movieId']:
    movie_id_to_index[row] = index
    index += 1

len(movie_id_to_index)


9742

In [54]:
num_users = data['userId'].nunique()
num_movies = movies_df['movieId'].nunique()
print("total number of users:", num_users)
print("total number of movies:", num_movies)

total number of users: 610
total number of movies: 9742


### Create training and testing sets

In [145]:
data_mat = np.zeros((num_users, num_movies))

for row in data.itertuples():
    data_mat[row[1] - 1, movie_id_to_index[row[2]]] = row[3]

print("data_mat dimensions:", data_mat.shape)
data_mat

data_mat dimensions: (610, 9742)


array([[4. , 0. , 4. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [2.5, 2. , 2. , ..., 0. , 0. , 0. ],
       [3. , 0. , 0. , ..., 0. , 0. , 0. ],
       [5. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [146]:
from sklearn.model_selection import train_test_split
train_mat, test_mat = train_test_split(data_mat, test_size=0.15)

print("train dimensions:", train_mat.shape)
print("test dimensions:", test_mat.shape)
train_mat

train dimensions: (518, 9742)
test dimensions: (92, 9742)


array([[0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [3.5, 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [3.5, 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

### Prediction methods

In [147]:
from numpy.linalg import norm

TOP_USERS = 10

# find top similar users
def top_similars(user_vector):
    cosine_mat = np.zeros(len(train_mat))

    for index, row in enumerate(train_mat):
        cosine = np.dot(user_vector, row) / (norm(user_vector) * norm(row))
        cosine_mat[index] = cosine
    
    top_indices = np.argpartition(cosine_mat, -TOP_USERS)[-TOP_USERS:]
    top_dists = [cosine_mat[index] for index in top_indices]
    return top_indices, top_dists

In [148]:
# predict a user's rating for all movies
def predict_movie(user_vector):
    top_indices, top_dists = top_similars(user_vector)
    res = []

    for movie_id in movie_id_to_index.keys():
        total_ratings = 0
        for i in range(len(top_indices)):
            total_ratings += (train_mat[top_indices[i], movie_id_to_index[movie_id]] * top_dists[i])
        res.append(total_ratings / np.sum(top_dists))

    return res

In [149]:
from sklearn import metrics
predictions = np.array([predict_movie(user_vector) for user_vector in test_mat])


In [150]:
print("predictions dimensions:", predictions.shape)
predictions

predictions dimensions: (92, 9742)


array([[3.54329187, 0.90501611, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [3.39958116, 1.0312652 , 1.35606564, ..., 0.        , 0.        ,
        0.        ],
       [4.03402206, 0.84441015, 1.11418339, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [4.24687962, 2.08378481, 1.25694337, ..., 0.        , 0.        ,
        0.        ],
       [1.80587672, 1.91175612, 0.61192086, ..., 0.        , 0.        ,
        0.        ],
       [2.56497782, 0.        , 0.2605385 , ..., 0.        , 0.        ,
        0.        ]])

### Calculate error

In [153]:
from sklearn import metrics
print('MAE:', metrics.mean_squared_error(test_mat, predictions))
print('MSE:', metrics.mean_squared_error(test_mat, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(test_mat, predictions)))

MAE: 0.17972872682187485
MSE: 0.17972872682187485
RMSE: 0.4239442496624702
