## Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
from math import *

## Merge datasets (compute average rating for each user and normalize)

In [3]:
ratings = pd.read_csv('./ml-latest-small/ratings.csv')    
movies = pd.read_csv('./ml-latest-small/movies.csv') 

mean_rating_user = ratings[["userId", "rating"]].groupby("userId").agg(np.average)
ratings["nrating"] = ratings.apply(lambda row: (row["rating"] - mean_rating_user.at[int(row["userId"]), "rating"]), axis=1)

df = pd.merge(movies, ratings, on='movieId')
df = df[['userId','nrating','movieId','title']]
df

Unnamed: 0,userId,nrating,movieId,title
0,1,-0.366379,1,Toy Story (1995)
1,5,0.363636,1,Toy Story (1995)
2,7,1.269737,1,Toy Story (1995)
3,15,-0.948148,1,Toy Story (1995)
4,17,0.290476,1,Toy Story (1995)
...,...,...,...,...
100831,184,0.294776,193581,Black Butler: Book of the Atlantic (2017)
100832,184,-0.205224,193583,No Game No Life: Zero (2017)
100833,184,-0.205224,193585,Flint (2017)
100834,184,-0.205224,193587,Bungo Stray Dogs: Dead Apple (2018)


## set of movie titles and users id 

In [3]:
movies_list = set(df['title'])
users_list = set(df['userId'])

## make a crosstab matrix user-item 

In [4]:
df2 = pd.crosstab(df.movieId, df.userId, df.nrating, aggfunc=np.sum)

## define distance function 

In [5]:
MAX_DISTANCE = 5.01
def euclidean_distance(v1, v2):
    x = [(i,j) for i, j in zip(v1,v2) if (not isnan(i) and  not isnan(j))]
    distance = sum((p-q)**2 for p, q in x)
    if len(x)!=0:
        distance = sqrt(distance / len(x))
    else:
        distance = MAX_DISTANCE   
    return distance

## create a dic for each user from the watched movies and his rating

In [6]:
data = {}
for m in users_list:
    df_user = df[df['userId'] == m]
    data[m] = dict(zip(df_user.title, df_user.nrating))

## find the the top similar users by finding their distance (this distance is computed among common movies that both users seen)

In [7]:
NUMNER_OF_SIMILAR_USERS = 10
def simliar_users(user):
    list_distances = []
    for u in users_list:
        if not u == user:
            distance = euclidean_distance(df2[user],df2[u])
            list_distances.append((u, distance))
    list_distances.sort(key = lambda x: x[1])
    return list_distances[:NUMNER_OF_SIMILAR_USERS]

## recommend 3 movie to input user
- find top similar users
- find all movies they watched
- merge their score to each common movies and create a final score for eac movie
- sort movies by final scroes
- recoomened 3 movies the input user didnt see it 

In [8]:
def recommend_user(user):
    top_similar_users = [i[0] for i in simliar_users(user)]
    dicts_items = [data[i] for i in top_similar_users]

    # sum over ratings of similar users
    dict_scores = {}
    for i in dicts_items:
        for j in i.keys():
            dict_scores[j] = dict_scores.get(j, 0) + i[j]
    
    list_recom = []
    for item in dict_scores.keys():
        if item not in data[user].keys():
            list_recom.append((item, dict_scores[item]))
    list_recom.sort(key=lambda x: x[1], reverse=True)
    return list_recom[:3]

In [9]:
recommend_user(1)

[('Dark Knight, The (2008)', 3.0330783036240088),
 ('Interstellar (2014)', 2.6499277130206926),
 ('Titanic (1997)', 1.7439024390243905)]

In [10]:
recommend_user(3)

[('Matrix, The (1999)', 6.768384643577674),
 ('Godfather, The (1972)', 5.396258206796066),
 ('Star Wars: Episode V - The Empire Strikes Back (1980)', 4.768384643577675)]

In [11]:
recommend_user(50)

[('Braveheart (1995)', 3.155707038059979),
 ('Aladdin (1992)', 2.145446507515473),
 ('Seven (a.k.a. Se7en) (1995)', 2.027501909854851)]