## Example with your movie ratings

In [182]:
import numpy as np
import pandas  as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

In [183]:
df_ratings = pd.read_csv('web_app/data/ratings.csv')

In [184]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [185]:
# pivot data
df_ratings_pivot = df_ratings.pivot(
    index="userId", columns="movieId", values="rating")

In [186]:
df_ratings_pivot

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


#### Eliminating Columns depends on nan percentage

In [187]:
# Delete columns containing either 75% or more than 75% NaN Values
perc = 90.0
min_count =  int(((100-perc)/100)*df_ratings_pivot.shape[0] + 1)
df_ratings_pivot = df_ratings_pivot.dropna( axis=1, 
                thresh=min_count)

In [188]:
df_ratings_pivot

movieId,1,2,6,10,11,16,17,19,21,25,...,68954,69122,70286,72998,74458,79132,89745,91529,99114,109487
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,4.0,4.0,,3.5,3.5,3.0
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,3.0,,...,,,,,,,,,,
5,4.0,,,,,,,,4.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,2.5,,4.0,2.0,,,...,,,,3.0,,,,,,
607,4.0,,,,3.0,,,,,3.0,...,,,,,,,,,,
608,2.5,2.0,,4.0,,4.5,,2.0,3.5,,...,,,,,,,,,,
609,3.0,,,4.0,,,,,,,...,,,,,,,,,,


In [189]:
# impute the nan with 0:
R = df_ratings_pivot.fillna(value=0)
R

movieId,1,2,6,10,11,16,17,19,21,25,...,68954,69122,70286,72998,74458,79132,89745,91529,99114,109487
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,4.0,0.0,3.5,3.5,3.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,2.5,0.0,4.0,2.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,0.0,4.0,0.0,4.5,0.0,2.0,3.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### One-liner from  `sklearn`

In [190]:
def cal_cosine_simularity(R):
    # Returns numpy array:
    cosine_similarity(R)
    # We can turn this into a dataframe:
    cos_sim_table = pd.DataFrame(cosine_similarity(R), index= R.index, columns=R.index)
    return cos_sim_table
    

In [191]:
def get_transpose(R):
    return R.T

In [192]:
# use the transposed version of R
R_t = get_transpose(R)
R_t

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,5.0
10,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0
11,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,2.5,3.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79132,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
89745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
91529,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
99114,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5


### TEST

In [193]:
def cosine_recom(active_user,cos_sim_table, R_t, nn, best=5):
    # create a list of unseen movies for this user
    unseen_movies = list(R_t.index[R_t[active_user]==0])
    cos_sim_table[active_user].sort_values(ascending=False)
    # Create a list of top nn similar user (nearest neighbours)
    neighbours= list(cos_sim_table[active_user].sort_values(ascending=False).index[1:(nn+1)])
    # create the recommendation (predicted/rated movie)
    predicted_ratings_movies = []
    
    for movie in unseen_movies:
        # we check the users who watched the movie
        people_who_have_seen_the_movie = list(R_t.columns[R_t.loc[movie] > 0])
    
        num = 0
        den = 0
        for user in neighbours:
        # if this person has seen the movie
            if user in people_who_have_seen_the_movie:
            #  we want extract the ratings and similarities
                rating = R_t.loc[movie,user]
                similarity = cos_sim_table.loc[active_user,user]
                num = num + rating*similarity
                den = den + similarity
        if(den!=0):
            predicted_ratings = num/den
            predicted_ratings_movies.append([predicted_ratings,movie])
        
    # create df pred
    df_pred = pd.DataFrame(predicted_ratings_movies, columns=['rating', 'movieId'])
    df_pred.set_index("movieId", inplace=True)
    df_pred=df_pred.sort_values(by=["rating"],ascending=False)
    recom_best=df_pred.head(best)
    return recom_best
    

### TEST

In [194]:
# choose an active user
active_user = 601

In [195]:
cos=cal_cosine_simularity(R)
cos

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.034452,0.082445,0.417822,0.218566,0.254383,0.292596,0.223718,0.157689,0.053687,...,0.158632,0.297767,0.571527,0.144809,0.276396,0.452416,0.440584,0.530452,0.181924,0.432055
2,0.034452,1.000000,0.000000,0.011389,0.032288,0.032149,0.021810,0.045237,0.000000,0.199927,...,0.359096,0.033609,0.047141,0.000000,0.000000,0.109560,0.030080,0.089031,0.054455,0.233672
3,0.082445,0.000000,1.000000,0.000000,0.171701,0.061546,0.000000,0.144338,0.000000,0.000000,...,0.098773,0.107236,0.045889,0.000000,0.000000,0.068320,0.095977,0.116652,0.000000,0.084291
4,0.417822,0.011389,0.000000,1.000000,0.148077,0.171956,0.243258,0.124479,0.033739,0.099853,...,0.150745,0.187041,0.502467,0.100198,0.230587,0.355508,0.299465,0.376160,0.050509,0.334883
5,0.218566,0.032288,0.171701,0.148077,1.000000,0.555143,0.145531,0.539275,0.000000,0.074936,...,0.103113,0.572996,0.224817,0.393029,0.289511,0.197075,0.268941,0.280410,0.390810,0.171607
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.452416,0.109560,0.068320,0.355508,0.197075,0.233788,0.500911,0.229963,0.206794,0.247767,...,0.454154,0.270783,0.574980,0.169936,0.340187,1.000000,0.369823,0.674769,0.167375,0.654531
607,0.440584,0.030080,0.095977,0.299465,0.268941,0.351266,0.309764,0.278169,0.026734,0.030584,...,0.155470,0.340875,0.486813,0.219694,0.263388,0.369823,1.000000,0.488389,0.266813,0.383928
608,0.530452,0.089031,0.116652,0.376160,0.280410,0.375990,0.539685,0.330759,0.177506,0.226046,...,0.345278,0.386120,0.598346,0.286408,0.369477,0.674769,0.488389,1.000000,0.251663,0.675731
609,0.181924,0.054455,0.000000,0.050509,0.390810,0.488337,0.142370,0.541695,0.000000,0.054164,...,0.054917,0.514247,0.146173,0.386396,0.207266,0.167375,0.266813,0.251663,1.000000,0.152732


In [196]:
rec=cosine_recom(active_user,cos, R_t, nn=5, best=5)

In [197]:
rec

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
2000,5.0
2985,5.0
1036,5.0
1208,5.0
1240,5.0


### New Functions and New User TEST

In [198]:
import random
def create_ramdom_user(num_ratings, df_ratings_pivot):
    random_dic={}
    for i in range(num_ratings):
        random_r=random.randint(1,5)
        movie_index=random.randint(0,len(df_ratings_pivot.columns)-1)
        random_dic[df_ratings_pivot.columns[movie_index]] =random_r
    return random_dic 

In [199]:
def get_user_array_0(user, df_ratings_pivot):
    arr = np.empty((1,len(df_ratings_pivot.columns)))
    arr[:] = 0
    for key, value in user.items():
        # find the index no
        index_no = df_ratings_pivot.columns.get_loc(key)
        arr[0][index_no]=value
    return arr   

In [200]:
def add_user_to_R(new_user, R):
    arr=get_user_array_0(new_user, R)
    user_index=R.index.max()+1
    # new user dataframe
    df_new_user=pd.DataFrame(arr,index=[user_index], columns = R.columns)
    R=R.append(df_new_user)
    return R

In [201]:
r_user=create_ramdom_user(7, R)
r_user

{11: 1, 48774: 3, 2571: 1, 2542: 3, 3793: 1, 2683: 2, 7361: 5}

In [202]:
R_updated=add_user_to_R(r_user, R)


In [203]:
R_updated

movieId,1,2,6,10,11,16,17,19,21,25,...,68954,69122,70286,72998,74458,79132,89745,91529,99114,109487
1,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,4.0,0.0,3.5,3.5,3.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,4.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,0.0,4.0,0.0,4.5,0.0,2.0,3.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
610,5.0,0.0,5.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,...,3.5,4.0,4.5,4.5,4.5,4.0,5.0,4.5,4.5,3.5


In [204]:
# choose an active user
active_user = 611

In [205]:
cos=cal_cosine_simularity(R_updated)
cos

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,602,603,604,605,606,607,608,609,610,611
1,1.000000,0.034452,0.082445,0.417822,0.218566,0.254383,0.292596,0.223718,0.157689,0.053687,...,0.297767,0.571527,0.144809,0.276396,0.452416,0.440584,0.530452,0.181924,0.432055,0.082445
2,0.034452,1.000000,0.000000,0.011389,0.032288,0.032149,0.021810,0.045237,0.000000,0.199927,...,0.033609,0.047141,0.000000,0.000000,0.109560,0.030080,0.089031,0.054455,0.233672,0.000000
3,0.082445,0.000000,1.000000,0.000000,0.171701,0.061546,0.000000,0.144338,0.000000,0.000000,...,0.107236,0.045889,0.000000,0.000000,0.068320,0.095977,0.116652,0.000000,0.084291,0.000000
4,0.417822,0.011389,0.000000,1.000000,0.148077,0.171956,0.243258,0.124479,0.033739,0.099853,...,0.187041,0.502467,0.100198,0.230587,0.355508,0.299465,0.376160,0.050509,0.334883,0.043605
5,0.218566,0.032288,0.171701,0.148077,1.000000,0.555143,0.145531,0.539275,0.000000,0.074936,...,0.572996,0.224817,0.393029,0.289511,0.197075,0.268941,0.280410,0.390810,0.171607,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,0.440584,0.030080,0.095977,0.299465,0.268941,0.351266,0.309764,0.278169,0.026734,0.030584,...,0.340875,0.486813,0.219694,0.263388,0.369823,1.000000,0.488389,0.266813,0.383928,0.042230
608,0.530452,0.089031,0.116652,0.376160,0.280410,0.375990,0.539685,0.330759,0.177506,0.226046,...,0.386120,0.598346,0.286408,0.369477,0.674769,0.488389,1.000000,0.251663,0.675731,0.128317
609,0.181924,0.054455,0.000000,0.050509,0.390810,0.488337,0.142370,0.541695,0.000000,0.054164,...,0.514247,0.146173,0.386396,0.207266,0.167375,0.266813,0.251663,1.000000,0.152732,0.000000
610,0.432055,0.233672,0.084291,0.334883,0.171607,0.195653,0.452998,0.188406,0.183358,0.329712,...,0.232175,0.479265,0.147712,0.331649,0.654531,0.383928,0.675731,0.152732,1.000000,0.116804


In [206]:
rec=cosine_recom(active_user,cos, R_updated.T, nn=5, best=5)

In [207]:
rec

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1721,5.0
1259,5.0
1584,5.0
6711,5.0
541,5.0
