In [92]:
import pandas as pd
import numpy as np
import logging
import sys
from time import time
import pickle
import re
from pandas.io.json import json_normalize
import json

from sklearn.utils.extmath import randomized_svd
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

from scipy import sparse
from scipy.sparse.linalg import svds

from collections import defaultdict
from collections import Counter
import math

In [2]:
from surprise import SVD, accuracy
from surprise.model_selection import cross_validate, train_test_split
from surprise import Dataset
from surprise import Reader
from surprise.prediction_algorithms.knns import KNNWithZScore, KNNBaseline
from surprise.prediction_algorithms.matrix_factorization import NMF

In [3]:
def convert_ids(ids_in_csv):
    return pd.to_numeric(ids_in_csv, errors='coerce').astype('int64')

def convert_to_float(ids_in_csv):
    return pd.to_numeric(ids_in_csv, errors='coerce').astype('float64')

def to_json(csv_entry):
    return json.loads(re.sub('\'', '"', csv_entry))

In [146]:
#movies_df has all the metadata of the movies and ratings_df has ratings given by user to movies

movies_df = pd.read_csv('the-movies-dataset/movies_metadata.csv'
                        , converters={'id': lambda x: convert_ids(x), 'imdb_id': lambda x: convert_ids(x)}
                       ,usecols=['id', 'original_title', 'budget', 'genres','spoken_languages', 'title','vote_count','vote_average'])
movies_df.drop_duplicates(subset ="id", keep = 'first', inplace = True)


In [147]:
ratings_df = pd.read_csv('the-movies-dataset/ratings_small.csv')
#Uncomment with higher computational power
#ratings_df = pd.read_csv('the-movies-dataset/ratings.csv')

In [148]:
###May need Fuzzy matching, but for now:
# Do not know if this is actually required
movies_df = movies_df[movies_df.spoken_languages == """[{'iso_639_1': 'en', 'name': 'English'}]"""]

In [149]:
ratings_df.drop_duplicates(subset=['userId','movieId'],keep='last',inplace=True)

In [152]:
movies_df = movies_df.sort_values(by='id')
movies_df = movies_df.reset_index()
titles = movies_df['title']
indices = pd.Series(movies_df.index, index= movies_df['title'])

In [153]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()
# testset = trainset.build_anti_testset()

In [154]:
algo = SVD(verbose=True)
algo.fit(trainset)

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, n_jobs=-1, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8876  0.9007  0.9004  0.8923  0.9023  0.8967  0.0057  
MAE (testset)     0.6817  0.6916  0.6940  0.6859  0.6987  0.6904  0.0060  
Fit time          5.28    5.35    5.10    5.39    4.47    5.12    0.34    
Test time         0.15    0.17    0.15    0.15    0.16    0.15    0.01    


{'fit_time': (5.284756183624268,
  5.345030307769775,
  5.095832347869873,
  5.392770051956177,
  4.466385364532471),
 'test_mae': array([0.68165163, 0.69159771, 0.69395321, 0.68594369, 0.69869296]),
 'test_rmse': array([0.88762335, 0.90065794, 0.90043447, 0.89231834, 0.90226313]),
 'test_time': (0.14821076393127441,
  0.1669445037841797,
  0.1478261947631836,
  0.14841079711914062,
  0.15827059745788574)}

In [155]:
users = algo.pu
items = algo.qi
user_bias = algo.bu
item_bias = algo.bi
titles = movies_df['title']


In [156]:
def get_viewed_movies(userId) :
    ''' 
    aggregate all movies that the user watched 
    input : userid
    output : a list of movie ids that the user has already rated
    '''
    
    users_viewed_movies = ratings_df[ratings_df['userId'] == userId].sort_values(['rating'], ascending=False)
    
    return users_viewed_movies['movieId'].tolist()

In [157]:
def estimate(users, items, user_bias, item_bias, u, i):
    '''
    gives the estimated ratings for user u 
    for the i movie
    '''
    u = trainset.to_inner_uid(u)
    i = trainset.to_inner_iid(i)
    est = trainset.global_mean
    est += user_bias[u]
    est += item_bias[i]
    est += np.dot(items[i], users[u])
    return est

In [225]:
def get_single_estimate(userId, itemId):
    
    if itemId in trainset._raw2inner_id_items:
        
        return estimate(users, items, user_bias, item_bias, userId, itemId)
    return 0

def recommend_movies(userID, movies_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    pred = []
    for i in movieids:
        if i in trainset._raw2inner_id_items:
            pred.append([i,estimate(users, items, user_bias, item_bias, userID, i)])
    pred_df = pd.DataFrame(pred,columns=['id','est'])
    #sorted_user_predictions = preds_df.iloc[user_row_number]
    print(pred_df.head())
    
    # Get the user's data and merge in the movie information.
    viewed_movieids = get_viewed_movies(userID)
               
    all_movieids = pred_df['id'].tolist()
    predicted_movieids = [value for value in all_movieids if value not in viewed_movieids]
    #print(type(predicted_movieids))
    recommendations = movies_df[movies_df['id'].isin(predicted_movieids)]
    recommendations = recommendations.merge(pred_df, left_on='id', right_on='id').sort_values(by='est',ascending=False)
    return predicted_movieids,recommendations[:num_recommendations]

In [159]:
pred_movie_id, rec = recommend_movies(321, movies_df, 10)
rec

     id       est
0   524  3.556158
1  4584  3.682077
2     5  2.932510
3  8012  3.475697
4   451  3.247257


Unnamed: 0,index,budget,genres,id,original_title,spoken_languages,title,vote_average,vote_count,est
234,534,21000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",858,Sleepless in Seattle,"[{'iso_639_1': 'en', 'name': 'English'}]",Sleepless in Seattle,6.5,630.0,4.742203
253,2649,4300000,"[{'id': 10749, 'name': 'Romance'}, {'id': 80, ...",912,The Thomas Crown Affair,"[{'iso_639_1': 'en', 'name': 'English'}]",The Thomas Crown Affair,6.9,95.0,4.535926
205,8546,0,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",750,Murder She Said,"[{'iso_639_1': 'en', 'name': 'English'}]",Murder She Said,7.0,31.0,4.438803
877,6141,0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",6016,The Good Thief,"[{'iso_639_1': 'en', 'name': 'English'}]",The Good Thief,6.0,32.0,4.31454
100,415,50000000,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",306,Beverly Hills Cop III,"[{'iso_639_1': 'en', 'name': 'English'}]",Beverly Hills Cop III,5.5,445.0,4.300957
420,915,1020000,"[{'id': 18, 'name': 'Drama'}, {'id': 9648, 'na...",1939,Laura,"[{'iso_639_1': 'en', 'name': 'English'}]",Laura,7.6,173.0,4.292256
103,4020,8000000,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",318,The Million Dollar Hotel,"[{'iso_639_1': 'en', 'name': 'English'}]",The Million Dollar Hotel,5.9,76.0,4.279955
298,11566,0,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",1252,Lonely Hearts,"[{'iso_639_1': 'en', 'name': 'English'}]",Lonely Hearts,6.0,88.0,4.268143
425,278,31000000,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",1945,Nell,"[{'iso_639_1': 'en', 'name': 'English'}]",Nell,6.1,128.0,4.266061
469,334,17000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",2064,While You Were Sleeping,"[{'iso_639_1': 'en', 'name': 'English'}]",While You Were Sleeping,6.5,340.0,4.256994


# Perform Hybrid recommendations

In [163]:
count_matrix = sparse.load_npz("notebook/countmatrix.npz")

In [164]:
vote_counts = movies_df[movies_df['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = movies_df[movies_df['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.158527322282293

In [165]:
sim_movie_list = {}

In [260]:
def cosine_sim(count_matrix, idx, title):
    if(title in sim_movie_list):
        #print('call from here')
        return sim_movie_list[title]
    else:
        cosine_sim = linear_kernel(count_matrix[idx], count_matrix)
        sim_movie_list[title]= cosine_sim[0]
        return cosine_sim[0]

def improved_recommendations(title, n):
    idx = indices[title]
    
    sim_scores = list(enumerate(cosine_sim(count_matrix, idx, title)))
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
#     print(movie_indices)
    movies = movies_df.iloc[movie_indices][['id','title', 'vote_count', 'vote_average']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.50)
    qualified = movies[(movies['vote_count'] >= m) & (
        movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(n)
    return qualified

def get_wr_for_df(df) :
    qualified = df[(df['vote_count'].notnull()) & (df['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    return qualified
    
def weighted_rating(x):
    m = vote_counts.quantile(0.60)
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [262]:
alpha = 0.5
beta = 1 - alpha

def get_hybrid_recommendations(userId, n_collaborative, n_content):
    pred_movie_id, rec = recommend_movies(userId, movies_df, n_collaborative)
    rec = rec.reset_index()
    retvalCollab = rec[['id','original_title','vote_count','vote_average','est']]
    retvalContent = pd.DataFrame(columns=['id','title','vote_count','vote_average','wr']) 
    retvalCollab.rename(columns = {'original_title':'title'}, inplace = True)
    for i in range(retvalCollab.shape[0]):
        retvalContent = pd.concat([retvalContent,improved_recommendations(rec.iloc[i]['original_title'],n_content)],ignore_index=True,sort=True)

    retvalContent['id'] = retvalContent['id'].astype(int)
    retvalContent['est'] = retvalContent['id'].apply(lambda x: get_single_estimate(userId,x))
    retvalCollab = get_wr_for_df(retvalCollab)
    
    retval = pd.concat([retvalCollab, retvalContent],ignore_index=True,sort=True)
    
    #Normalising wr values using (x-min / max-min) max = 10 min = 1
    retval['wr'] = retval['wr'].apply(lambda x : 5*((x-1)/(9)))
    retval['score'] = retval.apply(lambda x : alpha * x['est'] + beta * x['wr'],axis=1)
    return retval.sort_values('score',ascending=False)

In [263]:
get_hybrid_recommendations(321,5,10)

     id       est
0   524  3.556158
1  4584  3.682077
2     5  2.932510
3  8012  3.475697
4   451  3.247257


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be 

Unnamed: 0,est,id,title,vote_average,vote_count,wr,score
7,3.732487,901,City Lights,8,444,3.824108,3.778298
0,4.742203,858,Sleepless in Seattle,6,630,2.764092,3.753148
36,4.193539,994,Straw Dogs,7,230,3.25527,3.724405
2,4.438803,750,Murder She Said,7,31,2.944578,3.691691
1,4.535926,912,The Thomas Crown Affair,6,95,2.699864,3.617895
5,3.211456,13,Forrest Gump,8,8147,3.885216,3.548336
13,4.256994,2064,While You Were Sleeping,6,340,2.753036,3.505015
3,4.31454,6016,The Good Thief,6,32,2.603617,3.459078
46,3.371749,5236,Kiss Kiss Bang Bang,7,900,3.312182,3.341966
8,3.379414,401,Garden State,7,631,3.303429,3.341421


In [169]:
improved_recommendations('Inception')

[17160, 10929, 21, 119, 377, 2642, 16733, 12119, 208, 11168, 21656, 185, 477, 628, 638, 763, 1433, 1447, 1733, 3484, 10326, 10644, 12238, 13303, 15183]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,title,vote_count,vote_average,wr
17160,Interstellar,11187,8,7.995182
377,The Prestige,4510,8,7.988079
21,Memento,4168,8,7.987106
10929,The Dark Knight Rises,9263,7,6.996231
12119,Looper,4777,6,5.996666
628,X-Men Origins: Wolverine,4086,6,5.996105
208,The Matrix Reloaded,3500,6,5.995457
477,The Island,1813,6,5.991273
185,Starship Troopers,1584,6,5.990026
1733,Mad Max,1235,6,5.98725


In [232]:
movies_df[movies_df['id']==284]
ratings_df[ratings_df['movieId'] == 284]

Unnamed: 0,userId,movieId,rating,timestamp


# Approach to Folding in new user with some samples

In [12]:
#for testing we separate the first user as new user and perform the new method of folding in
new_ratings_df = ratings_df[ratings_df['userId'] == 1]
new_ratings_df['userId'] = new_ratings_df['userId'].apply(lambda x : 672)
ratings_df = pd.concat([ratings_df, new_ratings_df], ignore_index=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
users = algo.pu
items = algo.qi
user_bias = algo.bu
item_bias = algo.bi
(trainset.ur)[671] = (trainset.ur)[0]
trainset._raw2inner_id_users[672] = len(trainset._raw2inner_id_users)

In [14]:
def create_new_user(ruid,items,item_bias, n_epochs) :
    
    init_mean=0
    init_std_dev=.1
    lr_all=.005
    reg_all=.02
    n_factors = 100
    global_mean = trainset.global_mean
    bu = 0
    bi = item_bias
    pu = np.random.mtrand._rand.normal(init_mean, init_std_dev,
                    (n_factors))
    qi = items

    for current_epoch in range(n_epochs):
        
        #print("Processing epoch {}".format(current_epoch))
        u = trainset.to_inner_uid(ruid)
        for i, r in trainset.ur[trainset.to_inner_uid(ruid)]:

            # compute current error
            dot = 0  # <q_i, p_u>
            for f in range(n_factors):
                dot += qi[i, f] * pu[f]
            err = r - (global_mean + bu + bi[i] + dot)

           
            bu += lr_all * (err - reg_all * bu)
            #bi[i] += lr_bi * (err - reg_bi * bi[i])

            # update factors
            for f in range(n_factors):
                puf = pu[f]
                qif = qi[i, f]
                pu[f] += lr_all * (err * qif - reg_all * puf)
                #qi[i, f] += lr_qi * (err * puf - reg_qi * qif)

    return bu, pu

In [15]:
bias, puser = create_new_user(672,items,item_bias, 20)

In [16]:
#RMSE value for the new user
s = 0
count = 0
for i, r in trainset.ur[trainset.to_inner_uid(672)]:

    # compute current error
    dot = 0  # <q_i, p_u>
    #for f in range(100):
    dot += items[i].dot(puser)
    err = r - (trainset.global_mean + bias + item_bias[i] + dot)
    s += err*err
    count += 1
rmse = (s/count)**0.5
rmse

0.7589668401136558

In [17]:
#RMSE value for the first user predicted by the Surprise package
s = 0
count = 0
for i, r in trainset.ur[trainset.to_inner_uid(1)]:

    # compute current error
    dot = 0  # <q_i, p_u>
    #for f in range(100):
    dot += items[i].dot(users[0])
    err = r - (trainset.global_mean + user_bias[0] + item_bias[i] + dot)
    s += err*err
    count += 1
rmse = (s/count)**0.5
rmse

0.7065272690145884

In [18]:
#Comparing the 2 users 
# 1. predicted by us
# 2. predicted by Surprise package
sample = users[0]
s = 0 
for i in items :
    diff = (i.dot(sample) + user_bias[0]) - (i.dot(puser) + bias)
    s += diff*diff

rmse = (s/len(items))**0.5
rmse

0.13321195933660362

In [19]:
def estimate(users, items, user_bias, item_bias, u, i):
    '''
    gives the estimated ratings for user u 
    for the i movie
    '''
    u = trainset.to_inner_uid(u)
    i = trainset.to_inner_iid(i)
    est = trainset.global_mean
    est += user_bias[u]
    est += item_bias[i]
    est += np.dot(items[i], users[u])
    return est

In [20]:
for i in range(10) :
    dot = 0
    dot += items[trainset.to_inner_iid(i+1)].dot(puser)
    pred = trainset.global_mean + bias + item_bias[trainset.to_inner_iid(i+1)] + dot
    print(estimate(users, items, user_bias, item_bias, 1, i+1), pred)

2.735256393429975 2.760435458531395
2.560045883050796 2.40226111177123
2.3874639970039953 2.2954626184043856
1.999607929478677 1.8552588247205049
2.255968983170658 2.2224468352609543
3.0468318220329844 2.779659128988736
2.6213238809044133 2.415642871828312
2.575173928532757 2.427609594745811
2.15766890710044 2.219268849799967
2.668779217266296 2.555329122323936


In [34]:
predictions = algo.test(testset)


NameError: name 'predicitons' is not defined

In [36]:
predictions[:5]

[Prediction(uid=1, iid=10, r_ui=3.543608255669773, est=2.583825142683057, details={'was_impossible': False}),
 Prediction(uid=1, iid=17, r_ui=3.543608255669773, est=3.3001508729281652, details={'was_impossible': False}),
 Prediction(uid=1, iid=39, r_ui=3.543608255669773, est=2.6397472673265545, details={'was_impossible': False}),
 Prediction(uid=1, iid=47, r_ui=3.543608255669773, est=3.124481506629175, details={'was_impossible': False}),
 Prediction(uid=1, iid=50, r_ui=3.543608255669773, est=3.5812670200280556, details={'was_impossible': False})]

In [173]:
#TODO :
#add new ratings to rating_df
#implement the new item version
#create python scripts