In [32]:
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import string

import nltk 
from nltk import pos_tag
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import movie_reviews, stopwords,wordnet
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from rake_nltk import Rake

import numpy as np
from numpy import arange,atleast_2d,argsort

from collections import defaultdict

from surprise import Reader, Dataset, SVD, accuracy,KNNBaseline
from surprise.model_selection import train_test_split

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

from ast import literal_eval

In [2]:
helpfulCutOffRatings = 20
reviewDataPath = 'amazon/vsmall.csv'
metadataPath = 'amazon/vsmall_metadata.json'
topN = 5

In [3]:
def removeUnusedReviewInfo(rdata):
    rdata = rdata.drop(['unixReviewTime','reviewTime','reviewerName'],axis=1)
    return rdata
    
def createReviewerInfo(data):
    reviewerInfo = data[['reviewerID', 'reviewerName']]
    return reviewerInfo

In [114]:
def createUserMovieMatrix(udata,mdata):
    fullData = udata.merge(mdata, on='asin',how='outer')
    userMovieMatrix = fullData.pivot_table(index='reviewerID', columns = 'asin',values = 'overall',dropna=False).reset_index().rename_axis(None, axis=1)
    userMovieMatrix.set_index('reviewerID',inplace=True)
    userMovieMatching = pd.crosstab(index = fullData['reviewerID'], columns=fullData['asin'],dropna = False)
    userMovieMatrix = userMovieMatrix.replace(np.nan,0)
#     userMovieMatrix = userMovieMatrix.drop(['reviewerID'],axis = 1)
    return userMovieMatrix, userMovieMatching


In [5]:
# Classify data into negative or positive
def classifyBaseReviewRating(x):
    if x > 2.5:
        return "positive"
    if x == 2.5:
        return "neutral"
    if x < 2.5:
        return "negative"

In [6]:
def cleanData(r):
    r = str(r)
    if len(r) == 0:
        return ''
    if type(r) == float:
        return ''
    return r

In [7]:
def getWordnetType(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def cleanReview(r):
    r = str(r)
    if len(r) == 0:
        return ''
    if type(r) == float:
        return ''
    r = r.lower()
    #remove puncutation
    r = [eachWord.strip(string.punctuation) for eachWord in r.split(" ") ]
    #remove numbers
    r = [eachWord for eachWord in r if not any(l.isdigit() for l in eachWord)]
    
    #remove stop words
    stop = stopwords.words('english')
    r = [w for w in r if w not in stop]
    
    #remove words which are empty
    r = [w for w in r if len(w) >0]
    
    #add pos_tags
    pos_tags = pos_tag(r)
    
    #word lemmatize
    r = [WordNetLemmatizer().lemmatize(t[0], getWordnetType(t[1])) for t in pos_tags]
    
    #remove useless words
    r = [w for w in r if len(w) > 1]
    
    #recreate review
    r = " ".join(r)
    return r

# get compound score from VADER and append it as review rating
def getSentimentScore(rdata):
    sa = SentimentIntensityAnalyzer()
    rdata["sentiments"] = rdata["reviewText"].apply(lambda x: sa.polarity_scores(x))
    rdata = pd.concat([rdata.drop(['sentiments'], axis=1), rdata['sentiments'].apply(pd.Series)], axis=1)
    rdata['helpful'] = rdata['helpful'].apply(lambda x: getHelpfulScore(x))
    #compound score from VADER is from -1 to +1. Scaling it to 0 to 5 and multiplying it with helpfulness score
    rdata['reviewRating'] = (1+(rdata['compound']+1)*2)
#     rdata['reviewRating'] = (1+(rdata['compound']+1)*2) * rdata['helpful']
    rdata = rdata.drop(['compound','pos','neg','neu'],axis = 1)
    rdata = rdata.drop(['reviewText','helpful'],axis=1)
    return rdata
    

In [8]:
def getHelpfulScore(x):
    if x == None:
        return 1.0
    x = literal_eval(x)
    a = float(x[0])
    b = float(x[1])

    if b != 0 and a + b > helpfulCutOffRatings-1:
        return a / (a + b)
    else:
        return 1.0

In [9]:
def generateSentimentScoreOnReviews(rdata):
    reviewerInfo = createReviewerInfo(rdata)
    rdata = removeUnusedReviewInfo(rdata)
    rdata["reviewType"] = rdata["overall"].apply(classifyBaseReviewRating)
    rdata['reviewText'] = rdata['reviewText'] + rdata['summary']
    rdata = rdata.drop(['summary'],axis = 1)
    rdata['reviewText'] = rdata['reviewText'].apply(cleanData)    
    rdata['reviewText'] = rdata['reviewText'].apply(lambda x:cleanReview(x))
    rdata = getSentimentScore(rdata)
    return rdata


In [10]:
def getMovieData(path):
    return pd.read_json(path,lines=True,typ='frame')

def preProcessMovieData(moviedata):
    moviedata.sort_values("asin",inplace=True)
    moviedata.drop_duplicates(subset='asin',keep=False,inplace = True)
    moviedata = moviedata.reset_index(drop=True)
    moviedata = moviedata.drop(['rank','main_cat','image','also_buy','also_view','price','details','feature','date','tech1'],axis = 1,errors='ignore')
    return moviedata



In [11]:
def getCosineSimilarity(moviedata):
    count = CountVectorizer()
    count_matrix = count.fit_transform(moviedata['cleanText'])
    # generating the cosine similarity matrix
    cosine_sim = cosine_similarity(count_matrix, count_matrix)
    return cosine_sim

def getMovieSimilarityFromPlot(moviedata):
    # initializing the new column
    moviedata['impWords'] = ''
    moviedata['cleanText'] = ''
    for index, movie in moviedata.iterrows():
        tot = ''
        description = movie['description']
        if description != None:
            if type(description) != float:
                if len(description) != 0:
                    for i in description:
                        if type(i) != str:
                            print(description)
                        tot = tot + i

        #remove "Movies & TV", "Genre for Featured Categories", "Movies", "Independently Distributed", "All Titles", "All", "4-for-3 DVD", 
        ignoreWords = ["Movies & TV", "Genre for Featured Categories", "Movies", "Independently Distributed", "All Titles", "All", "4-for-3 DVD"]
        category = movie['category']
        if category != None:
            if type(category) != float:
                for w in category:
                    if w not in ignoreWords:
                        tot = tot+w
        brand = movie['brand']
        if brand != None:
            if type(brand) != float:
                tot = tot + i
        r = Rake()
        r.extract_keywords_from_text(tot)
        # getting the dictionary whith key words as keys and their scores as values
        key_words_dict_scores = r.get_word_degrees()

        # assigning the key words to the new column for the corresponding movie
        movie['impWords'] = list(key_words_dict_scores.keys())
        movie['cleanText'] = " ".join(map(str,movie['impWords']))
    moviedata = moviedata.drop(['category','brand','description','impWords'],axis = 1)
    cosine_sim =  getCosineSimilarity(moviedata)
    moviedata = moviedata.drop(['cleanText'],axis = 1)
    return cosine_sim, moviedata


In [12]:
def getMovieAvgRating(moviedata,rdataf):
    movieAvgRating = rdataf.groupby('asin')['overall'].agg(['mean', 'median', 'size'])
    movieAvgRating.columns = ['rating_mean', 'rating_median', 'num_ratings']
    return movieAvgRating


In [13]:
def get_top3_recommendations(predictions, topN = 3):
     
    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_recs[uid].append((iid, est))
     
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
     
    return top_recs

def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls


In [40]:
def getPredictionFromMovieContent(moviedata, rdata, testSplit):
    moviedata = preProcessMovieData(moviedata)
    # craete cosine similarity matrix
    cosine_sim, moviedata = getMovieSimilarityFromPlot(moviedata)
    rdata, testdata = train_test_split(rdata, test_size=testSplit)
#     userMovieMatrix = createUserMovieMatrix(rdata,moviedata)
    rdata = rdata.merge(moviedata, on='asin',how='left')
    movieAvgRating = getMovieAvgRating(moviedata,rdata)
    movieAvgRating.columns = ['rating_mean', 'rating_median', 'num_ratings']
    moviedata = moviedata.merge(movieAvgRating, on='asin',how='left')
    
    predictions = getUserRatingPredictionFromMovieContent(cosine_sim, rdata, testdata, moviedata)
    return predictions

In [15]:
def getUserRatingPredictionFromMovieContent(cosine_sim, rdata, testdata, moviedata):
    
    # Get user movie prediction based on train data
    userMovieMatrix, userMovieMatching = createUserMovieMatrix(rdata, moviedata)
    sumUserMovieWeight = np.dot(userMovieMatrix, cosine_sim)
    sumSimFoRM2M = np.dot(userMovieMatching, cosine_sim)
    userMoviePredict = sumUserMovieWeight/sumSimFoRM2M #Movie prediction for all the data
    
    #Get User movie matching for test data
    userMovieMatrixTest, userMovieMatchingTest = createUserMovieMatrix(testdata, moviedata)
    
    # Used for predicting not rated movies in train data
    userMovieMatching = 1 - userMovieMatching
    
    # get user movie matrix for users not rated all
    userMovieMatchingNewPred = userMovieMatching - userMovieMatchingTest
    
    # Predict all user-movie from train data
    userMoviePredictMissingMovies = userMoviePredict * userMovieMatching
    
    # Prediction for already rated movies
    userMovieAlreadyRatedPredicted = userMovieMatchingTest * userMoviePredictMissingMovies
    
    #Prediction for new movies
    userMovieNewPredictions = userMoviePredictMissingMovies * userMovieMatchingNewPred
    
    
    userMovieNewPredictions.reset_index(in_place=True)
    userMovieNewPredictions = userMovieNewPredictions.replace(0,np.nan)
    
    userMovieAlreadyRatedPredicted.reset_index(in_place=True)
    userMovieAlreadyRatedPredicted = userMovieAlreadyRatedPredicted.replace(0,np.nan)
    
    userMovieAlreadyRatedPredicted = pd.melt(userMovieAlreadyRatedPredicted, id_vars='reviewerID', 
                                        value_vars=list(userMovieAlreadyRatedPredicted.columns[1:]),
                                        var_name='asin', 
                                        value_name='rating')
    userMovieAlreadyRatedPredicted = userMovieAlreadyRatedPredicted[userMovieAlreadyRatedPredicted['rating'].notna()]
    testdataF = testdata.merge(metadata, on='asin',how='outer')
    
    testdataF = testdataF[testdataF['overall'].notna()]
    
    userMovieAlreadyRatedPredictedFinal = pd.merge(userMovieAlreadyRatedPredicted, 
                                                   testdataF, how='left',on=['reviewerID','asin'])
    userMovieAlreadyRatedPredictedFinal.drop(['title'],axis =1,inplace=True)
    
    
    userMovieNewPredictions = pd.melt(userMovieNewPredictions, id_vars='reviewerID', 
                                        value_vars=list(userMovieNewPredictions.columns[1:]),
                                        var_name='asin', 
                                        value_name='rating')
    userMovieNewPredictions = userMovieNewPredictions[userMovieNewPredictions['rating'].notna()]
    
    combinedPred = pd.concat([userMovieAlreadyRatedPredictedFinal, userMovieNewPredictions],ignore_index=True)
    
    combinedPred = combinedPred.replace(np.nan,0)
    combinedPred['tup'] = tuple(zip(combinedPred.reviewerID, combinedPred.asin,
                                    combinedPred.rating,combinedPred.overall))
    predictions = list(combinedPred['tup'])
    return predictions

In [435]:
rsmall = pd.read_csv('vvsmall.csv', names=['reviewerID','asin','overall'], skiprows=1, sep=' ')
rsmall

Unnamed: 0,reviewerID,asin,overall
0,u1,m2,4
1,u2,m1,5
2,u2,m4,4


In [436]:
rsmallt = pd.read_csv('vvsmall_test.csv', names=['reviewerID','asin','overall'], skiprows=1, sep=' ')
rsmallt

Unnamed: 0,reviewerID,asin,overall
0,u1,m3,3
1,u2,m2,3


In [437]:
meta = pd.read_json('vvsmall_medata.json',lines=True,typ='frame')


In [438]:
df = rsmall.merge(meta, on='asin',how='outer')
df

Unnamed: 0,reviewerID,asin,overall,title
0,u1,m2,4.0,m2
1,u2,m1,5.0,m1
2,u2,m4,4.0,m4
3,,m3,,m3


In [439]:
dft = rsmallt.merge(meta, on='asin',how='outer')
dft

Unnamed: 0,reviewerID,asin,overall,title
0,u1,m3,3.0,m3
1,u2,m2,3.0,m2
2,,m4,,m4
3,,m1,,m1


In [440]:
# df.pivot_table(index='reviewerID', columns = 'asin',values = 'overall',dropna = False).reset_index()
tt = df.pivot_table( index='reviewerID',columns = 'asin',values = 'overall',dropna=False).reset_index().rename_axis(None, axis=1)
tt.set_index('reviewerID',inplace=True)

tt = tt.replace(np.nan,0)
# tt = tt.drop(['reviewerID'],axis = 1)

tt

Unnamed: 0_level_0,m1,m2,m3,m4
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
u1,0.0,4.0,0.0,0.0
u2,5.0,0.0,0.0,4.0


In [441]:
# df.pivot_table(index='reviewerID', columns = 'asin',values = 'overall',dropna = False).reset_index()
ttt = dft.pivot_table( index='reviewerID',columns = 'asin',values = 'overall',dropna=False).reset_index().rename_axis(None, axis=1)
ttt.set_index('reviewerID',inplace=True)

ttt = ttt.replace(np.nan,0)
# tt = tt.drop(['reviewerID'],axis = 1)

ttt


Unnamed: 0_level_0,m1,m2,m3,m4
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
u1,0.0,0.0,3.0,0.0
u2,0.0,3.0,0.0,0.0


In [442]:

dfBin = pd.crosstab(index=df['reviewerID'], columns=df['asin'],dropna = False)
dfBin

asin,m1,m2,m3,m4
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
u1,0,1,0,0
u2,1,0,0,1


In [443]:
dfBint = pd.crosstab(index=dft['reviewerID'], columns=dft['asin'],dropna = False)
dfBint

asin,m1,m2,m3,m4
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
u1,0,0,1,0
u2,0,1,0,0


In [444]:
cosTest = np.array([[1, 0.5, 0.25,0.25], [0.5, 1, 0.4,0.3],[0.25,0.4,1,0.6],[0.25,0.3,0.6,1]])

In [445]:
m2mCosim = pd.DataFrame(cosTest)

In [446]:
mul = np.dot(tt,m2mCosim)

In [447]:
binMul = np.dot(dfBin,m2mCosim)

In [448]:
finR = mul/binMul

In [449]:
finR

array([[4.        , 4.        , 4.        , 4.        ],
       [4.8       , 4.625     , 4.29411765, 4.2       ]])

In [450]:
dfBin = 1- dfBin
dfBin

asin,m1,m2,m3,m4
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
u1,1,0,1,1
u2,0,1,1,0


In [451]:
newPred = dfBin - dfBint
newPred

asin,m1,m2,m3,m4
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
u1,1,0,0,1
u2,0,0,1,0


In [452]:
predictedRating = dfBin*finR
predictedRating


asin,m1,m2,m3,m4
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
u1,4.0,0.0,4.0,4.0
u2,0.0,4.625,4.294118,0.0


In [453]:
newPred = newPred*predictedRating
newPred

asin,m1,m2,m3,m4
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
u1,4.0,0.0,0.0,4.0
u2,0.0,0.0,4.294118,0.0


In [454]:
newPredAlready = dfBint*predictedRating
newPredAlready

asin,m1,m2,m3,m4
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
u1,0.0,0.0,4.0,0.0
u2,0.0,4.625,0.0,0.0


In [455]:
predictedRating.reset_index(inplace=True)
predictedRating


asin,reviewerID,m1,m2,m3,m4
0,u1,4.0,0.0,4.0,4.0
1,u2,0.0,4.625,4.294118,0.0


In [456]:

newPred.reset_index(inplace=True)
newPred = newPred.replace(0,np.nan)
newPred

asin,reviewerID,m1,m2,m3,m4
0,u1,4.0,,,4.0
1,u2,,,4.294118,


In [457]:

newPredAlready.reset_index(inplace=True)
newPredAlready = newPredAlready.replace(0,np.nan)
newPredAlready

asin,reviewerID,m1,m2,m3,m4
0,u1,,,4.0,
1,u2,,4.625,,


In [458]:
newPredAlreadyF = pd.melt(newPredAlready, id_vars='reviewerID', 
                                        value_vars=list(newPredAlready.columns[1:]),
                                        var_name='asin', 
                                        value_name='rating')
newPredAlreadyF = newPredAlreadyF[newPredAlreadyF['rating'].notna()]
dftf = dft[dft['overall'].notna()]
newPredAlreadyF = pd.merge(newPredAlreadyF, dftf, how='left',on=['reviewerID','asin'])
newPredAlreadyF.drop(['title'],axis =1,inplace=True)

In [459]:
newPredAlreadyF

Unnamed: 0,reviewerID,asin,rating,overall
0,u2,m2,4.625,3.0
1,u1,m3,4.0,3.0


In [460]:
newPredF = pd.melt(newPred, id_vars='reviewerID', 
                                        value_vars=list(newPred.columns[1:]),
                                        var_name='asin', 
                                        value_name='rating')
newPredF = newPredF[newPredF['rating'].notna()]


In [461]:
newPredF

Unnamed: 0,reviewerID,asin,rating
0,u1,m1,4.0
5,u2,m3,4.294118
6,u1,m4,4.0


In [462]:

comb = pd.concat([newPredAlreadyF, newPredF],ignore_index=True)
comb

Unnamed: 0,reviewerID,asin,rating,overall
0,u2,m2,4.625,3.0
1,u1,m3,4.0,3.0
2,u1,m1,4.0,
3,u2,m3,4.294118,
4,u1,m4,4.0,


In [463]:
comb = comb.replace(np.nan,0)

In [464]:

comb['tup'] = tuple(zip(comb.reviewerID, comb.asin,comb.rating,comb.overall))

comb



Unnamed: 0,reviewerID,asin,rating,overall,tup
0,u2,m2,4.625,3.0,"(u2, m2, 4.625, 3.0)"
1,u1,m3,4.0,3.0,"(u1, m3, 4.0, 3.0)"
2,u1,m1,4.0,0.0,"(u1, m1, 4.0, 0.0)"
3,u2,m3,4.294118,0.0,"(u2, m3, 4.294117647058823, 0.0)"
4,u1,m4,4.0,0.0,"(u1, m4, 4.0, 0.0)"


In [465]:
predictions = list(comb.tup)

In [319]:
predictedRating = pd.melt(predictedRating, id_vars='reviewerID', 
                                        value_vars=list(predictedRating.columns[1:]),
                                        var_name='movie', 
                                        value_name='rating')
predictedRating

Unnamed: 0,reviewerID,movie,rating
0,u1,m1,4.0
1,u2,m1,0.0
2,u1,m2,0.0
3,u2,m2,4.625
4,u1,m3,4.0
5,u2,m3,4.294118
6,u1,m4,4.0
7,u2,m4,0.0


In [None]:
#     userMovieMatching = 1 - userMovieMatching
#     userMoviePredictMissingMovies = userMoviePredict * userMovieMatching
#     userMoviePredictMissingMovies = userMoviePredictMissingMovies.reset_index()
#     userMoviePredictMissingMovies = userMoviePredictMissingMovies.replace(0,np.nan)
#     userMoviePredictMissingMovies = pd.melt(userMoviePredictMissingMovies, id_vars='reviewerID', 
#                                         value_vars=list(finRM.columns[1:]),
#                                         var_name='movie', 
#                                         value_name='rating')
#     userMoviePredictMissingMovies = userMoviePredictMissingMovies[userMoviePredictMissingMovies['rating'].notna()]
#     userMoviePredictMissingMovies = userMoviePredictMissingMovies.reset_index(drop=True)
#     userMoviePredictMissingMovies['value'] = tuple(zip(userMoviePredictMissingMovies.movie, 
#                                                   userMoviePredictMissingMovies.ratings))
#     predictions = pd.Series(userMoviePredictMissingMovies['value'].values,
#                             index=userMoviePredictMissingMovies.reviewerID).to_dict()

In [16]:
# Based on field in and type
def getSurpriseRatingPrediction(data, ratingCol, testSplit, algo, userBased):
    ratings_dict = {'itemID': list(data.asin),
                'userID': list(data.reviewerID),
                'rating': list(data[ratingCol])}
    df = pd.DataFrame(ratings_dict)
    
    # A reader is still needed but only the rating_scale param is required.
    # The Reader class is used to parse a file containing ratings.
    reader = Reader(rating_scale=(0.5, 5.0))
    # The columns must correspond to user id, item id and ratings (in that order).
    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
    trainset, testset = train_test_split(data, test_size=testSplit)
    if algo == 'svd':
        algo = SVD()
    elif algo == 'knn':
        sim_options = {'name': 'cosine',
               'user_based': userBased
               }
        algo = algo = KNNBaseline(sim_options=sim_options)
    algo.fit(trainset)
    print('mean',trainset.global_mean)
    predictions = algo.test(testset)
    return predictions
    

In [17]:
def getPredictions(rdata, moviedata, testSplit, topN):
    # get SVD predictions
    svdPredictions = getSurpriseRatingPrediction(rdata, 'overall', testSplit, 'svd', False )
    
    #get Knn user based predictions based on ratings
    userBasedKNNPredictions = getSurpriseRatingPrediction(rdata, 'overall', testSplit, 'knn', True)
    
    #get Knn movie based predictions based on ratings
    ratingBasedKNNPredictions = getSurpriseRatingPrediction(rdata, 'overall', testSplit, 'knn', False)
    
    #get Knn movie based predictions based on reviews
    rdata = generateSentimentScoreOnReviews(rdata)
    reviewBasedKNNPredictions = getSurpriseRatingPrediction(rdata, 'reviewRating', testSplit, 'knn', False)
    
    #get content based predictions
    contentBasedPredictions = getPredictionFromMovieContent(moviedata, rdata, testSplit)
    
    topNRec = getFinalRec(svdPredictions, userBasedKNNPredictions,
                                           ratingBasedKNNPredictions, reviewBasedKNNPredictions,
                                          contentBasedPredictions, topN)
    return topNRec
    
    

In [156]:
def mergeDict(dictList):
    mergeDict = defaultdict(list)
    for dictEach in dictList:
        for key, val in dictEach.items(): 
            mergeDict[key].append(val)
    
    newMergeDic = defaultdict(list)
    for user, userList in mergeDict.items():
        eachUser = defaultdict(list)
        for movie, rating in userList:
            eachUser[movie].append(rating)
        movieL = eachUser[movie]
        eachUser[movie] = sum(movieL) / len(movieL)
        newMergeDic[user] = dict(eachUser)
    return newMergeDic

In [18]:
# Combine all the predictions score by weighted average
def combinePredictions(svdPredictions, userBasedKNNPredictions,
                                           ratingBasedKNNPredictions, reviewBasedKNNPredictions,
                                          contentBasedPredictions):
    svdPredictionsNew = defaultdict(list)
    for uid, iid, true_r, est, _ in svdPredictions:
            svdPredictionsNew[uid].append((iid, est))
    
    
    userBasedKNNPredictionsNew = defaultdict(list)
    for uid, iid, true_r, est, _ in userBasedKNNPredictions:
            userBasedKNNPredictionsNew[uid].append((iid, est))
            
    ratingBasedKNNPredictionsNew = defaultdict(list)
    for uid, iid, true_r, est, _ in ratingBasedKNNPredictions:
            ratingBasedKNNPredictionsNew[uid].append((iid, est))
    
    reviewBasedKNNPredictionsNew = defaultdict(list)
    for uid, iid, true_r, est, _ in reviewBasedKNNPredictions:
            reviewBasedKNNPredictionsNew[uid].append((iid, est))
    
    combinedPred = [svdPredictionsNew, userBasedKNNPredictionsNew, 
                    ratingBasedKNNPredictionsNew, reviewBasedKNNPredictionsNew,
                   contentBasedPredictions]
    # contentBasedPredictions is already of the form like above
    # TODO: add scores of different methods and recommend
    
    return top_recs
    

In [207]:
a = {'a':('m1',3), 'b':('m2',4),'d':('m2',4)}
b = {'c':('m1',3), 'b':('m2',3)}
c = {'c':('m1',3), 'b':('m1',5)}
d = {'a':('m1',3), 'c':('m2',2), 'b':('m2',4)}


In [208]:
dicts = [a ,b ,c, d]
dicts

[{'a': ('m1', 3), 'b': ('m2', 4), 'd': ('m2', 4)},
 {'c': ('m1', 3), 'b': ('m2', 3)},
 {'c': ('m1', 3), 'b': ('m1', 5)},
 {'a': ('m1', 3), 'c': ('m2', 2), 'b': ('m2', 4)}]

In [209]:
mergeDict = defaultdict(list)
for dictEach in dicts:
    for key, val in dictEach.items(): 
        mergeDict[key].append(val)
mergeDict

defaultdict(list,
            {'a': [('m1', 3), ('m1', 3)],
             'b': [('m2', 4), ('m2', 3), ('m1', 5), ('m2', 4)],
             'd': [('m2', 4)],
             'c': [('m1', 3), ('m1', 3), ('m2', 2)]})

In [210]:
t = [('m1', 3), ('m1', 4)]


In [212]:
newMergeDic = defaultdict(list)
for user, movieList in mergeDict.items():
    movieDict = defaultdict(list)
    for movie, rating in movieList:
#         print(user,'   ',movie,'   ',rating)
        movieDict[movie].append(rating)
#         print(movieDict)
    movieL = movieDict[movie]
#     print(movieL)
#     movieDict[movie] = sum(movieL) / len(movieL)
    newMergeDic[user] = dict(movieDict)

In [214]:
dict(newMergeDic)

{'a': {'m1': [3, 3]},
 'b': {'m2': [4, 3, 4], 'm1': [5]},
 'd': {'m2': [4]},
 'c': {'m1': [3, 3], 'm2': [2]}}

In [201]:
dict(newMergeDic)

{'a': {'m1': 3.0},
 'b': {'m2': 3.6666666666666665, 'm1': [5]},
 'd': {'m2': 4.0},
 'c': {'m1': [3, 3], 'm2': 2.0}}

In [139]:
for user, userList in mergeDict:
        eachUser = defaultdict(list)
        for movie, rating in userList:
            eachUser[movie].append(rating)

{'m1': 4}

In [21]:
def getFinalRec(svdPredictions, userBasedKNNPredictions,
                                           ratingBasedKNNPredictions, reviewBasedKNNPredictions,
                                          contentBasedPredictions, topN):
    
    predictions = combinePredictions(svdPredictions, userBasedKNNPredictions,
                                           ratingBasedKNNPredictions, reviewBasedKNNPredictions,
                                          contentBasedPredictions)
    
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
    return top_recs
    

In [22]:
'''
movies = pd.read_csv('movies.csv')
movies_dict = {'itemID': list(movies.movieId),
                'title': list(movies.title)}
df2 = pd.DataFrame(movies_dict)
'''
# svd
rmse = []
size = []

In [28]:
rdata = pd.read_csv(reviewDataPath, names=['reviewerID','asin','reviewerName','helpful','reviewText','overall','summary','unixReviewTime','reviewTime'], skiprows=1, sep='\t')
rdata.reset_index(inplace=True)
rdata = rdata.drop(['index'],axis =1)
moviedata = pd.read_json(metadataPath,lines=True,typ='frame')
testSplit = 0.5

In [None]:
# read data as sample pandas dataframe

for i in arange(0.05, 1, 0.05):
    getPredictions(rdata, moviedata, i, topN)
    size.append(i)
    rmse.append(accuracy.rmse(predictions, verbose=True))
    top3_recommendations = get_top3_recommendations(predictions)
    #print('top recommendations for SVD', top3_recommendations)
    
    rmse1.append(accuracy.rmse(predictions1, verbose=True))
    top3_recommendations = get_top3_recommendations(predictions1)
    #print('top recommendations using Item-Item sim', top3_recommendations)
    
    rmse2.append(accuracy.rmse(predictions2, verbose=True))
    top3_recommendations = get_top3_recommendations(predictions2)
    #print('top recommendations using User-User sim', top3_recommendations)
    
    #for uid, user_ratings in top3_recommendations.items():
        #print(uid, [df2.loc[df2['itemID'] == iid, 'title'].iloc[0] for (iid, _) in user_ratings])
        #print(uid, [result['itemID'] == iid for (iid, _) in user_ratings])
        #print(uid, [result.loc[result['itemID'] == iid, 'title'].iloc[0] for (iid, _) in user_ratings])
    #precisions, recalls = precision_recall_at_k(predictions, k=3, threshold=4)

    # Precision and recall can then be averaged over all users
    #print(sum(prec for prec in precisions.values()) / len(precisions))
    #print(sum(rec for rec in recalls.values()) / len(recalls))

In [None]:

plt.style.use('seaborn-whitegrid')
fig = plt.figure()
ax = plt.axes()
plt.plot(size, rmse)
plt.show()