In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import string
import random
import nltk
from nltk import pos_tag
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import movie_reviews, stopwords,wordnet
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import sys

from rake_nltk import Rake

import numpy as np
from numpy import arange,atleast_2d,argsort

from collections import defaultdict,Counter
from surprise import Prediction
from surprise import Reader, Dataset, SVD, accuracy,KNNBaseline,NMF
from surprise.model_selection import train_test_split,cross_validate

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

from ast import literal_eval

import operator
from functools import reduce
import math

In [0]:
from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 80 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [0]:
helpfulCutOffRatings = 20
reviewDataPath = 'amazon/vsmall.csv'
train_data_location = '/freespace/local/lp642/Final_20_Movies_and_TV_train.tsv.gz'
test_data_location = '/freespace/local/lp642/Final_20_Movies_and_TV_test.tsv.gz'
metadataPath = '/freespace/local/lp642/meta_Movies_and_TV.json'
topN = 5

In [0]:
def removeUnusedReviewInfo(rdata):
    rdata = rdata.drop(['unixReviewTime','reviewTime','reviewerName'],axis=1,errors='ignore')
    return rdata
    
def createReviewerInfo(data):
    reviewerInfo = data[['reviewerID', 'reviewerName']]
    return reviewerInfo

In [0]:
def createUserMovieMatrix(udata,mdata):
    fullData = udata.merge(mdata, on='asin',how='outer')
    userMovieMatrix = fullData.pivot_table(index='reviewerID', columns = 'asin',values = 'overall',dropna=False).reset_index().rename_axis(None, axis=1)
    userMovieMatrix.set_index('reviewerID',inplace=True)
    userMovieMatching = pd.crosstab(index = fullData['reviewerID'], columns=fullData['asin'],dropna = False)
    userMovieMatrix = userMovieMatrix.replace(np.nan,0)
#     userMovieMatrix = userMovieMatrix.drop(['reviewerID'],axis = 1)
    return userMovieMatrix, userMovieMatching


In [0]:
# Classify data into negative or positive
def classifyBaseReviewRating(x):
    if x > 2.5:
        return "positive"
    if x == 2.5:
        return "neutral"
    if x < 2.5:
        return "negative"

In [0]:
def cleanData(r):
    r = str(r)
    if len(r) == 0:
        return ''
    if type(r) == float:
        return ''
    return r

In [0]:
def getWordnetType(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def cleanReview(r):
    r = str(r)
    if len(r) == 0:
        return ''
    if type(r) == float:
        return ''
    r = r.lower()
    #remove puncutation
    r = [eachWord.strip(string.punctuation) for eachWord in r.split(" ") ]
    #remove numbers
    r = [eachWord for eachWord in r if not any(l.isdigit() for l in eachWord)]
    
    #remove stop words
    stop = stopwords.words('english')
    r = [w for w in r if w not in stop]
    
    #remove words which are empty
    r = [w for w in r if len(w) >0]
    
    #add pos_tags
    pos_tags = pos_tag(r)
    
    #word lemmatize
    r = [WordNetLemmatizer().lemmatize(t[0], getWordnetType(t[1])) for t in pos_tags]
    
    #remove useless words
    r = [w for w in r if len(w) > 1]
    
    #recreate review
    r = " ".join(r)
    return r

# get compound score from VADER and append it as review rating
def getSentimentScore(rdata):
    sa = SentimentIntensityAnalyzer()
    rdata["sentiments"] = rdata["reviewText"].parallel_apply(lambda x: sa.polarity_scores(x))
    rdata = pd.concat([rdata.drop(['sentiments'], axis=1), rdata['sentiments'].parallel_apply(pd.Series)], axis=1)
    # Not using compound score because of data limitation. Only few movies have compound score availble on amazon data.
    # rdata['helpful'] = rdata['helpful'].parallel_apply(lambda x: getHelpfulScore(x)) 
    
    #compound score from VADER is from -1 to +1. Scaling it to 0 to 5 and multiplying it with helpfulness score
    rdata['reviewRating'] = (1+(rdata['compound']+1)*2)
    # rdata['reviewRating'] = (1+(rdata['compound']+1)*2) * rdata['helpful']
    rdata = rdata.drop(['compound','pos','neg','neu'],axis = 1,errors='ignore')
    rdata = rdata.drop(['reviewText','helpful'],axis=1,errors='ignore')
    return rdata
    

In [0]:
def getHelpfulScore(x):
    if x == None:
        return 1.0
    x = literal_eval(x)
    a = float(x[0])
    b = float(x[1])

    if b != 0 and a + b > helpfulCutOffRatings-1:
        return a / (a + b)
    else:
        return 1.0

In [0]:
def generateSentimentScoreOnReviews(rdata):
    reviewerInfo = createReviewerInfo(rdata)
    rdata = removeUnusedReviewInfo(rdata)
    rdata["reviewType"] = rdata["overall"].parallel_apply(classifyBaseReviewRating)
    rdata['reviewText'] = rdata['reviewText'] + rdata['summary']
    rdata = rdata.drop(['summary'],axis = 1,errors='ignore')
    rdata['reviewText'] = rdata['reviewText'].parallel_apply(cleanData)    
    rdata['reviewText'] = rdata['reviewText'].parallel_apply(lambda x:cleanReview(x))
    rdata = getSentimentScore(rdata)
    return rdata


In [0]:
def getMovieData(path):
    return pd.read_json(path,lines=True,typ='frame')

def preProcessMovieData(moviedata):
    moviedata.sort_values("asin",inplace=True)
    moviedata.drop_duplicates(subset='asin',keep=False,inplace = True)
    moviedata = moviedata.reset_index(drop=True)
    moviedata = moviedata.drop(['rank','main_cat','image','also_buy','also_view','price','details','feature','date','tech1'],axis = 1,errors='ignore')
    return moviedata



In [0]:
def getCosineSimilarity(moviedata):
    count = CountVectorizer()
    count_matrix = count.fit_transform(moviedata['cleanText'])
    # generating the cosine similarity matrix
    cosine_sim = cosine_similarity(count_matrix, count_matrix)
    return cosine_sim

def getMovieSimilarityFromPlot(moviedata):
    # initializing the new column
    moviedata['impWords'] = ''
    moviedata['cleanText'] = ''
    for index, movie in moviedata.iterrows():
        tot = ''
        description = movie['description']
        if description != None:
            if type(description) != float:
                if len(description) != 0:
                    for i in description:
                        if type(i) != str:
                            print(description)
                        tot = tot + i

        #remove "Movies & TV", "Genre for Featured Categories", "Movies", "Independently Distributed", "All Titles", "All", "4-for-3 DVD", 
        ignoreWords = ["Movies & TV", "Genre for Featured Categories", "Movies", "Independently Distributed", "All Titles", "All", "4-for-3 DVD"]
        category = movie['category']
        if category != None:
            if type(category) != float:
                for w in category:
                    if w not in ignoreWords:
                        tot = tot+w
        brand = movie['brand']
        if brand != None:
            if type(brand) != float:
                tot = tot + i
        r = Rake()
        r.extract_keywords_from_text(tot)
        # getting the dictionary whith key words as keys and their scores as values
        key_words_dict_scores = r.get_word_degrees()

        # assigning the key words to the new column for the corresponding movie
        movie['impWords'] = list(key_words_dict_scores.keys())
        movie['cleanText'] = " ".join(map(str,movie['impWords']))
    moviedata = moviedata.drop(['category','brand','description','impWords'],axis = 1,errors='ignore')
    cosine_sim =  getCosineSimilarity(moviedata)
    moviedata = moviedata.drop(['cleanText'],axis = 1)
    return cosine_sim, moviedata


In [0]:
def getMovieAvgRating(moviedata,rdataf):
    movieAvgRating = rdataf.groupby('asin')['overall'].agg(['mean', 'median', 'size'])
    movieAvgRating.columns = ['rating_mean', 'rating_median', 'num_ratings']
    return movieAvgRating


In [0]:
def get_top_recommendations(predictions, topN = 3):
     
    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_recs[uid].append((iid, est))
     
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
     
    return top_recs

def precision_recall_at_k(predictions, k=10, threshold=3.5, flag=True):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    if flag == False:
        for uid, _,true_r, est in predictions:
            user_est_true[uid].append((est, true_r))
    else:      
        for uid, _, true_r, est, _ in predictions:
            user_est_true[uid].append((est, true_r))
            
    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls


def getNDCGScore(tuples):
    userPredictedRatings = defaultdict(list)
    userActualRatings = defaultdict(list)
    for ele in tuples:
        userPredictedRatings[ele[0]].append(ele[3])
        userActualRatings[ele[0]].append(ele[2])
    for predictedRatings,actualRatings in zip(userPredictedRatings.items(),userActualRatings.items()):
        predictedRatings[1].sort( reverse = True) 
        actualRatings[1].sort(reverse = True) 
    
    mean_ndcg = 0

    for predictedRatings,actualRatings in zip(userPredictedRatings.items(),userActualRatings.items()):
        dcg = 0
        idcg = 0
        count = 0
        for predictedRating,actualRating in zip(predictedRatings[1],actualRatings[1]):
            count = count + 1
            dcg = dcg + (predictedRating/math.log(count+1,2))
            idcg = idcg + (actualRating/math.log(count+1,2))
        mean_ndcg = mean_ndcg + (dcg/idcg)
    return (mean_ndcg/len(userPredictedRatings))

In [0]:
def getPredictionFromMovieContent(moviedata, rdata, testSplit):
    moviedata = preProcessMovieData(moviedata)
    # craete cosine similarity matrix
    cosine_sim, moviedata = getMovieSimilarityFromPlot(moviedata)
    train = rdata.sample(frac = testSplit,random_state=101)
    testdata = rdata.drop(train.index)
    rdata = train
    del train
    rdata = rdata.merge(moviedata, on='asin',how='left')
    movieAvgRating = getMovieAvgRating(moviedata,rdata)
    movieAvgRating.columns = ['rating_mean', 'rating_median', 'num_ratings']
    moviedata = moviedata.merge(movieAvgRating, on='asin',how='left')
    
    predictions = getUserRatingPredictionFromMovieContent(cosine_sim, rdata, testdata, moviedata)
    return predictions

In [0]:
def getUserRatingPredictionFromMovieContent(cosine_sim, rdata, testdata, moviedata):
    
    # Get user movie prediction based on train data
    userMovieMatrix, userMovieMatching = createUserMovieMatrix(rdata, moviedata)
    sumUserMovieWeight = np.dot(userMovieMatrix, cosine_sim)
    sumSimFoRM2M = np.dot(userMovieMatching, cosine_sim)
    userMoviePredict = sumUserMovieWeight/sumSimFoRM2M #Movie prediction for all the data
    
    #Get User movie matching for test data
    userMovieMatrixTest, userMovieMatchingTest = createUserMovieMatrix(testdata, moviedata)
    
    # Used for predicting not rated movies in train data
    userMovieMatching = 1 - userMovieMatching
    
    # get user movie matrix for users not rated all
    userMovieMatchingNewPred = userMovieMatching - userMovieMatchingTest
    
    # Predict all user-movie from train data
    userMoviePredictMissingMovies = userMoviePredict * userMovieMatching
    
    # Prediction for already rated movies
    userMovieAlreadyRatedPredicted = userMovieMatchingTest * userMoviePredictMissingMovies
    
    #Prediction for new movies
    userMovieNewPredictions = userMoviePredictMissingMovies * userMovieMatchingNewPred
    
    
    userMovieNewPredictions.reset_index(in_place=True)
    userMovieNewPredictions = userMovieNewPredictions.replace(0,np.nan)
    
    userMovieAlreadyRatedPredicted.reset_index(in_place=True)
    userMovieAlreadyRatedPredicted = userMovieAlreadyRatedPredicted.replace(0,np.nan)
    
    userMovieAlreadyRatedPredicted = pd.melt(userMovieAlreadyRatedPredicted, id_vars='reviewerID', 
                                        value_vars=list(userMovieAlreadyRatedPredicted.columns[1:]),
                                        var_name='asin', 
                                        value_name='rating')
    userMovieAlreadyRatedPredicted = userMovieAlreadyRatedPredicted[userMovieAlreadyRatedPredicted['rating'].notna()]
    testdataF = testdata.merge(metadata, on='asin',how='outer')
    
    testdataF = testdataF[testdataF['overall'].notna()]
    
    userMovieAlreadyRatedPredictedFinal = pd.merge(userMovieAlreadyRatedPredicted, 
                                                   testdataF, how='left',on=['reviewerID','asin'])
    # Can drop if not needed. Few movies does not have meta data in amazon data. So removing it.
    # userMovieAlreadyRatedPredictedFinal.drop(['title'],axis =1,inplace=True)
    
    userMovieNewPredictions = pd.melt(userMovieNewPredictions, id_vars='reviewerID', 
                                        value_vars=list(userMovieNewPredictions.columns[1:]),
                                        var_name='asin', 
                                        value_name='rating')
    userMovieNewPredictions = userMovieNewPredictions[userMovieNewPredictions['rating'].notna()]
    
    combinedPred = pd.concat([userMovieAlreadyRatedPredictedFinal, userMovieNewPredictions],ignore_index=True)
    
    combinedPred = combinedPred.replace(np.nan,0)
    combinedPred['tup'] = tuple(zip(combinedPred.reviewerID, combinedPred.asin,
                                    combinedPred.rating,combinedPred.overall))
    predictions = list(combinedPred['tup'])
    return predictions

In [0]:
def partition_input_data(input_location):
    import pandas as pd
    from collections import Counter
    df_chunk = pd.read_csv(input_location,skiprows=1, sep='\t',chunksize=1000000)
    reviewers1 = Counter()
    reviewers2 = Counter()

    train_data = []
    test_data = []
    dfObj = pd.DataFrame()

    for chunk in df_chunk:
        for row in chunk.iterrows():
            for ele in row[1].iteritems():
                if ele[0] == 'reviewerID':
                    reviewers1[ele[1]] = reviewers1[ele[1]] + 1
                    reviewers2[ele[1]] = reviewers2[ele[1]] + 1	


    df_chunk = []
    df_chunk = pd.read_csv(input_location,skiprows=1, sep='\t',chunksize=1000000)

    for chunk in df_chunk:
        for row in chunk.iterrows():
            new_row = []
            for ele in row[1].iteritems():
                new_row.append(ele[1])
                if ele[0] == 'reviewerID':
                    if 0.2 * reviewers1[ele[1]] >= reviewers2[ele[1]]:
                        test_data.append(new_row)
                    else:
                        train_data.append(new_row)
                    reviewers2[ele[1]] = reviewers2[ele[1]] - 1

    
    train_frame = pd.DataFrame(train_data,columns =['overall', 'verified', 'reviewTime','reviewerID','asin','style','reviewerName','reviewText'
    ,'summary','unixReviewTime','vote','image'])
    reader = Reader(rating_scale=(0.5, 5.0))
    train_data = Dataset.load_from_df(train_frame[['reviewerID', 'asin', 'overall']], reader)
    '''
    train_frame.to_csv(train_data_location, sep='\t',
                          compression='infer')
    '''
    test_frame = pd.DataFrame(test_data,columns =['overall', 'verified', 'reviewTime','reviewerID','asin','style','reviewerName','reviewText'
    ,'summary','unixReviewTime','vote','image'])
    test_data = Dataset.load_from_df(test_frame[['reviewerID', 'asin', 'overall']], reader)
    print('testframe',test_frame)
    print('testdata',test_data)
    
    '''
    test_frame.to_csv(test_data_location, sep='\t',
                          compression='infer')
    '''
    return (train_data, test_data)

In [0]:
# Based on field in and type
def getSurpriseRatingPrediction(train_data, ratingCol, test_data, algo, userBased):
    
    reader = Reader(rating_scale=(0.5, 5.0))
    data = pd.concat([train_data,test_data])

    traindata = Dataset.load_from_df(train_data[['reviewerID', 'asin', ratingCol]], reader)

    testdata = Dataset.load_from_df(test_data[['reviewerID', 'asin', ratingCol]], reader)

    dataf = Dataset.load_from_df(data[['reviewerID', 'asin', ratingCol]], reader)


    raw_trainset = [traindata.raw_ratings[i] for i in range(len(traindata.raw_ratings))]
    raw_testset = [testdata.raw_ratings[i] for i in range(len(testdata.raw_ratings))]

    trainset = dataf.construct_trainset(raw_trainset)
    testset = dataf.construct_testset(raw_testset)
    
    algoName = algo
    if algo == 'svd':
        algo = SVD()
    elif algo == 'knn':
        sim_options = {'name': 'cosine',
               'user_based': userBased
               }
        algo = algo = KNNBaseline(sim_options=sim_options)
    elif algo == 'NMF':
        algo = NMF()
    algo.fit(trainset)
    print('mean for ',algoName, ' is ',trainset.global_mean)
    predictions = algo.test(testset)
    precisions, recalls =  precision_recall_at_k(predictions, k=10, threshold=3.5)
    rmse = accuracy.rmse(predictions, verbose=True)
    mae = accuracy.mae(predictions, verbose=True)
    p = sum(prec for prec in precisions.values()) / len(precisions)
    r = sum(prec for prec in recalls.values()) / len(recalls)
    f = 2*p*r/ (p+r)
    print('precisions for ',algoName, ' is ', p)
    print('recall for ',algoName, ' is ', r)
    print('F measure for ',algoName, ' is ', f)
    print('MAE measure for ',algoName, ' is ', mae)
    
    return predictions
    

In [0]:
def getPredictions(rdata, moviedata, testSplit, topN):
    #get NMF predictions
    NMFPredictions = getSurpriseRatingPrediction(rdata, 'overall', testSplit, 'NMF', False)
    
    # get SVD predictions
    svdPredictions = getSurpriseRatingPrediction(rdata, 'overall', testSplit, 'svd', False )
    
    #get Knn user based predictions based on ratings
    userBasedKNNPredictions = getSurpriseRatingPrediction(rdata, 'overall', testSplit, 'knn', True)
    
    #get Knn movie based predictions based on ratings
    ratingBasedKNNPredictions = getSurpriseRatingPrediction(rdata, 'overall', testSplit, 'knn', False)
    
    #get Knn movie based predictions based on reviews
    rdata = generateSentimentScoreOnReviews(rdata)
    testSplit = generateSentimentScoreOnReviews(testSplit)

    # Can use review ratings as actual ratings and predict the ratings. This can be used in case users have not provided ratings
    #reviewBasedKNNPredictions = getSurpriseRatingPrediction(rdata, 'reviewRating', testSplit, 'knn', False)

    reviewBasedKNNPredictions = list(testSplit.parallel_apply(lambda x: 
                                  Prediction(x.reviewerID,x.asin, x.overall, x.reviewRating, {}), axis = 1))
    
    #get content based predictions
    contentBasedPredictions = getPredictionFromMovieContent(moviedata, rdata, testSplit)
    print('content', contentBasedPredictions)
    topNRec = getFinalRec(svdPredictions,
                          userBasedKNNPredictions,
                          ratingBasedKNNPredictions,
                          reviewBasedKNNPredictions,
                          NMFPredictions, contentBasedPredictions
                          topN)
    return topNRec
    
    

In [0]:
def mergeDict(dicts):
    mergeDict = defaultdict(list)
    for dictEach in dicts:
        for key, val in dictEach.items(): 
            mergeDict[key].append(val)
    
    newMergeDic = defaultdict(list)
    for user, movieList in mergeDict.items():
        movieDict = defaultdict(list)
        movieList = reduce(operator.concat,movieList)
        for movie, true_r, rating in movieList:
            movieDict[movie].append((true_r,rating))
        movieL = movieDict[movie]
        newMergeDic[user] = dict(movieDict)

    finalUserList = []
    for user, movieList in newMergeDic.items():
        for movie, ratingList in movieList.items():
            avgRating = sum(x[1] for x in ratingList)/float(len(ratingList))
            finalUserList.append((user,movie,ratingList[0][0],avgRating))
    
    detail = {}
    predictions = [Prediction(uid,
                                iid,
                                r_ui_trans,
                                est,
                                detail)
                       for (uid, iid, r_ui_trans,est) in finalUserList]
    return predictions, finalUserList

In [0]:
def getPrecessionRecall(predictions, algoName, flag =True, str1='',str2=''):
    precisions, recalls =  precision_recall_at_k(predictions, k=5, threshold=3.5,flag = flag)
    rmse = accuracy.rmse(predictions, verbose=True)
    p = sum(prec for prec in precisions.values()) / len(precisions)
    r = sum(prec for prec in recalls.values()) / len(recalls)
    f = 2*p*r/ (p+r)
    mae = accuracy.mae(predictions, verbose=True)
    if flag==False:
        print('precisions for ',algoName, ' is ', p)
        print('recall for ',algoName, ' is ', r)
        print('F measure for ',algoName, ' is ', f)
    else:
        print('precisions for ',algoName, ' is ', p,'for ',str1,' ',str2)
        print('recall for ',algoName, ' is ', r,'for ',str1,' ',str2)
        print('F measure for ',algoName, ' is ', f,'for ',str1,' ',str2)
        
    return rmse, p, r, f ,mae
    

In [0]:
# Combine all the predictions score by weighted average
def combinePredictions(svdPredictions, userBasedKNNPredictions,
                                           ratingBasedKNNPredictions, reviewBasedKNNPredictions, NMFPredictions,
                                          ):
    svdW = 1
    rknnW = 1
    uknnW = 1
    reKnnW = 1
    svdPredictionsNew = defaultdict(list)
    for uid, iid, true_r, est, _ in svdPredictions:
            svdPredictionsNew[uid].append((iid,true_r, svdW*est))
    
    userBasedKNNPredictionsNew = defaultdict(list)
    for uid, iid, true_r, est, _ in userBasedKNNPredictions:
            userBasedKNNPredictionsNew[uid].append((iid,true_r, uknnW*est))
            
    ratingBasedKNNPredictionsNew = defaultdict(list)
    for uid, iid, true_r, est, _ in ratingBasedKNNPredictions:
            ratingBasedKNNPredictionsNew[uid].append((iid,true_r, rknnW*est))
    
    reviewBasedKNNPredictionsNew = defaultdict(list)
    for uid, iid, true_r, est, _ in reviewBasedKNNPredictions:
            reviewBasedKNNPredictionsNew[uid].append((iid,true_r, reKnnW*est))
    
    nmfPredictionsNew = defaultdict(list)
    for uid, iid, true_r, est, _ in NMFPredictions:
            nmfPredictionsNew[uid].append((iid,true_r, svdW*est))
            
    contentBasedPredictionsNew = defaultdict(list)
    for uid, iid, true_r, est in contentBasedPredictions:
           contentBasedPredictionsNew[uid].append((iid, est))
    
    combinedPred = [svdPredictionsNew, userBasedKNNPredictionsNew, 
                    ratingBasedKNNPredictionsNew,nmfPredictionsNew]
                    
    ratingPredictions, ratingPredictionsList = mergeDict(combinedPred)
    ratingPredictionsListNew = defaultdict(list)
    for uid, iid, true_r, est in ratingPredictionsList:
            ratingPredictionsListNew[uid].append((iid,true_r, est))
    
    getPrecessionRecall(ratingPredictions, flag=True, algoName='before adding reviewRating')
    print('NDCG for CF', getNDCGScore(ratingPredictionsList))

    
    combinedPred = [ratingPredictionsListNew, reviewBasedKNNPredictionsNew]
    finalPredictions, finalPredictionsL = mergeDict(combinedPred)
    getPrecessionRecall(finalPredictions, flag=True, algoName='after adding reviewRating')
    print('NDCG for CF+SA', getNDCGScore(finalPredictionsL))
    return finalPredictions
    

In [0]:
def getFinalRec(svdPredictions, userBasedKNNPredictions,
                ratingBasedKNNPredictions, reviewBasedKNNPredictions,
                NMFPredictions, contentBasedPredictions
                                          topN):
    
    predictions = combinePredictions(svdPredictions, 
                                    userBasedKNNPredictions,
                                    ratingBasedKNNPredictions, 
                                    reviewBasedKNNPredictions,
                                    NMFPredictions, 
                                    contentBasedPredictions
                                        )

    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_recs[uid].append((iid, est))
     
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
     
    return top_recs
    

In [0]:

train_data = pd.read_csv(train_data_location,  header=0, sep='\t', quotechar='"',  names=[ 'overall','verified','reviewTime','reviewerID','asin', 'style', 'reviewerName','reviewText','summary','unixReviewTime','vote','image'], skiprows=1)
train_data = train_data.drop(['verified','vote','image','unixReviewTime','style','reviewTime'],axis =1,errors='ignore')
train_data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,overall,reviewerID,asin,reviewerName,reviewText,summary
1,4,A3P98J5DZ00A75,5019281,Ken Roberts,Henry Winkler proves his acting ability in thi...,grey
2,5,A2U7DG83EXUSFP,5019281,Donald R. Brandeberry,A good movie with morals,Family movie
3,5,A1XQVED2NX33TN,5019281,In my opinion...,More of a 'modern' version of the classic. I ...,"Was happily surprised...great version, esp for..."
4,5,ARSGS4RQUVL1O,5019281,M. Smith,The Christmas Carol is my all time favorite st...,The Christmas Carol is my all time favorite st...
5,3,A101IGU6UDKW3X,5019281,DorothyZ,Well made Christmas movie. It's a little slow ...,It's a little slow but sweet natured.


In [0]:

test_data = pd.read_csv(test_data_location,  header=0, sep='\t', quotechar='"',  names=[ 'overall','verified','reviewTime','reviewerID','asin', 'style', 'reviewerName','reviewText','summary','unixReviewTime','vote','image'], skiprows=1)
test_data = test_data.drop(['verified','vote','image','unixReviewTime','style','reviewTime'],axis =1,errors='ignore')
test_data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,overall,reviewerID,asin,reviewerName,reviewText,summary
1,5,A16BJ43Z46QS3N,6304077955,C. L Wilson,I didn't really look forward to seeing this mo...,no title
2,4,A3AS6G2F9CDHV5,6304154178,Brian May,"This adventure, penned no less by Douglas Adam...","""They slammed him to the wall with good vibrat..."
3,2,A2JS2OU5SBH1XQ,6304233639,"Studebaker Hoch, billythemtn@geocities.com",One can only guess at what the producers of th...,Just what were they after?
4,4,A2JS2OU5SBH1XQ,6304258984,"Studebaker Hoch, billythemtn@geocities.com","Sure, the plot is so full of holes that you co...",One Helluva Ride!!!!!
5,1,A2JS2OU5SBH1XQ,6304298285,"Studebaker Hoch, billythemtn@geocities.com",Unbelieveably bad fare from the maker of such ...,Shame on you Francis


In [0]:
#Created test and train data in such a way, 80% of the users movie ratings are in train data and test 20% in test data
overalldata = pd.concat([train_data,test_data])


In [0]:
top5_recommendations = getPredictions(train_data, moveidata, test_data, topN)
