In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import math
import random
import nltk
import sklearn
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [None]:
finalbooks = pd.read_csv('finalbook.csv')
ratings = pd.read_csv('finalratings.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
def dcg_k(r,k):
    '''Discounted Cumulative Gain(DCG)
    r: True Ratings in Predicted Rank Order(1st element is top recommendation)
    k: Number of results to consider 
    '''
    
    r = np.asfarray(r)[:k]
    dcg = np.sum(2**r / np.log2(np.arange(2, r.size + 2)))
    return dcg

def ndcg_k(r,k):
    "Normalized Discounted Cumulative Gain(NDCG)"
    
    dcg_max = dcg_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0
    return dcg_k(r,k) / dcg_max

def mean_ndcg(rs):
    '''Mean NDCG for all users
    rs: Iterator/For each user: True ratings in Predicted Rank orde
    '''
    
    mean = np.mean([ndcg_k(r, len(r)) for r in rs])
    return mean

In [None]:
def rmse(y,h):
    '''Root Mean Squared Error(RMSE)
    y: real y
    h: predicted y
    '''
    
    a=y-h
    return np.sqrt(sum(a**2)/len(a))

In [None]:
# Defining the tail
tailcomp = ratings.groupby(by='newbookid', as_index=False).agg({'rating': pd.Series.count}).sort_values(by='rating', ascending=False)
tot=sum(tailcomp['rating'])
tailcomp['popshare'] = [x/tot for x in tailcomp['rating']]
tailcomp['popshare'] = tailcomp['popshare'].cumsum()
tailcomp['category'] = ['Head' if x<0.95 else "Tail" for x in tailcomp['popshare']]
tail = tailcomp.loc[tailcomp.popshare>=0.95]
tail

In [None]:
nltk.download('stopwords')

In [None]:
def get_words(message):
    '''Get the normalized list of words from a message string.
    This function should split a message into words, normalize them and return the resulting list.
    For splitting, you should split on spaces. For normalization, you should convert everything to lowercase.
    '''
    
    words = message
    words = message.split(" ")
    words = [x.lower() for x in words]
    
    return words

In [None]:
def create_dictionary(messages):
    '''Create a dictionary mapping words to integer in dices
    '''
    
    word_counts = collections.defaultdict(int)
    
    for message in messages:
        for word in set(get_words(message)):
            word_counts[word] += 1
            
    resulting_dictionary={}
    
    for word, count in word_counts.items():
        if count>=25 and word not in stopwords.words('english') and len(word) >1:
            next_index = len(resulting_dictionary)
            resulting_dictionary[word] = next_index
    
    return resulting_dictionary

In [None]:
def transform_text(messages, word_dictionary):
    "Transform a list of text messages into a numpy array for further processing."
    
    A = np.zeros((len(messages), len(word_dictionary)))
    
    for i, message in enumerate(messages):
        for word in get_words(message):
            if word in word_dictionary:
                A[i, word_dictionary[word]] +=1
    
    return A

In [None]:
def fit_naive_bayes_model(matrix, labels):
    "Fit a naive bayes model."
    
    model = {}

    phi = (1. * sum(labels) / len(labels))*0.95+0.05*0.5
    model['logphi_0'] = np.log(1.-phi)
    model['logphi_1'] = np.log(phi)
    theta_0 = (matrix[labels == 0]).sum(axis=0) + 1
    theta_1 = (matrix[labels == 1]).sum(axis=0) + 1
    theta_0 /= theta_0.sum()
    theta_1 /= theta_1.sum()
    model['logtheta_0'] = np.log(theta_0)
    model['logtheta_1'] = np.log(theta_1)

    return model

In [None]:
def predict_from_naive_bayes_model(model, matrix):
    "Use a Naive Bayes model to compute predictions for a target matrix."

    output = np.zeros(matrix.shape[0])

    logphi_0 = model['logphi_0']
    logphi_1 = model['logphi_1']
    logtheta_0 = model['logtheta_0']
    logtheta_1 = model['logtheta_1']
    logprobs_0 = (matrix * logtheta_0).sum(axis=1) + logphi_0
    logprobs_1 = (matrix * logtheta_1).sum(axis=1) + logphi_1

    output = (logprobs_1/(logprobs_1+logprobs_0))
    return output

In [None]:
def get_top_five_naive_bayes_words(model, dictionary):
   
    ids = np.argsort(model['logtheta_0'] - model['logtheta_1'])[:5]

    reverse_dictionary = {i: word for word, i in dictionary.items()}

    return [reverse_dictionary[i] for i in ids]

In [None]:
import collections
finalbooks['book_desc'] = finalbooks['book_desc'].fillna(finalbooks['title'])
finalbooks['book_desc'] = finalbooks['book_desc'].str.replace(r'[^\w\s]',"")
finalbooks['book_desc'] = finalbooks['book_desc'].fillna(finalbooks['tag_cloud'])
finalbooks['tag_cloud'] = finalbooks['tag_cloud'].str.replace('-'," ")
finalbooks['words'] = finalbooks['book_desc'] +" "+finalbooks['tag_cloud']+" "+finalbooks['authors']
dico = create_dictionary(finalbooks['book_desc'])
len(dico)

In [None]:
A = transform_text(finalbooks['book_desc'], dico)
finalbooks['binary']= [1 if x >=4 else 0 for x in finalbooks['average_rating']]
ratings['binary']= [1 if x >=4 else 0 for x in ratings['rating']]

In [None]:
allpreds = []
topwords = []
indicators = np.zeros(len(dico))
for i in range(15000):
    User = train.loc[train.newuser_id == i+1].sort_values('newbookid')
    User['binary']= [1 if x >=4 else 0 for x in User['rating']]
    A[User['newbookid']-1,:] 
    model = fit_naive_bayes_model(A[User['newbookid']-1,:], User['binary'])
    result = predict_from_naive_bayes_model(model, A)
    UserRes = finalbooks.filter(['newbookid'])
    UserRes['newuser_id'] = i+1 
    UserRes['pred'] = result
    allpreds.append(UserRes)
    indicators = indicators + (model['logtheta_0'] - model['logtheta_1'])
    if (i+1)%1000 == 0:
        print("done: ", i+1)
## Append in a list and then use concat
top5 = get_top_five_naive_bayes_words(model, dico)
topwords.append(top5)

In [None]:
indicators = indicators*15000
ids = np.argsort(-indicators)[:5]
reverse_dictionary = {i: word for word, i in dico.items()}
[reverse_dictionary[i] for i in ids]

In [None]:
np.sort(indicators*1000000000000)[:5]

In [None]:
fivewords = np.concatenate(topwords, axis=0 )
from collections import Counter
for key, value in sorted(Counter(fivewords).items(), reverse=True, key=lambda item: item[1]):
    print("%s: %s" % (key, value))

In [None]:
predictions = np.concatenate(allpreds, axis=0 )

In [None]:
bayes =pd.DataFrame(predictions, columns=['newbookid','newuser_id', 'pred']) 
bayes

In [None]:
bayesrank = test.merge(bayes,on = ['newbookid', 'newuser_id'])
bayesrank = bayesrank.sort_values(by=['newuser_id', 'pred'], ascending=False)
bayesrank.head(5)

In [None]:
bayesrank['pred']=bayesrank['pred']*4+1
bayesrank.head(5)

In [None]:
bayesrank['pred'].min()

In [None]:
train['conc']=train['newuser_id'].map(str)+train['newbookid'].map(str)
bayes['conc']=bayes['newuser_id'].map(str)+bayes['newbookid'].map(str)

In [None]:
bayesfin = bayes[~bayes.conc.isin(train.conc)]
bayesfin.describe()

In [None]:
bayeslist = []
for i in range(15000):
    a = bayesrank.loc[bayesrank.newuser_id == i+1]['rating'].tolist()
    bayeslist.append(a)
    if (i+1)%1000 == 0:
        print("done: ", i+1)

In [None]:
b = np.array([ndcg_k(r, len(r)) for r in bayeslist])

facet, axes = plt.subplots(1, 1, figsize=(10, 3))
n, bins, patches = plt.hist(b, 200, facecolor='blue', alpha=0.5) #, log = True)   
plt.title('Distribution of NDGC among Users for the Bayes model')
plt.show()

In [None]:
d = b[b == 1]
sum(d)/15000

In [None]:
print('(1) Bayes Model RMSE: ', np.round(rmse(bayesrank['pred'],bayesrank['rating']), decimals=3))
print('(2) Bayes Model NDCG: ', np.round(mean_ndcg(bayeslist), decimals=3))
print("(3) Median NDCG: ", np.round(np.median(b), decimals=3))
print("(4) Share of NDCG =1 among Users: ", np.round(sum(d)/15000, decimals=3))

In [None]:
bayesranktrain = train.merge(bayes,on = ['newbookid', 'newuser_id'])
bayesranktrain = bayesranktrain.sort_values(by=['newuser_id', 'pred'], ascending=False)
bayesranktrain['pred']=bayesranktrain['pred']*4+1

In [None]:
bayeslisttrain = []
for i in range(15000):
    a = bayesranktrain.loc[bayesranktrain.newuser_id == i+1]['rating'].tolist()
    bayeslisttrain.append(a)
    if (i+1)%1000 == 0: print("done: ", i+1)

In [None]:
print('(1) Bayes Model Train RMSE: ', np.round(rmse(bayesranktrain['pred'],bayesranktrain['rating']), decimals=3))
print('(2) Bayes Model Train NDCG: ', np.round(mean_ndcg(bayeslisttrain), decimals=3))