In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import math
import random
import nltk
import sklearn
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [None]:
finalbooks = pd.read_csv('finalbook.csv')
ratings = pd.read_csv('finalratings.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
def dcg_k(r,k):
    '''Discounted Cumulative Gain(DCG)
    r: True Ratings in Predicted Rank Order(1st element is top recommendation)
    k: Number of results to consider 
    '''
    
    r = np.asfarray(r)[:k]
    dcg = np.sum(2**r / np.log2(np.arange(2, r.size + 2)))
    return dcg

def ndcg_k(r,k):
    "Normalized Discounted Cumulative Gain(NDCG)"
    
    dcg_max = dcg_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0
    return dcg_k(r,k) / dcg_max

def mean_ndcg(rs):
    '''Mean NDCG for all users
    rs: Iterator/For each user: True ratings in Predicted Rank orde
    '''
    
    mean = np.mean([ndcg_k(r, len(r)) for r in rs])
    return mean

In [None]:
def rmse(y,h):
    '''Root Mean Squared Error(RMSE)
    y: real y
    h: predicted y
    '''
    
    a=y-h
    return np.sqrt(sum(a**2)/len(a))

In [None]:
# Defining the tail
tailcomp = ratings.groupby(by='newbookid', as_index=False).agg({'rating': pd.Series.count}).sort_values(by='rating', ascending=False)
tot=sum(tailcomp['rating'])
tailcomp['popshare'] = [x/tot for x in tailcomp['rating']]
tailcomp['popshare'] = tailcomp['popshare'].cumsum()
tailcomp['category'] = ['Head' if x<0.95 else "Tail" for x in tailcomp['popshare']]
tail = tailcomp.loc[tailcomp.popshare>=0.95]
tail

In [None]:
def get_words(message):
    '''Get the normalized list of words from a message string.
    This function should split a message into words, normalize them and return the resulting list.
    For splitting, you should split on spaces. For normalization, you should convert everything to lowercase.
    '''
    
    words = message
    words = message.split(" ")
    words = [x.lower() for x in words]
    
    return words

In [None]:
def create_dictionary(messages):
    '''Create a dictionary mapping words to integer in dices
    '''
    
    word_counts = collections.defaultdict(int)
    
    for message in messages:
        for word in set(get_words(message)):
            word_counts[word] += 1
            
    resulting_dictionary={}
    
    for word, count in word_counts.items():
        if count>=25 and word not in stopwords.words('english') and len(word) >1:
            next_index = len(resulting_dictionary)
            resulting_dictionary[word] = next_index
    
    return resulting_dictionary

In [None]:
def transform_text(messages, word_dictionary):
    "Transform a list of text messages into a numpy array for further processing."
    
    A = np.zeros((len(messages), len(word_dictionary)))
    
    for i, message in enumerate(messages):
        for word in get_words(message):
            if word in word_dictionary:
                A[i, word_dictionary[word]] +=1
    
    return A

In [None]:
import collections
finalbooks['book_desc'] = finalbooks['book_desc'].fillna(finalbooks['title'])
finalbooks['book_desc'] = finalbooks['book_desc'].str.replace(r'[^\w\s]',"")
finalbooks['book_desc'] = finalbooks['book_desc'].fillna(finalbooks['tag_cloud'])
finalbooks['tag_cloud'] = finalbooks['tag_cloud'].str.replace('-'," ")
finalbooks['words'] = finalbooks['book_desc'] +" "+finalbooks['tag_cloud']+" "+finalbooks['authors']
dico = create_dictionary(finalbooks['book_desc'])

In [None]:
len(dico)

In [None]:
A = transform_text(finalbooks['words'], dico)

In [None]:
np.size(A, 0)

In [None]:
np.sum(A, axis=1).min()

In [None]:
A1 = np.sum((A>0), axis= 0)

IDF = np.log(np.size(A, 0)/A1)
IDF
len(IDF)

In [None]:
TF = A / (np.sum(A, axis=1, keepdims=True))
np.shape(TF)

In [None]:
TFiDF= TF*IDF
np.shape(TFiDF)

In [None]:
TFiDF = TFiDF / np.sqrt((np.sum(TFiDF**2, axis = 1, keepdims=True)+0.01))

In [None]:
SimC = np.dot(TFiDF, TFiDF.T)
SimC

In [None]:
np.sum(SimC, axis = 1).max()

In [None]:
np.fill_diagonal(SimC, 1)
SimC

In [None]:
BookSim =pd.DataFrame(SimC, columns=finalbooks.title, index=finalbooks.title)

In [None]:
finalbooks.title[np.argsort(SimC[15, :])[-6:]]

In [None]:
SimC[15, [15, 2252, 6977, 4642, 2796, 1700 ]]

In [None]:
finalbooks [finalbooks.newbookid.isin(np.argsort(SimC[15, :])[-6:]+1)]

In [None]:
# Set up the matplotlib figure
f, ax = plt.subplots()

# Generate a custom diverging colormap followed by the correlation heatmap
cmap =sns.diverging_palette(20, 220, n=20000)

sns.heatmap(BookSim, cmap=cmap,center = 0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
allpreds = []
train = train.sort_values(by=['newbookid'])
for i in range(15000):
  bi = train.newbookid[train.newuser_id == i+1]-1
  Simi = SimC[:, bi]
  ri = np.array(train[train.newuser_id == i+1].sort_values(by=['newbookid']).rating)
  predi = finalbooks.filter(['newbookid'])
  predi['pred'] = np.sum(Simi*ri, axis=1)/(np.sum(Simi, axis=1)+0.01)
  predi['newuser_id'] = i+1
  allpreds.append(predi)
  if (i+1)%1000 == 0:
        print("done: ", i+1)

In [None]:
predictions = np.concatenate(allpreds, axis=0 )

In [None]:
final =pd.DataFrame(predictions, columns=['newbookid', 'pred', 'newuser_id'])

In [None]:
train['conc']=train['newuser_id'].map(str)+train['newbookid'].map(str)
final['conc']=final['newuser_id'].map(str)+final['newbookid'].map(str)
finalfin = final[~final.conc.isin(train.conc)]
finalfin.describe()

In [None]:
finalrank = test.merge(final,on = ['newbookid', 'newuser_id'])
finalrank = finalrank.sort_values(by=['newuser_id', 'pred'], ascending=False)
finalrank.head(5)

In [None]:
finallist = []
for i in range(15000):
    a = finalrank.loc[finalrank.newuser_id == i+1]['rating'].tolist()
    finallist.append(a)
    if (i+1)%1000 == 0:
        print("done: ", i+1)

In [None]:
b = np.array([ndcg_k(r, len(r)) for r in finallist])

facet, axes = plt.subplots(1, 1, figsize=(10, 3))
n, bins, patches = plt.hist(b, 200, facecolor='blue', alpha=0.5) #, log = True)   
plt.title('Distribution of NDGC among Users for the TFiDF model')
plt.show()

In [None]:
d = b[b == 1]
sum(d)/15000

In [None]:
top10 = finalfin.sort_values('pred',ascending = False).groupby('newuser_id').head(10)
top50 = finalfin.sort_values('pred',ascending = False).groupby('newuser_id').head(50)

print('(1) TF-iDF Model RMSE: ', np.round(rmse(finalrank['pred'],finalrank['rating']), decimals=3))
print('(2) TF-iDF Model NDCG: ', np.round(mean_ndcg(finallist), decimals=3))
print("(3) Median NDCG: ", np.round(np.median(b), decimals=3))
print("(4) Share of NDCG =1 among Users: ", np.round(sum(d)/15000, decimals=3))
print('(5) TF-iDF Model Div10 Score: ',np.round(sum(np.in1d(top10.newbookid, tail.newbookid))/len(top10), decimals=3))
print('(6) TF-iDF Model Div50 Score: ',np.round(sum(np.in1d(top50.newbookid, tail.newbookid))/len(top50), decimals=3))

In [None]:
finalranktrain = train.merge(final,on = ['newbookid', 'newuser_id'])
finalranktrain = finalranktrain.sort_values(by=['newuser_id', 'pred'], ascending=False)

finalranktrain

In [None]:
finallisttrain = []
for i in range(15000):
    a = finalranktrain.loc[finalranktrain.newuser_id == i+1]['rating'].tolist()
    finallisttrain.append(a)
    if (i+1)%1000 == 0:
        print("done: ", i+1)

In [None]:
print('(1) TF-iDF Train Model RMSE: ', np.round(rmse(finalranktrain['pred'],finalranktrain['rating']), decimals=3))
print('(2) TF-iDF Train Model NDCG: ', np.round(mean_ndcg(finallisttrain), decimals=3))