In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import math
import random
import nltk
import sklearn
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [None]:
finalbooks = pd.read_csv('finalbook.csv')
ratings = pd.read_csv('finalratings.csv')

In [None]:
train, test = train_test_split(ratings, stratify=ratings['newuser_id'], test_size=0.20, random_state=42)

In [None]:
def dcg_k(r,k):
    '''Discounted Cumulative Gain(DCG)
    r: True Ratings in Predicted Rank Order(1st element is top recommendation)
    k: Number of results to consider 
    '''
    
    r = np.asfarray(r)[:k]
    dcg = np.sum(2**r / np.log2(np.arange(2, r.size + 2)))
    return dcg

def ndcg_k(r,k):
    "Normalized Discounted Cumulative Gain(NDCG)"
    
    dcg_max = dcg_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0
    return dcg_k(r,k) / dcg_max

def mean_ndcg(rs):
    '''Mean NDCG for all users
    rs: Iterator/For each user: True ratings in Predicted Rank orde
    '''
    
    mean = np.mean([ndcg_k(r, len(r)) for r in rs])
    return mean

In [None]:
def precision_k(r, k):
    '''Score is precision at k
    r: Binary Y/N in predicted Rank order(1st element is top recommendation)
    '''
    
    assert k>=1
    r=np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return np.mean(r)

def average_precision(r):
    
    r=np.asarray(r) != 0
    out = [precision_k(r, k+1) for k in range(r, size) if r[k]]
    if not out:
        return 0
    return np.mean(out)

def mean_average_precision(rs):
    
    return np.mean([average_precision(r) for r in rs])

In [None]:
def rmse(y,h):
    '''Root Mean Squared Error(RMSE)
    y: real y
    h: predicted y
    '''
    
    a=y-h
    return np.sqrt(sum(a**2)/len(a))

In [None]:
# Distribution of the no. of ratings per book
facet, axes = plt.subplots(1,1,figsize=(20,6))
n, bins, patches = plt.hist(ratings['newbookid'].value_counts(), 200, facecolor='blue', alpha=0.5)
plt.title('Distribution of no. of reviews per book')
plt.show()

In [None]:
# Defining the tail
tailcomp = ratings.groupby(by='newbookid', as_index=False).agg({'rating': pd.Series.count}).sort_values(by='rating', ascending=False)
tot=sum(tailcomp['rating'])
tailcomp['popshare'] = [x/tot for x in tailcomp['rating']]
tailcomp['popshare'] = tailcomp['popshare'].cumsum()
tailcomp['category'] = ['Head' if x<0.95 else "Tail" for x in tailcomp['popshare']]
tail = tailcomp.loc[tailcomp.popshare>=0.95]
tail

## Baseline Model: Popularity Model

In [None]:
popular = finalbooks.groupby('newbookid')['ratings_count', 'average_rating'].sum().sort_values(by = 'ratings_count', ascending=False).reset_index()
popular.head(10)

In [None]:
poprank = test.merge(popular, on='newbookid')
poprank['pred'] = np.round(poprank['average_rating'])
poprank['ratingYN'] = [1 if x>=4 else 0 for x in poprank['rating']]

poprank = poprank.sort_values(by=['newuser_id', 'ratings_count'], ascending=False)
poprank.head(10)

In [None]:
poplista = []

for i in range(15000):
    a=poprank.loc[poprank.newuser_id == i+1]['rating'].tolist()
    poplista.append(a)
    if (i+1)%1000 == 0:
        print("Done: ", i+1)

In [None]:
top = poprank.sort_values('ratings_count', ascending=False).groupby('newuser_id')
top

In [None]:
b = np.array([ndcg_k(r, len(r)) for r in poplista])

facet, axes = plt.subplots(1,1,figsize=(20,6))
n, bins, patches = plt.hist(b, 200, facecolor='blue', alpha=0.5)
plt.title('Distribution of NDCG among users for the popularity model')
plt.show()

In [None]:
d=b[b==1]
sum(d)/15000

In [None]:
print('(1) Popularity Model RMSE: ', np.round(rmse(poprank['rating'],poprank['average_rating']), decimals=3))
print('(2) Popularity Model NDCG: ', np.round(mean_ndcg(poplista), decimals=3))
print("(3) Median NDCG: ", np.round(np.median(b), decimals=3))
print("(4) Share of NDCG =1 among Users: ", np.round(sum(d)/15000, decimals=3))

In [None]:
popranktrain = test.merge(popular,on = 'newbookid')
popranktrain['pred']= np.round(poprank['average_rating'])

In [None]:
poplisttrain = []
for i in range(15000):
    a = popranktrain.loc[popranktrain.newuser_id == i+1]['rating'].tolist()
    poplisttrain.append(a)
    if (i+1)%1000 == 0: print("done: ", i+1)

In [None]:
print('(1) Pop Train Model RMSE: ', np.round(rmse(popranktrain['pred'],popranktrain['rating']), decimals=3))
print('(2) Pop Train Model NDCG: ', np.round(mean_ndcg(poplisttrain), decimals=3))

In [None]:
train

In [None]:
test

In [None]:
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)