In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import math
import random
import nltk
import sklearn
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [None]:
finalbooks = pd.read_csv('finalbook.csv')
ratings = pd.read_csv('finalratings.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
def dcg_k(r,k):
    '''Discounted Cumulative Gain(DCG)
    r: True Ratings in Predicted Rank Order(1st element is top recommendation)
    k: Number of results to consider 
    '''
    
    r = np.asfarray(r)[:k]
    dcg = np.sum(2**r / np.log2(np.arange(2, r.size + 2)))
    return dcg

def ndcg_k(r,k):
    "Normalized Discounted Cumulative Gain(NDCG)"
    
    dcg_max = dcg_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0
    return dcg_k(r,k) / dcg_max

def mean_ndcg(rs):
    '''Mean NDCG for all users
    rs: Iterator/For each user: True ratings in Predicted Rank orde
    '''
    
    mean = np.mean([ndcg_k(r, len(r)) for r in rs])
    return mean

In [None]:
def rmse(y,h):
    '''Root Mean Squared Error(RMSE)
    y: real y
    h: predicted y
    '''
    
    a=y-h
    return np.sqrt(sum(a**2)/len(a))

In [None]:
# MATRIX FACTORIZATION
def new_R(data, U, B):
    nR = np.zeros(data.shape[0])
    c = 0
    for i in range(data.shape[0]):
        nR[c] = B[:, data.newbookid[i] - 1] @ U[data.newuser_id[i] - 1, :]

        c += 1
    return nR

In [None]:
## Alternate Least Square

def ALS(train, k, lamu = 0.1, lamb = 0.1):
    users = np.unique(train.newuser_id)
    books = np.unique(train.newbookid)
    nu = len(users)
    nb = len(books)

# Initialize U and B
    U = np.ones((max(users), k)) / np.sqrt(k)
    B = np.ones((k, max(books))) / np.sqrt(k)
    
    iter = 1
    RMSE = 3
    dRMSE = 1
    rms = []
    stop = 0.0001
    max_iter = 24
    
    while (dRMSE > stop) and (iter < max_iter):
      for i in users:
        ind_B = train.newbookid[train.newuser_id == i] - 1
        sub_B = B[:, ind_B]
        nui = sub_B.shape[1]
        Ai = sub_B @ np.transpose(sub_B) + lamu * np.identity(k)  
        Vi = sub_B @ train.rating[train.newuser_id == i]
        U[i - 1, :] = np.linalg.pinv(Ai) @ Vi
        
      nR = new_R(train, U, B)
      new_RMSE = rmse(nR,train.rating)
      dRMSEu = (RMSE - new_RMSE)
      RMSE = new_RMSE.copy()
      
      rms.append(RMSE)
      iter += 1
      print("step: ", iter)

      for i in books:
        ind_U = train.newuser_id[train.newbookid == i] - 1
        sub_U = U[ind_U, :]
        nbi = sub_U.shape[0]
        Ai = np.transpose(sub_U) @ sub_U + lamb * np.identity(k)  
        Vi = np.transpose(sub_U) @ train.rating[train.newbookid == i]
        B[:, i - 1] = np.linalg.pinv(Ai) @ Vi
        
      nR = new_R(train, U, B)
      new_RMSE = rmse(nR,train.rating)
      dRMSE = (RMSE - new_RMSE)
      RMSE = new_RMSE.copy()
      print("step: ", iter)
      rms.append(RMSE)
      iter += 1
    w = {}
    w['rms'] = rms
    w['U'] = U
    w['B'] = B

    return w

In [None]:
traint, traincv = train_test_split(train,stratify=train['newuser_id'], test_size=0.20,random_state=42)
traint = traint.reset_index(drop=True)
traincv = traincv.reset_index(drop=True)

In [None]:
ks = []
trains = []
cvs = []
ndgs = []

for k in [3]:
  for alphau in  [ 0.125]:
    for betab in  [0.075, 0.1, 0.2, 1]:
      print("running for... alphau = ", alphau, " and betab = ", betab)
      w = ALS(traint, k, alphau, betab)
      CVpred = new_R(traincv, w['U'], w['B'])
      RMSE_CV = np.sqrt(np.mean((CVpred - traincv.rating) ** 2))
      ranked = traincv.filter(['rating'])
      ranked['pred'] = CVpred
      ndgcv = ndcg_k(ranked.sort_values(by=['pred'], ascending = False).rating, len(ranked.sort_values(by=['pred'], ascending = False).rating))
      ndgs.append(ndgcv)
      ks.append(betab)
      trains.append(w['rms'][-1])
      cvs.append(RMSE_CV)
      print("RMSEtrain: ", w['rms'][-1])
      print("RMSECV: ", RMSE_CV)
      print("done for: k= ", k, "alphau= ", alphau, "betab= ", betab)
      print("RMSEtrain: ", w['rms'][-1])
      print("RMSECV: ", RMSE_CV)
      print("NDG: ", ndgcv)
      print ("w rms: ", w['rms'])

In [None]:
print(ks)
print(trains)
print(cvs)

In [None]:
w = ALS(train,  3, 0.1, 0.1)
R = w['U'].dot(w['B'])
rflat = np.matrix.flatten(R)
testy = np.repeat(np.array(train.newuser_id.unique()), 8000)
booky = np.tile(np.array(finalbooks.newbookid), 15000)
booky

In [None]:
testy = np.sort(testy)
testy

In [None]:
predictions = pd.DataFrame(np.column_stack((testy, booky, rflat)), columns=('newuser_id','newbookid', 'pred'))
predictions

In [None]:
## DEFINING THE TAIL
tailcomp = ratings.groupby(by= 'newbookid', as_index=False).agg({'rating':pd.Series.count}).sort_values(by = 'rating', ascending = False)
tot = sum(tailcomp['rating'])
tailcomp['popshare']= [x/tot for x in tailcomp['rating']]
tailcomp['popshare']= tailcomp['popshare'].cumsum()
tailcomp['category']= ['Head' if x<0.95 else "Tail" for x in tailcomp['popshare']]

tail = tailcomp.loc[tailcomp.popshare >= 0.95]
tail

In [None]:
mfrank = test.merge(predictions,on = ['newbookid', 'newuser_id'])
mfrank = mfrank.sort_values(by=['newuser_id', 'pred'], ascending=False)
mfrank.head(5)

In [None]:
train['conc']=train['newuser_id'].map(str)+train['newbookid'].map(str)
pred['conc']=pred['newuser_id'].map(str)+pred['newbookid'].map(str)
predfin = pred[~pred.conc.isin(train.conc)]
predfin.describe()

In [None]:
mflist = []
for i in range(15000):
    a = mfrank.loc[mfrank.newuser_id == i+1]['rating'].tolist()
    mflist.append(a)
mflist

In [None]:
b = np.array([ndcg_k(r, len(r)) for r in mflist])


facet, axes = plt.subplots(1, 1, figsize=(10, 3))
n, bins, patches = plt.hist(b, 200, facecolor='blue', alpha=0.5) #, log = True)   
plt.title('Distribution of NDGC among Users for the MF model')
plt.show()

In [None]:
d = b[b == 1]
sum(d)/15000

In [None]:
top10 = predictions.sort_values('pred',ascending = False).groupby('newuser_id').head(10)
top50 = predictions.sort_values('pred',ascending = False).groupby('newuser_id').head(50)

print('(1) MF Model RMSE: ', np.round(rmse(mfrank['pred'],mfrank['rating']), decimals=3))
print('(2) MF Model NDCG: ', np.round(mean_ndcg(mflist), decimals=3))
print("(3) Median NDCG: ", np.round(np.median(b), decimals=3))
print("(4) Share of NDCG =1 among Users: ", np.round(sum(d)/15000, decimals=3))
print('(5) MF Model Div10 Score: ',np.round(sum(np.in1d(top10.newbookid, tail.newbookid))/len(top10), decimals=3))
print('(6) MF Model Div50 Score: ',np.round(sum(np.in1d(top50.newbookid, tail.newbookid))/len(top50), decimals=3))

In [None]:
mfranktrain = train.merge(predictions,on = ['newbookid', 'newuser_id'])
mfranktrain = mfranktrain.sort_values(by=['newuser_id', 'pred'], ascending=False)

In [None]:
mflisttrain = []
for i in range(15000):
    a = mfranktrain.loc[mfranktrain.newuser_id == i+1]['rating'].tolist()
    mflisttrain.append(a)
    
mflisttrain

In [None]:
print('(1) MF Train Model RMSE: ', np.round(rmse(mfranktrain['pred'],mfranktrain['rating']), decimals=3))
print('(2) MF Train Model NDCG: ', np.round(mean_ndcg(mflisttrain), decimals=3))