In [34]:
import numpy as np
import pandas as pd
import math
import time

##### Load the full dataset

In [3]:
data = pd.read_csv('data/ml-100k/u.data',sep='\t', header=None, names=['userId', 'itemId', 'rating','timestamp'])

In [9]:
movie_info = pd.read_csv('data/ml-100k/u.item',sep='|', 
                         header=None,
                         index_col = False,
                         names=['itemId', 'title'],
                         usecols = [0,1],
                         encoding = "ISO-8859-1")

In [13]:
df = pd.merge(data, movie_info, left_on='itemId', right_on='itemId')

##### Load the train/test splitted dataset

In [35]:
#load the data split
train_1 = pd.read_csv('data/ml-100k/ua.base',sep='\t', header=None, names=['userId', 'itemId', 'rating','timestamp'])
test_1 = pd.read_csv('data/ml-100k/ua.test',sep='\t', header=None, names=['userId', 'itemId', 'rating','timestamp'])
train_2 = pd.read_csv('data/ml-100k/ub.base',sep='\t', header=None, names=['userId', 'itemId', 'rating','timestamp'])
test_2 = pd.read_csv('data/ml-100k/ub.test',sep='\t', header=None, names=['userId', 'itemId', 'rating','timestamp'])

In [55]:
#check how many ratings we have in the train and test splits
train_ratings = train_1.userId.count()
test_ratings = test_1.userId.count()
train_frac = train_ratings/(train_ratings+test_ratings)*100
print(f'Train/test split is: {train_frac, 100-train_frac}')

Train/test split is: (90.57, 9.430000000000007)


In [50]:
#look at the composition of the train set for a specific user
test_1[test_1.userId==4]

Unnamed: 0,userId,itemId,rating,timestamp
30,4,50,5,892003526
31,4,260,4,892004275
32,4,264,3,892004275
33,4,288,4,892001445
34,4,294,5,892004409
35,4,303,5,892002352
36,4,354,5,892002353
37,4,356,3,892003459
38,4,357,4,892003525
39,4,361,5,892002353


In [51]:
#look at the composition of the test set for a specific user
train_1[train_1.userId==4]

Unnamed: 0,userId,itemId,rating,timestamp
358,4,11,4,892004520
359,4,210,3,892003374
360,4,258,5,892001374
361,4,271,4,892001690
362,4,300,5,892001445
363,4,301,5,892002353
364,4,324,5,892002353
365,4,327,5,892002352
366,4,328,3,892001537
367,4,329,5,892002352


In [73]:
#convert the rating dataframe into the Rating train and test matrix
R_train1 = pd.pivot_table(train_1, values='rating', index='userId', columns='itemId')
R_test1 = pd.pivot_table(test_1, values='rating', index='userId', columns='itemId')

In [101]:
#Check the structure of the rating matrix R
R_train1.head()

itemId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [75]:
R_train1.shape

(943, 1680)

In [76]:
R_test1.shape

(943, 1129)

In [99]:
#Cerate the implicit feedback matrix
F = R_train1.copy()

for i in F.index:
    for j in F.columns:
        if np.isnan(F.loc[i,j]) == True:
            F.loc[i,j] = 0
        elif np.isnan(F.loc[i,j]) == False:
            F.loc[i,j] = 1
        else: continue

In [100]:
F.head()

itemId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
print(np.mean(np.nanmean(R_train1, axis=0)))
print(np.mean(np.nanmean(R_train1, axis=1)))

3.06679224254575
3.5877486062026698


In [91]:
#Calculate the global mean (bias)
count = 0
r_sum = 0

for i in R_train1.index:
    for j in R_train1.columns:
        if np.isnan(R_train1.loc[i,j]) == False:
            count += 1
            r_sum += R_train1.loc[i,j]
        else: continue

mu = r_sum/count

In [98]:
#Print the global mean
print(f'The global mean/bias for our matrix is: {mu}')

The global mean/bias for our matrix is: 3.5238268742409184


In [276]:
def SVDplusplus(R, F, mu, factors, steps, lrn_rate, regular, lrn_rate_bias, regular_bias):
    
    start = time.time()
    #get the dimentions of R
    n = len(R.index)
    m = len(R.columns)
    
    #initialize the user, item and global bias
    Bu = pd.DataFrame(np.random.rand(n), index=R.index) 
    Bi = pd.DataFrame(np.random.rand(m), index=R.columns)
    mu = mu
    
    #initialize the user-factor, item-factor and feedback-factor matrices
    P = pd.DataFrame(np.random.rand(n,factors), index=R.index)
    Q = pd.DataFrame(np.random.rand(m,factors), index=R.columns)
    Y = pd.DataFrame(np.random.rand(m,factors), index=R.columns)
    
    #precalculate implicit feedback metrics
    y_norm = pd.DataFrame(pow(np.sum(F, axis=1), -1/2), index=F.index)
    
    loss_total = []
    
    for s in range(steps):
        print(f'Iteration {s} started')
        for i in R.index:
            for j in R.columns:
                if np.isnan(R.loc[i,j]) == False:
                    
                    #print our vars for debugging purposes
                    print(f'''User: {i}, Item: {j}, Rating: {R.loc[i,j]},
                          mu: {mu}  \n,
                          Bu: {Bu.loc[i]}  \n,
                          Bi: {Bi.loc[j]} \n,
                          Q_j: {Q.loc[j]} \n,
                          P_i: {P.loc[i]} \n,
                          Y_j: {Y.loc[j]} \n,
                          y_norm_i: {y_norm.loc[i]} \n,
                          Y_sum: {np.sum(Y[F.loc[i]==1], axis=0)} \n''')
                    
                    
                    
                    eij = (R.loc[i,j] 
                           - mu 
                           - Bu.loc[i] 
                           - Bi.loc[j] 
                           - np.dot(Q.loc[j], P.loc[i] + float(y_norm.loc[i])*np.sum(Y[F.loc[i]==1], axis=0)))
                    
                    Bu.loc[i] = Bu.loc[i] + lrn_rate_bias*(eij - regular_bias*Bu.loc[i])
                    Bi.loc[j] = Bi.loc[j] + lrn_rate_bias*(eij - regular_bias*Bi.loc[j])
                    P.loc[i] = P.loc[i] + lrn_rate*(eij*Q.loc[j] - regular*P.loc[i])
                    Q.loc[j] = Q.loc[j] + lrn_rate*(eij*(P.loc[i] + (y_norm.loc[i])*np.sum(Y[F.loc[i]==1], axis=0))\
                                                    - regular*Q.loc[j])
                    Y.loc[j] = Y.loc[j] + lrn_rate*(eij*y_norm.loc[i]*Q.loc[j] - regular*Y.loc[j])
                    
                    #print our vars for debugging purposes
                    print(f'''y_norm*Y_sum: \n {float(y_norm.loc[i])*np.sum(Y[F.loc[i]==1], axis=0)} \n,
                          P_i + y_norm*Y_sum: \n {P.loc[i] + float(y_norm.loc[i])*np.sum(Y[F.loc[i]==1], axis=0)} \n,
                          dot product: {np.dot(Q.loc[j], P.loc[i].array + float(y_norm.loc[i])*np.sum(Y[F.loc[i]==1], axis=0))}\n,
                          eij: {eij} \n,
                          Bu: {Bu.loc[i]} \n,
                          Bi: {Bi.loc[j]} \n,
                          Q_j: {Q.loc[j]} \n,
                          P_i: {P.loc[i]} \n,
                          Y_j: {Y.loc[j]} \n,
                          Bu nan {np.sum(np.isnan(Bu))},
                          Bi nan {np.sum(np.isnan(Bi))},
                          P nan {np.sum(np.sum(np.isnan(P)))},
                          Q nan {np.sum(np.sum(np.isnan(Q)))},
                          Y nan {np.sum(np.sum(np.isnan(Y)))}''')
        
        #calculae the loss
        loss_iter = 0
        for i in R.index:
            for j in R.columns:
                if np.isnan(R.loc[i,j]) == False:
                    p_nrm = np.linalg.norm(P.loc[i])**2
                    q_nrm = np.linalg.norm(Q.loc[j])**2
                    y_nrm = np.linalg.norm(Y[F.loc[i]==1], axis=1)**2
                    loss_iter += (R.loc[i,j] 
                                  - mu 
                                  - Bu.loc[i] 
                                  - Bi.loc[j] 
                                  - np.dot(Q.loc[j], P.loc[i] + y_norm.loc[i]*np.sum(Y[F.loc[i]==1], axis=0)))**2 \
                                  + regular*(p_nrm + q_nrm + np.sum(y_nrm))\
                                  + regular_bias*(Bu.loc[i]**2 + Bi.loc[j]**2)
        loss_total.append(loss_iter)
        print(f'Iteration: {s} \t Loss: {loss_total[-1]} \t Time passed: {time.time()-start}')            
    
    return P, Q, Y, Bu, Bi, loss_total

In [277]:
P, Q, Y, Bu, Bi, loss = SVDplusplus(R_train1, F, mu, 15, 2, 0.02, 0.1, 0.01, 0.007)

Iteration 0 started
User: 1, Item: 1, Rating: 5.0,
                          mu: 3.5238268742409184  
,
                          Bu: 0    0.121615
Name: 1, dtype: float64  
,
                          Bi: 0    0.081209
Name: 1, dtype: float64 
,
                          Q_j: 0     0.477454
1     0.541664
2     0.059692
3     0.959312
4     0.444175
5     0.097926
6     0.538718
7     0.481462
8     0.180017
9     0.899358
10    0.917311
11    0.425035
12    0.384457
13    0.530907
14    0.252773
Name: 1, dtype: float64 
,
                          P_i: 0     0.792160
1     0.168703
2     0.186504
3     0.548293
4     0.472633
5     0.194894
6     0.418226
7     0.354146
8     0.436999
9     0.775014
10    0.099578
11    0.212453
12    0.293862
13    0.538690
14    0.529304
Name: 1, dtype: float64 
,
                          Y_j: 0     0.395121
1     0.789603
2     0.421468
3     0.079732
4     0.743599
5     0.673449
6     0.045113
7     0.656080
8     0.914177
9     0.389746
10    

AttributeError: 'Series' object has no attribute 'array'

In [269]:
F.loc[4] + F.loc[3]

itemId
1       0.0
2       0.0
3       0.0
4       0.0
5       0.0
6       0.0
7       0.0
8       0.0
9       0.0
10      0.0
11      1.0
12      0.0
13      0.0
14      0.0
15      0.0
16      0.0
17      0.0
18      0.0
19      0.0
20      0.0
21      0.0
22      0.0
23      0.0
24      0.0
25      0.0
26      0.0
27      0.0
28      0.0
29      0.0
30      0.0
       ... 
1652    0.0
1654    0.0
1655    0.0
1656    0.0
1657    0.0
1658    0.0
1659    0.0
1660    0.0
1661    0.0
1662    0.0
1663    0.0
1664    0.0
1665    0.0
1666    0.0
1667    0.0
1668    0.0
1669    0.0
1670    0.0
1671    0.0
1672    0.0
1673    0.0
1674    0.0
1675    0.0
1676    0.0
1677    0.0
1678    0.0
1679    0.0
1680    0.0
1681    0.0
1682    0.0
Length: 1680, dtype: float64

In [162]:
#precalculate y_norm for rating prediction
y_norm = pd.DataFrame(pow(np.sum(F, axis=1), -1/2), index=F.index)


In [237]:
np.sum(np.sum(np.isnan(P)))

14145

In [235]:
Bi = pd.DataFrame(np.random.rand(len(R_train1.columns)), index=R_train1.columns)
Bu = pd.DataFrame(np.random.rand(len(R_train1.index)), index=R_train1.index)
print(Bu.loc[943])
print (Bi.loc[4])
print(Bu.loc[943]+Bi.loc[4])

0    0.077748
Name: 943, dtype: float64
0    0.876321
Name: 4, dtype: float64
0    0.95407
dtype: float64


In [187]:
#Let's evaluate the performance with RMSE and MAE metrics

counter = 0
se = 0
ae = 0

for i in R_test1.index:
    for j in R_test1.columns:
        if np.isnan(R_test1.loc[i,j]) == False:
            if j in Q.index == True:
                r_pred = (mu 
                          + Bu.loc[i]
                          + Bi.loc[j]
                          + np.dot(Q.loc[j], P.loc[i] + y_norm.loc[i]*np.sum(Y[F.loc[i]==1], axis=0)))
                se += (R_test1.loc[i,j] - r_pred)**2
                ae += np.abs(R_test1.loc[i,j] - r_pred)
                counter += 1
            elif j in Q.index == False:
                r_pred = mu + Bi[j]
                se += (R_test1.loc[i,j] - r_pred)**2
                ae += np.abs(R_test1.loc[i,j] - r_pred)
                counter += 1
            else: continue
    
rmse = (se/counter)**(1/2)
mae = ae/counter

print(f'The model RMSE: {rmse} \t The model MAE: {mae}')

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()