In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
import operator

rating = pd.read_csv('occf_data')


In [8]:

rating.index.name = 'index'
rating.head()

Unnamed: 0_level_0,watched_episodes,user_id,anime_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,1,29
1,4,1,318
2,1,1,262
3,5,1,9
4,4,1,185


In [25]:
test_df = rating.loc[range(0,63487,3)]
print(test_df.head())

       watched_episodes  user_id  anime_id
index                                     
0                     1        1        29
5                     4        1       254
10                   51        1        40
15                    1        1       144
20                  220        1         6


In [26]:
train1 = rating.loc[range(1,63487,3)]
train2 = rating.loc[range(2,63487,3)]

train_df = pd.concat([train1, train2])

In [39]:
print(train_df.max(axis = 0))
print(test_df.max(axis = 0))

watched_episodes    965
user_id             433
anime_id            999
dtype: int64
watched_episodes    975
user_id             433
anime_id            999
dtype: int64


In [47]:
train = np.zeros((433,999))
test = np.zeros((433,999))

for row in range(0, len(train_df)):
    u_id, a_id = train_df['user_id'].iloc[row], train_df['anime_id'].iloc[row]
    train[u_id-1][a_id-1]=train_df['watched_episodes'].iloc[row]
for row in range(0, len(test_df)):
    u_id, a_id = test_df['user_id'].iloc[row], test_df['anime_id'].iloc[row]
    test[u_id-1][a_id-1]=test_df['watched_episodes'].iloc[row]


In [48]:
from scipy import sparse
train = np.asmatrix(train)
test = np.asmatrix(test)

print(train)

[[ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  2.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 ...
 [26.  1.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]]


In [86]:
from collections import defaultdict
from datetime import datetime

class OCCF():
    def __init__(self, train, test, f, print_each):
        
        self.weight = 0.1
        self.alpha = 40
        self.n_iter = 10
        self.print = print_each
        
        self.f = f
        self.no_user = train.shape[0]
        self.no_item = train.shape[1]
        self.R = train
        self.R_test = test
        self.P = np.array(train>0, dtype=np.float16)
        self.P_test = np.array(test>0, dtype=np.float16)
        
        self.C = self.R * self.alpha + 1
        self.Ci = np.zeros((self.no_user, self.no_user))
        self.Cu = np.zeros((self.no_item, self.no_item))
        
        self.X = np.random.standard_normal((self.no_user, f))
        self.Y = np.random.standard_normal((self.no_item, f))
        
        self.loss = defaultdict(float)
        self.loss_test = defaultdict(float)
        
    def pred_v(self):
        return np.dot(self.X, self.Y.T)
    
    def upd_X(self, u, YtY):
        
        np.fill_diagonal(self.Cu, (self.C[u, :] - 1))
        comp1 = YtY
        comp2 = np.dot(self.Y.T, self.Cu).dot(self.Y)
        comp3 = np.identity(self.f) * self.weight
        comp = np.linalg.inv(comp1 + comp2 + comp3)
        comp = np.dot(comp, self.Y.T).dot(self.Cu)
        
        return np.dot(comp, self.P[u, :])
    
    def upd_Y(self, i, XtX):
        
        np.fill_diagonal(self.Ci, (self.C[:, i] - 1))
        comp1 = XtX
        comp2 = np.dot(self.X.T, self.Ci).dot(self.X)
        comp3 = np.identity(self.f) * self.weight
        comp = np.linalg.inv(comp1 + comp2 + comp3)
        comp = np.dot(comp, self.X.T).dot(self.Ci)
        
        return np.dot(comp, self.P[:, i])
    
    def optimize(self):
        
        start = datetime.now()
        for it in range(self.n_iter):
            
            start_iter = datetime.now()
            
            for u in range(self.no_user):
                YtY = np.dot(self.Y.T, self.Y)
                self.X[u, :] = self.upd_X(u, YtY)

            for i in range(self.no_item):
                XtX = np.dot(self.X.T, self.X)
                self.Y[i, :] = self.upd_Y(i, XtX)
        
            loss1 = self.evaluate(type = 'Train')
            loss2 = self.evaluate(type = 'Test')

            self.loss[it] = loss1
            self.loss_test[it] = loss2
            
            if self.print:
                print(f'EPOCH {it+1} : Training RANK {self.loss[it]:.4f}, Test RANK {self.loss_test[it]:.4f}')
            
                print(f'Time per Iteration {datetime.now() - start_iter}')
            elif self.print == False and it == self.n_iter -1:
                print(f'Training RANK {self.loss[it]:.4f}, Test RANK {self.loss_test[it]:.4f}')
            
            
        end = datetime.now()
        if self.print:
            print(f'Training time : {end-start}')
        
    def evaluate(self, type):
        
        if type == 'Train':
            R = self.R
            
        else:
            R = self.R_test

        R_pred = self.pred_v()
        
        rank_mat = np.zeros(R_pred.shape)
        
        for u in range(self.no_user):
            
            pred_u = R_pred[u, :] * -1
            rank = pred_u.argsort().argsort()
            rank = rank / self.no_item
            rank_mat[u, :] = rank

        return np.sum(np.asarray(R) * rank_mat) / np.sum(R)
    
        

In [87]:
model = OCCF(train, test, 30, print_each = True)

In [88]:
model.optimize()

EPOCH 1 : TRAINING RANK 0.3332, VALID RANK 0.5241
Time per Iteration 0:00:02.351312
EPOCH 2 : TRAINING RANK 0.2262, VALID RANK 0.4191
Time per Iteration 0:00:02.300014
EPOCH 3 : TRAINING RANK 0.1935, VALID RANK 0.3817
Time per Iteration 0:00:02.324877
EPOCH 4 : TRAINING RANK 0.1821, VALID RANK 0.3595
Time per Iteration 0:00:02.334275
EPOCH 5 : TRAINING RANK 0.1753, VALID RANK 0.3431
Time per Iteration 0:00:02.365521
EPOCH 6 : TRAINING RANK 0.1697, VALID RANK 0.3295
Time per Iteration 0:00:02.281304
EPOCH 7 : TRAINING RANK 0.1652, VALID RANK 0.3175
Time per Iteration 0:00:02.329826
EPOCH 8 : TRAINING RANK 0.1605, VALID RANK 0.3051
Time per Iteration 0:00:02.306954
EPOCH 9 : TRAINING RANK 0.1564, VALID RANK 0.2947
Time per Iteration 0:00:02.336657
EPOCH 10 : TRAINING RANK 0.1530, VALID RANK 0.2871
Time per Iteration 0:00:02.315315
Training takes time 0:00:23.246506


In [89]:
for i in [20,40,60,80,100,120,140,160,180,200] : 
    model = OCCF(train, test, i, print_each=False)
    model.optimize()

TRAINING RANK 0.1608, VALID RANK 0.2868
TRAINING RANK 0.1498, VALID RANK 0.3500
TRAINING RANK 0.1281, VALID RANK 0.3749
TRAINING RANK 0.1022, VALID RANK 0.4092
TRAINING RANK 0.0797, VALID RANK 0.4123
TRAINING RANK 0.0644, VALID RANK 0.4226
TRAINING RANK 0.0549, VALID RANK 0.4421
TRAINING RANK 0.0491, VALID RANK 0.4441
TRAINING RANK 0.0461, VALID RANK 0.4681
TRAINING RANK 0.0447, VALID RANK 0.4694
