In [1]:
import os
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error, ndcg_score, recall_score
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [2]:
from typical import cf_user_based, MF
import utils

# Douban

In [3]:
root = os.path.join(os.getcwd(), "DoubanBook")
rel_p = os.path.join(root, "user_book.dat")

user_cnt = 13024
item_cnt = 22347

In [4]:
rel = utils.read_file(rel_p)
rel.head()

Unnamed: 0,cols_0,cols_1,cols_2
0,10855,938,4
1,10027,3,3
2,741,2426,5
3,453,1263,4
4,11665,7717,5


In [5]:
k = 5
kf = KFold(n_splits=5)

## Douban without filter

In [6]:
fold_cnt = 0

mse_list = []
recall_list = []
ndcg_list = []

for train_index, test_index in kf.split(rel):
    
    fold_cnt += 1
    print("========= Fold: {} ==========".format(fold_cnt))
    
    test_df = rel.iloc[test_index]
    
    train_index, valid_index = train_test_split(train_index, test_size=0.1)
    train_df = rel.iloc[train_index]
    valid_df = rel.iloc[valid_index]
    
    train_m = csr_matrix((train_df.cols_2, (train_df.cols_0, train_df.cols_1)), shape=(user_cnt+1, item_cnt+1))
    train_m = utils.get_rep(train_m)
    mf = MF(R=train_m, K=30, alpha=0.01, beta=0.001, iterations=10)
    mf.train()
    
    preds = []
    gts = []
    for index, row in valid_df.iterrows():
        u, i, r = row['cols_0'], row['cols_1'], row['cols_2']
        pred = mf.get_rating(u-1, i-1)
        preds.append(pred)
        gts.append(r)
        
    preds = np.array(preds).reshape(-1, 1)
    gts = np.array(gts).reshape(-1, 1)
    
    mse = mean_squared_error(gts, preds)
    
    preds = np.where(preds >= 3, 1, 0)
    gts = np.where(preds >=3, 1, 0)
    recall = recall_score(gts, preds)
    ndcg = ndcg_score(gts.reshape(1, -1), preds.reshape(1, -1))
    
    mse_list.append(mse)
    recall_list.append(recall)
    ndcg_list.append(ndcg)

Iteration: 10 ; error = 3121.5899


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 3121.3403


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 3121.3449


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 3121.2396


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 3121.2556


  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
gts

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [8]:
preds

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [9]:
sum(mse_list)/len(mse_list)

17.081908049402763

In [10]:
sum(recall_list)/len(recall_list)

0.0

In [11]:
sum(ndcg_list)/len(ndcg_list)

0.0

## Douban with filter

In [12]:
fold_cnt = 0

mse_list = []
recall_list = []
ndcg_list = []

rel.loc[rel['cols_2'] < 3] = 0

for train_index, test_index in kf.split(rel):
    
    fold_cnt += 1
    print("========= Fold: {} ==========".format(fold_cnt))
    
    test_df = rel.iloc[test_index]
    
    train_index, valid_index = train_test_split(train_index, test_size=0.1)
    train_df = rel.iloc[train_index]
    valid_df = rel.iloc[valid_index]
    
    train_m = csr_matrix((train_df.cols_2, (train_df.cols_0, train_df.cols_1)), shape=(user_cnt+1, item_cnt+1))
    train_m = utils.get_rep(train_m)
    mf = MF(R=train_m, K=20, alpha=0.01, beta=0.001, iterations=10)
    mf.train()
    
    preds = []
    gts = []
    for index, row in valid_df.iterrows():
        u, i, r = row['cols_0'], row['cols_1'], row['cols_2']
        pred = mf.get_rating(u-1, i-1)
        preds.append(pred)
        gts.append(r)
    preds = np.array(preds).reshape(-1, 1)
    gts = np.array(gts).reshape(-1, 1)
    
    mse = mean_squared_error(gts, preds)
    preds = np.where(preds >= 3, 1, 0)
    gts = np.where(preds >=3, 1, 0)
    recall = recall_score(gts, preds)
    ndcg = ndcg_score(gts.reshape(1, -1), preds.reshape(1, -1))
    
    mse_list.append(mse)
    recall_list.append(recall)
    ndcg_list.append(ndcg)

Iteration: 10 ; error = 3113.6718


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 3113.1917


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 3113.3897


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 3112.9214


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 3113.3379


  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
sum(mse_list)/len(mse_list)

17.00082829918236

In [14]:
sum(recall_list)/len(recall_list)

0.0

In [15]:
sum(ndcg_list)/len(ndcg_list)

0.0

# Yelp

In [16]:
root = os.path.join(os.getcwd(), "Yelp")
rel_p = os.path.join(root, "user_business.dat")

user_cnt = 16239
item_cnt = 14284

In [17]:
rel = utils.read_file(rel_p)
rel.head()

Unnamed: 0,cols_0,cols_1,cols_2
0,1,8391,5
1,1,8971,5
2,2,186,5
3,2,205,5
4,2,209,4


In [18]:
k = 5
kf = KFold(n_splits=5)

## yelp without filter

In [19]:
fold_cnt = 0

mse_list = []
recall_list = []
ndcg_list = []

for train_index, test_index in kf.split(rel):
    
    fold_cnt += 1
    print("========= Fold: {} ==========".format(fold_cnt))
    
    test_df = rel.iloc[test_index]
    
    train_index, valid_index = train_test_split(train_index, test_size=0.1)
    train_df = rel.iloc[train_index]
    valid_df = rel.iloc[valid_index]
    
    train_m = csr_matrix((train_df.cols_2, (train_df.cols_0, train_df.cols_1)), shape=(user_cnt+1, item_cnt+1))
    train_m = utils.get_rep(train_m)
    mf = MF(R=train_m, K=20, alpha=0.01, beta=0.001, iterations=10)
    mf.train()
    
    preds = []
    gts = []
    for index, row in valid_df.iterrows():
        u, i, r = row['cols_0'], row['cols_1'], row['cols_2']
        pred = mf.get_rating(u-1, i-1)
        preds.append(pred)
        gts.append(r)
    preds = np.array(preds).reshape(-1, 1)
    gts = np.array(gts).reshape(-1, 1)
    
    mse = mean_squared_error(gts, preds)
    preds = np.where(preds >= 3, 1, 0)
    gts = np.where(preds >=3, 1, 0)
    recall = recall_score(gts, preds)
    ndcg = ndcg_score(gts.reshape(1, -1), preds.reshape(1, -1))
    
    mse_list.append(mse)
    recall_list.append(recall)
    ndcg_list.append(ndcg)

Iteration: 10 ; error = 1487.4866


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 1487.2711


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 1487.0861


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 1486.7731


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 1486.2068


  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
sum(mse_list)/len(mse_list)

15.47277090968501

In [21]:
sum(recall_list)/len(recall_list)

0.0

In [22]:
sum(ndcg_list)/len(ndcg_list)

0.0

## yelp with filter

In [23]:
fold_cnt = 0

mse_list = []
recall_list = []
ndcg_list = []

rel.loc[rel['cols_2'] < 3] = 0

for train_index, test_index in kf.split(rel):
    
    fold_cnt += 1
    print("========= Fold: {} ==========".format(fold_cnt))
    
    test_df = rel.iloc[test_index]
    
    train_index, valid_index = train_test_split(train_index, test_size=0.1)
    train_df = rel.iloc[train_index]
    valid_df = rel.iloc[valid_index]
    
    train_m = csr_matrix((train_df.cols_2, (train_df.cols_0, train_df.cols_1)), shape=(user_cnt+1, item_cnt+1))
    train_m = utils.get_rep(train_m)
    mf = MF(R=train_m, K=20, alpha=0.01, beta=0.001, iterations=10)
    mf.train()
    
    preds = []
    gts = []
    for index, row in valid_df.iterrows():
        u, i, r = row['cols_0'], row['cols_1'], row['cols_2']
        pred = mf.get_rating(u-1, i-1)
        preds.append(pred)
        gts.append(r)
    preds = np.array(preds).reshape(-1, 1)
    gts = np.array(gts).reshape(-1, 1)
    
    mse = mean_squared_error(gts, preds)
    preds = np.where(preds >= 3, 1, 0)
    gts = np.where(preds >=3, 1, 0)
    recall = recall_score(gts, preds)
    ndcg = ndcg_score(gts.reshape(1, -1), preds.reshape(1, -1))
    
    mse_list.append(mse)
    recall_list.append(recall)
    ndcg_list.append(ndcg)

Iteration: 10 ; error = 1467.5738


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 1467.6946


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 1467.9586


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 1466.8636


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 1466.4691


  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
sum(mse_list)/len(mse_list)

15.053902877768191

In [25]:
sum(recall_list)/len(recall_list)

0.0

In [26]:
sum(ndcg_list)/len(ndcg_list)

0.0

# Movielens

In [27]:
root = os.path.join(os.getcwd(), "Movielens")
rel_p = os.path.join(root, "user_movie.dat")

user_cnt = 943
item_cnt = 1682

In [28]:
rel = utils.read_file(rel_p)
rel.head()

Unnamed: 0,cols_0,cols_1,cols_2,cols_3
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [29]:
k = 5
kf = KFold(n_splits=5)

## movielens without filter

In [30]:
fold_cnt = 0

mse_list = []
recall_list = []
ndcg_list = []

for train_index, test_index in kf.split(rel):
    
    fold_cnt += 1
    print("========= Fold: {} ==========".format(fold_cnt))
    
    test_df = rel.iloc[test_index]
    
    train_index, valid_index = train_test_split(train_index, test_size=0.1)
    train_df = rel.iloc[train_index]
    valid_df = rel.iloc[valid_index]
    
    train_m = csr_matrix((train_df.cols_2, (train_df.cols_0, train_df.cols_1)), shape=(user_cnt+1, item_cnt+1))
    train_m = utils.get_rep(train_m)
    mf = MF(R=train_m, K=20, alpha=0.01, beta=0.001, iterations=10)
    mf.train()
    
    preds = []
    gts = []
    for index, row in valid_df.iterrows():
        u, i, r = row['cols_0'], row['cols_1'], row['cols_2']
        pred = mf.get_rating(u-1, i-1)
        preds.append(pred)
        gts.append(r)
    preds = np.array(preds).reshape(-1, 1)
    gts = np.array(gts).reshape(-1, 1)
    
    mse = mean_squared_error(gts, preds)
    preds = np.where(preds >= 3, 1, 0)
    gts = np.where(preds >=3, 1, 0)
    recall = recall_score(gts, preds)
    ndcg = ndcg_score(gts.reshape(1, -1), preds.reshape(1, -1))
    
    mse_list.append(mse)
    recall_list.append(recall)
    ndcg_list.append(ndcg)

Iteration: 10 ; error = 951.9534


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 951.6323


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 953.2988


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 954.0300


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 953.3073


  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
sum(mse_list)/len(mse_list)

12.640980773089101

In [32]:
sum(recall_list)/len(recall_list)

0.0

In [33]:
sum(ndcg_list)/len(ndcg_list)

0.0

## movielens with filter

In [34]:
fold_cnt = 0

mse_list = []
recall_list = []
ndcg_list = []

rel.loc[rel['cols_2'] < 3] = 0

for train_index, test_index in kf.split(rel):
    
    fold_cnt += 1
    print("========= Fold: {} ==========".format(fold_cnt))
    
    test_df = rel.iloc[test_index]
    
    train_index, valid_index = train_test_split(train_index, test_size=0.1)
    train_df = rel.iloc[train_index]
    valid_df = rel.iloc[valid_index]
    
    train_m = csr_matrix((train_df.cols_2, (train_df.cols_0, train_df.cols_1)), shape=(user_cnt+1, item_cnt+1))
    train_m = utils.get_rep(train_m)
    mf = MF(R=train_m, K=20, alpha=0.01, beta=0.001, iterations=10)
    mf.train()
    
    preds = []
    gts = []
    for index, row in valid_df.iterrows():
        u, i, r = row['cols_0'], row['cols_1'], row['cols_2']
        pred = mf.get_rating(u-1, i-1)
        preds.append(pred)
        gts.append(r)
    preds = np.array(preds).reshape(-1, 1)
    gts = np.array(gts).reshape(-1, 1)
    
    mse = mean_squared_error(gts, preds)
    preds = np.where(preds >= 3, 1, 0)
    gts = np.where(preds >=3, 1, 0)
    recall = recall_score(gts, preds)
    ndcg = ndcg_score(gts.reshape(1, -1), preds.reshape(1, -1))
    
    mse_list.append(mse)
    recall_list.append(recall)
    ndcg_list.append(ndcg)

Iteration: 10 ; error = 938.4072


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 938.4870


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 940.0742


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 940.5924


  _warn_prf(average, modifier, msg_start, len(result))


Iteration: 10 ; error = 940.4940


  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
sum(mse_list)/len(mse_list)

12.306930131242586

In [36]:
sum(recall_list)/len(recall_list)

0.0

In [37]:
sum(ndcg_list)/len(ndcg_list)

0.0