In [1]:
import os
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error, ndcg_score, recall_score
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [2]:
from typical import CF
import utils

# Douban

In [3]:
root = os.path.join(os.getcwd(), "DoubanBook")
rel_p = os.path.join(root, "user_book.dat")

user_cnt = 13024
item_cnt = 22347

In [4]:
rel = utils.read_file(rel_p)
rel.head()

Unnamed: 0,cols_0,cols_1,cols_2
0,10855,938,4
1,10027,3,3
2,741,2426,5
3,453,1263,4
4,11665,7717,5


In [5]:
k = 5
kf = KFold(n_splits=5)

## Douban without filter

In [None]:
fold_cnt = 0

mse_list = []
recall_list = []
ndcg_list = []

for train_index, test_index in kf.split(rel):
    
    fold_cnt += 1
    print("========= Fold: {} ==========".format(fold_cnt))
    
    test_df = rel.iloc[test_index]
    
    train_index, valid_index = train_test_split(train_index, test_size=0.1)
    train_df = rel.iloc[train_index]
    valid_df = rel.iloc[valid_index]
    
    train_m = csr_matrix((train_df.cols_2, (train_df.cols_0, train_df.cols_1)), shape=(user_cnt+1, item_cnt+1))
    train_m = utils.get_rep(train_m)
    cf = CF(R=train_m)
    cf.train()
    
    preds = []
    gts = []
    for index, row in valid_df.iterrows():
        u, i, r = row['cols_0'], row['cols_1'], row['cols_2']
        pred = cf.get_rating(u, i)
        preds.append(pred)
        gts.append(r)
        
    preds = np.array(preds).reshape(-1, 1)
    gts = np.array(gts).reshape(-1, 1)
    
    mse = mean_squared_error(gts, preds)
    preds = np.where(preds >= 3, 1, 0)
    gts = np.where(preds >=3, 1, 0)
    recall = recall_score(gts, preds)
    ndcg = ndcg_score(gts.reshape(1, -1), preds.reshape(1, -1))
    
    mse_list.append(mse)
    recall_list.append(recall)
    ndcg_list.append(ndcg)



In [None]:
sum(mse_list)/len(mse_list)

In [None]:
sum(recall_list)/len(recall_list)

In [None]:
sum(ndcg_list)/len(ndcg_list)

## Douban with filter

In [None]:
fold_cnt = 0

mse_list = []
recall_list = []
ndcg_list = []

rel.loc[rel['cols_2'] < 3] = 0

for train_index, test_index in kf.split(rel):
    
    fold_cnt += 1
    print("========= Fold: {} ==========".format(fold_cnt))
    
    test_df = rel.iloc[test_index]
    
    train_index, valid_index = train_test_split(train_index, test_size=0.1)
    train_df = rel.iloc[train_index]
    valid_df = rel.iloc[valid_index]
    
    train_m = csr_matrix((train_df.cols_2, (train_df.cols_0, train_df.cols_1)), shape=(user_cnt+1, item_cnt+1))
    train_m = utils.get_rep(train_m)
    cf = CF(R=train_m)
    cf.train()
    
    preds = []
    gts = []
    for index, row in valid_df.iterrows():
        u, i, r = row['cols_0'], row['cols_1'], row['cols_2']
        pred = cf.get_rating(u, i)
        preds.append(pred)
        gts.append(r)
    preds = np.array(preds).reshape(-1, 1)
    gts = np.array(gts).reshape(-1, 1)
    
    mse = mean_squared_error(gts, preds)
    preds = np.where(preds >= 3, 1, 0)
    gts = np.where(preds >=3, 1, 0)
    recall = recall_score(gts, preds)
    ndcg = ndcg_score(gts.reshape(1, -1), preds.reshape(1, -1))
    
    mse_list.append(mse)
    recall_list.append(recall)
    ndcg_list.append(ndcg)

In [None]:
sum(mse_list)/len(mse_list)

In [None]:
sum(recall_list)/len(recall_list)

In [None]:
sum(ndcg_list)/len(ndcg_list)

# Yelp

In [None]:
root = os.path.join(os.getcwd(), "Yelp")
rel_p = os.path.join(root, "user_business.dat")

user_cnt = 16239
item_cnt = 14284

In [None]:
rel = utils.read_file(rel_p)
rel.head()

In [None]:
k = 5
kf = KFold(n_splits=5)

## yelp without filter

In [None]:
fold_cnt = 0

mse_list = []
recall_list = []
ndcg_list = []

for train_index, test_index in kf.split(rel):
    
    fold_cnt += 1
    print("========= Fold: {} ==========".format(fold_cnt))
    
    test_df = rel.iloc[test_index]
    
    train_index, valid_index = train_test_split(train_index, test_size=0.1)
    train_df = rel.iloc[train_index]
    valid_df = rel.iloc[valid_index]
    
    train_m = csr_matrix((train_df.cols_2, (train_df.cols_0, train_df.cols_1)), shape=(user_cnt+1, item_cnt+1))
    train_m = utils.get_rep(train_m)
    cf = CF(R=train_m)
    cf.train()
    
    preds = []
    gts = []
    for index, row in valid_df.iterrows():
        u, i, r = row['cols_0'], row['cols_1'], row['cols_2']
        pred = cf.get_rating(u, i)
        preds.append(pred)
        gts.append(r)
    preds = np.array(preds).reshape(-1, 1)
    gts = np.array(gts).reshape(-1, 1)
    
    mse = mean_squared_error(gts, preds)
    preds = np.where(preds >= 3, 1, 0)
    gts = np.where(preds >=3, 1, 0)
    recall = recall_score(gts, preds)
    ndcg = ndcg_score(gts.reshape(1, -1), preds.reshape(1, -1))
    
    mse_list.append(mse)
    recall_list.append(recall)
    ndcg_list.append(ndcg)

In [None]:
sum(mse_list)/len(mse_list)

In [None]:
sum(recall_list)/len(recall_list)

In [None]:
sum(ndcg_list)/len(ndcg_list)

## yelp with filter

In [None]:
fold_cnt = 0

mse_list = []
recall_list = []
ndcg_list = []

rel.loc[rel['cols_2'] < 3] = 0

for train_index, test_index in kf.split(rel):
    
    fold_cnt += 1
    print("========= Fold: {} ==========".format(fold_cnt))
    
    test_df = rel.iloc[test_index]
    
    train_index, valid_index = train_test_split(train_index, test_size=0.1)
    train_df = rel.iloc[train_index]
    valid_df = rel.iloc[valid_index]
    
    train_m = csr_matrix((train_df.cols_2, (train_df.cols_0, train_df.cols_1)), shape=(user_cnt+1, item_cnt+1))
    train_m = utils.get_rep(train_m)
    cf = CF(R=train_m)
    cf.train()
    
    preds = []
    gts = []
    for index, row in valid_df.iterrows():
        u, i, r = row['cols_0'], row['cols_1'], row['cols_2']
        pred = cf.get_rating(u, i)
        preds.append(pred)
        gts.append(r)
    preds = np.array(preds).reshape(-1, 1)
    gts = np.array(gts).reshape(-1, 1)
    
    mse = mean_squared_error(gts, preds)
    preds = np.where(preds >= 3, 1, 0)
    gts = np.where(preds >=3, 1, 0)
    recall = recall_score(gts, preds)
    ndcg = ndcg_score(gts.reshape(1, -1), preds.reshape(1, -1))
    
    mse_list.append(mse)
    recall_list.append(recall)
    ndcg_list.append(ndcg)

In [None]:
sum(mse_list)/len(mse_list)

In [None]:
sum(recall_list)/len(recall_list)

In [None]:
sum(ndcg_list)/len(ndcg_list)

# Movielens

In [None]:
root = os.path.join(os.getcwd(), "Movielens")
rel_p = os.path.join(root, "user_movie.dat")

user_cnt = 943
item_cnt = 1682

In [None]:
rel = utils.read_file(rel_p)
rel.head()

In [None]:
k = 5
kf = KFold(n_splits=5)

## movielens without filter

In [None]:
fold_cnt = 0

mse_list = []
recall_list = []
ndcg_list = []

for train_index, test_index in kf.split(rel):
    
    fold_cnt += 1
    print("========= Fold: {} ==========".format(fold_cnt))
    
    test_df = rel.iloc[test_index]
    
    train_index, valid_index = train_test_split(train_index, test_size=0.1)
    train_df = rel.iloc[train_index]
    valid_df = rel.iloc[valid_index]
    
    train_m = csr_matrix((train_df.cols_2, (train_df.cols_0, train_df.cols_1)), shape=(user_cnt+1, item_cnt+1))
    train_m = utils.get_rep(train_m)
    cf = CF(R=train_m)
    cf.train()
    
    preds = []
    gts = []
    for index, row in valid_df.iterrows():
        u, i, r = row['cols_0'], row['cols_1'], row['cols_2']
        pred = cf.get_rating(u, i)
        preds.append(pred)
        gts.append(r)
    preds = np.array(preds).reshape(-1, 1)
    gts = np.array(gts).reshape(-1, 1)
    
    mse = mean_squared_error(gts, preds)
    preds = np.where(preds >= 3, 1, 0)
    gts = np.where(preds >=3, 1, 0)
    recall = recall_score(gts, preds)
    ndcg = ndcg_score(gts.reshape(1, -1), preds.reshape(1, -1))
    
    mse_list.append(mse)
    recall_list.append(recall)
    ndcg_list.append(ndcg)

In [None]:
sum(mse_list)/len(mse_list)

In [None]:
sum(recall_list)/len(recall_list)

In [None]:
sum(ndcg_list)/len(ndcg_list)

## movielens with filter

In [None]:
fold_cnt = 0

mse_list = []
recall_list = []
ndcg_list = []

rel.loc[rel['cols_2'] < 3] = 0

for train_index, test_index in kf.split(rel):
    
    fold_cnt += 1
    print("========= Fold: {} ==========".format(fold_cnt))
    
    test_df = rel.iloc[test_index]
    
    train_index, valid_index = train_test_split(train_index, test_size=0.1)
    train_df = rel.iloc[train_index]
    valid_df = rel.iloc[valid_index]
    
    train_m = csr_matrix((train_df.cols_2, (train_df.cols_0, train_df.cols_1)), shape=(user_cnt+1, item_cnt+1))
    train_m = utils.get_rep(train_m)
    cf = CF(R=train_m)
    cf.train()
    
    preds = []
    gts = []
    for index, row in valid_df.iterrows():
        u, i, r = row['cols_0'], row['cols_1'], row['cols_2']
        pred = cf.get_rating(u, i)
        preds.append(pred)
        gts.append(r)
    preds = np.array(preds).reshape(-1, 1)
    gts = np.array(gts).reshape(-1, 1)
    
    mse = mean_squared_error(gts, preds)
    preds = np.where(preds >= 3, 1, 0)
    gts = np.where(preds >=3, 1, 0)
    recall = recall_score(gts, preds)
    ndcg = ndcg_score(gts.reshape(1, -1), preds.reshape(1, -1))
    
    mse_list.append(mse)
    recall_list.append(recall)
    ndcg_list.append(ndcg)

In [None]:
sum(mse_list)/len(mse_list)

In [None]:
sum(recall_list)/len(recall_list)

In [None]:
sum(ndcg_list)/len(ndcg_list)