# Introduction
在設計一個推薦系統模型時，最常用的作法之一就是透過使用者(user)和物品(item)的關聯性來建立模型，模型的輸入是user和item，輸出是模型預測user喜歡item的分數。具體來說，模型是一個函式f(user, item)= rate, user,item和rate是在database的一筆資料，rate則是這位user喜歡這個item的分數。在這篇文章中，我們訓練資料選用MovieLens資料集，模型使用[Factorization Machine](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)的方法，建立函式如下:$$rate = {w_0} + \sum_{i=1}^{n} {w_i}{x_i} + \sum_{i=1}^{n}\sum_{j=i+1}^{n} <{v_i},{v_j}>{x_i}{x_j}$$，${x}$是欄位的向量表示法（以MovieLens資料集為例，$x_0$是user_id, $x_1$是movie_id），${w_i}$是一階交互關係，就是自己跟自己的關係， $<{v_i},{v_j}>$的運算是${v_i}$,${v_j}$兩個vector做dot product, 代表$x_i$和$x_j$的關係，又稱二階交互關係。而計算${v}$和${x}$的作法是這樣的，我們舉MovieLens欄位0為例, 也就是user id，一開始先初始化兩張table，第一個是用來查user id對應到的embedding,也就是${x_0}$, 另一個是查${v_0}$，也就是說一個user id對應到兩個embedding，其中${x}$包含了自己欄位的特徵，並透過${w}$和別的欄位做weighted sum，${v}$則是透過$<{v_i},{v_j}>$來計算和其他欄位的關聯性。最後我們只要給一個user，例如：Jun，和一部電影《鬼影特攻：以暴制暴》，模型就可以預測Jun對於《鬼影特攻：以暴制暴》的喜愛程度。

# MovieLens Data 
#### dataset download: http://files.grouplens.org/datasets/movielens/ml-100k.zip

In [1]:
import pandas as pd
df = pd.read_csv("./ml-100k/ua.base", header=None, delimiter='\t')
df.columns = ["user_id", "movie_id", "rate", "timestamp"]
df

Unnamed: 0,user_id,movie_id,rate,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
90565,943,1047,2,875502146
90566,943,1074,4,888640250
90567,943,1188,3,888640250
90568,943,1228,3,888640275


# Parameters Setting

In [2]:
# import for Model
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
import random
from torch.utils.data import DataLoader

# for time
from time import time

# load parameters
from bunch import Bunch
args = Bunch({
    'epochs':20,
    'lr':1e-3,
    'batch_size':128,
    'eval_step':500,
    'load_dir':'./model_save',
    'save_dir':'./model_save',
    'seed':1,
    'device': None
})

###############
"""device"""
use_gpu = args.device is not None
if torch.cuda.is_available() and not use_gpu:
    print("WARNING: You have a CUDA device")
# set cuda device and seed
if use_gpu:
    torch.cuda.set_device(args.device)
    
"""set seed"""
torch.cuda.manual_seed(args.seed)
torch.manual_seed(args.seed)
random.seed(args.seed)
np.random.seed(args.seed)



## Dataset

In [3]:
import torch.utils.data as data
import pandas as pd
import json
class Dataset(data.Dataset):
    def __init__(self, csv_file, header=None, delimiter='\t'):
        """
        user_id: 1~943
        movie_id: 1~1682（其實裡面只有1680個，代表有兩個movie id是沒有在裡面的）
        """
        self.dataframe = pd.read_csv(csv_file, header=header, delimiter=delimiter)
        self.column_num = len(self.dataframe.columns)
        # rename column
        self.rename_column(['user_id', 'movie_id', 'rating', 'timestamp'])

        # however len(self.dataframe['movie_id'].value_counts()) == 1680
        self.user_num = self.dataframe['user_id'].value_counts().keys().max()
        self.movie_num = self.dataframe['movie_id'].value_counts().keys().max() # 1682
        
        # 分數是1~6分，我們將大於三分的label設成True, 小於的label設成False
        # set labels = True if rating >3 else labels=False
        self.dataframe['labels'] = self.dataframe['rating']>3
    
    def rename_column(self, name_list):
        assert self.column_num == len(name_list)
        self.dataframe.columns = name_list
        
    def show_labels_proportion(self):
        #label_True_count/Total_num_of_dataframe
        return self.dataframe['labels'].sum()/len(self.dataframe['labels'])
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        return json.loads(self.dataframe.iloc[idx].to_json(orient='columns'))

In [4]:
# load training dataset
train_dataset = Dataset('./ml-100k/ua.base')

# load testing dataset
test_dataset = Dataset('./ml-100k/ua.test')

print("Train labels proportion(True_count/Total_num)", train_dataset.show_labels_proportion())
print("Train labels proportion(True_count/Total_num)", test_dataset.show_labels_proportion())

Train labels proportion(True_count/Total_num) 0.5510213094843768
Train labels proportion(True_count/Total_num) 0.5799575821845175


In [5]:
user_num = train_dataset.user_num
movie_num = train_dataset.movie_num

In [6]:
# split train val dataset
from torch.utils.data import random_split
val_size = int(len(train_dataset)*0.1)
train_size = len(train_dataset) - val_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])
print("train size:{}, validation size:{}, test size:{}".format(train_size, val_size, len(test_dataset)))

train size:81513, validation size:9057, test size:9430


In [7]:
# create DataLoader
train_iter = DataLoader(dataset=train_dataset,
                        batch_size=args.batch_size)

val_iter = DataLoader(dataset=val_dataset,
                        batch_size=args.batch_size)

test_iter = DataLoader(dataset=test_dataset,
                        batch_size=args.batch_size)

# Baic module

In [8]:
import torch
from torch.autograd import Variable
# ref: https://github.com/hpzhao/SummaRuNNer/tree/47de2c0cc81f0464490ec43a7504e6d3075a2742
class BasicModule(torch.nn.Module):

    def __init__(self, args):
        super(BasicModule,self).__init__()
        self.args = args
        self.model_name = str(type(self))
    
    def save(self):
        checkpoint = {'model':self.state_dict(), 'args': self.args}
        best_path = '%s_%s_seed_%d.pt' % (self.args.save_dir,self.model_name,self.args.seed)
        torch.save(checkpoint,best_path)

        return best_path

    def load(self, best_path):
        if best_path == "":
            best_path = self.args.load_dir
        if self.args.device is not None:
            data = torch.load(best_path)['model']
        else:
            data = torch.load(best_path, map_location=lambda storage, loc: storage)['model']
        self.load_state_dict(data)
        if self.args.device is not None:
            return self.cuda()
        else:
            return self

# FM model

In [9]:
# https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf
# https://www.kaggle.com/gennadylaptev/factorization-machine-implemented-in-pytorch/data
class FM(BasicModule):
    def __init__(self, args, n=None, k=None):
        super(FM, self).__init__(args)
        self.model_name = 'FM'
        # Initially we fill V with random values sampled from Gaussian distribution
        # use nn.Parameter to compute gradients
        self.args = args
        self.V = nn.Parameter(torch.FloatTensor(n, k).normal_(0,1))
        self.linear = nn.Linear(n, 1)
        
    def forward(self, x):
        out_1 = torch.matmul(x, self.V).pow(2).sum(1, keepdim=True) #S_1^2
        out_2 = torch.matmul(x.pow(2), self.V.pow(2)).sum(1, keepdim=True) # S_2
        # out_1, out_2 shape are (B, k) 
        out_inter = 0.5*(out_1 - out_2)
        out_lin = self.linear(x)
        out = out_inter + out_lin
        out = torch.sigmoid(out) # (B,1)
        return out.squeeze() # (B)

# FM with embedding layer 

In [10]:
"""
The same as FM, but using nn.Embedding can be more convient to extend the first and seconder order parameter, 
i.e., you can modify the dimension of nn.Embedding from nn.Embedding(feature_size,1) 
to nn.Embedding(feature_size, D_model), the value of D_model is up to you
"""
# ref: https://github.com/chenxijun1029/DeepFM_with_PyTorch
class FM_emb(BasicModule):
    def __init__(self, args, feature_sizes, k=None):
        super(FM_emb, self).__init__(args)
        self.model_name = 'FM_emb'
        self.feature_sizes = feature_sizes # list [field1_categories_num, field2_categories_num, ...]
        self.args = args
        self.embedding_size = k
        self.first_order_embeddings = nn.ModuleList([nn.Embedding(feature_size,1) for feature_size in self.feature_sizes])
        self.second_order_embeddings = nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes])
        
        
    def forward(self, x, values, use_sigmoid=True):
        fields_first_order = [torch.sum(emb(x[:,i]), dim=1, keepdim=True)*values[:,i].unsqueeze(1) for i, emb in enumerate(self.first_order_embeddings)] # [(B,1), (B,1)]
        fm_first_order = sum(fields_first_order)# (B,2) sum-> (B,1)
        
        # a,b are vector
        # use 2*a.dot(b)xy = (ax+by)^2 - a^2x^2 - b^2y^2 reduce calculation
        fm_second_order_emb_arr = [torch.sum(emb(x[:,i]), dim=1, keepdim=True)*values[:,i].unsqueeze(1) for i, emb in enumerate(self.second_order_embeddings)] # [(B,1), (B,1)]
        fm_sum_second_order_emb = sum(fm_second_order_emb_arr)
        fm_sum_second_order_emb_square = fm_sum_second_order_emb*fm_sum_second_order_emb # (x+y)^2
        fm_second_order_emb_square = [item*item for item in fm_second_order_emb_arr]
        fm_second_order_emb_square_sum = sum(fm_second_order_emb_square) #x^2+y^2
        fm_second_order = (fm_sum_second_order_emb_square - fm_second_order_emb_square_sum) * 0.5
        
        out = fm_first_order + fm_second_order
        if use_sigmoid:
            return torch.sigmoid(out).squeeze() # (B,1) squeeze-> (B)
        else:
            return out.squeeze() # (B)

# FFM

In [11]:
# https://www.csie.ntu.edu.tw/~cjlin/papers/ffm.pdf
class FFM_emb(BasicModule):
    def __init__(self, args, feature_sizes, k=None):
        super(FFM_emb, self).__init__(args)
        self.model_name = 'FFM_emb'
        self.feature_sizes = feature_sizes # list [field1_categories_num, field2_categories_num, ...]
        self.field_sizes = len(feature_sizes)
        self.args = args
        self.embedding_size = k
        self.first_order_embeddings = nn.ModuleList([nn.Embedding(feature_size,1) for feature_size in self.feature_sizes])
        self.second_order_embeddings = nn.ModuleList([nn.ModuleList([nn.Embedding(feature_size, self.embedding_size) for _ in range(self.field_sizes)]) for feature_size in self.feature_sizes])
    
    def forward(self, x, values, use_sigmoid=True):
        # first order relation
        """
        x shape: (B, Field_num)
        value shape: (B, Field_num)
        """
        # or (torch.sum(emb(x[:,i]), dim=1, keepdim=True).t()*values[:,i]).t()
        fields_first_order = [torch.sum(emb(x[:,i]), dim=1, keepdim=True)*values[:,i].unsqueeze(1) for i, emb in enumerate(self.first_order_embeddings)] # [(B,1), (B,1)]
        fm_first_order = sum(fields_first_order)# (B,1)+(B,1)+...-> (B,1)
        
        # second order relation
        """
        fm_second_order_emb_arr (field_sizes, field_sizes)
        |[(B,1), (B,1), (B,1) ...]
        |[(B,1), (B,1), (B,1) ...]
        |[(B,1), (B,1), (B,1) ...]
                . 
                .
                .
        """
        ffm_second_order_emb_arr = [[torch.sum(emb(x[:,i]), dim=1, keepdim=True)*values[:,i].unsqueeze(1) for emb in field_embs] for i, field_embs in enumerate(self.second_order_embeddings)]
        ffm_wij_arr = []
        for i in range(self.field_sizes):
            for j in range(i+1, self.field_sizes):
                ffm_wij_arr.append(ffm_second_order_emb_arr[i][j]*ffm_second_order_emb_arr[j][i])
        ffm_second_order = sum(ffm_wij_arr)
        out = fm_first_order + ffm_second_order
        if use_sigmoid:
            return torch.sigmoid(out).squeeze() # (B,1) squeeze-> (B)
        else:
            return out.squeeze() # (B)

In [12]:
# build FM model
criterion = nn.BCELoss()
#model = FFM_emb(args, [user_num, movie_num], k=8)
model = FM_emb(args, [user_num, movie_num], k=64)

optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
if use_gpu:
    model.cuda()
    criterion.cuda()

# Train

In [13]:
# preprocess in batch(convert id to one hot encoding)
def one_hot_encoding(idx_tensor, dim=None):
    # only FM need this, FM_emb, FFM_emb doesn't
    if dim==None:
        print('Please assign a dimension num, e.g. one_hot_encoding(...,dim=10)')
        return
    one_hot = torch.zeros(idx_tensor.size(0), dim)
    one_hot[torch.arange(idx_tensor.size(0)), idx_tensor] = 1
    return one_hot

def convert_feature(batch, use_one_hot=True):
    """
    'user_id': [user_id, ...]
    'movie_id': [movie_id, ...]
    'labels': [label, ...]
    """
    # convert one hot encoding
    # the index should be minus 1, due to the id is start from 1
    if use_one_hot:
        users_vec = one_hot_encoding(batch['user_id']-1, dim=user_num) # B x user_num
        movies_vec = one_hot_encoding(batch['movie_id']-1, dim=movie_num) # B x movie_num
    else:
        users_vec = (batch['user_id']-1).unsqueeze(1).type(torch.LongTensor)
        movies_vec = (batch['movie_id']-1).unsqueeze(1).type(torch.LongTensor)
    labels = batch['labels'].float()
    
    # concat fields vector
    features = torch.cat((users_vec, movies_vec), dim=-1) # B x (user_num+movie_num)
    return features, labels

In [14]:
from sklearn.metrics import roc_auc_score
def validation(model, val_iter, criterion):
    model.eval().cpu()
    y_pred = []
    y = []
    total_loss = 0.0
    for nbatch, batch in enumerate(val_iter):
        features, labels = convert_feature(batch, use_one_hot=False)
        values = torch.ones(features.shape)
        probs = model(features, values)
        loss = criterion(probs, labels)
        total_loss += loss.item()
        y.extend(labels.data.numpy())
        y_pred.extend(probs.data.numpy())
    total_loss = total_loss/(nbatch+1)
    results = roc_auc_score(y, y_pred)
    if use_gpu:
        model.train().cuda()
    else:
        model.train()
    return total_loss, results

In [15]:
stime = time()
model.train()
total_loss = []
best_roc = -float('inf')
for epoch in range(1,args.epochs+1):
    etime = time()
    for nbatch, batch in enumerate(train_iter):
        """
        feature shape: (B, Field_num)
        values shape: (B, Field_num)
        """
        # only FM requires ont_hot_encoding, FM_emb, FFM_emb do not need
        # so use_one_hot=True for FM, otherwise use_one_hot=False
        features, labels = convert_feature(batch, use_one_hot=False)
        
        # value in original category. However, in this datase all of values are 1
        values = torch.ones(features.shape)
        if use_gpu:
            features, labels, values = features.cuda(), labels.cuda(), values.cuda()
        probs = model(features, values)
        loss = criterion(probs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if nbatch % args.eval_step==0:
            cur_loss, roc_auc = validation(model, val_iter, criterion)
            if best_roc < roc_auc:
                best_roc = roc_auc
                best_path = model.save()
                print(f"[{epoch}, {nbatch}] save model:{best_path}")
            print(f"[{epoch}, {nbatch}], cur_loss {cur_loss}, roc_auc:{roc_auc}, batch_loss {loss.item()}")
    epoch_loss, roc_auc = validation(model, train_iter, criterion)
    total_loss.append(epoch_loss)
    print(f"Epoch {epoch}, train loss:{epoch_loss}, roc_auc:{roc_auc}")

[1, 0] save model:./model_save_FM_emb_seed_1.pt
[1, 0], cur_loss 11.345163056548213, roc_auc:0.5114522058932062, batch_loss 10.903482437133789
[1, 500] save model:./model_save_FM_emb_seed_1.pt
[1, 500], cur_loss 3.373172189148379, roc_auc:0.5141295337521764, batch_loss 3.0293455123901367
Epoch 1, train loss:2.5038247130916482, roc_auc:0.5307565864419732
[2, 0] save model:./model_save_FM_emb_seed_1.pt
[2, 0], cur_loss 2.5234663335370344, roc_auc:0.5217247304196952, batch_loss 2.1613426208496094
[2, 500] save model:./model_save_FM_emb_seed_1.pt
[2, 500], cur_loss 1.209699400713746, roc_auc:0.5351489105100254, batch_loss 0.8595420122146606
Epoch 2, train loss:1.0324237423377285, roc_auc:0.563045984870705
[3, 0] save model:./model_save_FM_emb_seed_1.pt
[3, 0], cur_loss 1.0336723671832555, roc_auc:0.5462986251704588, batch_loss 1.3454794883728027
[3, 500] save model:./model_save_FM_emb_seed_1.pt
[3, 500], cur_loss 0.8693184735069812, roc_auc:0.5626877654793541, batch_loss 0.8493895530700684

In [16]:
#model.load("model_save_FFM_emb_seed_1.pt")
model.load("model_save_FM_emb_seed_1.pt")
total_loss, roc_auc = validation(model, test_iter, criterion)
print(f"loss:{total_loss}, roc_auc:{roc_auc}")

loss:0.6902047137956362, roc_auc:0.7119764661012618


In [17]:
# FM_emb and FFM_emb result(use epoch 50)
# ffm: 0.74097810204624
# fm: 0.7313277392961333

# catboost

In [18]:
def integrate_all_data(data_iter):
    data_list = []
    label_list = []
    for nbatch, batch in enumerate(data_iter):
        """
        feature shape: (B, Field_num)
        """
        features, labels = convert_feature(batch, use_one_hot=False)
        data_list.append(features.data.numpy())
        label_list.extend(labels.numpy())
    return np.concatenate(data_list, axis=0), label_list

train_data, train_label = integrate_all_data(train_iter)
val_data, val_label = integrate_all_data(val_iter)
test_data, test_label = integrate_all_data(test_iter)

In [19]:
from catboost import CatBoostClassifier, Pool, cv
model = CatBoostClassifier(iterations=2,
                           depth=1,
                           learning_rate=0.1,
                           loss_function='Logloss',
                           early_stopping_rounds=30,
                           verbose=False,
                           task_type='CPU',
                           devices='0')

In [20]:
cat_features = [0, 1]
model.fit(train_data, train_label, cat_features=cat_features)

<catboost.core.CatBoostClassifier at 0x7fe0d30b9710>

In [21]:
def validation_for_catboost(model, data, data_label):
    preds_proba = model.predict(data, prediction_type='Probability')
    probs = []
    for l, row in zip(data_label, preds_proba):
        probs.append(row[int(l)])
    results = roc_auc_score(data_label, probs)
    return results, probs

In [24]:
result, probs = validation_for_catboost(model, val_data, val_label)

In [25]:
result

0.7141420457106172