# Define Functions

In [42]:
# Define Layers

import numpy as np
import torch
import torch.nn.functional as F

class FeaturesLinear(torch.nn.Module):
    def __init__(self, field_dims, output_dim=1):
        super().__init__()
        self.fc = torch.nn.Linear(sum(field_dims), output_dim)
        self.bias = torch.nn.Parameter(torch.zeros((output_dim,)))

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields) original is long type``
        """
        return self.fc(x) + self.bias


class FeaturesEmbedding(torch.nn.Module):
    def __init__(self, field_dims, embed_dim):
        super().__init__()
        self.embedding = torch.nn.Linear(sum(field_dims), embed_dim)
        torch.nn.init.xavier_uniform_(self.embedding.weight.data)

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields) original is long type``
        """
        return self.embedding(x)


class FieldAwareFactorizationMachine(torch.nn.Module):
    def __init__(self, field_dims, embed_dim):
        super().__init__()
        self.num_fields = len(field_dims)
        self.embeddings = torch.nn.ModuleList([
            torch.nn.Linear(sum(field_dims), embed_dim) for _ in range(self.num_fields)
        ])
        for embedding in self.embeddings:
            torch.nn.init.xavier_uniform_(embedding.weight.data)

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields) original is long type``
        """
        xs = [embedding(x) for embedding in self.embeddings]
        ix = []
        for i in range(self.num_fields - 1):
            for j in range(i + 1, self.num_fields):
                ix.append(xs[j][:, i] * xs[i][:, j])
        ix = torch.stack(ix, dim=1)
        return ix


class FactorizationMachine(torch.nn.Module):
    def __init__(self, reduce_sum=True):
        super().__init__()
        self.reduce_sum = reduce_sum

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
        """
        square_of_sum = torch.sum(x, dim=1) ** 2
        sum_of_square = torch.sum(x ** 2, dim=1)
        ix = square_of_sum - sum_of_square
        if self.reduce_sum:
            ix = torch.sum(ix, dim=1, keepdim=True)
        return 0.5 * ix


class MultiLayerPerceptron(torch.nn.Module):
    def __init__(self, input_dim, embed_dims, dropout, output_layer=True):
        super().__init__()
        layers = []
        for embed_dim in embed_dims:
            layers.append(torch.nn.Linear(input_dim, embed_dim))
            layers.append(torch.nn.BatchNorm1d(embed_dim))
            layers.append(torch.nn.ReLU())
            layers.append(torch.nn.Dropout(p=dropout))
            input_dim = embed_dim
        if output_layer:
            layers.append(torch.nn.Linear(input_dim, 1))
        self.mlp = torch.nn.Sequential(*layers)

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, embed_dim)``
        """
        return self.mlp(x)


class InnerProductNetwork(torch.nn.Module):
    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
        """
        num_fields = x.shape[1]
        row, col = [], []
        for i in range(num_fields - 1):
            for j in range(i + 1, num_fields):
                row.append(i), col.append(j)
        return torch.sum(x[:, row] * x[:, col], dim=2)


class OuterProductNetwork(torch.nn.Module):
    def __init__(self, num_fields, embed_dim, kernel_type='mat'):
        super().__init__()
        num_ix = num_fields * (num_fields - 1) // 2
        if kernel_type == 'mat':
            kernel_shape = embed_dim, num_ix, embed_dim
        elif kernel_type == 'vec':
            kernel_shape = num_ix, embed_dim
        elif kernel_type == 'num':
            kernel_shape = num_ix, 1
        else:
            raise ValueError('unknown kernel type: ' + kernel_type)
        self.kernel_type = kernel_type
        self.kernel = torch.nn.Parameter(torch.zeros(kernel_shape))
        torch.nn.init.xavier_uniform_(self.kernel.data)

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
        """
        num_fields = x.shape[1]
        row, col = [], []
        for i in range(num_fields - 1):
            for j in range(i + 1, num_fields):
                row.append(i), col.append(j)
        p, q = x[:, row], x[:, col]
        if self.kernel_type == 'mat':
            kp = torch.sum(p.unsqueeze(1) * self.kernel, dim=-1).permute(0, 2, 1)
            return torch.sum(kp * q, -1)
        else:
            return torch.sum(p * q * self.kernel.unsqueeze(0), -1)


class CrossNetwork(torch.nn.Module):
    def __init__(self, input_dim, num_layers):
        super().__init__()
        self.num_layers = num_layers
        self.w = torch.nn.ModuleList([
            torch.nn.Linear(input_dim, 1, bias=False) for _ in range(num_layers)
        ])
        self.b = torch.nn.ParameterList([
            torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers)
        ])

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
        """
        x0 = x
        for i in range(self.num_layers):
            xw = self.w[i](x)
            x = x0 * xw + self.b[i] + x
        return x


class AttentionalFactorizationMachine(torch.nn.Module):
    def __init__(self, embed_dim, attn_size, dropouts):
        super().__init__()
        self.attention = torch.nn.Linear(embed_dim, attn_size)
        self.projection = torch.nn.Linear(attn_size, 1)
        self.fc = torch.nn.Linear(embed_dim, 1)
        self.dropouts = dropouts

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
        """
        num_fields = x.shape[1]
        row, col = [], []
        for i in range(num_fields - 1):
            for j in range(i + 1, num_fields):
                row.append(i), col.append(j)
        p, q = x[:, row], x[:, col]
        inner_product = p * q
        attn_scores = F.relu(self.attention(inner_product))
        attn_scores = F.softmax(self.projection(attn_scores), dim=1)
        attn_scores = F.dropout(attn_scores, p=self.dropouts[0], training=self.training)
        attn_output = torch.sum(attn_scores * inner_product, dim=1)
        attn_output = F.dropout(attn_output, p=self.dropouts[1], training=self.training)
        return self.fc(attn_output)


class CompressedInteractionNetwork(torch.nn.Module):
    def __init__(self, input_dim, cross_layer_sizes, split_half=True):
        super().__init__()
        self.num_layers = len(cross_layer_sizes)
        self.split_half = split_half
        self.conv_layers = torch.nn.ModuleList()
        prev_dim, fc_input_dim = input_dim, 0
        for i in range(self.num_layers):
            cross_layer_size = cross_layer_sizes[i]
            self.conv_layers.append(torch.nn.Conv1d(input_dim * prev_dim, cross_layer_size, 1,
                                                    stride=1, dilation=1, bias=True))
            if self.split_half and i != self.num_layers - 1:
                cross_layer_size //= 2
            prev_dim = cross_layer_size
            fc_input_dim += prev_dim
        self.fc = torch.nn.Linear(fc_input_dim, 1)

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
        """
        xs = []
        x0, h = x.unsqueeze(2), x
        for i in range(self.num_layers):
            x = x0 * h.unsqueeze(1)
            batch_size, f0_dim, fin_dim, embed_dim = x.shape
            x = x.view(batch_size, f0_dim * fin_dim, embed_dim)
            x = F.relu(self.conv_layers[i](x))
            if self.split_half and i != self.num_layers - 1:
                x, h = torch.split(x, x.shape[1] // 2, dim=1)
            else:
                h = x
            xs.append(x)
        return self.fc(torch.sum(torch.cat(xs, dim=1), 2))


class AnovaKernel(torch.nn.Module):
    def __init__(self, order, reduce_sum=True):
        super().__init__()
        self.order = order
        self.reduce_sum = reduce_sum

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
        """
        batch_size, num_fields, embed_dim = x.shape
        a_prev = torch.ones((batch_size, num_fields + 1, embed_dim), dtype=torch.float).to(x.device)
        for t in range(self.order):
            a = torch.zeros((batch_size, num_fields + 1, embed_dim), dtype=torch.float).to(x.device)
            a[:, t+1:, :] += x[:, t:, :] * a_prev[:, t:-1, :]
            a = torch.cumsum(a, dim=1)
            a_prev = a
        if self.reduce_sum:
            return torch.sum(a[:, -1, :], dim=-1, keepdim=True)
        else:
            return a[:, -1, :]

In [46]:
# Define Models

import math
import torch
import torch.nn.functional as F

# ML Models
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.svm import SVR, SVC
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, AdaBoostRegressor, AdaBoostClassifier
from xgboost import XGBRegressor, XGBClassifier

# FM Model
class FactorizationMachine(nn.Module):
    def __init__(self, input_dim, factors_num):
        super(FactorizationMachine, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
        self.interaction = nn.Parameter(torch.rand(input_dim, factors_num), requires_grad=True)
        nn.init.uniform_(self.interaction, -0.1, 0.1)

    def forward(self, x):
        linear_term = self.linear(x)
        interaction_1 = torch.matmul(x, self.interaction).pow(2)
        interaction_2 = torch.matmul(x. pow(2), self.interaction.pow(2))
        interaction_term = 0.5 * torch.sum(interaction_2 - interaction_1, 1, keepdim=True)
        output = linear_term + interaction_term
        return output

# DeepFM Model
class DeepFM(nn.Module):

    def __init__(self, feature_sizes, embedding_size, hidden_dims, num_classes=1, dropout=[0.5, 0.5]):
        
        super().__init__()
        self.field_size = len(feature_sizes)
        self.feature_sizes = feature_sizes
        self.embedding_size = embedding_size
        self.hidden_dims = hidden_dims
        self.num_classes = num_classes
        # self.dtype = torch.long
        # self.bias = torch.nn.Parameter(torch.randn(1))
        
        
        # FM part
        self.fm_first_order_embeddings = nn.ModuleList(
            [nn.Embedding(feature_size, 1) for feature_size in self.feature_sizes])
        self.fm_second_order_embeddings = nn.ModuleList(
            [nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes])
        
        # Deep part
        all_dims = [self.field_size * self.embedding_size] + self.hidden_dims
        for i in range(1, len(hidden_dims) + 1):
            setattr(self, 'linear_'+str(i), nn.Linear(all_dims[i-1], all_dims[i]))
            setattr(self, 'batchNorm_' + str(i), nn.BatchNorm1d(all_dims[i]))
            setattr(self, 'activation_' + str(i), nn.ReLU())
            setattr(self, 'dropout_'+str(i), nn.Dropout(dropout[i-1]))
        
    def forward(self, Xi, Xv):
        
        # FM part
        fm_first_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in enumerate(self.fm_first_order_embeddings)]
        fm_first_order = torch.cat(fm_first_order_emb_arr, 1)
        fm_second_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in enumerate(self.fm_second_order_embeddings)]
        fm_sum_second_order_emb = sum(fm_second_order_emb_arr)
        fm_sum_second_order_emb_square = fm_sum_second_order_emb * fm_sum_second_order_emb  # (x+y)^2
        fm_second_order_emb_square = [
            item*item for item in fm_second_order_emb_arr]
        fm_second_order_emb_square_sum = sum(
            fm_second_order_emb_square)  # x^2+y^2
        fm_second_order = (fm_sum_second_order_emb_square - fm_second_order_emb_square_sum) * 0.5
        
        # Deep part
        deep_emb = torch.cat(fm_second_order_emb_arr, 1)
        deep_out = deep_emb
        for i in range(1, len(self.hidden_dims) + 1):
            deep_out = getattr(self, 'linear_' + str(i))(deep_out)
            deep_out = getattr(self, 'batchNorm_' + str(i))(deep_out)
            deep_out = getattr(self, 'activation_' + str(i))(deep_out)
            deep_out = getattr(self, 'dropout_' + str(i))(deep_out)
        
        # Sum part
        total_sum = torch.sum(fm_first_order, 1) + torch.sum(fm_second_order, 1) + torch.sum(deep_out, 1)
        
        return total_sum

# AFM Model
class AttentionalFactorizationMachineModel(torch.nn.Module):
    """
    A pytorch implementation of Attentional Factorization Machine.

    Reference:
        J Xiao, et al. Attentional Factorization Machines: Learning the Weight of Feature Interactions via Attention Networks, 2017.
    """

    def __init__(self, field_dims, embed_dim, attn_size, dropouts):
        super().__init__()
        self.num_fields = len(field_dims)
        self.embed_dim = embed_dim # manual added
        self.embedding = FeaturesEmbedding(field_dims, embed_dim)
        self.linear = FeaturesLinear(field_dims)
        self.afm = AttentionalFactorizationMachine(embed_dim, attn_size, dropouts)

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields)``
        """
        batch_size = x.size(0) # manual added
        embed_shape = (batch_size, self.num_fields, self.embed_dim) # manual added
        embed_x = x.view(embed_shape) # manual added
        x = self.linear(x) + self.afm(embed_x)
#         return torch.sigmoid(x.squeeze(1))
        return x.squeeze(1) # manual added

# DCN Model
class DeepCrossNetworkModel(torch.nn.Module):
    """
    A pytorch implementation of Deep & Cross Network.

    Reference:
        R Wang, et al. Deep & Cross Network for Ad Click Predictions, 2017.
    """

    def __init__(self, field_dims, embed_dim, num_layers, mlp_dims, dropout):
        super().__init__()
        self.embedding = FeaturesEmbedding(field_dims, embed_dim)
        self.num_fields = len(field_dims) # manual added
        self.embed_dim = embed_dim # manual added
        self.embed_output_dim = len(field_dims) * embed_dim
        self.cn = CrossNetwork(self.embed_output_dim, num_layers)
        self.mlp = MultiLayerPerceptron(self.embed_output_dim, mlp_dims, dropout, output_layer=False)
        self.linear = torch.nn.Linear(mlp_dims[-1] + self.embed_output_dim, 1)

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields)``
        """
        batch_size = x.size(0) # manual added
        embed_shape = (batch_size, self.num_fields, self.embed_dim) # manual added
        embed_x = x.view(embed_shape) # manual added
#         embed_x = self.embedding(x).view(-1, self.embed_output_dim)
        embed_x = embed_x.view(-1, self.embed_output_dim) # manual added
        x_l1 = self.cn(embed_x)
        h_l2 = self.mlp(embed_x)
        x_stack = torch.cat([x_l1, h_l2], dim=1)
        p = self.linear(x_stack)
#         return torch.sigmoid(p.squeeze(1))
        return p.squeeze(1) # manual added

# xDeepFM Model
class ExtremeDeepFactorizationMachineModel(torch.nn.Module):
    """
    A pytorch implementation of xDeepFM.

    Reference:
        J Lian, et al. xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems, 2018.
    """

    def __init__(self, field_dims, embed_dim, mlp_dims, dropout, cross_layer_sizes, split_half=True):
        super().__init__()
        self.num_fields = len(field_dims) # manual added
        self.embed_dim = embed_dim # manual added
        self.embedding = FeaturesEmbedding(field_dims, embed_dim)
        self.embed_output_dim = len(field_dims) * embed_dim
        self.cin = CompressedInteractionNetwork(len(field_dims), cross_layer_sizes, split_half)
        self.mlp = MultiLayerPerceptron(self.embed_output_dim, mlp_dims, dropout)
        self.linear = FeaturesLinear(field_dims)

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields)``
        """
        batch_size = x.size(0) # manual added
        embed_shape = (batch_size, self.num_fields, self.embed_dim) # manual added
        embed_x = x.view(embed_shape) # manual added
#         embed_x = self.embedding(x)
        x = self.linear(x) + self.cin(embed_x) + self.mlp(embed_x.view(-1, self.embed_output_dim))
#         return torch.sigmoid(x.squeeze(1))
        return x.squeeze(1) # manual added

# AutoInt Model
class AutomaticFeatureInteractionModel(torch.nn.Module):
    """
    A pytorch implementation of AutoInt.

    Reference:
        W Song, et al. AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks, 2018.
    """

    def __init__(self, field_dims, embed_dim, atten_embed_dim, num_heads, num_layers, mlp_dims, dropouts, has_residual=True):
        super().__init__()
        self.num_fields = len(field_dims)
        self.embed_dim = embed_dim
        self.linear = FeaturesLinear(field_dims)
        self.embedding = FeaturesEmbedding(field_dims, embed_dim)
        self.atten_embedding = torch.nn.Linear(embed_dim, atten_embed_dim)
        self.embed_output_dim = len(field_dims) * embed_dim
        self.atten_output_dim = len(field_dims) * atten_embed_dim
        self.has_residual = has_residual
        self.mlp = MultiLayerPerceptron(self.embed_output_dim, mlp_dims, dropouts[1])
        self.self_attns = torch.nn.ModuleList([
            torch.nn.MultiheadAttention(atten_embed_dim, num_heads, dropout=dropouts[0]) for _ in range(num_layers)
        ])
        self.attn_fc = torch.nn.Linear(self.atten_output_dim, 1)
        if self.has_residual:
            self.V_res_embedding = torch.nn.Linear(embed_dim, atten_embed_dim)

    def forward(self, x):
#         print("x:", x.size())
#         embed_x = self.embedding(x)
        batch_size = x.size(0)
        embed_shape = (batch_size, self.num_fields, self.embed_dim)
        embed_x = x.view(embed_shape)
#         print("embed_x:", embed_x.size())
        atten_x = self.atten_embedding(embed_x)
#         print("atten_x:", atten_x.size())
        cross_term = atten_x.transpose(0, 1)
#         cross_term = atten_x
#         print("cross_term:", cross_term.size())
        for self_attn in self.self_attns:
            cross_term, _ = self_attn(cross_term, cross_term, cross_term)
#         print("cross_term:", cross_term.size())
        cross_term = cross_term.transpose(0, 1)
#         print("cross_term:", cross_term.size())
        if self.has_residual:
            V_res = self.V_res_embedding(embed_x)
#             print("V_res", V_res.size())
            cross_term += V_res
        cross_term = F.relu(cross_term).contiguous().view(-1, self.atten_output_dim)
        x = self.linear(x) + self.attn_fc(cross_term) + self.mlp(embed_x.view(-1, self.embed_output_dim))
        return x.squeeze(1)

# AFN Model
class LNN(torch.nn.Module):
    """
    A pytorch implementation of LNN layer
    Input shape
        - A 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
    Output shape
        - 2D tensor with shape:``(batch_size,LNN_dim*embedding_size)``.
    Arguments
        - **in_features** : Embedding of feature.
        - **num_fields**: int.The field size of feature.
        - **LNN_dim**: int.The number of Logarithmic neuron.
        - **bias**: bool.Whether or not use bias in LNN.
    """
    def __init__(self, num_fields, embed_dim, LNN_dim, bias=False):
        super(LNN, self).__init__()
        self.num_fields = num_fields
        self.embed_dim = embed_dim
        self.LNN_dim = LNN_dim
        self.lnn_output_dim = LNN_dim * embed_dim
        self.weight = torch.nn.Parameter(torch.Tensor(LNN_dim, num_fields))
        if bias:
            self.bias = torch.nn.Parameter(torch.Tensor(LNN_dim, embed_dim))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()
    
    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields, embedding_size)``
        """
        embed_x_abs = torch.abs(x) # Computes the element-wise absolute value of the given input tensor.
        embed_x_afn = torch.add(embed_x_abs, 1e-7)
        # Logarithmic Transformation
        embed_x_log = torch.log1p(embed_x_afn) # torch.log1p and torch.expm1
        lnn_out = torch.matmul(self.weight, embed_x_log)
        if self.bias is not None:
            lnn_out += self.bias
        lnn_exp = torch.expm1(lnn_out)
        output = F.relu(lnn_exp).contiguous().view(-1, self.lnn_output_dim)
        return output

class AdaptiveFactorizationNetwork(torch.nn.Module):
    """
    A pytorch implementation of AFN.

    Reference:
        Cheng W, et al. Adaptive Factorization Network: Learning Adaptive-Order Feature Interactions, 2019.
    """
    def __init__(self, field_dims, embed_dim, LNN_dim, mlp_dims, dropouts):
        super().__init__()
        self.num_fields = len(field_dims)
        self.linear = FeaturesLinear(field_dims)    # Linear
        self.embed_dim = embed_dim # manual added
        self.embedding = FeaturesEmbedding(field_dims, embed_dim)   # Embedding
        self.LNN_dim = LNN_dim
        self.LNN_output_dim = self.LNN_dim * embed_dim
        self.LNN = LNN(self.num_fields, embed_dim, LNN_dim)
        self.mlp = MultiLayerPerceptron(self.LNN_output_dim, mlp_dims, dropouts[0])

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        batch_size = x.size(0) # manual added
        embed_shape = (batch_size, self.num_fields, self.embed_dim) # manual added
        embed_x = x.view(embed_shape) # manual added
#         embed_x = self.embedding(x)
        lnn_out = self.LNN(embed_x)
        x = self.linear(x) + self.mlp(lnn_out)
#         return torch.sigmoid(x.squeeze(1))
        return x.squeeze(1) # manual added

In [115]:
# Define Training and Validation (or Testing) Function

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import tqdm
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.metrics import accuracy_score, auc, roc_curve, f1_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_auc_score
import os
import numpy as np
import pandas as pd
from datetime import datetime

def model_training_validtion_or_testing(X_train, y_train, X_valid, y_valid, 
                             model_name, hyperparameters, task_type, loss_type, optimizer_type,
                             dl_learning_rate, epochs_num, batch_size, save_records=True, testing=False):
    
    deep_learning_model_names = ["FM", "DeepFM", "AFM", "DCN", "xDeepFM", "AutoInt", "AFN"]
    machine_learning_model_names = ["Linear", "KNN", "SVM", "DecisionTree", "RandomForest", 
                                    "AdaBoost", "XGBoost"]
    
    model = 0

    if model_name in deep_learning_model_names:

        if model_name == "FM":
            model = FactorizationMachine(**hyperparameters)
        elif model_name == "DeepFM":
            model = DeepFM(**hyperparameters)
        elif model_name == "AFM":
            model = AttentionalFactorizationMachineModel(**hyperparameters)
        elif model_name == "DCN":
            model = DeepCrossNetworkModel(**hyperparameters)
        elif model_name == "xDeepFM":
            model = ExtremeDeepFactorizationMachineModel(**hyperparameters)
        elif model_name == "AutoInt":
            model = AutomaticFeatureInteractionModel(**hyperparameters)
        elif model_name == "AFN":
            model = AdaptiveFactorizationNetwork(**hyperparameters)
        else:
            print(f"Please choose a model in {deep_learning_model_names} !")

        # Convert to float tensor
        X_train_tensor = torch.from_numpy(X_train).float()
        Xi_train_tensor = torch.arange(X_train_tensor.size(1)).unsqueeze(0).unsqueeze(-1).repeat(X_train_tensor.size(0), 1, 1).int()
        y_train_tensor = torch.from_numpy(y_train).float()
        X_valid_tensor = torch.from_numpy(X_valid).float()
        Xi_valid_tensor = torch.arange(X_valid_tensor.size(1)).unsqueeze(0).unsqueeze(-1).repeat(X_valid_tensor.size(0), 1, 1).int()
        y_valid_tensor = torch.from_numpy(y_valid).float()

        # Convert data to DataLoader
        train_dataset = TensorDataset(Xi_train_tensor, X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        validation_dataset = TensorDataset(Xi_valid_tensor, X_valid_tensor, y_valid_tensor)
        validation_loader = DataLoader(validation_dataset, batch_size=len(validation_data), shuffle=True)

        criterion = nn.MSELoss() # Use mean squared error loss as default criterion type
#         metric_type = "RMSE" # Use RMSE as default metric type
        
        regression_loss_list = ["MSE, RMSE"]
        classification_loss_list = ["CrossEntropy", "BinaryCrossEntropy"]
#         classification_metric_list = ["Accuracy", "AUC", "ROC", "F1-score"]

        optimizer_type_list = ["Adam", "Adagrad", "RMSprop", "Adadelta", "Adamax", "Nadam"]
        optimizer = optim.Adam(model.parameters(), lr=dl_learning_rate)  # Use Adam optimizer as default type
        
        if optimizer_type == "Adam":
            optimizer = optim.Adam(model.parameters(), lr=dl_learning_rate)
        elif optimizer_type == "Adagrad":
            optimizer = optim.Adagrad(model.parameters(), lr=dl_learning_rate)
        elif optimizer_type == "RMSprop":
            optimizer = optim.RMSprop(model.parameters(), lr=dl_learning_rate)
        elif optimizer_type == "Adadelta":
            optimizer = optim.Adadelta(model.parameters(), lr=dl_learning_rate)
        elif optimizer_type == "Adamax":
            optimizer = optim.Adamax(model.parameters(), lr=dl_learning_rate)
        elif optimizer_type == "Nadam":
            optimizer = optim.Nadam(model.parameters(), lr=dl_learning_rate)
        else:
            print(f"Please select an optimizer in {optimizer_name_list}")
            return 0
        
        # Train the model
        training_results_df = []
        
        print(f'Start training {model_name} model ...')
        for epoch in range(epochs_num):
            
            total_loss = 0.0
            total_batches = 0
            
            total_rmse = 0.0
            total_mse = 0.0
            total_accuracy = 0.0
            total_auc_score = 0.0
            total_f1 = 0.0
            total_precision = 0.0
            total_recall = 0.0
            total_specificity = 0.0

            for t, (xi, x, y_true) in enumerate(train_loader):
                
                optimizer.zero_grad()
                
                y_pred = 0
                if model_name == "DeepFM":
                    y_pred = model(xi, x)
                else:    
                    y_pred = model(x)
                
                loss = 0.0
                
                rmse = 0.0
                mse = 0.0
                accuracy = 0.0
                auc_score = 0.0
                f1 = 0.0
                precision = 0.0
                recall = 0.0
                specificity = 0.0
                
                if task_type == "Regression":
                    criterion = nn.MSELoss()
                    if loss_type == "RMSE":
                        eps = 1e-6
                        if model_name == "FM":
                            loss = torch.sqrt(criterion(y_pred, y_true.view(-1, 1)) + eps)
                        else:
                            loss = torch.sqrt(criterion(y_pred, y_true.view(-1)) + eps)
                    elif loss_type == "MSE":
                        if model_name == "FM":
                            loss = criterion(y_pred, y_true.view(-1, 1))
                        else:        
                            loss = criterion(y_pred, y_true.view(-1))
                    else:
                        print(f"Please make sure loss type is in {regression_loss_list}")
                        return 0
                    
                    y_true = y_true.detach().numpy()
                    y_pred = y_pred.detach().numpy()
                    
                    rmse = root_mean_squared_error(y_true, y_pred)
                    mse = mean_squared_error(y_true, y_pred)
                    
                elif task_type == "Classification":
                    if loss_type == "BCE":
                        criterion = nn.BCELoss()
                        y_pred = torch.sigmoid(y_pred)
                    elif loss_type == "CE":
                        criterion = nn.CrossEntropyLoss()
                    else:
                        print(f"Please make sure loss type is in {classification_loss_list}")
                        return 0
                    
                    loss = criterion(y_pred, y_true)
                    
                    y_true = y_true.detach().numpy()
                    y_pred = y_pred.detach().numpy()
                    
                    accuracy = accuracy_score(y_true, y_pred) # Accuracy
                    auc_score = roc_auc_score(y_true, y_pred) # AUC
                    f1 = f1_score(y_true, y_pred) # F1-score
                    precision = precision_score(y_true, y_pred) # Precision
                    recall = recall_score(y_true, y_pred) # Recall
                    specificity = recall_score(y_true, y_pred, pos_label=0) # Specificity (True Negative Rate)
                    
                else:
                    print("Please make sure the task is regression or classification !")
                    return 0
                
                loss.backward()
                optimizer.step()

                total_loss += loss.item()
                total_batches += 1
                
                total_rmse += rmse
                total_mse += mse
                total_accuracy += accuracy
                total_auc_score += auc_score
                total_f1 += f1
                total_precision += precision
                total_recall += recall
                total_specificity += specificity
            
            avg_loss = total_loss / total_batches
            
            avg_rmse = total_rmse / total_batches
            avg_mse = total_mse / total_batches
            
            avg_accuracy = total_accuracy / total_batches
            avg_auc_score = total_auc_score / total_batches
            avg_f1 = total_f1 / total_batches
            avg_precision = total_precision / total_batches
            avg_recall = total_recall / total_batches
            avg_specificity = total_specificity / total_batches
            
            value_list = [
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), model_name, 
                "for_testing" if testing else "for_validation",
                *hyperparameters.values(), task_type, loss_type,
                optimizer_type, dl_learning_rate, epochs_num, batch_size, f"{epoch + 1}th/{epochs_num}", 
                avg_loss, avg_rmse, avg_mse, avg_accuracy, avg_auc_score, 
                avg_f1, avg_precision, avg_recall, avg_specificity
            ]
#             print(f'Epoch {epoch + 1}/{epochs_num}, Average RMSE: {average_rmse:.4f}')
            
            training_results_df.append(value_list)

        # Make predictions on validation data
        
        validation_results_df = []
        
        print("Start validating ...")
        all_predictions = []
        
        with torch.no_grad():
            for t, (xi, x, y_true) in enumerate(validation_loader):
                
                predictions = 0
                if model_name == "DeepFM":
                    predictions = model(xi, x)
                else:
                    predictions = model(x)
                
                all_predictions.append(predictions.numpy())

        # Concatenate all predictions into a single numpy array
        all_predictions = np.concatenate(all_predictions)

        # Reshape all_predictions if needed
        y_pred = all_predictions.squeeze()

        # Calculate metric values
        rmse = 0.0
        mse = 0.0
        accuracy = 0.0
        auc_score = 0.0
        f1 = 0.0
        precision = 0.0
        recall = 0.0
        specificity = 0.0
        
        y_true = y_true.detach().numpy() # Convert PyTorch tensor to NumPy array
        
        if task_type == "Regression":
            rmse = root_mean_squared_error(y_true, y_pred)
            mse = mean_squared_error(y_true, y_pred)
        elif task_type == "Classification":
            accuracy = accuracy_score(y_true, y_pred) # Accuracy
            auc_score = roc_auc_score(y_true, y_pred) # AUC
            f1 = f1_score(y_true, y_pred) # F1-score
            precision = precision_score(y_true, y_pred) # Precision
            recall = recall_score(y_true, y_pred) # Recall
            specificity = recall_score(y_true, y_pred, pos_label=0) # Specificity (True Negative Rate)
        else:
            print("Please make sure the task is regression or classification !")
            return 0
        
        value_list = [
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'), model_name, *hyperparameters.values(), 
            task_type, loss_type, optimizer_type, dl_learning_rate, epochs_num, batch_size, 
            rmse, mse, accuracy, auc_score, f1, precision, recall, specificity
        ]
        
        validation_results_df.append(value_list)
#         print(f"RMSE on the validation set: {rmse}")
        
        # Save training and validation results to file
        training_column_list = [
            "Timestamp", "Model_name", "Training_for", *[f"{model_name}_{key}" for key in list(hyperparameters.keys())], 
            "Task_type", "Loss_type", "Optimizer_type", "DL_learning_rate", "Epochs_num", "Batch_size", 
            "Epoch/Epochs_num", "Avg_Loss", "Avg_RMSE", "Avg_MSE", "Avg_Accuracy", 
            "Avg_AUC_score", "Avg_F1", "Avg_Precision", "Avg_Recall", "Avg_Specificity"
        ]
        validation_column_list = [
            "Timestamp", "Model_name", *[f"{model_name}_{key}" for key in list(hyperparameters.keys())], 
            "Task_type", "Loss_type", "Optimizer_type", "DL_learning_rate", "Epochs_num", "Batch_size", 
            "RMSE", "MSE", "Accuracy", "AUC_score", "F1", "Precision", "Recall", "Specificity"
        ]
        
        training_records_df = pd.DataFrame(training_results_df, columns=training_column_list)
        validation_records_df = pd.DataFrame(validation_results_df, columns=validation_column_list)
        
#         output = ', '.join([f"{col} : {val}" for col, val in validation_records_df.iloc[0].items()])
#         print(output)
        
        if save_records:
            
            # Create folder if not exists
            folder_name = f"{model_name}_result_records"
            if not os.path.exists(folder_name):
                os.makedirs(folder_name)
            
            training_file_path = os.path.join(folder_name, f'{model_name}_training_result_records.csv')
            
            if not os.path.exists(training_file_path):
                training_records_df.to_csv(training_file_path, index=False, float_format='%.4f')
            else:
                training_records_df.to_csv(training_file_path, mode='a', header=False, index=False, float_format='%.4f')
            
            # If testing, then change validation results csv file path to testing results csv file path
            validation_file_path = os.path.join(folder_name, f'{model_name}_validation_result_records.csv')
            
            if testing:
                validation_file_path = os.path.join(folder_name, f'{model_name}_testing_result_records.csv')
            else: 
                validation_file_path = validation_file_path
            
            if not os.path.exists(validation_file_path):
                validation_records_df.to_csv(validation_file_path, index=False, float_format='%.4f')
            else:
                validation_records_df.to_csv(validation_file_path, mode='a', header=False, index=False, float_format='%.4f')
        else:
            print("Training and validation results are not saved to csv !")

    elif model_name in machine_learning_model_names:

        if model_name == "Linear":
            model = LinearRegression() if task_type == "Regression" else LogisticRegression()
        elif model_name == "KNN":
            model = KNeighborsRegressor(**hyperparameters) if task_type == "Regression" else KNeighborsClassifier(**hyperparameters)
        elif model_name == "SVM":
            model = SVR(**hyperparameters) if task_type == "Regression" else SVC(**hyperparameters)
        elif model_name == "DecisionTree":
            model = DecisionTreeRegressor(**hyperparameters) if task_type == "Regression" else DecisionTreeClassifier(**hyperparameters)
        elif model_name == "RandomForest":
            model = RandomForestRegressor(**hyperparameters) if task_type == "Regression" else RandomForestClassifier(**hyperparameters)
        elif model_name == "AdaBoost":
            model = AdaBoostRegressor(**hyperparameters) if task_type == "Regression" else AdaBoostClassifier(**hyperparameters)
        elif model_name == "XGBoost":
            model = XGBRegressor(**hyperparameters) if task_type == "Regression" else XGBClassifier(**hyperparameters)
        else:
            print(f"Please choose a model in {machine_learning_model_names} !")

        # Train the model
        print(f'Start training {model_name} model ...')
        model.fit(X_train, y_train)

        # Make predictions on validation data
        
        validation_results_df = []
        
        print("Start validating ...")
        y_pred = model.predict(X_valid)

        # Calculate metric values
        rmse = 0.0
        mse = 0.0
        accuracy = 0.0
        auc_score = 0.0
        f1 = 0.0
        precision = 0.0
        recall = 0.0
        specificity = 0.0
        
        if task_type == "Regression":
            rmse = root_mean_squared_error(y_valid, y_pred)
            mse = mean_squared_error(y_valid, y_pred)
        elif task_type == "Classification":
            accuracy = accuracy_score(y_valid, y_pred) # Accuracy
            auc_score = roc_auc_score(y_valid, y_pred) # AUC
            f1 = f1_score(y_valid, y_pred) # F1-score
            precision = precision_score(y_valid, y_pred) # Precision
            recall = recall_score(y_valid, y_pred) # Recall
            specificity = recall_score(y_valid, y_pred, pos_label=0) # Specificity (True Negative Rate)
        else:
            print("Please make sure the task is regression or classification !")
            return 0
        
        value_list = [
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'), model_name, *hyperparameters.values(), task_type, 
            rmse, mse, accuracy, auc_score, f1, precision, recall, specificity
        ]
        
        validation_results_df.append(value_list)       
#         print(f"RMSE on the validation set: {rmse}")
        
        # Save validation results to file
        training_records_df = f"{model_name} has no training result records."
        validation_column_list = [
            "Timestamp", "Model_name", *[f"{model_name}_{key}" for key in list(hyperparameters.keys())], "Task_type",
            "RMSE", "MSE", "Accuracy", "AUC_score", "F1", "Precision", "Recall", "Specificity"
        ]
        validation_records_df = pd.DataFrame(validation_results_df, columns=validation_column_list)
        
#         output = ', '.join([f"{col} : {val}" for col, val in validation_records_df.iloc[0].items()])
#         print(output)
        
        if save_records:
            folder_name = f"{model_name}_result_records"
            if not os.path.exists(folder_name):
                os.makedirs(folder_name)
            
            file_path = os.path.join(folder_name, f'{model_name}_validation_result_records.csv')
            
            # If testing, then change validation results csv file path to testing results csv file path
            if testing:
                file_path = os.path.join(folder_name, f'{model_name}_testing_result_records.csv')
            else: 
                file_path = file_path
            
            if not os.path.exists(file_path):
                validation_records_df.to_csv(file_path, index=False, float_format='%.4f')
            else:
                validation_records_df.to_csv(file_path, mode='a', header=False, index=False, float_format='%.4f')
        else:
            print("Training and validation results are not saved to csv !")

    else:
        print(f"Please select a model in {deep_learning_model_names} and {machine_learning_model_names} !")
        return 0
    
    return training_records_df, validation_records_df

# Experiments: Training and Validation

In [68]:
import os
import pandas as pd
import numpy as np

# data_path = '../Data_preprocessing/'
training_data_path = 'research_training_set_with_concatenated_reviews_and_feature_vectors.csv'
training_data = pd.read_csv(training_data_path)
# training_data.head()

# data_path = '../Data_preprocessing/'
validation_data_path = 'research_validation_set_with_concatenated_reviews_and_feature_vectors.csv'
validation_data = pd.read_csv(validation_data_path)
# validation_data.head()

In [77]:
# Training and Validation Data preprocessing

# columns_to_train = ['user_feature_vector', 'business_feature_vector']
columns_to_train = ['user_feature_vector', 'business_feature_vector', "categories_feature_vector"]

concatenated_vectors = []
for i in range(len(training_data)):
    col_vectors = []
    for col in columns_to_train:
        col_vectors.append(np.array(eval(training_data[col][i])))
    concatenated_vector = np.concatenate(col_vectors)
    concatenated_vectors.append(concatenated_vector)

X_train = np.array(concatenated_vectors)
y_train = np.array(training_data['stars'])

concatenated_vectors = []
for i in range(len(validation_data)):
    col_vectors = []
    for col in columns_to_train:
        col_vectors.append(np.array(eval(validation_data[col][i])))
    concatenated_vector = np.concatenate(col_vectors)
    concatenated_vectors.append(concatenated_vector)

X_valid = np.array(concatenated_vectors)
y_valid = np.array(validation_data['stars'])

In [114]:
# FM
# input_dim, factors_num
# field_dims, embed_dim=16

input_dim = len(X_train[0])
embedding_size = 768
field_num = len(columns_to_train)

FM_params = {
    'hyperparameters': {
        'input_dim': [input_dim], 
        'factors_num': [5, 10, 15], 
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': ['RMSE'], 'optimizer_type': ['Adam'], 
        'dl_learning_rate': [0.01, 0.02, 0.001], 'epochs_num': [10, 20, 30], 'batch_size': [100, 500, 1000]
    }
}

# DeepFM
# feature_sizes, embedding_size, hidden_dims, num_classes=1, dropout=[0.5, 0.5]

# Paper:
# test activation function: relu, tanh (relu is better)
# embedding size seems to be 5
# dropout: test 1, 0.9, 0.8, 0.7, 0.6, 0.5 (when 0.6 ~ 0.9 is better)
# hidden layer num: test 1, 3, 5, 7, better at begining, but more getting worse
# shape: given layer num and total neuron num, test 4 shape, such as 3, 600 =>
# constant (200-200-200), increasing (100-200-300), decreasing (300-200-100), and diamond (150-300-150)
# constant is better, which is consistent with previous studies

DeepFM_params = {
    'hyperparameters': {
        'feature_sizes': [[embedding_size for i in range(field_num)]], 
        'embedding_size': [4, 8], 
        'hidden_dims': [[64, 32], [128, 64], [256, 128]], 
        'num_classes': [1], 
        'dropout': [(0.5, 0.5)]
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': ['RMSE'], 'optimizer_type': ['Adam'], 
        'dl_learning_rate': [0.01, 0.02, 0.001], 'epochs_num': [10, 20, 30], 'batch_size': [100, 500, 1000]
    }
}

# AFM
# field_dims, embed_dim, attn_size, dropouts 
# embed_dim=16, attn_size=16, dropouts=(0.2, 0.2)

# Paper
# optimizer: mini-batch Adagrad. The batch size for Frappe and MovieLens is set to 128 and 4096 
# The embedding size is set to 256 for all methods
# without special mention, the attention factor is also 256 (test on 1, 4, 8, 16, 32, 64, 128, 256)
# Validation error: performance is stable across different size of attention factors
# also test dropout for fm, afm, fifm, 0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8
# Specifically, for AFM, the optimal dropout ratio on Frappe and MovieLens is 0.2 and 0.5
# also test l2 regulation(attention network) from 0, 0.5, 1, 2, 4, 8, 16
# prove using l2 is better, and when l2 is larger, rmse is decreasing gently 

AFM_params = {
    'hyperparameters': {
        'field_dims': [[embedding_size for i in range(field_num)]], 
        'embed_dim': [768], 
        'attn_size': [8, 16], 
        'dropouts': [(0.5, 0.5)]
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': ['RMSE'], 'optimizer_type': ['Adam'], 
        'dl_learning_rate': [0.01, 0.02, 0.001], 'epochs_num': [10, 20, 30], 'batch_size': [100, 500, 1000]
    }
}

# DCN
# field_dims, embed_dim, num_layers, mlp_dims, dropout 
# embed_dim=16, num_layers=3, mlp_dims=(16, 16), dropout=0.2

# Paper:
# cross layer(num_layers): 1~6, hidden layer: 2 ~ 5, with size 32 ~ 1024, 
# Based on test loss, 2 deep layers of size 1024 and 6 cross layers for the DCN model
# 5 deep layers of size 1024 for the DNN
# Based on validation loss, more cross layer num DCN loss decrease
 
# initial learning rate was tuned from 0.0001 to 0.001 with increments of 0.0001
# early stopping: at training step 150,000
# optimization: mini-batch stochastic optimization with Adam optimizer. 
# batch size: 512. Batch normalization was applied to the deep network
# gradient clip: 100
# Real-valued features are normalized by applying a log transform. 
# For categorical features, we embed the features in dense vectors of dimension 6×(category cardinality)1/4.
# Concatenating all embeddings results in a vector of dimension 1026

# Not CTR task, grid search:
# deep layers num ranged from 1 to 10 with layer size from 50 to 300. 
# The number of cross layers ranged from 4 to 10. 
# The number of residual units ranged from 1 to 5 with their input dim and cross dim from 50 to 300.
# For DCN, the input vector was fed to the cross network directly

DCN_params = {
    'hyperparameters': {
        'field_dims': [[embedding_size for i in range(field_num)]], 
        'embed_dim': [768], 
        'num_layers': [3],
        'mlp_dims': [(16, 16)],
        'dropout': [0.5]
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': ['RMSE'], 'optimizer_type': ['Adam'], 
        'dl_learning_rate': [0.01, 0.02, 0.001], 'epochs_num': [10, 20, 30], 'batch_size': [100, 500, 1000]
    }
}

# xDeepFM
# embed_dim, mlp_dims, dropout, cross_layer_sizes, split_half=True 
# embed_dim=16, cross_layer_sizes=(16, 16), split_half=False, mlp_dims=(16, 16), dropout=0.2

# Paper
# Learning rate : 0.001. optimization method: Adam, mini-batch size: 4096. 
# L2 regularization with λ = 0.0001 for DNN, DCN, Wide&Deep, DeepFM and xDeepFM, dropout 0.5 for PNN
# embed_dim = 10, 
# Neuron num per layer: (1) 400 for DNN layers; 
# (2) 200 for CIN layers on Criteo dataset, and 100 for CIN layers on Dianping and Bing News datasets
# test cin num_layer 1, 2, 3, 4 (3 best, degrad after 3)
# test cin num neurons 20, 40, 100, 200, Bing news is increasing(200 best), Dianping is 20 ~ 100 better, 200 degrades 
# test cin sigmoid, tanh, relu, identity 4 activation, identity is best, then relu
# best cross depth and dnn depth is (3, 2)

xDeepFM_params = {
    'hyperparameters': {
        'field_dims': [[embedding_size for i in range(field_num)]], 
        'embed_dim': [768], 
        'cross_layer_sizes': [(16, 16)],
        "split_half": [False],
        'mlp_dims': [(16, 16)],
        'dropout': [0.5]
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': ['RMSE'], 'optimizer_type': ['Adam'], 
        'dl_learning_rate': [0.01, 0.02, 0.001], 'epochs_num': [10, 20, 30], 'batch_size': [100, 500, 1000]
    }
}

# AutoInt
# field_dims, embed_dim, atten_embed_dim, num_heads, num_layers, mlp_dims, dropouts, has_residual=True
# embed_dim=16, atten_embed_dim=64, num_heads=2, num_layers=3, mlp_dims=(400, 400), dropouts=(0, 0, 0)

# Paper:
# embed_dim is set to 16, batch size set to 1024
# num_head is 2, num_layer(interaction layer) is 3, num of hidden units (interaction layer) is 32
# test dropout from 0.1 ~ 0.9
# optimizer is Adam
# test num_layer(interaction layer) from 0 ~ 4, 1 increase dramaticaly, 1 ~ 4 become stable
# test atten_embed_dim 8, 16, 24, 32, movie len is getting better, for KDD12, 24 is best, then decrease
# hidden units shape is set to (1, 200) or (4, 100)
# residaul is crucial

AutoInt_params = {
    'hyperparameters': {
        'field_dims': [[embedding_size for i in range(field_num)]], 
        'embed_dim': [768], 
        'atten_embed_dim': [(64, 32)],
        "num_heads": [2],
        "num_layers": [3],
        'mlp_dims': [(16, 16), (400, 400)],
        'dropouts': [(0.5, 0.5, 0.5)],
        "has_residual": [True]
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': ['RMSE'], 'optimizer_type': ['Adam'], 
        'dl_learning_rate': [0.01, 0.02, 0.001], 'epochs_num': [10, 20, 30], 'batch_size': [100, 500, 1000]
    }
}

# AFN
# field_dims, embed_dim, LNN_dim, mlp_dims, dropouts
# embed_dim=16, LNN_dim=1500, mlp_dims=(400, 400, 400), dropouts=(0, 0, 0)

# Paper:
# LNN_dim: 1500, 1200, 800, 600, embed_dim: 10
# batch size: 4096
# optimizer: Adam # earning rate :0.001
# mlp_dims: (400, 400, 400)
# All the other hyperparameters are tuned on the validation set

AFN_params = {
    'hyperparameters': {
        'field_dims': [[embedding_size for i in range(field_num)]], 
        'embed_dim': [768], 
        'LNN_dim': [1500, 1000], 
        'mlp_dims': [(400, 400, 400)], 
        'dropouts': [(0.5, 0.5, 0.5)]
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': ['RMSE'], 'optimizer_type': ['Adam'], 
        'dl_learning_rate': [0.01, 0.02, 0.001], 'epochs_num': [10, 20, 30], 'batch_size': [100, 200, 500]
    }
}

# # Linear parameters
Linear_params = {
    'hyperparameters': {
        
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': [None], 'optimizer_type': [None], 
        'dl_learning_rate': [None], 'epochs_num': [None], 'batch_size': [None]
    }
}

# # K-Nearest Neighbors parameters
# n_neighbors: int, default=5  # Number of neighbors
# weights: {'uniform', 'distance'}, default='uniform'  # Weight function used in prediction
# metric: {'euclidean', 'manhattan', 'chebyshev', 'minkowski', 
#          'wminkowski', 'seuclidean', 'mahalanobis'}, default='minkowski'  # Distance metric

KNN_params = {
    'hyperparameters': {
        'n_neighbors': [5],
        'weights': ['uniform'],
        'metric': ['euclidean']
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': [None], 'optimizer_type': [None], 
        'dl_learning_rate': [None], 'epochs_num': [None], 'batch_size': [None]
    }
}

# # Support Vector Machine parameters
# C: float, default=1.0  # Penalty parameter C of the error term
# kernel: {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf'  # Kernel function
# gamma: {'scale', 'auto'} or float, default='scale'  # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'

SVM_params = {
    'hyperparameters': {
        'C': [1.0],
        'kernel': ['rbf', 'poly', 'sigmoid'],
        'gamma': ['scale']
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': [None], 'optimizer_type': [None], 
        'dl_learning_rate': [None], 'epochs_num': [None], 'batch_size': [None]
    }
}

# # Decision Tree parameters
# max_depth: int or None, default=None  # Maximum depth of the tree
# min_samples_split: int or float, default=2  # Minimum number of samples required to split an internal node
# min_samples_leaf: int or float, default=1  # Minimum number of samples required to be at a leaf node

DecisionTree_params = {
    'hyperparameters': {
        'max_depth': [None],
        'min_samples_split': [2],
        'min_samples_leaf': [1]
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': [None], 'optimizer_type': [None], 
        'dl_learning_rate': [None], 'epochs_num': [None], 'batch_size': [None]
    }
}

# # Random Forest parameters
# max_depth: int or None, default=None  # Maximum depth of the tree
# min_samples_split: int or float, default=2  # Minimum number of samples required to split an internal node
# min_samples_leaf: int or float, default=1  # Minimum number of samples required to be at a leaf node
# n_estimators: int, default=100  # Number of trees in the forest

RandomForest_params = {
    'hyperparameters': {
        'max_depth': [None],
        'min_samples_split': [2],
        'min_samples_leaf': [1],
        'n_estimators': [50, 100]
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': [None], 'optimizer_type': [None], 
        'dl_learning_rate': [None], 'epochs_num': [None], 'batch_size': [None]
    }
}

# # AdaBoost parameters
# n_estimators: int, default=50  # Maximum number of estimators at which boosting is terminated
# learning_rate: float, default=1.0  # Learning rate shrinks the contribution of each classifier

AdaBoost_params = {
    'hyperparameters': {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.02, 0.001]
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': [None], 'optimizer_type': [None], 
        'dl_learning_rate': [None], 'epochs_num': [None], 'batch_size': [None]
    }
}

# # XGBoost parameters
# learning_rate: float, default=0.1  # Boosting learning rate (xgb's "eta")
# n_estimators: int, default=100  # Number of boosted trees to fit
# subsample: float, default=1  # Subsample ratio of the training instances (xgb's "subsample")
# colsample_bytree: float, default=1  # Subsample ratio of columns when constructing each tree (xgb's "colsample_bytree")

XGBoost_params = {
    'hyperparameters': {
        'learning_rate': [0.01, 0.02, 0.001],
        'n_estimators': [50, 100],
        'subsample': [1],
        'colsample_bytree': [1]
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': [None], 'optimizer_type': [None], 
        'dl_learning_rate': [None], 'epochs_num': [None], 'batch_size': [None]
    }
}

# Training hyperparameter combinations setting

func_params_list = {
    
    "FM": FM_params,
    "DeepFM": DeepFM_params,
    "AFM": AFM_params,
    "DCN": DCN_params,
    "xDeepFM": xDeepFM_params,
    "AutoInt": AutoInt_params,
    "AFN": AFN_params,
    
    "Linear": Linear_params,
    "KNN": KNN_params,
    "SVM": SVM_params,
    "DecisionTree": DecisionTree_params,
    "RandomForest": RandomForest_params,
    "AdaBoost": AdaBoost_params,
    "XGBoost": XGBoost_params 
    
}

In [117]:
# Start Training and Validation

from itertools import product

model_names = ["FM", "DeepFM", "AFM", "DCN", "xDeepFM", "AutoInt", "AFN",
               "Linear", "KNN", "SVM", "DecisionTree", "RandomForest", "AdaBoost", "XGBoost"]

# test_model_names = ["Linear"]

for model_name in model_names:
    print('-----------------------------------------------------------------------------------')
    print(f"{model_name}")
    print('-----------------------------------------------------------------------------------')
    
    model_params = func_params_list[model_name]["hyperparameters"]
    model_params_combinations = list(product(*model_params.values()))
    
    func_params = func_params_list[model_name]['func_params']
    func_params_combinations = list(product(*func_params.values()))

#     model_func_combinations = []
    
    for combination in model_params_combinations:
        model_params_combination = {}
        model_params_with_names = zip(model_params.keys(), combination)
        for name, value in model_params_with_names:
            model_params_combination[name] = value

        for combination in func_params_combinations:
            func_params_combination = {}
            func_params_with_names = zip(func_params.keys(), combination)
            
            for name, value in func_params_with_names:
                func_params_combination[name] = value

            merge_params = {"hyperparameters": model_params_combination, **func_params_combination}
            
            # Training and validation then save results to csv file
            model_training_validtion_or_testing(X_train, y_train, X_valid, y_valid, 
                             model_name, **merge_params, save_records=True, testing=False)
        
#             model_func_combinations.append(merge_params)
    
#     print(f"{model_name} training combinations: {len(model_func_combinations)}")

-----------------------------------------------------------------------------------
Linear
-----------------------------------------------------------------------------------
Start training Linear model ...
Start validating ...


# Experiments: Testing

In [None]:
import os
import pandas as pd
import numpy as np

# data_path = '../Data_preprocessing/'
training_data_path = 'research_training_set_with_concatenated_reviews_and_feature_vectors.csv'
training_data = pd.read_csv(training_data_path)
# training_data.head()

# data_path = '../Data_preprocessing/'
test_data_path = 'research_test_set_with_concatenated_reviews_and_feature_vectors.csv'
test_data = pd.read_csv(test_data_path)
# test_data.head()

In [None]:
# Training and Test Data preprocessing

# columns_to_train = ['user_feature_vector', 'business_feature_vector']
columns_to_train = ['user_feature_vector', 'business_feature_vector', "categories_feature_vector"]

concatenated_vectors = []
for i in range(len(training_data)):
    col_vectors = []
    for col in columns_to_train:
        col_vectors.append(np.array(eval(training_data[col][i])))
    concatenated_vector = np.concatenate(col_vectors)
    concatenated_vectors.append(concatenated_vector)

X_train = np.array(concatenated_vectors)
y_train = np.array(training_data['stars'])

concatenated_vectors = []
for i in range(len(test_data)):
    col_vectors = []
    for col in columns_to_train:
        col_vectors.append(np.array(eval(test_data[col][i])))
    concatenated_vector = np.concatenate(col_vectors)
    concatenated_vectors.append(concatenated_vector)

X_test = np.array(concatenated_vectors)
y_test = np.array(test_data['stars'])

In [None]:
# FM
# input_dim, factors_num

input_dim = len(X_train[0])
embedding_size = 768
field_num = len(columns_to_train)

best_FM_params = {
    'hyperparameters': {
        'input_dim': [input_dim], 
        'factors_num': [5, 10, 15], 
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': ['RMSE'], 'optimizer_type': ['Adam'], 
        'dl_learning_rate': [0.01, 0.02, 0.001], 'epochs_num': [10, 20, 30], 'batch_size': [100, 500, 1000]
    }
}

# DeepFM
# feature_sizes, embedding_size, hidden_dims, num_classes=1, dropout=[0.5, 0.5]

best_DeepFM_params = {
    'hyperparameters': {
        'feature_sizes': [[embedding_size for i in range(field_num)]], 
        'embedding_size': [4, 8], 
        'hidden_dims': [[64, 32], [128, 64], [256, 128]], 
        'num_classes': [1], 
        'dropout': [(0.5, 0.5)]
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': ['RMSE'], 'optimizer_type': ['Adam'], 
        'dl_learning_rate': [0.01, 0.02, 0.001], 'epochs_num': [10, 20, 30], 'batch_size': [100, 500, 1000]
    }
}

# AFM
# field_dims, embed_dim, attn_size, dropouts 
# embed_dim=16, attn_size=16, dropouts=(0.2, 0.2)

best_AFM_params = {
    'hyperparameters': {
        'field_dims': [[embedding_size for i in range(field_num)]], 
        'embed_dim': [768], 
        'attn_size': [8, 16], 
        'dropouts': [(0.5, 0.5)]
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': ['RMSE'], 'optimizer_type': ['Adam'], 
        'dl_learning_rate': [0.01, 0.02, 0.001], 'epochs_num': [10, 20, 30], 'batch_size': [100, 500, 1000]
    }
}

# DCN
# field_dims, embed_dim, num_layers, mlp_dims, dropout 
# embed_dim=16, num_layers=3, mlp_dims=(16, 16), dropout=0.2

best_DCN_params = {
    'hyperparameters': {
        'field_dims': [[embedding_size for i in range(field_num)]], 
        'embed_dim': [768], 
        'num_layers': [3],
        'mlp_dims': [(16, 16)],
        'dropout': [0.5]
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': ['RMSE'], 'optimizer_type': ['Adam'], 
        'dl_learning_rate': [0.01, 0.02, 0.001], 'epochs_num': [10, 20, 30], 'batch_size': [100, 500, 1000]
    }
}

# xDeepFM
# embed_dim, mlp_dims, dropout, cross_layer_sizes, split_half=True 
# embed_dim=16, cross_layer_sizes=(16, 16), split_half=False, mlp_dims=(16, 16), dropout=0.2

best_xDeepFM_params = {
    'hyperparameters': {
        'field_dims': [[embedding_size for i in range(field_num)]], 
        'embed_dim': [768], 
        'cross_layer_sizes': [(16, 16)],
        "split_half": [False],
        'mlp_dims': [(16, 16)],
        'dropout': [0.5]
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': ['RMSE'], 'optimizer_type': ['Adam'], 
        'dl_learning_rate': [0.01, 0.02, 0.001], 'epochs_num': [10, 20, 30], 'batch_size': [100, 500, 1000]
    }
}

# AutoInt
# field_dims, embed_dim, atten_embed_dim, num_heads, num_layers, mlp_dims, dropouts, has_residual=True
# embed_dim=16, atten_embed_dim=64, num_heads=2, num_layers=3, mlp_dims=(400, 400), dropouts=(0, 0, 0)

best_AutoInt_params = {
    'hyperparameters': {
        'field_dims': [[embedding_size for i in range(field_num)]], 
        'embed_dim': [768], 
        'atten_embed_dim': [(64, 32)],
        "num_heads": [2],
        "num_layers": [3],
        'mlp_dims': [(16, 16), (400, 400)],
        'dropouts': [(0.5, 0.5, 0.5)],
        "has_residual": [True]
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': ['RMSE'], 'optimizer_type': ['Adam'], 
        'dl_learning_rate': [0.01, 0.02, 0.001], 'epochs_num': [10, 20, 30], 'batch_size': [100, 500, 1000]
    }
}

# AFN
# field_dims, embed_dim, LNN_dim, mlp_dims, dropouts
# embed_dim=16, LNN_dim=1500, mlp_dims=(400, 400, 400), dropouts=(0, 0, 0)

best_AFN_params = {
    'hyperparameters': {
        'field_dims': [[embedding_size for i in range(field_num)]], 
        'embed_dim': [768], 
        'LNN_dim': [1500, 1000], 
        'mlp_dims': [(400, 400, 400)], 
        'dropouts': [(0.5, 0.5, 0.5)]
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': ['RMSE'], 'optimizer_type': ['Adam'], 
        'dl_learning_rate': [0.01, 0.02, 0.001], 'epochs_num': [10, 20, 30], 'batch_size': [100, 200, 500]
    }
}

# # Linear parameters
best_Linear_params = {
    'hyperparameters': {
        
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': [None], 'optimizer_type': [None], 
        'dl_learning_rate': [None], 'epochs_num': [None], 'batch_size': [None]
    }
}

# # K-Nearest Neighbors parameters
# n_neighbors: int, default=5  # Number of neighbors
# weights: {'uniform', 'distance'}, default='uniform'  # Weight function used in prediction
# metric: {'euclidean', 'manhattan', 'chebyshev', 'minkowski', 
#          'wminkowski', 'seuclidean', 'mahalanobis'}, default='minkowski'  # Distance metric

best_KNN_params = {
    'hyperparameters': {
        'n_neighbors': [5],
        'weights': ['uniform'],
        'metric': ['euclidean']
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': [None], 'optimizer_type': [None], 
        'dl_learning_rate': [None], 'epochs_num': [None], 'batch_size': [None]
    }
}

# # Support Vector Machine parameters
# C: float, default=1.0  # Penalty parameter C of the error term
# kernel: {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf'  # Kernel function
# gamma: {'scale', 'auto'} or float, default='scale'  # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'

best_SVM_params = {
    'hyperparameters': {
        'C': [1.0],
        'kernel': ['rbf', 'poly', 'sigmoid'],
        'gamma': ['scale']
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': [None], 'optimizer_type': [None], 
        'dl_learning_rate': [None], 'epochs_num': [None], 'batch_size': [None]
    }
}

# # Decision Tree parameters
# max_depth: int or None, default=None  # Maximum depth of the tree
# min_samples_split: int or float, default=2  # Minimum number of samples required to split an internal node
# min_samples_leaf: int or float, default=1  # Minimum number of samples required to be at a leaf node

best_DecisionTree_params = {
    'hyperparameters': {
        'max_depth': [None],
        'min_samples_split': [2],
        'min_samples_leaf': [1]
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': [None], 'optimizer_type': [None], 
        'dl_learning_rate': [None], 'epochs_num': [None], 'batch_size': [None]
    }
}

# # Random Forest parameters
# max_depth: int or None, default=None  # Maximum depth of the tree
# min_samples_split: int or float, default=2  # Minimum number of samples required to split an internal node
# min_samples_leaf: int or float, default=1  # Minimum number of samples required to be at a leaf node
# n_estimators: int, default=100  # Number of trees in the forest

best_RandomForest_params = {
    'hyperparameters': {
        'max_depth': [None],
        'min_samples_split': [2],
        'min_samples_leaf': [1],
        'n_estimators': [50, 100]
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': [None], 'optimizer_type': [None], 
        'dl_learning_rate': [None], 'epochs_num': [None], 'batch_size': [None]
    }
}

# # AdaBoost parameters
# n_estimators: int, default=50  # Maximum number of estimators at which boosting is terminated
# learning_rate: float, default=1.0  # Learning rate shrinks the contribution of each classifier

best_AdaBoost_params = {
    'hyperparameters': {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.02, 0.001]
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': [None], 'optimizer_type': [None], 
        'dl_learning_rate': [None], 'epochs_num': [None], 'batch_size': [None]
    }
}

# # XGBoost parameters
# learning_rate: float, default=0.1  # Boosting learning rate (xgb's "eta")
# n_estimators: int, default=100  # Number of boosted trees to fit
# subsample: float, default=1  # Subsample ratio of the training instances (xgb's "subsample")
# colsample_bytree: float, default=1  # Subsample ratio of columns when constructing each tree (xgb's "colsample_bytree")

best_XGBoost_params = {
    'hyperparameters': {
        'learning_rate': [0.01, 0.02, 0.001],
        'n_estimators': [50, 100],
        'subsample': [1],
        'colsample_bytree': [1]
    },
    'func_params':{
        'task_type': ['Regression'], 'loss_type': [None], 'optimizer_type': [None], 
        'dl_learning_rate': [None], 'epochs_num': [None], 'batch_size': [None]
    }
}

# Training hyperparameter combinations setting

best_func_params_list = {
    
    "FM": best_FM_params,
    "DeepFM": best_DeepFM_params,
    "AFM": best_AFM_params,
    "DCN": best_DCN_params,
    "xDeepFM": best_xDeepFM_params,
    "AutoInt": best_AutoInt_params,
    "AFN": best_AFN_params,
    
    "Linear": best_Linear_params,
    "KNN": best_KNN_params,
    "SVM": best_SVM_params,
    "DecisionTree": best_DecisionTree_params,
    "RandomForest": best_RandomForest_params,
    "AdaBoost": best_AdaBoost_params,
    "XGBoost": best_XGBoost_params 
    
}

In [None]:
# Start Training and Testing

from itertools import product

model_names = ["FM", "DeepFM", "AFM", "DCN", "xDeepFM", "AutoInt", "AFN",
               "Linear", "KNN", "SVM", "DecisionTree", "RandomForest", "AdaBoost", "XGBoost"]

# test_model_names = ["Linear"]

for model_name in model_names:
    print('-----------------------------------------------------------------------------------')
    print(f"{model_name}")
    print('-----------------------------------------------------------------------------------')
    
    model_params = best_func_params_list[model_name]["hyperparameters"]
    model_params_combinations = list(product(*model_params.values()))
    
    func_params = best_func_params_list[model_name]['func_params']
    func_params_combinations = list(product(*func_params.values()))

#     model_func_combinations = []
    
    for combination in model_params_combinations:
        model_params_combination = {}
        model_params_with_names = zip(model_params.keys(), combination)
        for name, value in model_params_with_names:
            model_params_combination[name] = value

        for combination in func_params_combinations:
            func_params_combination = {}
            func_params_with_names = zip(func_params.keys(), combination)
            
            for name, value in func_params_with_names:
                func_params_combination[name] = value

            merge_params = {"hyperparameters": model_params_combination, **func_params_combination}
            
            # Training and validation then save results to csv file
            model_training_validtion_or_testing(X_train, y_train, X_test, y_test, 
                             model_name, **merge_params, save_records=True, testing=True)
        
#             model_func_combinations.append(merge_params)
    
#     print(f"{model_name} training combinations: {len(model_func_combinations)}")