In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, precision_score, recall_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [3]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X.iloc[idx].values, dtype=torch.float32), torch.tensor(self.y.iloc[idx], dtype=torch.float32)

In [4]:
class DeepFM(nn.Module):
    def __init__(self, sparse_input_dims, dense_input_dim, embedding_dim, dnn_hidden_units):
        super(DeepFM, self).__init__()
        
        # Embedding layers for sparse features
        self.embeddings = nn.ModuleList([nn.Embedding(input_dim, embedding_dim) for input_dim in sparse_input_dims])
        
        # Linear part
        self.linear = nn.ModuleList([nn.Embedding(input_dim, 1) for input_dim in sparse_input_dims])
        
        # DNN part
        dnn_input_dim = embedding_dim * len(sparse_input_dims) + dense_input_dim
        layers = []
        for i in range(len(dnn_hidden_units)):
            if i == 0:
                layers.append(nn.Linear(dnn_input_dim, dnn_hidden_units[i]))
            else:
                layers.append(nn.Linear(dnn_hidden_units[i-1], dnn_hidden_units[i]))
            layers.append(nn.ReLU())
        self.dnn = nn.Sequential(*layers)
        self.dnn_output = nn.Linear(dnn_hidden_units[-1], 1)
        
    def forward(self, x_sparse, x_dense):
        # Linear part
        linear_logit = sum([self.linear[i](x_sparse[:, i]) for i in range(x_sparse.shape[1])]).squeeze(1)
        
        # Embedding and interaction part
        embeddings = [self.embeddings[i](x_sparse[:, i]) for i in range(x_sparse.shape[1])]
        fm_logit = sum([torch.sum(embed_i * embed_j, dim=1, keepdim=True) 
                        for i, embed_i in enumerate(embeddings)
                        for j, embed_j in enumerate(embeddings) if i < j]).squeeze(1)
        
        # DNN part
        dnn_input = torch.cat(embeddings + [x_dense], dim=1)
        dnn_logit = self.dnn(dnn_input)
        dnn_logit = self.dnn_output(dnn_logit).squeeze(1)
        
        # Final output
        logit = linear_logit + fm_logit + dnn_logit
        output = torch.sigmoid(logit)
        return output


In [5]:
def train_model(X_train, y_train, sparse_input_dims, dense_input_dim, embedding_dim, dnn_hidden_units, epochs, learning_rate):
    model = DeepFM(sparse_input_dims, dense_input_dim, embedding_dim, dnn_hidden_units).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        model.train()
        X_train, y_train = X_train.to(device), y_train.to(device)
        X_sparse = X_train[:, :len(sparse_input_dims)].long()
        X_dense = X_train[:, len(sparse_input_dims):]
        optimizer.zero_grad()
        outputs = model(X_sparse, X_dense)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()
        
        predicted = (outputs > 0.5).float()
        correct = (predicted == y_train).sum().item()
        accuracy = correct / y_train.size(0)

        if (epoch + 1) % 100 == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}, Accuracy: {accuracy:.4f}")
            
    return model


In [6]:
def evaluate_model(model, X_test, y_test, sparse_input_dims):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        X_test, y_test = X_test.to(device), y_test.to(device)
        X_sparse = X_test[:, :len(sparse_input_dims)].long()
        X_dense = X_test[:, len(sparse_input_dims):]
        outputs = model(X_sparse, X_dense)
        y_true.extend(y_test.cpu().numpy())
        y_pred.extend(outputs.cpu().numpy())
    
    y_pred_labels = np.where(np.array(y_pred) > 0.5, 1, 0)
    accuracy = accuracy_score(y_true, y_pred_labels)
    roc_auc = roc_auc_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred_labels)
    recall = recall_score(y_true, y_pred_labels)
    log_loss_value = log_loss(y_true, y_pred)

    return accuracy, roc_auc, precision, recall, log_loss_value, y_true, y_pred

In [7]:
def load_data(item_path, review_path):
    item_data = pd.read_csv(item_path)
    review_data = pd.read_csv(review_path)

    data_merge = pd.merge(review_data, item_data, on="N_id")
    data_merge['Target'] = np.where(data_merge['Target'] > 3, 1, 0)

    User_category = data_merge.pivot_table("Target", index="User", columns="Smell", aggfunc="mean")
    User_category_matrix = User_category.fillna(0)

    data_merge = pd.merge(data_merge, User_category_matrix, on="User")

    sparse_features = ['Company', 'Smell', 'Gender', 'Year']
    dense_features = User_category_matrix.columns.tolist()
    sparse_input_dims = []
    for feat in sparse_features:
        lbe = LabelEncoder()
        data_merge[feat] = lbe.fit_transform(data_merge[feat])
        max_val = data_merge[feat].max()
        print(f"{feat}: {len(lbe.classes_)} classes, max value: {max_val}")
        sparse_input_dims.append(len(lbe.classes_) + 1)
        
    mms = MinMaxScaler()
    data_merge[dense_features] = mms.fit_transform(data_merge[dense_features])

    x = data_merge[sparse_features + dense_features]
    y = data_merge['Target']
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=52, shuffle=True, stratify=y)
    
    return x_train, x_test, y_train, y_test, sparse_features, dense_features, sparse_input_dims


In [8]:
item_path = 'data/item.csv'
review_path = 'data/review.csv'
x_train, x_test, y_train, y_test, sparse_features, dense_features, sparse_input_dims = load_data(item_path, review_path)

embedding_dim = 8
dnn_hidden_units = [256, 128]
epochs = 1000
learning_rate = 0.001

dense_input_dim = len(dense_features)

print(f"Sparse input dimensions: {sparse_input_dims}")

# Convert the train and test data to tensors
X_train_tensor = torch.tensor(x_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(x_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Train the model
model = train_model(X_train_tensor, y_train_tensor, sparse_input_dims, dense_input_dim, embedding_dim, dnn_hidden_units, epochs, learning_rate)

# Evaluate the model
accuracy, roc_auc, precision, recall, log_loss_value, y_true, y_pred = evaluate_model(model, X_test_tensor, y_test_tensor, sparse_input_dims)

print(f"Final Evaluation - Accuracy: {accuracy}, ROC-AUC: {roc_auc}, Log Loss: {log_loss_value}")


Company: 2906 classes, max value: 2905
Smell: 32 classes, max value: 31
Gender: 4 classes, max value: 3
Year: 166 classes, max value: 165
Sparse input dimensions: [2907, 33, 5, 167]
Epoch [100/1000], Loss: 1.5008, Accuracy: 0.7708
Epoch [200/1000], Loss: 0.5442, Accuracy: 0.8097
Epoch [300/1000], Loss: 0.3774, Accuracy: 0.8523
Epoch [400/1000], Loss: 0.3359, Accuracy: 0.8673
Epoch [500/1000], Loss: 0.3103, Accuracy: 0.8758
Epoch [600/1000], Loss: 0.2927, Accuracy: 0.8830
Epoch [700/1000], Loss: 0.2767, Accuracy: 0.8879
Epoch [800/1000], Loss: 0.2652, Accuracy: 0.8920
Epoch [900/1000], Loss: 0.2564, Accuracy: 0.8956
Epoch [1000/1000], Loss: 0.2479, Accuracy: 0.8985
Final Evaluation - Accuracy: 0.8817889153947703, ROC-AUC: 0.8767623655092128, Log Loss: 0.2931595459306287
