In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

### Load the train, val, test dataset

In [None]:
root = 'KuaiRec 2.0/'

# Training data
train = pd.read_csv(root + "joined_train_data_FE.csv")

# Validation data
val = pd.read_csv(root + "val_data.csv")

# Test data
# test = pd.read_csv(root + "test_data.csv")

print(f'Total number of training data: {len(train)}')
print(f'Total number of validation data: {len(val)}')
# print(f'Total number of test data: {len(test)}')

Total number of training data: 4054501
Total number of validation data: 227390
Total number of test data: 103558


In [155]:
# Convert to datetime
train['time'] = pd.to_datetime(train['time'])
val['time'] = pd.to_datetime(val['time'])
test['time'] = pd.to_datetime(test['time'])

In [102]:
train.columns

Index(['user_id', 'video_id', 'time', 'watch_ratio', 'user_active_degree',
       'is_lowactive_period', 'is_live_streamer', 'is_video_author',
       'follow_user_num', 'fans_user_num', 'friend_user_num', 'register_days',
       'author_id', 'video_type', 'video_tag_name', 'video_duration',
       'show_cnt', 'play_cnt', 'play_duration', 'like_cnt', 'comment_cnt',
       'share_cnt', 'follow_cnt', 'collect_cnt', 'manual_cover_text',
       'caption', 'topic_tag', 'first_level_category_id',
       'first_level_category_name', 'second_level_category_id',
       'second_level_category_name', 'third_level_category_id',
       'third_level_category_name', 'is_new_user', 'total_connections',
       'is_content_creator', 'hour', 'day_of_week', 'watch_frequency',
       'is_weekend_interaction', 'is_weekend', 'time_period',
       'count_afternoon_views', 'count_evening_views', 'count_midnight_views',
       'count_morning_views', 'avg_daily_watch_time', 'top_3_categories',
       'avg_watch_

### Filter and sort

In [156]:
columns = ['user_id', 'video_id', 'time', 'watch_ratio', 'user_active_degree',
    #    'is_lowactive_period', 'is_live_streamer', 'is_video_author',
       'follow_user_num', 'fans_user_num', 'friend_user_num', 'register_days',
    #    'author_id', 'video_type', 
      #  'video_tag_name', 
       'video_duration',
       'show_cnt', 'play_cnt', 'play_duration', 'like_cnt', 'comment_cnt',
    #    'share_cnt', 'follow_cnt', 'collect_cnt', 'manual_cover_text',
    #    'caption', 'topic_tag', 
       'first_level_category_id',
    #    'first_level_category_name', 
       'second_level_category_id',
    #    'second_level_category_name', 
       'third_level_category_id',
    #    'third_level_category_name', 
       'is_new_user', 'total_connections',
    #    'is_content_creator', 'hour', 'day_of_week', 
       'watch_frequency',
       'is_weekend_interaction', 'is_weekend', 
      #  'time_period',
       'count_afternoon_views', 'count_evening_views', 'count_midnight_views',
       'count_morning_views', 'avg_daily_watch_time', 'top_3_categories',
       'avg_watch_ratio', 'total_likes_given', 
    #    'video_length_category', 'video_length_long', 'video_length_medium', 
       'engagement_rate',
    #    'diversity_score', 'previous_time', 'time_since_last_interaction'
       ]
filtered_train = train[columns]

# Sort by user_id and time
filtered_train = filtered_train.sort_values(by=['user_id', 'time'], ascending=True)
val = val.sort_values(by=['user_id', 'time'], ascending=True)
test = test.sort_values(by=['user_id', 'time'], ascending=True)

In [157]:
# Get unique users and items from the training data
train_users = set(filtered_train['user_id'].unique())
train_items = set(filtered_train['video_id'].unique())

# Filter the validation set
filtered_val = val[
    val['user_id'].isin(train_users) & val['video_id'].isin(train_items)
]

# Filter the test set
filtered_test = test[
    test['user_id'].isin(train_users) & test['video_id'].isin(train_items)
]

In [158]:
# Preprocessing
user_encoder = LabelEncoder()
video_encoder = LabelEncoder()

filtered_train['user_id'] = user_encoder.fit_transform(filtered_train['user_id'])
filtered_train['video_id'] = video_encoder.fit_transform(filtered_train['video_id'])

filtered_val['user_id'] = user_encoder.transform(filtered_val['user_id'])
filtered_val['video_id'] = video_encoder.transform(filtered_val['video_id'])

filtered_test['user_id'] = user_encoder.transform(filtered_test['user_id'])
filtered_test['video_id'] = video_encoder.transform(filtered_test['video_id'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_val['user_id'] = user_encoder.transform(filtered_val['user_id'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_val['video_id'] = video_encoder.transform(filtered_val['video_id'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_test['user_id'] = user_encoder.transform(filt

In [159]:
# Choose random 15000 samples from the training data
filtered_train2 = filtered_train.sample(n=15000, random_state=1)

train_users = set(filtered_train2['user_id'].unique())
train_items = set(filtered_train2['video_id'].unique())

# Filter the validation set
filtered_val = val[
    val['user_id'].isin(train_users) & val['video_id'].isin(train_items)
]

# Filter the test set
filtered_test = test[
    test['user_id'].isin(train_users) & test['video_id'].isin(train_items)
]

len(filtered_train2)

15000

In [160]:
print(len(filtered_train2))
print(len(filtered_val))
print(len(filtered_test))

15000
14562
5049


### Create Dataloader

In [173]:
class KuaiShouDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        return {
            'user_id': torch.tensor(row['user_id'], dtype=torch.long),
            'video_id': torch.tensor(row['video_id'], dtype=torch.long),
            'time': torch.tensor(row['time'].timestamp(), dtype=torch.float),  # datetime to float (seconds since epoch)
            'watch_ratio': torch.tensor(row['watch_ratio'], dtype=torch.float),
            'follow_user_num': torch.tensor(row['follow_user_num'], dtype=torch.long),
            'fans_user_num': torch.tensor(row['fans_user_num'], dtype=torch.long),
            'friend_user_num': torch.tensor(row['friend_user_num'], dtype=torch.long),
            'register_days': torch.tensor(row['register_days'], dtype=torch.float),
            'video_duration': torch.tensor(row['video_duration'], dtype=torch.float),
            'show_cnt': torch.tensor(row['show_cnt'], dtype=torch.long),
            'play_cnt': torch.tensor(row['play_cnt'], dtype=torch.long),
            'play_duration': torch.tensor(row['play_duration'], dtype=torch.float),
            'like_cnt': torch.tensor(row['like_cnt'], dtype=torch.long),
            'comment_cnt': torch.tensor(row['comment_cnt'], dtype=torch.long),
            'first_level_category_id': torch.tensor(row['first_level_category_id'], dtype=torch.long),
            'second_level_category_id': torch.tensor(row['second_level_category_id'], dtype=torch.long),
            'third_level_category_id': torch.tensor(row['third_level_category_id'], dtype=torch.long),
            'is_new_user': torch.tensor(row['is_new_user'], dtype=torch.long),
            'total_connections': torch.tensor(row['total_connections'], dtype=torch.long),
            'watch_frequency': torch.tensor(row['watch_frequency'], dtype=torch.float),
            'is_weekend_interaction': torch.tensor(row['is_weekend_interaction'], dtype=torch.long),
            'is_weekend': torch.tensor(row['is_weekend'], dtype=torch.long),
            'count_afternoon_views': torch.tensor(row['count_afternoon_views'], dtype=torch.long),
            'count_evening_views': torch.tensor(row['count_evening_views'], dtype=torch.long),
            'count_midnight_views': torch.tensor(row['count_midnight_views'], dtype=torch.long),
            'count_morning_views': torch.tensor(row['count_morning_views'], dtype=torch.long),
            'avg_daily_watch_time': torch.tensor(row['avg_daily_watch_time'], dtype=torch.float),
            # 'top_3_categories': torch.tensor(row['top_3_categories'], dtype=torch.long),
            'avg_watch_ratio': torch.tensor(row['avg_watch_ratio'], dtype=torch.float),
            'total_likes_given': torch.tensor(row['total_likes_given'], dtype=torch.long),
            'engagement_rate': torch.tensor(row['engagement_rate'], dtype=torch.float),
        }

In [174]:
# Create DataLoad
# dataset_train = KuaiShouDataset(filtered_train)
dataset_train = KuaiShouDataset(filtered_train2)
dataset_val = KuaiShouDataset(filtered_val)
dataset_test = KuaiShouDataset(filtered_test)

# Set the batch size and other DataLoader parameters
batch_size = 512

# Initialise the DataLoader
train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

### Define Neural Factorisation Machine architecture

In [188]:
features = ['time', 
            # 'user_active_degree',
    #    'is_lowactive_period', 'is_live_streamer', 'is_video_author',
       'follow_user_num', 'fans_user_num', 'friend_user_num', 'register_days',
    #    'author_id', 'video_type', 
      #  'video_tag_name', 
       'video_duration',
       'show_cnt', 'play_cnt', 'play_duration', 'like_cnt', 'comment_cnt',
    #    'share_cnt', 'follow_cnt', 'collect_cnt', 'manual_cover_text',
    #    'caption', 'topic_tag', 
       'first_level_category_id',
    #    'first_level_category_name', 
       'second_level_category_id',
    #    'second_level_category_name', 
       'third_level_category_id',
    #    'third_level_category_name', 
       'is_new_user', 'total_connections',
    #    'is_content_creator', 'hour', 'day_of_week', 
       'watch_frequency',
       'is_weekend_interaction', 'is_weekend', 
      #  'time_period',
       'count_afternoon_views', 'count_evening_views', 'count_midnight_views',
       'count_morning_views', 'avg_daily_watch_time', 
      #  'top_3_categories',
       'avg_watch_ratio', 'total_likes_given', 
    #    'video_length_category', 'video_length_long', 'video_length_medium', 
       'engagement_rate',
    #    'diversity_score', 'previous_time', 'time_since_last_interaction'
       ]
len(features)

27

#### GMF and MLP

In [195]:
class NFM_MLP(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, mlp_hidden_layers, num_additional_features):
        super(NFM_MLP, self).__init__()

        # GMF Components
        self.user_embeddings_gmf = nn.Embedding(num_users, embedding_dim)
        self.item_embeddings_gmf = nn.Embedding(num_items, embedding_dim)

        # MLP Components
        self.user_embeddings_mlp = nn.Embedding(num_users, embedding_dim)
        self.item_embeddings_mlp = nn.Embedding(num_items, embedding_dim)

        # Calculate input size for MLP based on embedding and additional features
        input_size = 2 * embedding_dim + num_additional_features  # Adjusted input size

        # Create MLP layers dynamically from the hidden layer sizes list
        self.mlp_layers = nn.ModuleList()
        for hidden_size in mlp_hidden_layers:
            self.mlp_layers.append(nn.Linear(input_size, hidden_size))
            input_size = hidden_size  # Update input size for the next layer

        # Final combined layers
        self.fc1_combined = nn.Linear(embedding_dim + mlp_hidden_layers[-1], num_items)
        self.fc2_combined = nn.Linear(num_items, 1) # Output size set to num_items

    def forward(self, user_id, item_id, additional_features):
        # GMF: Generalized Matrix Factorization
        user_emb_gmf = self.user_embeddings_gmf(user_id)
        item_emb_gmf = self.item_embeddings_gmf(item_id)
        gmf_output = user_emb_gmf * item_emb_gmf  # Element-wise multiplication

        # MLP: Multi-Layer Perceptron
        user_emb_mlp = self.user_embeddings_mlp(user_id)
        item_emb_mlp = self.item_embeddings_mlp(item_id)
        mlp_input = torch.cat([user_emb_mlp, item_emb_mlp, additional_features], dim=-1)

        # Pass through the dynamic MLP layers
        mlp_output = mlp_input  # Start with concatenated input
        for layer in self.mlp_layers:
            mlp_output = torch.relu(layer(mlp_output))

        # Combine GMF and MLP outputs
        combined_input = torch.cat([gmf_output, mlp_output], dim=-1)
        combined_output = torch.relu(self.fc1_combined(combined_input))
        combined_output = self.fc2_combined(combined_output)  # Final output layer

        return combined_output.squeeze()  # Remove extra dimension for output

#### GMF and CNN

In [None]:
# class NFM_CNN(nn.Module):
#     def __init__(self, num_users, num_items, embedding_dim, cnn_hidden_layers):
#         super(NFM_CNN, self).__init__()

#         # GMF Components
#         self.user_embeddings_gmf = nn.Embedding(num_users, embedding_dim)
#         self.item_embeddings_gmf = nn.Embedding(num_items, embedding_dim)

#         # CNN Components
#         self.user_embeddings_cnn = nn.Embedding(num_users, embedding_dim)
#         self.item_embeddings_cnn = nn.Embedding(num_items, embedding_dim)

#         # Create CNN layers dynamically from the hidden layer sizes list
#         self.cnn_layers = nn.ModuleList()
#         input_channels = 1  # Since we will use 1D convolutions
#         input_size = 2 * embedding_dim  # Input size for CNN
#         for hidden_size in cnn_hidden_layers:
#             self.cnn_layers.append(nn.Conv1d(input_channels, hidden_size, kernel_size=3, padding=1))
#             input_channels = hidden_size  # Update input channels for the next layer

#         # Final combined layers
#         self.fc1_combined = nn.Linear(embedding_dim + cnn_hidden_layers[-1], 128)
#         self.fc2_combined = nn.Linear(128, 1)

#     def forward(self, user_id, item_id):
#         # GMF: Generalized Matrix Factorization
#         user_emb_gmf = self.user_embeddings_gmf(user_id)
#         item_emb_gmf = self.item_embeddings_gmf(item_id)
#         gmf_output = user_emb_gmf * item_emb_gmf  # Element-wise multiplication

#         # CNN: Convolutional Neural Network
#         user_emb_cnn = self.user_embeddings_cnn(user_id)
#         item_emb_cnn = self.item_embeddings_cnn(item_id)
#         cnn_input = torch.cat([user_emb_cnn, item_emb_cnn], dim=-1).unsqueeze(1)  # Add channel dimension

#         # Pass through the dynamic CNN layers
#         cnn_output = cnn_input
#         for layer in self.cnn_layers:
#             cnn_output = torch.relu(layer(cnn_output))
#             cnn_output = torch.max_pool1d(cnn_output, kernel_size=2)  # Apply max pooling

#         # Flatten the CNN output
#         cnn_output = cnn_output.view(cnn_output.size(0), -1)

#         # Combine GMF and CNN outputs
#         combined_input = torch.cat([gmf_output, cnn_output], dim=-1)
#         combined_output = F.relu(self.fc1_combined(combined_input))
#         combined_output = self.fc2_combined(combined_output)  # Final output layer

#         return combined_output.squeeze()  # Remove extra dimension for output

### Train the model

#### GMF and MLP

In [196]:
# Define model parameters
num_users = len(user_encoder.classes_)
num_items = len(video_encoder.classes_)
EMBEDDING_DIM = 64
MLP_HIDDEN_LAYERS = [16, 8]  # Custom MLP hidden layer sizes
LEARNING_RATE = 0.001

# Instantiate the model
num_additional_features = len(features) # Number of features excluding user_id and video_id, watch_ratio
model = NFM_MLP(num_users, num_items, EMBEDDING_DIM, MLP_HIDDEN_LAYERS, num_additional_features)

# Define loss function and optimizer
criterion = nn.MSELoss() 
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    for batch in train_loader:
        user_id = batch['user_id']
        video_id = batch['video_id']
        ratio = batch['watch_ratio'].float()

        # Gather additional features
        additional_features = torch.cat([
            batch['time'].unsqueeze(1),
            batch['follow_user_num'].unsqueeze(1),
            batch['fans_user_num'].unsqueeze(1),
            batch['friend_user_num'].unsqueeze(1),
            batch['register_days'].unsqueeze(1),
            batch['video_duration'].unsqueeze(1),
            batch['show_cnt'].unsqueeze(1),
            batch['play_cnt'].unsqueeze(1),
            batch['play_duration'].unsqueeze(1),
            batch['like_cnt'].unsqueeze(1),
            batch['comment_cnt'].unsqueeze(1),
            batch['first_level_category_id'].unsqueeze(1),
            batch['second_level_category_id'].unsqueeze(1),
            batch['third_level_category_id'].unsqueeze(1),
            batch['is_new_user'].unsqueeze(1),
            batch['total_connections'].unsqueeze(1),
            batch['watch_frequency'].unsqueeze(1),
            batch['is_weekend_interaction'].unsqueeze(1),
            batch['is_weekend'].unsqueeze(1),
            batch['count_afternoon_views'].unsqueeze(1),
            batch['count_evening_views'].unsqueeze(1),
            batch['count_midnight_views'].unsqueeze(1),
            batch['count_morning_views'].unsqueeze(1),
            batch['avg_daily_watch_time'].unsqueeze(1),
            batch['avg_watch_ratio'].unsqueeze(1),
            batch['total_likes_given'].unsqueeze(1),
            batch['engagement_rate'].unsqueeze(1),
        ], dim=-1)

        # Forward pass
        outputs = model(user_id, video_id, additional_features)
        loss = criterion(outputs, ratio)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print loss at the end of each epoch
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/10], Loss: 9469384465034772480.0000
Epoch [2/10], Loss: 235268223891668992.0000
Epoch [3/10], Loss: 25701601842823168.0000
Epoch [4/10], Loss: 293813780217856.0000
Epoch [5/10], Loss: 184321440743424.0000
Epoch [6/10], Loss: 78267214200832.0000
Epoch [7/10], Loss: 40721188913152.0000
Epoch [8/10], Loss: 24082416402432.0000
Epoch [9/10], Loss: 13983483428864.0000
Epoch [10/10], Loss: 9771708055552.0000


In [200]:
# Get recommendations for user 0
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    user_id = torch.tensor(filtered_train2['user_id'])
    video_id = torch.tensor(filtered_train2['video_id'])
    ratio = torch.tensor(filtered_train2['watch_ratio'].values, dtype=torch.float)

    # Gather additional features
    additional_features = torch.cat([
        filtered_train2['time'].unsqueeze(1),
        filtered_train2['follow_user_num'].unsqueeze(1),
        filtered_train2['fans_user_num'].unsqueeze(1),
        filtered_train2['friend_user_num'].unsqueeze(1),
        filtered_train2['register_days'].unsqueeze(1),
        filtered_train2['video_duration'].unsqueeze(1),
        filtered_train2['show_cnt'].unsqueeze(1),
        filtered_train2['play_cnt'].unsqueeze(1),
        filtered_train2['play_duration'].unsqueeze(1),
        filtered_train2['like_cnt'].unsqueeze(1),
        filtered_train2['comment_cnt'].unsqueeze(1),
        filtered_train2['first_level_category_id'].unsqueeze(1),
        filtered_train2['second_level_category_id'].unsqueeze(1),
        filtered_train2['third_level_category_id'].unsqueeze(1),
        filtered_train2['is_new_user'].unsqueeze(1),
        filtered_train2['total_connections'].unsqueeze(1),
        filtered_train2['watch_frequency'].unsqueeze(1),
        filtered_train2['is_weekend_interaction'].unsqueeze(1),
        filtered_train2['is_weekend'].unsqueeze(1),
        filtered_train2['count_afternoon_views'].unsqueeze(1),
        filtered_train2['count_evening_views'].unsqueeze(1),
        filtered_train2['count_midnight_views'].unsqueeze(1),
        filtered_train2['count_morning_views'].unsqueeze(1),
        filtered_train2['avg_daily_watch_time'].unsqueeze(1),
        filtered_train2['avg_watch_ratio'].unsqueeze(1),
        filtered_train2['total_likes_given'].unsqueeze(1),
        filtered_train2['engagement_rate'].unsqueeze(1),
    ], dim=-1)

    scores = model(user_id, video_id, additional_features)  # Get output scores
    top_n_indices = scores.argsort(descending=True)[:20]  # Get indices of top N scores
    top_n_videos = video_id[top_n_indices]  # Get top N video IDs based on sorted scores

    print(f'Top 20 recommended videos for user 0: {top_n_videos}')

ValueError: could not determine the shape of object type 'Series'

### Evaluation on test set

#### GMP and MLP

In [None]:
# # After training, evaluate on the test set
# model.eval()  # Set the model to evaluation mode
# test_loss = 0
# with torch.no_grad():  # Disable gradient calculation for testing
#     for test_batch in test_loader:
#         user_id_test = test_batch['user_id']
#         video_id_test = test_batch['video_id']
#         rating_test = test_batch['watch_ratio'].float()  # Test target

#         test_outputs = model(user_id_test, video_id_test)
#         test_loss += criterion(test_outputs, rating_test).item()

# # Calculate average test loss
# test_loss /= len(test_loader)
# print(f'Test Loss: {test_loss:.4f}')