In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import Dataset, DataLoader

## Load the train and validation dataset

In [4]:
root = '../KuaiRec 2.0/'

# Training data
train = pd.read_csv(root + "data_exports/joined_train_data_segmented.csv")
val = pd.read_csv(root + "data_exports/joined_val_data_FE.csv")

print(f'Total number of training data: {len(train)}')
print(f'Total number of validation data: {len(val)}')

Total number of training data: 2552082
Total number of validation data: 1376299


In [10]:
train.head()

Unnamed: 0,user_id,video_id,time,watch_ratio,user_active_degree,is_lowactive_period,is_live_streamer,is_video_author,follow_user_num,fans_user_num,...,avg_daily_watch_time,top_3_categories,cluster,News_Politics,Auto_Tech,Lifestyle,Sports_Fitness,Entertainment,Culture,Others
0,14,148,2020-07-05 05:27:48.378,0.722103,full_active,0,0,1,73,6,...,8360719000000.0,"['Car', 'Pets', 'Real estate家居']",0,0,1,1,0,0,0,1
1,14,183,2020-07-05 05:28:00.057,1.907377,full_active,0,0,1,73,6,...,8360719000000.0,"['Car', 'Pets', 'Real estate家居']",0,0,1,1,0,0,0,1
2,14,3649,2020-07-05 05:29:09.479,2.063311,full_active,0,0,1,73,6,...,8360719000000.0,"['Car', 'Pets', 'Real estate家居']",0,0,1,1,0,0,0,1
3,14,5262,2020-07-05 05:30:43.285,0.566388,full_active,0,0,1,73,6,...,8360719000000.0,"['Car', 'Pets', 'Real estate家居']",0,0,1,1,0,0,0,1
4,14,8234,2020-07-05 05:35:43.459,0.418364,full_active,0,0,1,73,6,...,8360719000000.0,"['Car', 'Pets', 'Real estate家居']",0,0,1,1,0,0,0,1


### Get the current date
This is necessary to calculate the age of the videos, which will be used for the time decay component of our model.
We assume it to be the day of the latest interaction. 

In [None]:
# Convert type to datetime
train['time'] = pd.to_datetime(train['time'])

# Assume current date is the next day of the last date
CURRENT_DATE_TRAIN = train['time'].dt.date.max()

# Just the date portion
print(f'Current date: {CURRENT_DATE_TRAIN}')

Current date: 2020-08-03
Current date: 2020-08-19


### Calculate age of video

In [12]:
video_info = pd.read_csv(root + 'data/item_daily_features.csv', usecols=['video_id', 'upload_dt']).drop_duplicates()

video_info['upload_dt'] = pd.to_datetime(video_info['upload_dt'])

In [81]:
# Get video age for training data
video_info_train = video_info[video_info['video_id'].isin(train['video_id'].unique())]
video_info_train['video_age'] = (CURRENT_DATE_TRAIN - video_info_train['upload_dt'].dt.date).dt.days
video_age_dict = video_info_train.set_index('video_id')['video_age'].to_dict()    # Convert to dictionary

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  video_info_train['video_age'] = (CURRENT_DATE_TRAIN - video_info_train['upload_dt'].dt.date).dt.days


In [14]:
train.columns

Index(['user_id', 'video_id', 'time', 'watch_ratio', 'user_active_degree',
       'is_lowactive_period', 'is_live_streamer', 'is_video_author',
       'follow_user_num', 'fans_user_num', 'friend_user_num', 'register_days',
       'author_id', 'video_type', 'video_tag_name', 'video_duration',
       'show_cnt', 'play_cnt', 'play_duration', 'like_cnt', 'comment_cnt',
       'share_cnt', 'follow_cnt', 'collect_cnt', 'manual_cover_text',
       'caption', 'topic_tag', 'first_level_category_name',
       'second_level_category_name', 'third_level_category_name',
       'english_caption', 'english_first_level_category_name',
       'english_second_level_category_name',
       'english_third_level_category_name', 'english_topic_tag', 'is_new_user',
       'total_connections', 'is_content_creator', 'hour', 'day_of_week',
       'watch_frequency', 'is_weekend_interaction', 'is_weekend',
       'time_period', 'count_afternoon_views', 'count_evening_views',
       'count_midnight_views', 'count_m

## Preprocessing for feeding into Neural Network portion of NCF

### One hot encode categorical variables

In [17]:
train.columns

Index(['user_id', 'video_id', 'time', 'watch_ratio', 'user_active_degree',
       'is_lowactive_period', 'is_live_streamer', 'is_video_author',
       'follow_user_num', 'fans_user_num', 'friend_user_num', 'register_days',
       'author_id', 'video_type', 'video_tag_name', 'video_duration',
       'show_cnt', 'play_cnt', 'play_duration', 'like_cnt', 'comment_cnt',
       'share_cnt', 'follow_cnt', 'collect_cnt', 'manual_cover_text',
       'caption', 'topic_tag', 'first_level_category_name',
       'second_level_category_name', 'third_level_category_name',
       'english_caption', 'english_first_level_category_name',
       'english_second_level_category_name',
       'english_third_level_category_name', 'english_topic_tag', 'is_new_user',
       'total_connections', 'is_content_creator', 'hour', 'day_of_week',
       'watch_frequency', 'is_weekend_interaction', 'is_weekend',
       'time_period', 'count_afternoon_views', 'count_evening_views',
       'count_midnight_views', 'count_m

In [None]:
# One hot encode 'user_active_degree', 'time_period'
train_processed = pd.get_dummies(train, columns=['user_active_degree', 'time_period'])

# Remove the column for user_active_degree = UNKNOWN
train_processed = train_processed.drop(columns=['user_active_degree_UNKNOWN'])

In [19]:
train_processed = train_processed.drop(columns=['author_id', 'video_type', 
                                    'video_tag_name', 
                                    'manual_cover_text', 'caption', 'topic_tag', 
                                    'first_level_category_name', 'second_level_category_name', 'third_level_category_name',
                                    'english_caption', 'english_first_level_category_name',
                                    'english_second_level_category_name',
                                    'english_third_level_category_name', 'english_topic_tag',
                                    'top_3_categories',
                                    'play_duration', 'hour', 'day_of_week'
                                    ])

### Scale continuous variables

Below, we can see that the values are all on different scales. For example, follow_user_num is in the tens-thousands while like_cnt can range form millions to billions. This will affect the training of the model, therefore scaling is needed. 

Note that we scale the validation data with the scaler fitted to the training data.

In [None]:
train_processed[['follow_user_num',
       'fans_user_num', 'friend_user_num', 'register_days', 'video_duration',
       'show_cnt', 'play_cnt', 'like_cnt', 'comment_cnt',
       'share_cnt', 'follow_cnt', 'collect_cnt', 'count_afternoon_views', 'count_evening_views', 'count_midnight_views',
       'count_morning_views', 'avg_daily_watch_time']].describe()

Unnamed: 0,follow_user_num,fans_user_num,friend_user_num,register_days,video_duration,show_cnt,play_cnt,like_cnt,comment_cnt,share_cnt,follow_cnt,collect_cnt,count_afternoon_views,count_evening_views,count_midnight_views,count_morning_views,avg_daily_watch_time
count,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0
mean,53.81411,3.872561,1.331606,265.3341,11647.91,6959049.0,7052437.0,204478.0,8935.899,3805.251,20932.72,285.876,465.8341,280.9108,457.9366,610.0598,8062631000000.0
std,141.8902,9.716679,4.924868,264.0708,13441.16,9275605.0,9511481.0,320943.1,21119.83,12695.3,63310.06,1337.505,284.4922,238.5123,433.9834,330.5712,706882700000.0
min,0.0,0.0,0.0,8.0,3066.0,644.0,331.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4632392000000.0
25%,7.0,0.0,0.0,119.0,7333.0,832913.0,762922.0,15528.0,345.0,64.0,1002.0,5.0,249.0,84.0,53.0,374.0,7686325000000.0
50%,15.0,1.0,0.0,200.0,9383.0,3127692.0,3071419.0,73590.0,2171.0,414.0,4968.0,28.0,444.0,225.0,356.0,569.0,8158000000000.0
75%,43.0,4.0,1.0,302.0,11500.0,9372330.0,9544620.0,251209.0,8918.0,2275.0,17978.0,133.0,656.0,419.0,748.0,806.0,8518700000000.0
max,1811.0,251.0,71.0,2002.0,294520.0,65255080.0,64795780.0,2762854.0,338365.0,206105.0,1215372.0,29197.0,1477.0,1435.0,1852.0,1727.0,12772440000000.0


In [None]:
scaler = StandardScaler()

columns_to_scale = ['follow_user_num',
       'fans_user_num', 'friend_user_num', 'register_days', 'video_duration',
       'show_cnt', 'play_cnt', 
       'like_cnt', 'comment_cnt',
       'share_cnt', 'follow_cnt', 'collect_cnt', 
       'total_connections',
       'watch_frequency', 
       'count_afternoon_views', 'count_evening_views', 'count_midnight_views',
       'count_morning_views', 
       'avg_daily_watch_time', 
       ]

train_processed[columns_to_scale] = scaler.fit_transform(train_processed[columns_to_scale])

We now see that the mean of all the columns is (close to) 0 and the standard deviation is 1

In [22]:
train_processed[['follow_user_num',
       'fans_user_num', 'friend_user_num', 'register_days', 'video_duration',
       'show_cnt', 'play_cnt', 'like_cnt', 'comment_cnt',
       'share_cnt', 'follow_cnt', 'collect_cnt', 'count_afternoon_views', 'count_evening_views', 'count_midnight_views',
       'count_morning_views', 'avg_daily_watch_time']].describe()

Unnamed: 0,follow_user_num,fans_user_num,friend_user_num,register_days,video_duration,show_cnt,play_cnt,like_cnt,comment_cnt,share_cnt,follow_cnt,collect_cnt,count_afternoon_views,count_evening_views,count_midnight_views,count_morning_views,avg_daily_watch_time
count,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0
mean,3.875563e-18,-3.162816e-17,1.933327e-17,-6.521637e-17,-2.266133e-16,4.790998e-17,8.17432e-18,-2.5257980000000003e-17,-1.905485e-17,-1.854256e-18,-9.449469e-18,1.789107e-17,-5.862346000000001e-17,-3.456824e-17,2.940082e-17,1.021901e-16,-2.135391e-15
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-0.379266,-0.3985478,-0.2703841,-0.9744891,-0.6384799,-0.7501836,-0.7414311,-0.6371098,-0.4231048,-0.299737,-0.3306382,-0.2137384,-1.637424,-1.177763,-1.055194,-1.845472,-4.852628
25%,-0.329932,-0.3985478,-0.2703841,-0.5541473,-0.321022,-0.6604569,-0.6612552,-0.5887337,-0.4067695,-0.2946957,-0.3148114,-0.2100001,-0.7621797,-0.8255794,-0.9330694,-0.7140968,-0.5323456
50%,-0.2735504,-0.295632,-0.2703841,-0.2474113,-0.1685054,-0.4130575,-0.4185488,-0.407823,-0.3203104,-0.2671265,-0.2521673,-0.1928039,-0.07674775,-0.2344147,-0.234886,-0.1242086,0.1349154
75%,-0.07621467,0.01311554,-0.06733294,0.1388488,-0.011004,0.2601751,0.2620184,0.1456054,-0.0008475032,-0.1205368,-0.04667066,-0.1142995,0.6684398,0.5789608,0.6683745,0.5927325,0.6451838
max,12.38413,25.43333,14.14625,6.576517,21.04522,6.284877,6.070911,7.971434,15.5981,15.93501,18.86651,21.61572,3.554284,4.838701,3.212251,3.37882,6.662791


## Create the Dataset

In [29]:
class KuaiShouDataset(Dataset):
    def __init__(self, data, user_id_col, video_id_col, user_feature_cols, video_feature_cols, watch_ratio_col, video_age_dict):
        self.user_feature_cols = user_feature_cols
        self.video_feature_cols = video_feature_cols

        # Initialise and fit LabelEncoders
        self.user_encoder = LabelEncoder()
        self.video_encoder = LabelEncoder()
        
        self.user_indices = torch.tensor(self.user_encoder.fit_transform(data[user_id_col]), dtype=torch.long)
        self.video_indices = torch.tensor(self.video_encoder.fit_transform(data[video_id_col]), dtype=torch.long)

        # Convert user and video features and watch ratios to tensors
        self.user_features = torch.tensor(data[user_feature_cols].values, dtype=torch.float32)
        self.video_features = torch.tensor(data[video_feature_cols].values, dtype=torch.float32)
        self.watch_ratios = torch.tensor(data[watch_ratio_col].values, dtype=torch.float32)

        # Time related features
        self.video_age_dict = video_age_dict

    def __len__(self):
        return len(self.user_indices)

    def __getitem__(self, idx):
        return self.user_indices[idx], self.video_indices[idx], self.user_features[idx], self.video_features[idx], self.watch_ratios[idx]

    def inverse_transform_user_ids(self, encoded_user_idx):
        """Decode encoded user indices to original user_ids."""
        return self.user_encoder.inverse_transform(encoded_user_idx)
    
    def inverse_transform_video_ids(self, encoded_video_idx):
        """Decode encoded video indices to original video_ids."""
        return self.video_encoder.inverse_transform(encoded_video_idx)
    
    def get_video_age(self, video_idx):
        """Get video age."""
        video_ids = self.inverse_transform_video_ids(video_idx)

        ages = []
        for i in range(len(video_idx)):
            ages.append(self.video_age_dict[video_ids[i]])
        return torch.tensor(ages, dtype=torch.float32)
    
    def get_decoded_user_video_pairs(self):
        """Get decoded user-video pairs."""
        return self.inverse_transform_user_ids(self.user_indices), self.inverse_transform_video_ids(self.video_indices)

## Time Infused Neural Collaborative Filtering

### Defining the model architecture

In [25]:
class NCF(nn.Module):
    def __init__(self, num_users, num_videos, embedding_dim, num_user_features, num_video_features, dropout):
        super(NCF, self).__init__()

        # Hyperparameters
        self.dropout = dropout
        
        # GMF Components for embeddings
        self.user_embeddings_gmf = nn.Embedding(num_users, embedding_dim)
        self.video_embeddings_gmf = nn.Embedding(num_videos, embedding_dim)

        # MLP Components for embeddings
        self.user_embeddings_mlp = nn.Embedding(num_users, embedding_dim)
        self.video_embeddings_mlp = nn.Embedding(num_videos, embedding_dim)

        # MLP layers for user and video embeddings
        self.fc1_mlp = nn.Linear(2 * embedding_dim, 128)
        self.fc2_mlp = nn.Linear(128, 64)

        # MLP layers for user and video features
        self.user_video_features_fc = nn.Linear(num_user_features + num_video_features, 64)

        # Final layers combining GMF, MLP for embeddings, and additional features
        self.fc1_combined = nn.Linear(embedding_dim + 64 + 64, 128)
        self.fc2_combined = nn.Linear(128, 1)

    def forward(self, user_idx, video_idx, user_features, video_features):
        ####### GMF Embedding branch #######
        user_emb_gmf = self.user_embeddings_gmf(user_idx)
        video_emb_gmf = self.video_embeddings_gmf(video_idx)
        gmf_output = user_emb_gmf * video_emb_gmf                                   # dimension: (batch_size, embedding_dim)

        ####### MLP Embedding branch #######
        user_emb_mlp = self.user_embeddings_mlp(user_idx)
        video_emb_mlp = self.video_embeddings_mlp(video_idx)
        mlp_input = torch.cat([user_emb_mlp, video_emb_mlp], dim=-1)                # dimension: (batch_size, 2 * embedding_dim)

        # First fully connected layer with BatchNorm and ReLU
        mlp_output = self.fc1_mlp(mlp_input)
        if self.training:
            mlp_output = nn.BatchNorm1d(128)(mlp_output)
        mlp_output = torch.relu(mlp_output)
        mlp_output = nn.Dropout(self.dropout)(mlp_output)

        # Second fully connected layer with BatchNorm and ReLU
        mlp_output = self.fc2_mlp(mlp_output)                                       # dimension: (batch_size, 64)
        if self.training:
            mlp_output = nn.BatchNorm1d(64)(mlp_output)
        mlp_output = torch.relu(mlp_output)
        mlp_output = nn.Dropout(self.dropout)(mlp_output)

        ####### MLP Feature processing branch #######
        user_video_features = torch.cat([user_features, video_features], dim=-1)
        user_video_features_processed = self.user_video_features_fc(user_video_features)  # dimension: (batch_size, 64)
        user_video_features_processed = torch.relu(user_video_features_processed)
        user_video_features_processed = nn.Dropout(self.dropout)(user_video_features_processed)

        ####### Combine GMF, MLP, and additional features #######
        combined_input = torch.cat([gmf_output, mlp_output, user_video_features_processed], dim=-1)
        combined_output = self.fc1_combined(combined_input)
        combined_output = torch.relu(combined_output)
        combined_output = nn.Dropout(self.dropout)(combined_output)

        combined_output = self.fc2_combined(combined_output)
        combined_output = torch.sigmoid(combined_output) * 5
        
        return combined_output.squeeze()

### Building the Recommendation System

In [86]:
class KuaiShou_NCF_RecSys:
    def __init__(self, dataset_train: KuaiShouDataset, model: nn.Module, embedding_dim: int, dropout: float, decay: float):
        self.dataset_train = dataset_train
        self.num_users = len(dataset_train.user_encoder.classes_)
        self.num_videos = len(dataset_train.video_encoder.classes_)
        self.num_user_features = len(dataset_train.user_feature_cols)
        self.num_video_features = len(dataset_train.video_feature_cols)
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Move model to GPU if available
        
        # Initialise the model
        self.model: nn.Module = model(self.num_users, self.num_videos, embedding_dim, self.num_user_features, self.num_video_features, dropout)

        # Time decay constants
        self.decay = decay

    def train(self, batch_size: int, num_epochs: int, lr: int, criterion, optimizer):
        # Initialise the DataLoader
        train_loader = DataLoader(self.dataset_train, batch_size=batch_size, shuffle=True)

        self.model.to(self.device)
        print(f"Model moved to {self.device}")

        # Optimizer and loss function
        optimizer = optimizer(self.model.parameters(), lr=lr)
        criterion = criterion

        # Training loop
        for epoch in range(num_epochs):
            self.model.train()
            total_loss = 0
            
            for user_idx, video_idx, user_features, video_features, watch_ratio in train_loader:
                user_idx, video_idx, user_features, video_features, watch_ratio = user_idx.to(self.device), video_idx.to(self.device), user_features.to(self.device), video_features.to(self.device), watch_ratio.to(self.device)
                
                # Forward pass
                optimizer.zero_grad()
                outputs = self.model(user_idx, video_idx, user_features, video_features)
                loss = criterion(outputs, watch_ratio)

                # Backward pass and optimization
                loss.backward()
                optimizer.step()

                # Accumulate loss for reporting
                total_loss += loss.item()

            # Print loss for each epoch
            avg_loss = total_loss / len(train_loader)
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    def predict(self, user_ids, video_ids, batch_size=1024):
        """
        Generates a dataframe with predicted watch ratios for each user-video pair in batches.
        """
        self.model.eval()  # Set model to evaluation mode
        predictions_df = pd.DataFrame(columns=['user_id', 'video_id', 'watch_ratio'])

        # Transform user and video ids to the same indices as the training data
        user_indices = self.dataset_train.user_encoder.transform(user_ids)
        video_indices = self.dataset_train.video_encoder.transform(video_ids)

        # Batch prediction
        for start_idx in range(0, len(user_ids), batch_size):
            end_idx = min(start_idx + batch_size, len(user_ids))

            batch_user_indices = user_indices[start_idx:end_idx]
            batch_video_indices = video_indices[start_idx:end_idx]

            # Prepare batch of user and video indices
            user_batch = torch.tensor(batch_user_indices, dtype=torch.long).to(self.device)
            video_batch = torch.tensor(batch_video_indices, dtype=torch.long).to(self.device)
            
            # Get user and video features from validation data
            user_features_batch = self.dataset_train.user_features[batch_user_indices].to(self.device)
            video_features_batch = self.dataset_train.video_features[batch_video_indices].to(self.device)
            
            # Get video age in batch
            video_age_batch = self.dataset_train.get_video_age(batch_video_indices).to(self.device)

            # Predict in batch
            with torch.no_grad():
                predicted_watch_ratios = self.model(user_batch, video_batch, user_features_batch, video_features_batch)

            # Apply time decay
            decay_weights = self.calculate_exponential_weight(video_age_batch)
            predicted_watch_ratios = predicted_watch_ratios * decay_weights

            # Append predictions to DataFrame
            batch_predictions_df = pd.DataFrame({'user_id': self.dataset_train.inverse_transform_user_ids(batch_user_indices),
                                                 'video_id': self.dataset_train.inverse_transform_video_ids(batch_video_indices),
                                                 'watch_ratio': predicted_watch_ratios.cpu().numpy()})
            predictions_df = pd.concat([predictions_df, batch_predictions_df])
            
        return predictions_df
    
    def get_parameters(self):
        """
        Returns the model parameters.
        """
        return self.model.state_dict()
    
    def calculate_exponential_weight(self, video_age_days):
        """
        Returns the decay weight based on the defined decay constant and the number of days since the video has been uploaded.
        """
        return torch.exp(-self.decay * video_age_days)

### Fitting the Training Data to the Model and Generating Predictions

In [68]:
# Define the columns for user and video features in the user-item interaction data
user_cols = ['is_lowactive_period',
             'is_live_streamer', 'is_video_author', 'follow_user_num',
             'fans_user_num', 'friend_user_num', 'register_days', 'is_new_user',
             'total_connections', 'is_content_creator',
             'watch_frequency', 'is_weekend_interaction', 'is_weekend',
             'count_afternoon_views', 'count_evening_views', 'count_midnight_views', 'count_morning_views', 
             'avg_daily_watch_time', 
             'user_active_degree_full_active', 'user_active_degree_high_active', 'user_active_degree_middle_active', 
             'time_period_afternoon', 'time_period_evening', 'time_period_midnight', 'time_period_morning'
            ]
video_cols = ['video_duration', 'show_cnt', 'play_cnt', 
              'like_cnt', 'comment_cnt', 'share_cnt', 'follow_cnt', 'collect_cnt', 
              'News_Politics', 'Auto_Tech', 'Lifestyle', 'Sports_Fitness', 'Entertainment', 'Culture', 'Others',
            ]

Let's create a function which allows us to train and predict using the NCF model.

In [93]:
def train_and_predict(hyperparameters: dict, train_data: pd.DataFrame, val_data: pd.DataFrame, video_age_train_dict, **kwargs):
    cluster = kwargs.get('cluster', None)

    # Set seed for reproducibility
    torch.manual_seed(0)

    BATCH_SIZE = hyperparameters['batch_size']
    NUM_EPOCHS = hyperparameters['num_epochs']
    LEARNING_RATE = hyperparameters['lr']
    EMBEDDING_DIM = hyperparameters['embedding_dim']
    DROPOUT = hyperparameters['dropout']
    DECAY = hyperparameters['decay']

    # Loss function and optimizer
    criterion = nn.MSELoss()
    optimiser = optim.Adam

    print(f"----- Training {'' if cluster == None else f'for cluster {cluster} '}-----")

    # Create the dataset
    dataset_train = KuaiShouDataset(train_data, 'user_id', 'video_id', user_cols, video_cols, 'watch_ratio', video_age_train_dict)

    # Initialise the NCF model
    print("Initialising...")
    ncf_rec_sys = KuaiShou_NCF_RecSys(dataset_train, NCF, EMBEDDING_DIM, DROPOUT, DECAY)

    # Train on data
    ncf_rec_sys.train(BATCH_SIZE, NUM_EPOCHS, LEARNING_RATE, criterion, optimiser)

    # Generate predictions
    print("Generating predictions...")
    
    # Filter as we can only predict for users and videos that are in the training data
    users = train_data['user_id'].unique()
    videos = train_data['video_id'].unique()
    val_data = val_data[val_data['user_id'].isin(users) & val_data['video_id'].isin(videos)]

    predictions_df = ncf_rec_sys.predict(val_data['user_id'], val_data['video_id'])
    
    print("Complete!")
    return cluster, predictions_df

#### Example: Fitting to Cluster 0

In [87]:
params = {
    'batch_size': 512,
    'num_epochs': 2,
    'lr': 0.001,
    'embedding_dim': 64,
    'dropout': 0.3,
    'decay': 0.01
}

cluster = 0
train_cluster = train_processed[train_processed['cluster'] == cluster]

val_cluster = val[val['cluster'] == cluster]

cluster, cluster_0_predictions, params = train_and_predict(params, train_cluster, val_cluster, video_age_dict, **{'cluster': 0})

----- Training for cluster 0 -----
Initialising...
Model moved to cpu
Epoch [1/2], Loss: 0.4148
Epoch [2/2], Loss: 0.3706
Generating predictions...
Complete!


In [84]:
# Plot the distribution of predicted watch ratios
# cluster_0_predictions['watch_ratio'].hist(bins=50)

cluster_0_predictions

Unnamed: 0,user_id,video_id,watch_ratio
0,14,8825,1.469138
1,14,2739,0.881861
2,14,7328,1.561320
3,14,2677,0.679518
4,14,8773,1.398021
...,...,...,...
725,7162,8814,1.488533
726,7162,5901,1.132553
727,7162,5681,1.397783
728,7162,5776,1.023636


### Grid Search for Hyperparameter Tuning

In [89]:
import itertools

In [94]:
# Is there a way to parallelise this?
def train_by_cluster_and_without(params: dict, train_data: pd.DataFrame, val_data: pd.DataFrame, video_age_dict: dict,
                                     train_by_cluster: bool = True, train_without_clustering: bool = False):
    param_str = '_'.join([f'{key}{val}' for key, val in params.items()])

    # Train for each cluster
    if train_by_cluster:
        cluster_predictions = {}
        for cluster in sorted(train_data['cluster'].unique()):
            train_cluster = train_data[train_data['cluster'] == cluster]
            val_cluster = val_data[val_data['cluster'] == cluster]

            cluster, predictions_df = train_and_predict(params, train_cluster, val_cluster, video_age_dict, **{'cluster': cluster})
            cluster_predictions[cluster] = predictions_df
        
        # Combine predictions
        watch_ratio_predictions_df = pd.DataFrame()
        for cluster, df in cluster_predictions.items():
            cluster_predictions_df = df
            cluster_predictions_df['cluster'] = cluster
            
            watch_ratio_predictions_df = pd.concat([watch_ratio_predictions_df, cluster_predictions_df])
        
        # Save predictions
        output_file = root + f'results/w_clustering_{param_str}.csv'
        watch_ratio_predictions_df.to_csv(output_file, index=False)
        print(f'Predictions with segmentation saved to {output_file}')
    
    # Train without clustering
    if train_without_clustering:
        _, predictions_df = train_and_predict(params, train_data, val_data, video_age_dict)

        # Save predictions
        output_file = root + f'results/wo_clustering_{param_str}.csv'
        predictions_df.to_csv(output_file, index=False)
        print(f'Predictions without segmentation saved to {output_file}')

In [96]:
# hyperparameters = {
#     'batch_size': [256, 512],
#     'num_epochs': [10],
#     'lr': [0.001, 0.01],
#     'embedding_dim': [32, 64],
#     'dropout': [0.3, 0.5],
#     'alpha': [0.01, 0.05],
#     'beta': [0.01, 0.05]
# }
hyperparameters = {
    'batch_size': [512],
    'num_epochs': [30],
    'lr': [0.001],
    'embedding_dim': [64],
    'dropout': [0.3],
    'decay': [0.01]
}

# Generate all possible combinations of hyperparameters
param_combinations = list(itertools.product(*hyperparameters.values()))

# Train for each combination of hyperparameters
for params in param_combinations:
    params_dict = {key: val for key, val in zip(hyperparameters.keys(), params)}
    print(f"Training with hyperparameters: {params_dict}")
    train_by_cluster_and_without(params_dict, train_processed, val, video_age_dict, train_by_cluster=True, train_without_clustering=False)

Training with hyperparameters: {'batch_size': 512, 'num_epochs': 30, 'lr': 0.001, 'embedding_dim': 64, 'dropout': 0.3, 'decay': 0.01}
----- Training for cluster 0 -----
Initialising...
Model moved to cpu
Epoch [1/30], Loss: 0.4148
Epoch [2/30], Loss: 0.3706
Epoch [3/30], Loss: 0.3628
Epoch [4/30], Loss: 0.3572
Epoch [5/30], Loss: 0.3528
Epoch [6/30], Loss: 0.3492
Epoch [7/30], Loss: 0.3465
Epoch [8/30], Loss: 0.3433
Epoch [9/30], Loss: 0.3405
Epoch [10/30], Loss: 0.3368
Epoch [11/30], Loss: 0.3333
Epoch [12/30], Loss: 0.3294
Epoch [13/30], Loss: 0.3258
Epoch [14/30], Loss: 0.3220
Epoch [15/30], Loss: 0.3178
Epoch [16/30], Loss: 0.3139
Epoch [17/30], Loss: 0.3097
Epoch [18/30], Loss: 0.3056
Epoch [19/30], Loss: 0.3013
Epoch [20/30], Loss: 0.2969
Epoch [21/30], Loss: 0.2928
Epoch [22/30], Loss: 0.2886
Epoch [23/30], Loss: 0.2853
Epoch [24/30], Loss: 0.2813
Epoch [25/30], Loss: 0.2772
Epoch [26/30], Loss: 0.2735
Epoch [27/30], Loss: 0.2706
Epoch [28/30], Loss: 0.2667
Epoch [29/30], Loss: 

### Tuned Model

In [None]:
params = {
    'batch_size': [512],
    'num_epochs': [10],
    'lr': [0.001],
    'embedding_dim': [64],
    'dropout': [0.3],
    'alpha': [0.001],
    'beta': [0.001]
}

train_by_cluster_and_without(params, train_processed, train_by_cluster=True, train_without_clustering=True)