In [37]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import Dataset, DataLoader

## Load the train dataset

In [38]:
root = '../'

# Training data
train = pd.read_csv(root + "data_exports/joined_train_data_segmented.csv")

print(f'Total number of training data: {len(train)}')

Total number of training data: 2552082


In [39]:
train.head()

Unnamed: 0,user_id,video_id,time,watch_ratio,user_active_degree,is_lowactive_period,is_live_streamer,is_video_author,follow_user_num,fans_user_num,...,avg_daily_watch_time,top_3_categories,cluster,News_Politics,Auto_Tech,Lifestyle,Sports_Fitness,Entertainment,Culture,Others
0,14,148,2020-07-05 05:27:48.378,0.722103,full_active,0,0,1,73,6,...,8360719000000.0,"['Car', 'Pets', 'Real estate家居']",0,0,1,1,0,0,0,1
1,14,183,2020-07-05 05:28:00.057,1.907377,full_active,0,0,1,73,6,...,8360719000000.0,"['Car', 'Pets', 'Real estate家居']",0,0,1,1,0,0,0,1
2,14,3649,2020-07-05 05:29:09.479,2.063311,full_active,0,0,1,73,6,...,8360719000000.0,"['Car', 'Pets', 'Real estate家居']",0,0,1,1,0,0,0,1
3,14,5262,2020-07-05 05:30:43.285,0.566388,full_active,0,0,1,73,6,...,8360719000000.0,"['Car', 'Pets', 'Real estate家居']",0,0,1,1,0,0,0,1
4,14,8234,2020-07-05 05:35:43.459,0.418364,full_active,0,0,1,73,6,...,8360719000000.0,"['Car', 'Pets', 'Real estate家居']",0,0,1,1,0,0,0,1


### Get the current date
This is necessary to calculate the number of days since the last interaction as well as the age of the videos, which will be used for the time decay component of our model.
We assume it to be the day of the latest interaction in the training data. 

In [40]:
# Convert type to datetime
train['time'] = pd.to_datetime(train['time'])

# Assume current date is the next day of the last date in the training data
CURRENT_DATE = train['time'].dt.date.max()

# Just the date portion
print(f'Current date: {CURRENT_DATE}')

Current date: 2020-08-03


### Calculate age of video

In [41]:
video_info = pd.read_csv(root + 'data/item_daily_features.csv', usecols=['video_id', 'upload_dt']).drop_duplicates()

video_info['upload_dt'] = pd.to_datetime(video_info['upload_dt'])

In [42]:
# Get videos in the training data
video_info_filtered = video_info[video_info['video_id'].isin(train['video_id'].unique())]

# Calculate video age
# video_info_filtered['Current Date'] = CURRENT_DATE
video_info_filtered['video_age'] = (pd.to_datetime(CURRENT_DATE) - video_info_filtered['upload_dt']).dt.days

# Convert into dictionary
video_age_dict = video_info_filtered.set_index('video_id')['video_age'].to_dict()
video_age_dict

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  video_info_filtered['video_age'] = (pd.to_datetime(CURRENT_DATE) - video_info_filtered['upload_dt']).dt.days


{103: 38,
 109: 37,
 120: 36,
 122: 35,
 128: 34,
 130: 33,
 131: 33,
 133: 33,
 136: 33,
 137: 33,
 139: 33,
 142: 33,
 145: 32,
 146: 32,
 147: 32,
 148: 32,
 151: 31,
 152: 31,
 153: 31,
 154: 31,
 164: 30,
 166: 30,
 168: 30,
 169: 30,
 170: 30,
 171: 30,
 173: 30,
 175: 30,
 179: 30,
 180: 30,
 183: 30,
 186: 30,
 188: 30,
 203: 29,
 206: 29,
 207: 29,
 210: 29,
 211: 29,
 217: 29,
 223: 28,
 229: 28,
 237: 28,
 238: 28,
 250: 28,
 251: 28,
 254: 28,
 255: 28,
 256: 28,
 258: 28,
 262: 28,
 265: 28,
 267: 28,
 270: 28,
 272: 28,
 275: 27,
 279: 27,
 280: 27,
 282: 27,
 285: 27,
 286: 27,
 288: 27,
 289: 27,
 290: 27,
 296: 27,
 297: 27,
 300: 27,
 302: 27,
 304: 27,
 306: 27,
 307: 27,
 314: 27,
 318: 27,
 319: 27,
 324: 26,
 331: 26,
 336: 26,
 340: 26,
 349: 26,
 350: 26,
 351: 26,
 361: 25,
 364: 25,
 365: 25,
 368: 25,
 369: 25,
 372: 25,
 373: 25,
 378: 25,
 388: 25,
 389: 25,
 390: 25,
 391: 25,
 392: 25,
 394: 25,
 395: 25,
 400: 25,
 402: 25,
 403: 25,
 408: 24,
 413: 24,


### Calculate age of the interaction

In [43]:
# Get interaction age as dictionary
interaction = train[['user_id', 'video_id', 'time']]
interaction['interaction_age'] = (pd.to_datetime(CURRENT_DATE) - interaction['time']).dt.days

interaction_age_dict = interaction.groupby(['user_id', 'video_id'])['interaction_age'].max().to_dict()
interaction_age_dict

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interaction['interaction_age'] = (pd.to_datetime(CURRENT_DATE) - interaction['time']).dt.days


{(14, 103): 25,
 (14, 109): 24,
 (14, 120): 24,
 (14, 122): 24,
 (14, 128): 24,
 (14, 130): 7,
 (14, 131): 12,
 (14, 133): 13,
 (14, 136): 20,
 (14, 137): 12,
 (14, 139): 27,
 (14, 142): 12,
 (14, 145): 26,
 (14, 146): 18,
 (14, 147): 26,
 (14, 148): 28,
 (14, 151): 3,
 (14, 153): 12,
 (14, 154): 13,
 (14, 164): 21,
 (14, 166): 15,
 (14, 168): 19,
 (14, 169): 23,
 (14, 170): 21,
 (14, 171): 28,
 (14, 173): 12,
 (14, 175): 28,
 (14, 179): 28,
 (14, 180): 25,
 (14, 183): 28,
 (14, 186): 28,
 (14, 188): 21,
 (14, 203): 27,
 (14, 206): 27,
 (14, 207): 19,
 (14, 210): 23,
 (14, 211): 27,
 (14, 217): 27,
 (14, 223): 26,
 (14, 229): 26,
 (14, 237): 16,
 (14, 238): 17,
 (14, 250): 5,
 (14, 251): 13,
 (14, 254): 26,
 (14, 255): 13,
 (14, 256): 26,
 (14, 258): 26,
 (14, 262): 26,
 (14, 265): 26,
 (14, 267): 17,
 (14, 270): 15,
 (14, 272): 23,
 (14, 275): 26,
 (14, 279): 26,
 (14, 280): 26,
 (14, 282): 24,
 (14, 285): 26,
 (14, 286): 26,
 (14, 288): 25,
 (14, 289): 26,
 (14, 290): 26,
 (14, 296):

In [44]:
train.columns

Index(['user_id', 'video_id', 'time', 'watch_ratio', 'user_active_degree',
       'is_lowactive_period', 'is_live_streamer', 'is_video_author',
       'follow_user_num', 'fans_user_num', 'friend_user_num', 'register_days',
       'author_id', 'video_type', 'video_tag_name', 'video_duration',
       'show_cnt', 'play_cnt', 'play_duration', 'like_cnt', 'comment_cnt',
       'share_cnt', 'follow_cnt', 'collect_cnt', 'manual_cover_text',
       'caption', 'topic_tag', 'first_level_category_name',
       'second_level_category_name', 'third_level_category_name',
       'english_caption', 'english_first_level_category_name',
       'english_second_level_category_name',
       'english_third_level_category_name', 'english_topic_tag', 'is_new_user',
       'total_connections', 'is_content_creator', 'hour', 'day_of_week',
       'watch_frequency', 'is_weekend_interaction', 'is_weekend',
       'time_period', 'count_afternoon_views', 'count_evening_views',
       'count_midnight_views', 'count_m

In [45]:
# Segregate by 
# 'cluster'

# One hot encode
# 'user_active_degree', 'time_period'

# Remove
# 'author_id', 'video_type' (all 'NORMAL'), 
# 'manual_cover_text', 'video_tag_name'
# 'caption', 'topic_tag', 'first_level_category_name',
# 'second_level_category_name', 'third_level_category_name',
# 'english_caption', 'english_first_level_category_name',
# 'english_second_level_category_name',
# 'english_third_level_category_name', 'english_topic_tag'
# 'top_3_categories' (use the OHE categories)
# 'play_duration' (as it is highly correlated to target variable 'watch_ratio' (watch_ratio = play_duration / video_duration))

# To incorporate (time decay)
# 'time', 'video_age'

## Preprocessing for feeding into Neural Network portion of NCF

### One hot encode categorical variables

In [46]:
# One hot encode 'user_active_degree', 'time_period'
train_processed = pd.get_dummies(train, columns=['user_active_degree', 'time_period'])

# Remove the column for user_active_degree = UNKNOWN
train_processed = train_processed.drop(columns=['user_active_degree_UNKNOWN'])

In [47]:
train_processed = train_processed.drop(columns=['author_id', 'video_type', 
                                    'video_tag_name', 
                                    'manual_cover_text', 'caption', 'topic_tag', 
                                    'first_level_category_name', 'second_level_category_name', 'third_level_category_name',
                                    'english_caption', 'english_first_level_category_name',
                                    'english_second_level_category_name',
                                    'english_third_level_category_name', 'english_topic_tag',
                                    'top_3_categories',
                                    'play_duration'
                                    ])

### Scale continuous variables

In [48]:
train_processed.columns

Index(['user_id', 'video_id', 'time', 'watch_ratio', 'is_lowactive_period',
       'is_live_streamer', 'is_video_author', 'follow_user_num',
       'fans_user_num', 'friend_user_num', 'register_days', 'video_duration',
       'show_cnt', 'play_cnt', 'like_cnt', 'comment_cnt', 'share_cnt',
       'follow_cnt', 'collect_cnt', 'is_new_user', 'total_connections',
       'is_content_creator', 'hour', 'day_of_week', 'watch_frequency',
       'is_weekend_interaction', 'is_weekend', 'count_afternoon_views',
       'count_evening_views', 'count_midnight_views', 'count_morning_views',
       'avg_daily_watch_time', 'cluster', 'News_Politics', 'Auto_Tech',
       'Lifestyle', 'Sports_Fitness', 'Entertainment', 'Culture', 'Others',
       'user_active_degree_full_active', 'user_active_degree_high_active',
       'user_active_degree_middle_active', 'time_period_afternoon',
       'time_period_evening', 'time_period_midnight', 'time_period_morning'],
      dtype='object')

In [49]:
# We can see that the values are all on different scales. For example, follow_user_num is in the tens-thousands while like_cnt can range form millions to billions. This will affect the training of the model, therefore scaling is needed

train_processed[['follow_user_num',
       'fans_user_num', 'friend_user_num', 'register_days', 'video_duration',
       'show_cnt', 'play_cnt', 'like_cnt', 'comment_cnt',
       'share_cnt', 'follow_cnt', 'collect_cnt', 'count_afternoon_views', 'count_evening_views', 'count_midnight_views',
       'count_morning_views', 'avg_daily_watch_time']].describe()

Unnamed: 0,follow_user_num,fans_user_num,friend_user_num,register_days,video_duration,show_cnt,play_cnt,like_cnt,comment_cnt,share_cnt,follow_cnt,collect_cnt,count_afternoon_views,count_evening_views,count_midnight_views,count_morning_views,avg_daily_watch_time
count,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0
mean,53.81411,3.872561,1.331606,265.3341,11647.91,6959049.0,7052437.0,204478.0,8935.899,3805.251,20932.72,285.876,465.8341,280.9108,457.9366,610.0598,8062631000000.0
std,141.8902,9.716679,4.924868,264.0708,13441.16,9275605.0,9511481.0,320943.1,21119.83,12695.3,63310.06,1337.505,284.4922,238.5123,433.9834,330.5712,706882700000.0
min,0.0,0.0,0.0,8.0,3066.0,644.0,331.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4632392000000.0
25%,7.0,0.0,0.0,119.0,7333.0,832913.0,762922.0,15528.0,345.0,64.0,1002.0,5.0,249.0,84.0,53.0,374.0,7686325000000.0
50%,15.0,1.0,0.0,200.0,9383.0,3127692.0,3071419.0,73590.0,2171.0,414.0,4968.0,28.0,444.0,225.0,356.0,569.0,8158000000000.0
75%,43.0,4.0,1.0,302.0,11500.0,9372330.0,9544620.0,251209.0,8918.0,2275.0,17978.0,133.0,656.0,419.0,748.0,806.0,8518700000000.0
max,1811.0,251.0,71.0,2002.0,294520.0,65255080.0,64795780.0,2762854.0,338365.0,206105.0,1215372.0,29197.0,1477.0,1435.0,1852.0,1727.0,12772440000000.0


In [50]:
scaler = StandardScaler()

columns_to_scale = ['follow_user_num',
       'fans_user_num', 'friend_user_num', 'register_days', 'video_duration',
       'show_cnt', 'play_cnt', 
       'like_cnt', 'comment_cnt',
       'share_cnt', 'follow_cnt', 'collect_cnt', 
       'total_connections',
       'watch_frequency', 
       'count_afternoon_views', 'count_evening_views', 'count_midnight_views',
       'count_morning_views', 
       'avg_daily_watch_time', 
       ]

train_processed[columns_to_scale] = scaler.fit_transform(train_processed[columns_to_scale])

In [51]:
train_processed[['follow_user_num',
       'fans_user_num', 'friend_user_num', 'register_days', 'video_duration',
       'show_cnt', 'play_cnt', 'like_cnt', 'comment_cnt',
       'share_cnt', 'follow_cnt', 'collect_cnt', 'count_afternoon_views', 'count_evening_views', 'count_midnight_views',
       'count_morning_views', 'avg_daily_watch_time']].describe()

# We now see that the mean of all the columns is (close to) 0 and the standard deviation is 1

Unnamed: 0,follow_user_num,fans_user_num,friend_user_num,register_days,video_duration,show_cnt,play_cnt,like_cnt,comment_cnt,share_cnt,follow_cnt,collect_cnt,count_afternoon_views,count_evening_views,count_midnight_views,count_morning_views,avg_daily_watch_time
count,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0,2552082.0
mean,3.875563e-18,-3.162816e-17,1.933327e-17,-6.521637e-17,-2.266133e-16,4.790998e-17,8.17432e-18,-2.5257980000000003e-17,-1.905485e-17,-1.854256e-18,-9.449469e-18,1.789107e-17,-5.862346000000001e-17,-3.456824e-17,2.940082e-17,1.021901e-16,-2.135391e-15
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-0.379266,-0.3985478,-0.2703841,-0.9744891,-0.6384799,-0.7501836,-0.7414311,-0.6371098,-0.4231048,-0.299737,-0.3306382,-0.2137384,-1.637424,-1.177763,-1.055194,-1.845472,-4.852628
25%,-0.329932,-0.3985478,-0.2703841,-0.5541473,-0.321022,-0.6604569,-0.6612552,-0.5887337,-0.4067695,-0.2946957,-0.3148114,-0.2100001,-0.7621797,-0.8255794,-0.9330694,-0.7140968,-0.5323456
50%,-0.2735504,-0.295632,-0.2703841,-0.2474113,-0.1685054,-0.4130575,-0.4185488,-0.407823,-0.3203104,-0.2671265,-0.2521673,-0.1928039,-0.07674775,-0.2344147,-0.234886,-0.1242086,0.1349154
75%,-0.07621467,0.01311554,-0.06733294,0.1388488,-0.011004,0.2601751,0.2620184,0.1456054,-0.0008475032,-0.1205368,-0.04667066,-0.1142995,0.6684398,0.5789608,0.6683745,0.5927325,0.6451838
max,12.38413,25.43333,14.14625,6.576517,21.04522,6.284877,6.070911,7.971434,15.5981,15.93501,18.86651,21.61572,3.554284,4.838701,3.212251,3.37882,6.662791


## Create the Dataset

In [64]:
class KuaiShouDataset(Dataset):
    def __init__(self, data, user_id_col, video_id_col, user_feature_cols, video_feature_cols, watch_ratio_col, interaction_age_dict, video_age_dict):
        self.user_feature_cols = user_feature_cols
        self.video_feature_cols = video_feature_cols

        # Initialise and fit LabelEncoders
        self.user_encoder = LabelEncoder()
        self.video_encoder = LabelEncoder()
        
        self.user_indices = torch.tensor(self.user_encoder.fit_transform(data[user_id_col]), dtype=torch.long)
        self.video_indices = torch.tensor(self.video_encoder.fit_transform(data[video_id_col]), dtype=torch.long)

        # Convert to correct numpy dtype before creating tensors
        user_features = data[user_feature_cols].values.astype(np.float32)
        video_features = data[video_feature_cols].values.astype(np.float32)
        
        # Convert to tensors
        self.user_features = torch.tensor(user_features, dtype=torch.float32)
        self.video_features = torch.tensor(video_features, dtype=torch.float32)
        self.watch_ratios = torch.tensor(data[watch_ratio_col].values, dtype=torch.float32)

        # Time related features
        self.interaction_age_dict = interaction_age_dict
        self.video_age_dict = video_age_dict

    def __len__(self):
        return len(self.user_indices)

    def __getitem__(self, idx):
        return self.user_indices[idx], self.video_indices[idx], self.user_features[idx], self.video_features[idx], self.watch_ratios[idx]

    def inverse_transform_user_ids(self, encoded_user_idx):
        """Decode encoded user indices to original user_ids."""
        return self.user_encoder.inverse_transform(encoded_user_idx)
    
    def inverse_transform_video_ids(self, encoded_video_idx):
        """Decode encoded video indices to original video_ids."""
        return self.video_encoder.inverse_transform(encoded_video_idx)
    
    def get_interaction_age(self, user_idx, video_idx):
        """Get days since interaction."""
        user_ids = self.inverse_transform_user_ids(user_idx)
        video_ids = self.inverse_transform_video_ids(video_idx)

        ages = []
        for i in range(len(user_ids)):
            ages.append(self.interaction_age_dict[(user_ids[i], video_ids[i])])
        return torch.tensor(ages, dtype=torch.float32)
    
    def get_video_age(self, video_idx):
        """Get video age."""
        video_ids = self.inverse_transform_video_ids(video_idx)

        ages = []
        for i in range(len(video_idx)):
            ages.append(self.video_age_dict[video_ids[i]])
        return torch.tensor(ages, dtype=torch.float32)

## Time Infused Neural Collaborative Filtering

### Defining the model architecture

In [65]:
class NCF(nn.Module):
    def __init__(self, num_users, num_videos, embedding_dim, num_user_features, num_video_features, dropout, alpha, beta):
        super(NCF, self).__init__()

        # Hyperparameters
        self.dropout = dropout
        self.alpha = alpha
        self.beta = beta
        
        # GMF Components for embeddings
        self.user_embeddings_gmf = nn.Embedding(num_users, embedding_dim)
        self.video_embeddings_gmf = nn.Embedding(num_videos, embedding_dim)

        # MLP Components for embeddings
        self.user_embeddings_mlp = nn.Embedding(num_users, embedding_dim)
        self.video_embeddings_mlp = nn.Embedding(num_videos, embedding_dim)

        # MLP layers for user and video embeddings
        self.fc1_mlp = nn.Linear(2 * embedding_dim, 128)
        self.fc2_mlp = nn.Linear(128, 64)

        # MLP layers for user and video features
        # self.user_features_fc = nn.Linear(num_user_features, embedding_dim)
        # self.video_features_fc = nn.Linear(num_video_features, embedding_dim)
        self.user_video_features_fc = nn.Linear(num_user_features + num_video_features, 64)

        # Final layers combining GMF, MLP for embeddings, and additional features
        # self.fc1_combined = nn.Linear(embedding_dim + 64 + 2 * embedding_dim, 128)
        self.fc1_combined = nn.Linear(embedding_dim + 64 + 64, 128)
        self.fc2_combined = nn.Linear(128, 1)

    def forward(self, user_idx, video_idx, user_features, video_features, interaction_days, video_age_days):
        ####### GMF Embedding branch #######
        user_emb_gmf = self.user_embeddings_gmf(user_idx)
        video_emb_gmf = self.video_embeddings_gmf(video_idx)
        gmf_output = user_emb_gmf * video_emb_gmf                                   # dimension: (batch_size, embedding_dim)

        ####### MLP Embedding branch #######
        user_emb_mlp = self.user_embeddings_mlp(user_idx)
        video_emb_mlp = self.video_embeddings_mlp(video_idx)
        mlp_input = torch.cat([user_emb_mlp, video_emb_mlp], dim=-1)                # dimension: (batch_size, 2 * embedding_dim)

        # First fully connected layer with BatchNorm and ReLU
        mlp_output = self.fc1_mlp(mlp_input)
        if self.training:
            mlp_output = nn.BatchNorm1d(128)(mlp_output)
        mlp_output = torch.relu(mlp_output)
        mlp_output = nn.Dropout(self.dropout)(mlp_output)

        # Second fully connected layer with BatchNorm and ReLU
        mlp_output = self.fc2_mlp(mlp_output)                                       # dimension: (batch_size, 64)
        if self.training:
            mlp_output = nn.BatchNorm1d(64)(mlp_output)
        mlp_output = torch.relu(mlp_output)
        mlp_output = nn.Dropout(self.dropout)(mlp_output)

        ####### MLP Feature processing branch #######
        # user_features_processed = self.user_features_fc(user_features)              # dimension: (batch_size, embedding_dim)
        # user_features_processed = torch.relu(user_features_processed)
        # user_features_processed = nn.Dropout(self.dropout)(user_features_processed)

        # video_features_processed = self.video_features_fc(video_features)           # dimension: (batch_size, embedding_dim)
        # video_features_processed = torch.relu(video_features_processed)
        # video_features_processed = nn.Dropout(self.dropout)(video_features_processed)
        user_video_features = torch.cat([user_features, video_features], dim=-1)
        user_video_features_processed = self.user_video_features_fc(user_video_features)  # dimension: (batch_size, 64)
        user_video_features_processed = torch.relu(user_video_features_processed)
        user_video_features_processed = nn.Dropout(self.dropout)(user_video_features_processed)

        ####### Combine GMF, MLP, and additional features #######
        combined_input = torch.cat([gmf_output, mlp_output, user_video_features_processed], dim=-1)
        combined_output = self.fc1_combined(combined_input)
        # if self.training:
        #     combined_output = nn.BatchNorm1d(128)(combined_output)
        combined_output = torch.relu(combined_output)
        combined_output = nn.Dropout(self.dropout)(combined_output)

        combined_output = self.fc2_combined(combined_output)
        combined_output = torch.relu(combined_output).squeeze()

        ######## Apply decay factors based on interaction time and video age #######
        decay_weight = self.calculate_exponential_weight(interaction_days, video_age_days)
        final_output = combined_output * decay_weight

        return final_output
    
    def calculate_exponential_weight(self, interaction_days, video_age_days):
        """
        Returns the decay weight based on the defined decay constant and time since the transaction was made.
        """
        decay_interaction = torch.exp(-self.alpha * interaction_days)
        decay_video_age = torch.exp(-self.beta * video_age_days)
        
        return decay_interaction * decay_video_age

### Creating the Recommendation System

In [74]:
class KuaiShou_NCF_RecSys:
    def __init__(self, dataset: KuaiShouDataset, model: nn.Module, embedding_dim: int, dropout: float, alpha: float, beta: float):
        self.dataset = dataset
        self.num_users = len(dataset.user_encoder.classes_)
        self.num_videos = len(dataset.video_encoder.classes_)
        self.num_user_features = len(dataset.user_feature_cols)
        self.num_video_features = len(dataset.video_feature_cols)
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Move model to GPU if available
        
        # Initialise the model
        self.model: nn.Module = model(self.num_users, self.num_videos, embedding_dim, self.num_user_features, self.num_video_features, dropout, alpha, beta)

    def train(self, batch_size, num_epochs, lr, criterion, optimizer):
        # Initialise the DataLoader
        train_loader = DataLoader(self.dataset, batch_size=batch_size, shuffle=True)

        self.model.to(self.device)
        print(f"Model moved to {self.device}")

        # Optimizer and loss function
        optimizer = optimizer(self.model.parameters(), lr=lr)
        criterion = criterion

        # Training loop
        for epoch in range(num_epochs):
            self.model.train()
            total_loss = 0
            
            for user_idx, video_idx, user_features, video_features, watch_ratio in train_loader:
                user_idx, video_idx, user_features, video_features, watch_ratio = user_idx.to(self.device), video_idx.to(self.device), user_features.to(self.device), video_features.to(self.device), watch_ratio.to(self.device)
                
                # Get the interaction days and video age
                interaction_days = self.dataset.get_interaction_age(user_idx, video_idx)
                video_age = self.dataset.get_video_age(video_idx)
                
                # Forward pass
                optimizer.zero_grad()
                outputs = self.model(user_idx, video_idx, user_features, video_features, interaction_days, video_age)
                loss = criterion(outputs, watch_ratio)

                # Backward pass and optimization
                loss.backward()
                optimizer.step()

                # Accumulate loss for reporting
                total_loss += loss.item()

            # Print loss for each epoch
            avg_loss = total_loss / len(train_loader)
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    def predict(self, batch_size=512):
        """
        Generates a dataframe with predicted watch ratios for each user-video pair in batches.
        """
        self.model.eval()  # Set model to evaluation mode
        all_predictions = []

        for start_user_idx in range(0, self.num_users, batch_size):
            end_user_idx = min(start_user_idx + batch_size, self.num_users)
            user_indices = torch.arange(start_user_idx, end_user_idx, dtype=torch.long).to(self.device)

            # Gather user features in batch
            user_features_batch = self.dataset.user_features[user_indices].to(self.device)
            
            for video_idx in range(self.num_videos):
                video_tensor = torch.tensor([video_idx], dtype=torch.long).to(self.device)
                video_feature_tensor = self.dataset.video_features[video_idx].unsqueeze(0).to(self.device)
                video_age = self.dataset.get_video_age(video_tensor).to(self.device)
                
                # Repeat video data for the entire user batch
                video_tensor_batch = video_tensor.expand(len(user_indices))
                video_feature_batch = video_feature_tensor.expand(len(user_indices), -1)
                video_age_batch = video_age.expand(len(user_indices))

                # Set interaction days to 0 for prediction
                interaction_days = torch.zeros(len(user_indices), dtype=torch.float32).to(self.device)

                # Predict in batch
                with torch.no_grad():
                    predicted_watch_ratios = self.model(
                        user_indices, video_tensor_batch, user_features_batch,
                        video_feature_batch, interaction_days, video_age_batch
                    )

                # Collect predictions
                for user_idx, watch_ratio in zip(user_indices.tolist(), predicted_watch_ratios.tolist()):
                    all_predictions.append((user_idx, video_idx, watch_ratio))

        # Convert to DataFrame at once
        watch_ratio_df = pd.DataFrame(all_predictions, columns=['user_id', 'video_id', 'watch_ratio'])

        # Inverse transform user and video indices
        watch_ratio_df['user_id'] = self.dataset.inverse_transform_user_ids(watch_ratio_df['user_id'].astype(int))
        watch_ratio_df['video_id'] = self.dataset.inverse_transform_video_ids(watch_ratio_df['video_id'].astype(int))

        return watch_ratio_df

    
    def get_parameters(self):
        """
        Returns the model parameters.
        """
        return self.model.state_dict()

### Fitting the Training Data to the Model and Generating Predictions

In [75]:
# Define the columns for user and video features in the user-item interaction data
user_cols = ['is_lowactive_period',
             'is_live_streamer', 'is_video_author', 'follow_user_num',
             'fans_user_num', 'friend_user_num', 'register_days', 'is_new_user',
             'total_connections', 'is_content_creator', 'hour', 'day_of_week',
             'watch_frequency', 'is_weekend_interaction', 'is_weekend',
             'count_afternoon_views', 'count_evening_views', 'count_midnight_views', 'count_morning_views', 
             'avg_daily_watch_time', 
             'user_active_degree_full_active', 'user_active_degree_high_active', 'user_active_degree_middle_active', 
             'time_period_afternoon', 'time_period_evening', 'time_period_midnight', 'time_period_morning'
            ]
video_cols = ['video_duration', 'show_cnt', 'play_cnt', 
              'like_cnt', 'comment_cnt', 'share_cnt', 'follow_cnt', 'collect_cnt', 
              'News_Politics', 'Auto_Tech', 'Lifestyle', 'Sports_Fitness', 'Entertainment', 'Culture', 'Others',
            ]

In [76]:
def train_and_predict(hyperparameters: dict, train_data: pd.DataFrame, **kwargs):
    cluster = kwargs.get('cluster', None)

    # Set seed for reproducibility
    torch.manual_seed(0)

    BATCH_SIZE = hyperparameters['batch_size']
    NUM_EPOCHS = hyperparameters['num_epochs']
    LEARNING_RATE = hyperparameters['lr']
    EMBEDDING_DIM = hyperparameters['embedding_dim']
    DROPOUT = hyperparameters['dropout']
    ALPHA = hyperparameters['alpha']
    BETA = hyperparameters['beta']

    # Loss function and optimizer
    criterion = nn.MSELoss()
    optimiser = optim.Adam

    print(f"----- Training {'' if cluster == None else f'for cluster {cluster} '}-----")

    # Create the dataset
    dataset_train = KuaiShouDataset(train_data, 'user_id', 'video_id', user_cols, video_cols, 'watch_ratio', interaction_age_dict, video_age_dict)

    # Initialise the NCF model
    print("Initialising...")
    ncf_rec_sys = KuaiShou_NCF_RecSys(dataset_train, NCF, EMBEDDING_DIM, DROPOUT, ALPHA, BETA)

    # Train on data
    ncf_rec_sys.train(BATCH_SIZE, NUM_EPOCHS, LEARNING_RATE, criterion, optimiser)

    # Generate predictions
    print("Generating predictions...")
    predictions_df = ncf_rec_sys.predict()
    
    print("Complete!")
    return cluster, predictions_df

##### Example: Fitting to Cluster 0

In [80]:
params = {
    'batch_size': 512,
    'num_epochs': 10,
    'lr': 0.001,
    'embedding_dim': 64,
    'dropout': 0.3,
    'alpha': 0.01,  # Decay constant for days since interaction (days)
    'beta': 0.01    # Decay constant for video age (days)
}

cluster = 0
train_cluster = train_processed[train_processed['cluster'] == cluster]

cluster, cluster_0_predictions = train_and_predict(params, train_cluster, **{'cluster': 0})

----- Training for cluster 0 -----
Initialising...
Model moved to cpu
Epoch [1/10], Loss: 0.4294
Epoch [2/10], Loss: 0.3764
Epoch [3/10], Loss: 0.3639
Epoch [4/10], Loss: 0.3581
Epoch [5/10], Loss: 0.3541
Epoch [6/10], Loss: 0.3520
Epoch [7/10], Loss: 0.3497
Epoch [8/10], Loss: 0.3471
Epoch [9/10], Loss: 0.3447
Epoch [10/10], Loss: 0.3418
Generating predictions...
Complete!


In [81]:
cluster_0_predictions

Unnamed: 0,user_id,video_id,watch_ratio
0,14,103,1.548594
1,21,103,1.339207
2,24,103,0.704573
3,51,103,1.320418
4,64,103,1.833451
...,...,...,...
634835,7086,10130,1.105983
634836,7116,10130,1.049223
634837,7132,10130,1.121811
634838,7141,10130,1.258493


### Grid Search for Hyperparameter Tuning

In [79]:
import itertools

In [83]:
# Is there a way to parallelise this?
def train_by_cluster_and_without(params: dict, train_data: pd.DataFrame, train_by_cluster: bool = True, train_without_clustering: bool = False):
    param_str = '_'.join([f'{key}{val}' for key, val in params.items()])

    # Train for each cluster
    if train_by_cluster:
        cluster_predictions = {}
        for cluster in sorted(train_data['cluster'].unique()):
            train_cluster = train_data[train_data['cluster'] == cluster]

            cluster, predictions_df = train_and_predict(params, train_cluster, **{'cluster': cluster})
            cluster_predictions[cluster] = predictions_df

            # Include parameters in the output file name
            output_file = root + f'results/cluster{cluster}_{param_str}.csv'
            predictions_df.to_csv(output_file, index=False)

            print(f'Predictions for cluster {cluster} saved to {output_file}')
        
        # Save combined predictions to a single file
        watch_ratio_predictions_df = pd.DataFrame()
        for cluster, df in cluster_predictions.items():
            cluster_predictions_df = df
            cluster_predictions_df['cluster'] = cluster
            
            watch_ratio_predictions_df = pd.concat([watch_ratio_predictions_df, cluster_predictions_df])
        
        # Include parameters in the output file name
        output_file = root + f'results/w_clustering_{param_str}.csv'
        watch_ratio_predictions_df.to_csv(output_file, index=False)
        print(f'Predictions with segmentation saved to {output_file}')
    
    # Train without clustering
    if train_without_clustering:
        _, predictions_df = train_and_predict(params, train_data)

        # Include parameters in the output file name
        output_file = root + f'results/wo_clustering_{param_str}.csv'
        predictions_df.to_csv(output_file, index=False)
        print(f'Predictions without segmentation saved to {output_file}')

In [None]:
hyperparameters = {
    'batch_size': [512, 256],
    'num_epochs': [10, 20],
    'lr': [0.001, 0.0001],
    'embedding_dim': [64, 128],
    'dropout': [0.3, 0.5],
    'alpha': [0.01, 0.05],
    'beta': [0.01, 0.05]
}

# Generate all possible combinations of hyperparameters
param_combinations = list(itertools.product(*hyperparameters.values()))

# Train for each combination of hyperparameters
for params in param_combinations:
    params_dict = {key: val for key, val in zip(hyperparameters.keys(), params)}
    train_by_cluster_and_without(params_dict, train_processed, True, True)

----- Training for cluster 0 -----
Initialising...
Model moved to cpu
Epoch [1/10], Loss: 0.4294
Epoch [2/10], Loss: 0.3764
Epoch [3/10], Loss: 0.3639
Epoch [4/10], Loss: 0.3581
Epoch [5/10], Loss: 0.3541
Epoch [6/10], Loss: 0.3520
Epoch [7/10], Loss: 0.3497
Epoch [8/10], Loss: 0.3471
Epoch [9/10], Loss: 0.3447
Epoch [10/10], Loss: 0.3418
Generating predictions...
Complete!
Predictions for cluster 0 saved to ../results/cluster0_batch_size512_num_epochs10_lr0.001_embedding_dim64_dropout0.3_alpha0.01_beta0.01.csv
----- Training for cluster 1 -----
Initialising...
Model moved to cpu
Epoch [1/10], Loss: 0.2275
Epoch [2/10], Loss: 0.1863
Epoch [3/10], Loss: 0.1792
Epoch [4/10], Loss: 0.1766
Epoch [5/10], Loss: 0.1750
Epoch [6/10], Loss: 0.1737
Epoch [7/10], Loss: 0.1727
Epoch [8/10], Loss: 0.1713
Epoch [9/10], Loss: 0.1700
Epoch [10/10], Loss: 0.1688
Generating predictions...
Complete!
Predictions for cluster 1 saved to ../results/cluster1_batch_size512_num_epochs10_lr0.001_embedding_dim64_

In [None]:
# class NCF(nn.Module):
#     def __init__(self, num_users, num_videos, embedding_dim, num_user_features, num_video_features):
#         super(NCF, self).__init__()
        
#         # GMF Components for embeddings
#         self.user_embeddings_gmf = nn.Embedding(num_users, embedding_dim)
#         self.video_embeddings_gmf = nn.Embedding(num_videos, embedding_dim)

#         # MLP Components for embeddings
#         self.user_embeddings_mlp = nn.Embedding(num_users, embedding_dim)
#         self.video_embeddings_mlp = nn.Embedding(num_videos, embedding_dim)

#         # MLP layers for user and video embeddings
#         self.fc1_mlp = nn.Linear(2 * embedding_dim + num_user_features + num_video_features, 128)
#         self.fc2_mlp = nn.Linear(128, 64)

#         # Final layers combining GMF, MLP for embeddings, and additional features
#         self.fc1_combined = nn.Linear(embedding_dim + 64, 128)
#         self.fc2_combined = nn.Linear(128, 1)

#     def forward(self, user_id, video_id, user_features, video_features, dropout=0.5):
#         ##### GMF Embedding branch
#         user_emb_gmf = self.user_embeddings_gmf(user_id)
#         video_emb_gmf = self.video_embeddings_gmf(video_id)
#         gmf_output = user_emb_gmf * video_emb_gmf   # dimension: (batch_size, embedding_dim)

#         ##### MLP Embedding branch
#         user_emb_mlp = self.user_embeddings_mlp(user_id)
#         video_emb_mlp = self.video_embeddings_mlp(video_id)

#         mlp_input = torch.cat([user_emb_mlp, video_emb_mlp, user_features, video_features], dim=-1) 

#         # First fully connected layer with BatchNorm and ReLU
#         mlp_output = self.fc1_mlp(mlp_input)
#         # if self.training:
#         #     mlp_output = nn.BatchNorm1d(128)(mlp_output)
#         mlp_output = torch.relu(mlp_output)
#         mlp_output = nn.Dropout(dropout)(mlp_output)

#         # Second fully connected layer with BatchNorm and ReLU
#         mlp_output = self.fc2_mlp(mlp_output)
#         # if self.training:
#         #     mlp_output = nn.BatchNorm1d(64)(mlp_output)
#         mlp_output = torch.relu(mlp_output)
#         mlp_output = nn.Dropout(dropout)(mlp_output)

#         ##### Combine GMF, MLP
#         combined_input = torch.cat([gmf_output, mlp_output], dim=-1)
#         combined_output = self.fc1_combined(combined_input)
#         # if self.training:
#         #     combined_output = nn.BatchNorm1d(128)(combined_output)
#         combined_output = torch.relu(combined_output)
#         combined_output = nn.Dropout(dropout)(combined_output)

#         ##### Scale wach_ratio to range [0, 5]
#         combined_output = self.fc2_combined(combined_output)
#         combined_output = torch.relu(combined_output)

#         return combined_output.squeeze()