In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning as pl
import warnings
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from datetime import datetime
from transformers import get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

# Load Data and Merge Feacture

In [None]:
enc = LabelEncoder()

def merge_feacture_dataset(df:pd.DataFrame)->pd.DataFrame:
    shape = df.shape[0]
    df1 = pd.read_parquet("../data/meta_song_composer.parquet").drop_duplicates("song_id")
    df1['composer_id'] = enc.fit_transform(df1['composer_id'].fillna('nan'))
    #print(len(set(df['song_id'].unique())-set(df1['song_id'].unique())))
    df = pd.merge(df, df1, how='left') 
    assert df.shape[0] == shape, f"origin shape: {shape}, merge after shape: {df.shape[0]}"
    df2 = pd.read_parquet("../data/meta_song_genre.parquet").drop_duplicates("song_id")
    df2['genre_id'] = enc.fit_transform(df2['genre_id'].fillna('nan'))
    #print(len(set(df['song_id'].unique())-set(df2['song_id'].unique())))
    df = pd.merge(df, df2, how='left') 
    assert df.shape[0] == shape, f"origin shape: {shape}, merge after shape: {df.shape[0]}"
    df3 = pd.read_parquet("../data/meta_song_lyricist.parquet").drop_duplicates("song_id")
    df3['lyricist_id'] = enc.fit_transform(df3['lyricist_id'].fillna('nan'))
    #print(len(set(df['song_id'].unique())-set(df3['song_id'].unique())))
    df = pd.merge(df, df3, how='left') 
    assert df.shape[0] == shape, f"origin shape: {shape}, merge after shape: {df.shape[0]}"
    df4 = pd.read_parquet("../data/meta_song_producer.parquet").drop_duplicates("song_id")
    df4['producer_id'] = enc.fit_transform(df4['producer_id'].fillna('nan'))
    #print(len(set(df['song_id'].unique())-set(df4['song_id'].unique())))
    df = pd.merge(df, df4, how='left') 
    assert df.shape[0] == shape, f"origin shape: {shape}, merge after shape: {df.shape[0]}"
    df5 = pd.read_parquet("../data/meta_song_titletext.parquet").drop_duplicates("song_id")
    df5['title_text_id'] = enc.fit_transform(df5['title_text_id'].fillna('nan'))
    #print(len(set(df['song_id'].unique())-set(df5['song_id'].unique())))
    df = pd.merge(df, df5, how='left') 
    assert df.shape[0] == shape, f"origin shape: {shape}, merge after shape: {df.shape[0]}"
    df6 = pd.read_parquet("../data/meta_song.parquet").drop_duplicates("song_id")
    df6['artist_id'] = enc.fit_transform(df6['artist_id'].fillna(float(-1)))
    df6['album_id'] = enc.fit_transform(df6['album_id'].fillna(float(-1)))
    df6['language_id'] = enc.fit_transform(df6['language_id'].fillna(float(-1)))
    #print(len(set(df['song_id'].unique())-set(df6['song_id'].unique())))
    df = pd.merge(df, df6, how='left') 
    assert df.shape[0] == shape, f"origin shape: {shape}, merge after shape: {df.shape[0]}"
    print(f"Merge finish!, now shape is : {df.shape}")
    return df

# load from directory
train_source = pd.read_parquet("../data/label_train_source.parquet")
train_target = pd.read_parquet("../data/label_train_target.parquet")
test_source  = pd.read_parquet("../data/label_test_source.parquet")
# sort the data by session_id, listening_order
train_source = train_source.sort_values(by=['session_id', 'listening_order'], ascending=[True, True])
train_target = train_target.sort_values(by=['session_id', 'listening_order'], ascending=[True, True])
test_source  = test_source.sort_values( by=['session_id', 'listening_order'], ascending=[True, True])

train_source = merge_feacture_dataset(train_source)
train_target = merge_feacture_dataset(train_target)
test_source  = merge_feacture_dataset(test_source)

# Data Preprocessing

In [None]:
def get_song_ID_encode_dict(train_source:pd.DataFrame, train_target:pd.DataFrame, test_source:pd.DataFrame)->dict:
    unique_song_ids = set(train_source['song_id'].tolist() +
                          train_target['song_id'].tolist() +
                          test_source['song_id'].tolist())
    ID_to_IDX = {song_id:i+1 for i,song_id in enumerate(unique_song_ids)}
    ID_to_IDX["SOS"]=0
    IDX_to_ID = {v: k for k, v in ID_to_IDX.items()}
    return ID_to_IDX, IDX_to_ID

def preprocessing(df:pd.DataFrame, label_encoder:LabelEncoder, ID_to_IDX:dict)->pd.DataFrame:
    # IDs Data
    for col in ['composer_id', 'genre_id', 'lyricist_id', 'producer_id', 'title_text_id']:
        df[col] = df[col]+1
        df[col].fillna(0, inplace=True)
    for col in ['play_status', 'login_type']:
        df[col] = label_encoder.fit_transform(df[col].fillna('nan'))
    # numerical data
    df['song_length'] = pd.to_numeric(df['song_length'], errors='coerce')
    df['song_length'].fillna(df['song_length'].mean(), inplace=True)
    # time data
    # unix time
    df['unix_played_at'] = pd.to_datetime(df['unix_played_at'], unit='s')
    df['unix_second'] = df['unix_played_at'].dt.second
    df['unix_minute'] = df['unix_played_at'].dt.minute
    df['unix_hour']   = df['unix_played_at'].dt.hour
    df['unix_month']  = df['unix_played_at'].dt.month
    df['unix_year']   = df['unix_played_at'].dt.year
    # album month
    df['album_month'] = pd.to_datetime(df['album_month'], errors='coerce')
    # Calculate mean or median date
    mean_date = df['album_month'].mean()  # Use mean() or median() as needed
    # Fill NaNs with the mean or median date
    df['album_month'].fillna(mean_date, inplace=True)
    df['album_Month'] = df['album_month'].dt.month
    df['album_Year']  = df['album_month'].dt.year
    df.drop(columns=['unix_played_at', 'album_month'], inplace=True) # , 'album_month'
    # Song ID encode
    df['song_id'] = df['song_id'].map(ID_to_IDX)

    return df

ID_to_IDX, IDX_to_ID = get_song_ID_encode_dict(train_source, train_target, test_source)
train_source = preprocessing(train_source, enc, ID_to_IDX)
train_target = preprocessing(train_target, enc, ID_to_IDX)
test_source  = preprocessing(test_source,  enc, ID_to_IDX)

# Build sequential Data format

In [None]:
def build_sequential_data(df:pd.DataFrame, for_target:bool=False)->(list, list):
    session_list, feature_list = [], []
    grouped_sorted_data = df.groupby('session_id')
    if for_target:
        for session_id, grouped_df in tqdm(grouped_sorted_data):
            grouped_df = grouped_df.sort_values(by='listening_order')
            session_list.append(session_id)
            feature_list.append(grouped_df['song_id'].to_numpy().T)
    else:
        for session_id, grouped_df in tqdm(grouped_sorted_data):
            grouped_df = grouped_df.sort_values(by='listening_order')
            grouped_df.drop(columns=['session_id', 'listening_order'], inplace=True)
            session_list.append(session_id)
            feature_list.append(grouped_df.to_numpy().T)
    return session_list, feature_list

train_source_session_list, train_source_feature_list = build_sequential_data(train_source)
train_target_session_list, train_target_feature_list = build_sequential_data(train_target, for_target=True)
test_source_session_list,  test_source_feature_list  = build_sequential_data(test_source)

# Parameters

In [None]:
class HyperParameters:
    def __init__(self, batch_size=512, 
                 learning_rate=0.001, 
                 epochs=10, # 1 for testing
                 hidden_size = 256,
                 embedding_dim = 256,
                 lstm_num_layers = 1,
                 train_size = 0.8,
                 device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
                 save_dir = "../model/encoder-decoder/"
                 ):
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.hidden_size = hidden_size
        self.embedding_dim = embedding_dim
        self.lstm_num_layers = lstm_num_layers
        self.train_size = train_size
        self.device = device
        self.save_dir = save_dir

    def get_batch_size(self):
        return self.batch_size

    def get_learning_rate(self):
        return self.learning_rate

    def get_epochs(self):
        return self.epochs

    def get_hidden_size(self):
        return self.hidden_size
    
    def get_embedding_dim(self):
        return self.embedding_dim
    
    def get_lstm_num_layers(self):
        return self.lstm_num_layers
    
    def get_train_size(self):
        return self.train_size
    
    def get_device(self):
        return self.device
    
    def get_save_dir(self):
        return self.save_dir
    
hyperparams = HyperParameters()

# Dataset and Dataloader

In [None]:
class RankingDatset(Dataset):
    def __init__(self, session_id, feature, label=None, train = True):
        self.session_id = session_id
        self.feature    = feature
        self.label      = label if train else None
    def __len__(self):
        return len(self.session_id)
    def __getitem__(self, idx):
        session_id = self.session_id[idx]
        feature = torch.tensor(self.feature[idx], dtype=torch.float) 
        label = torch.tensor(self.label[idx], dtype=torch.long)
        return {'session_id': session_id, 'feature': feature, 'label': label}

In [None]:
train_dataset = RankingDatset(train_source_session_list, train_source_feature_list, train_target_feature_list)
train_size = int(hyperparams.get_train_size() * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])
test_dataset  = RankingDatset(test_source_session_list, test_source_feature_list, train=False)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=hyperparams.get_batch_size(), shuffle=True)
val_dataloader   = DataLoader(val_dataset, batch_size=hyperparams.get_batch_size())
test_dataloader  = DataLoader(test_dataset, batch_size=hyperparams.get_batch_size())

# Model

In [None]:
# Define the Encoder
class Encoder(nn.Module):
    def __init__(self, hidden_size, num_feature, num_layers=1):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers  = num_layers
        self.lstm = nn.LSTM(num_feature, hidden_size, num_layers, batch_first=True)

    def forward(self, feature):
        # feature : torch.Size([batch size, 19, 20])
        # Forward propagate LSTM
        out, hidden = self.lstm(feature.transpose(1,2))  # out: tensor of shape (batch_size, seq_length, hidden_size)

        return out, hidden

# Define the Decoder
class Decoder(nn.Module):
    def __init__(self, num_songs, embedding_dim, enc_hidden_size, hidden_size, num_layers=1):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(num_songs, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim+enc_hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc1 = nn.Linear(20, 1)
        self.fc2 = nn.Linear(hidden_size, num_songs)

    def forward(self, decode_song_ids, last_hidden, encoder_hidden):
        # decode_song_ids: torch.Size([batch size, 1])
        # encoder_hidden:  torch.Size([batch size, 20, encoder_hidden])
        embedded = self.embedding(decode_song_ids).squeeze(1) # torch.Size([batch size, 1, embedding_dim])
        encoder_hidden = self.fc1(encoder_hidden.transpose(1,2)) # torch.Size([batch size, enc_hidden_size, 1])
        lstm_input = torch.cat((embedded, encoder_hidden.transpose(1,2)), dim=2) # torch.Size([batch size, 1, embedding_dim + enc_hidden_size])
        # Forward propagate LSTM
        out, lstm_hidden = self.lstm(lstm_input, last_hidden)
        out = self.fc2(out.squeeze(1)) # torch.Size([batch size, num_songs])
        return out, lstm_hidden

# Seq2Seq model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, features, target_song_ids=None, use_teacher_forcing=True):
        encoder_output, hidden = self.encoder(features)
        batch_size = features.shape[0]  # 64
        target_song_ids = torch.zeros(batch_size, 5).long().to(hyperparams.get_device()) if target_song_ids is None else target_song_ids.squeeze(1)
        target_len = target_song_ids.size(-1) # 5
        target_song_size = self.decoder.fc2.out_features
        
        outputs = torch.zeros(batch_size, target_len, target_song_size) # torch.Size([64, 5, 716557])
        decoder_input = torch.zeros(batch_size, 1).long().to(hyperparams.get_device())  # SOS token as the first input, torch.Size([64, 1])
        for t in range(0, target_len):
            decoder_output, hidden = self.decoder(decoder_input.unsqueeze(1), hidden, encoder_output)
            outputs[:, t, :] = decoder_output
            # using teacher forcing
            if use_teacher_forcing:
                assert target_song_ids is not None, "Teacher forcing must have a target; it shouldn't be None."
                decoder_input = target_song_ids[:, t].unsqueeze(1)
            else:
                decoder_input = torch.argmax(decoder_output.squeeze(0),dim = 1).unsqueeze(1)

        return torch.argmax(outputs, dim = 2)

In [None]:
encoder = Encoder(hyperparams.get_hidden_size(), 
                  train_dataset[0]['feature'].shape[0], 
                  hyperparams.get_lstm_num_layers()
                  )
decoder = Decoder(len(ID_to_IDX), 
                  hyperparams.get_embedding_dim(), 
                  hyperparams.get_hidden_size(), 
                  hyperparams.get_hidden_size(), 
                  hyperparams.get_lstm_num_layers()
                  )
model = Seq2Seq(encoder, decoder).to(hyperparams.get_device())

In [None]:
def testing_model(model=model, train_dataloader=train_dataloader, inference=False):
    # last batch
    batch_sample = None
    for idx, data in enumerate(train_dataloader):
        if idx == len(train_dataloader)-1:
            batch_sample = data
    #batch_sample = next(iter(train_dataloader))
    features = batch_sample['feature'].to(hyperparams.get_device())
    print("features: ", features.shape)
    target   = batch_sample['label'].to(hyperparams.get_device())
    print("target: ", target.shape)
    # Set the model to evaluation mode
    with torch.no_grad():
        if inference:
            output = model(features, None, False)
        else:
            output = model(features, target)
        return output.shape, output
testing_model(inference=True)

# Loss function, Optimizer, Scheduler

In [None]:
class listNetLoss(torch.nn.Module):
    def __init__(self, eps=1e-10, padded_value_indicator=-1):
        super(listNetLoss, self).__init__()
        self.eps = eps
        self.padded_value_indicator = padded_value_indicator
        self.softmax = torch.nn.Softmax(dim=1)
    def forward(self, y_pred, y_true):
        """
        ListNet loss introduced in "Learning to Rank: From Pairwise Approach to Listwise Approach".
        :param y_pred: predictions from the model, shape [batch_size, slate_length]
        :param y_true: ground truth labels, shape [batch_size, slate_length]
        :param eps: epsilon value, used for numerical stability
        :param padded_value_indicator: an indicator of the y_true index containing a padded item, e.g. -1
        :return: loss value, a torch.Tensor
        """
        y_pred = y_pred.float()
        y_true = y_true.float()
        y_pred = y_pred.detach().requires_grad_(True)
        y_pred,_ = torch.sort(y_pred, dim=1)
        y_true,_ = torch.sort(y_true, dim=1)
        preds_smax = self.softmax(y_pred) + self.eps
        true_smax  = self.softmax(y_true)
        preds_smax = preds_smax 
        preds_log = torch.log(preds_smax)
        return torch.mean(-torch.sum(true_smax * preds_log, dim=1))
    
def test_Loss(loss_fn=listNetLoss()):
    # Example usage:
    predictions = torch.tensor(
        [[     0, 627674, 217020, 131695, 131695, 131695],
         [     0,  43503, 502994, 472149, 639739, 585053],
         [     0,  43503, 169674, 169674, 217020, 585053],
         [     0, 696212, 231735, 231735, 272798, 272798],
         [     0,  43503, 512256, 512256, 667592, 137733],
         [     0,  43503, 231735, 169674, 169674, 169674],
         [     0,  43503, 144857, 667592, 667592, 137733],
         [     0,  43503, 217020, 585053, 667592, 585053],
         [     0,  43503, 231735, 217020, 667592, 634661],
         [     0,  43503, 144857, 355345, 137733, 137733],
         [     0,  43503, 169674, 169674, 169674, 169674],
         [     0,  43503, 231735, 512256, 639739, 169674],
         [     0, 696212, 231735, 231735, 231735, 231232],
         [     0,  43503, 231735, 217020, 169674, 639739],
         [     0,  43503, 562567, 169674, 634661, 634661],
         [     0,  43503, 217020, 667592, 667592, 634661]]).float() # Example predicted scores
    labels = torch.tensor(
        [[     0, 627674, 217020, 131695, 131695, 131695],
         [     0,  43503, 502994, 472149, 639739, 585053],
         [     0,  43503, 169674, 169674, 217020, 585053],
         [     0, 696212, 231735, 231735, 272798, 272798],
         [     0,  43503, 512256, 512256, 667592, 137733],
         [     0,  43503, 2, 3, 169674, 169674],
         [     0,  43503, 144857, 667592, 667592, 137733],
         [     0,  43503, 217020, 585053, 667592, 585053],
         [     0,  43503, 231735, 217020, 667592, 634661],
         [     0,  43503, 144857, 355345, 137733, 137733],
         [     0,  43503, 169674, 169674, 5, 169674],
         [     0,  43503, 231735, 512256, 639739, 169674],
         [     0, 696212, 231735, 4, 231735, 231232],
         [     0,  43503, 231735, 217020, 169674, 639739],
         [     0,  43503, 562567, 169674, 634661, 634661],
         [     0,  43503, 217020, 667592, 667592, 634661]]).float() # Example true relevance scores 
    # Define nDCG loss criterion
    criterion = loss_fn
    # Calculate nDCG loss
    loss = criterion(predictions[:,1:].detach().requires_grad_(True), labels[:,1:])
    loss.backward()
    print(f"Loss: {loss.item()}")
    
test_Loss()

In [None]:
loss_fn = listNetLoss()
# loss_fn = nn.MSELoss()
# loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=hyperparams.get_learning_rate())
total_steps = len(train_dataloader) * hyperparams.get_epochs()
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training

In [None]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0.0
    for batch in tqdm(data_loader):
        optimizer.zero_grad()
        features = batch['feature'].to(device)
        target   = batch['label'].squeeze(1).to(device)
        outputs = model(features, target)
        loss = loss_fn(outputs[:,1:].cpu(), target[:,1:].cpu())
        total_loss+=loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    return total_loss

def evaluate(model, data_loader, device):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(data_loader):
            features = batch['feature'].to(device)
            target   = batch['label'].squeeze(1).to(device)
            outputs = model(features, target)
            loss = loss_fn(outputs[:,1:].cpu(), target[:,1:].cpu())
            total_loss+=loss.item()
    return total_loss

def training_Process():
    train_losses, valid_losses = [], []
    print("\nStart Training:")
    for epoch in range(hyperparams.get_epochs()):
        print(f"Epoch {epoch + 1}/{hyperparams.get_epochs()}")
        train_loss = train(model, train_dataloader, optimizer, scheduler, hyperparams.get_device())
        valid_loss = evaluate(model, val_dataloader, hyperparams.get_device())
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)
        print(f"Training loss   : {train_loss:.4f}")
        print(f"Validation loss : {valid_loss:.4f}")
    return train_losses, valid_losses

train_losses, valid_losses = training_Process()

In [None]:
def save_model_and_loss(model, folder_path, valid_losses):
    # Create the folder if it doesn't exist
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    # Save the model state dict
    mean_valid_loss = sum(valid_losses) / len(valid_losses)
    torch.save(model.state_dict(), os.path.join(folder_path, f'model_loss_{mean_valid_loss:.4f}.pth'))

def load_model(model, model_path):
    # Load the model state dict
    model.load_state_dict(torch.load(model_path))
    # Set the model to evaluation mode after loading
    model.eval()  
    return model

# save model
save_model_and_loss(model, hyperparams.get_save_dir(), valid_losses)

In [None]:
 # Plotting
plt.figure(figsize=(8, 6))
plt.plot(train_losses, label='Training Loss')
plt.plot(valid_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Losses')
plt.legend()
plt.show()

# Generate Submittion

In [None]:
def generate_prediction(model, test_dataloader, device, batch=False):
    if batch:
        batch = next(iter(test_dataloader))
        session_id = batch['session_id']
        features = batch['feature'].to(device)
        outputs = model(features, target_song_ids=None, use_teacher_forcing=False)
        return session_id, outputs[:,1:]
    else:
        total_session_ids = torch.zeros(len(test_dataloader)*hyperparams.get_batch_size())
        total_outputs = torch.zeros(len(test_dataloader)*hyperparams.get_batch_size(), 5)
        # batch = next(iter(test_dataloader))
        for idx, batch in tqdm(enumerate(test_dataloader)):
            session_id = batch['session_id']
            features = batch['feature'].to(device)
            outputs = model(features, target_song_ids=None, use_teacher_forcing=False, prediction=True)
            total_session_ids[hyperparams.get_batch_size()*idx:hyperparams.get_batch_size()*(idx+1)] = session_id
            total_outputs[hyperparams.get_batch_size()*idx:hyperparams.get_batch_size()*(idx+1), :] = outputs[:,1:]
        return total_session_ids, total_outputs

total_session_ids, total_outputs = generate_prediction(model, test_dataloader, hyperparams.get_device(), batch=True)

final_submittion_df = pd.DataFrame({
    "session_id":total_session_ids.numpy(),
    "top1":total_outputs[:,0].numpy(),
    "top2":total_outputs[:,1].numpy(),
    "top3":total_outputs[:,2].numpy(),
    "top4":total_outputs[:,3].numpy(),
    "top5":total_outputs[:,4].numpy()
})
final_submittion_df['top1'] = final_submittion_df['top1'].map(IDX_to_ID)
final_submittion_df['top2'] = final_submittion_df['top2'].map(IDX_to_ID)
final_submittion_df['top3'] = final_submittion_df['top3'].map(IDX_to_ID)
final_submittion_df['top4'] = final_submittion_df['top4'].map(IDX_to_ID)
final_submittion_df['top5'] = final_submittion_df['top5'].map(IDX_to_ID)
# save the final submission
# Get today's date
current_time = datetime.now().strftime("%Y-%m-%d-%H-%M")
save_folder = "../submission/"
file_name = f'{current_time}.csv'
final_submittion_df.to_csv(save_folder+file_name, index=False)