# Libraries

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import get_linear_schedule_with_warmup
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

# Hyper-Parameters

In [2]:
class HyperParameters:
    def __init__(self, batch_size=16, 
                 learning_rate=0.0001, 
                 epochs=10,
                 hidden_size = 256,
                 embedding_dim = 128,
                 lstm_num_layers = 3,
                 train_size = 0.8,
                 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                 ):
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.hidden_size = hidden_size
        self.embedding_dim = embedding_dim
        self.lstm_num_layers = lstm_num_layers
        self.train_size = train_size
        self.device = device

    def get_batch_size(self):
        return self.batch_size

    def get_learning_rate(self):
        return self.learning_rate

    def get_epochs(self):
        return self.epochs

    def get_hidden_size(self):
        return self.hidden_size
    
    def get_embedding_dim(self):
        return self.embedding_dim
    
    def get_lstm_num_layers(self):
        return self.lstm_num_layers
    
    def get_train_size(self):
        return self.train_size
    
    def get_device(self):
        return self.device
    
hyperparams = HyperParameters()

# Load Data

In [3]:
# Read the Parquet file and drop duplicates based on "song_id" column
df = pd.read_parquet("../data/meta_song.parquet").drop_duplicates("song_id")
# Count the number of missing values (NaNs) in the DataFrame
na_count = df.isna().sum()
print("Number of missing values (NaNs) in the dataset: meta_song.parquet")
print(na_count)

Number of missing values (NaNs) in the dataset: meta_song.parquet
song_id             0
artist_id      128866
song_length    128866
album_id       323497
language_id    323497
album_month    323523
dtype: int64


In [4]:
def merge_feacture_dataset(df:pd.DataFrame)->pd.DataFrame:
    shape = df.shape[0]
    df1 = pd.read_parquet("../data/meta_song_composer.parquet").drop_duplicates("song_id")
    print(len(set(df['song_id'].unique())-set(df1['song_id'].unique())))
    df = pd.merge(df, df1, how='left') 
    assert df.shape[0] == shape, f"origin shape: {shape}, merge after shape: {df.shape[0]}"
    df2 = pd.read_parquet("../data/meta_song_genre.parquet").drop_duplicates("song_id")
    print(len(set(df['song_id'].unique())-set(df2['song_id'].unique())))
    df = pd.merge(df, df2, how='left') 
    assert df.shape[0] == shape, f"origin shape: {shape}, merge after shape: {df.shape[0]}"
    df3 = pd.read_parquet("../data/meta_song_lyricist.parquet").drop_duplicates("song_id")
    print(len(set(df['song_id'].unique())-set(df3['song_id'].unique())))
    df = pd.merge(df, df3, how='left') 
    assert df.shape[0] == shape, f"origin shape: {shape}, merge after shape: {df.shape[0]}"
    df4 = pd.read_parquet("../data/meta_song_producer.parquet").drop_duplicates("song_id")
    print(len(set(df['song_id'].unique())-set(df4['song_id'].unique())))
    df = pd.merge(df, df4, how='left') 
    assert df.shape[0] == shape, f"origin shape: {shape}, merge after shape: {df.shape[0]}"
    df5 = pd.read_parquet("../data/meta_song_titletext.parquet").drop_duplicates("song_id")
    print(len(set(df['song_id'].unique())-set(df5['song_id'].unique())))
    df = pd.merge(df, df5, how='left') 
    assert df.shape[0] == shape, f"origin shape: {shape}, merge after shape: {df.shape[0]}"
    df6 = pd.read_parquet("../data/meta_song.parquet").drop_duplicates("song_id")
    print(len(set(df['song_id'].unique())-set(df6['song_id'].unique())))
    df = pd.merge(df, df6, how='left') 
    assert df.shape[0] == shape, f"origin shape: {shape}, merge after shape: {df.shape[0]}"
    print(f"Merge finish!, now shape is : {df.shape}")
    return df

train_source = pd.read_parquet("../data/label_train_source.parquet")
train_target = pd.read_parquet("../data/label_train_target.parquet")
test_source  = pd.read_parquet("../data/label_test_source.parquet")

train_source = train_source.sort_values(by=['session_id', 'listening_order'], ascending=[True, True])
train_target = train_target.sort_values(by=['session_id', 'listening_order'], ascending=[True, True])
test_source  = test_source.sort_values(by=['session_id', 'listening_order'], ascending=[True, True])

#train_source = merge_feacture_dataset(train_source)
#train_target = merge_feacture_dataset(train_target)
#test_source  = merge_feacture_dataset(test_source)

# Encode the feactures

In [5]:
train_source.head(20)

Unnamed: 0,session_id,song_id,unix_played_at,play_status,login_type,listening_order
10952316,1,f6f06a71bb8bc38af6c0b7dae9cab00d,1660012505,0,7,1
10952317,1,7b48a87effd31c9c07b68ed212062854,1660012730,0,7,2
10952318,1,61c46d6401aab1dde7c7de23dc55c037,1660015113,0,7,3
10952319,1,7e54c9199aad70e35fe256d23701bad0,1660015289,0,7,4
10952320,1,6178580fa01b62e9b52787902c0d8ae6,1660015841,0,7,5
10952321,1,ab694649c65477d0bc574bf391a3f4a0,1660015842,0,7,6
10952322,1,5b3387fa195672dcfe979d17e4a62c9e,1660015846,0,7,7
10952323,1,2790c612d8d301e2f35550c75aea8c75,1660015846,0,7,8
10952324,1,d36c6cf30154e18e6c972704206d6b1e,1660015848,0,7,9
10952325,1,1cbcc681ecf7acef4948bff2eb8e39d7,1660015850,0,7,10


In [6]:
def fill_NaNs(df:pd.DataFrame, numerical_columns:list=None, string_columns:list=None)->pd.DataFrame:
    for column in numerical_columns:
        df[column].fillna(df[column].mean(), inplace=True)
    for column in string_columns:
        df[column].fillna(0)
    return df

def encode_unix_time(df:pd.DataFrame, sin_cos = False):
    # Convert 'unix_played_at' to a datetime column
    df['played_at_datetime'] = pd.to_datetime(df['unix_played_at'], unit='s')
    if sin_cos:
        df['hour_sin'] = np.sin(2 * np.pi * df['played_at_datetime'].dt.hour / 24)
        df['hour_cos'] = np.cos(2 * np.pi * df['played_at_datetime'].dt.hour / 24)
        df['minute_sin'] = np.sin(2 * np.pi * df['played_at_datetime'].dt.minute / 60)
        df['minute_cos'] = np.cos(2 * np.pi * df['played_at_datetime'].dt.minute / 60)
        df['second_sin'] = np.sin(2 * np.pi * df['played_at_datetime'].dt.second / 60)
        df['second_cos'] = np.cos(2 * np.pi * df['played_at_datetime'].dt.second / 60)
        df['month_sin'] = np.sin(2 * np.pi * df['played_at_datetime'].dt.month / 12)
        df['month_cos'] = np.cos(2 * np.pi * df['played_at_datetime'].dt.month / 12)
        df['year_sin'] = np.sin(2 * np.pi * df['played_at_datetime'].dt.year / 2023)
        df['year_cos'] = np.cos(2 * np.pi * df['played_at_datetime'].dt.year / 2023)
    else:
        df['hour_of_day'] = df['played_at_datetime'].dt.hour / 24
        df['minute_of_hour'] = df['played_at_datetime'].dt.minute / 60
        df['second_of_minute'] = df['played_at_datetime'].dt.second / 60
        df['month'] = df['played_at_datetime'].dt.month / 12
        df['year'] = df['played_at_datetime'].dt.year / 2023
    # Drop the specified columns from the DataFrame
    df.drop(columns=['unix_played_at', 'played_at_datetime'], inplace=True)
    return df

def get_song_ID_encode_dict(train:pd.DataFrame, test:pd.DataFrame)->dict:
    unique_song_ids = set(train['song_id'].tolist()+test['song_id'].tolist())
    ID_IDX = {song_id:i+1 for i,song_id in enumerate(unique_song_ids)}
    ID_IDX["SOS"]=0
    return ID_IDX

def encode_song_id(source_df:pd.DataFrame, target_df:pd.DataFrame, id2idx:dict):
    source_df['song_id'] = source_df['song_id'].map(id2idx)
    target_df['song_id'] = target_df['song_id'].map(id2idx)
    return source_df, target_df

In [7]:
ID_IDX = get_song_ID_encode_dict(train_source, test_source)
train_source, train_target = encode_song_id(train_source, train_target, ID_IDX)
train_source = encode_unix_time(train_source)
train_target = encode_unix_time(train_target)
test_source  = encode_unix_time(test_source)

In [8]:
def convert_per_N(df:pd.DataFrame, n:int, label = False):
    data = []
    pre_session_id = int(df['session_id'].iloc[0])
    if label:
        row = [0]
        for i in tqdm(range(df.shape[0])):
            # next session id
            if pre_session_id != int(df['session_id'].iloc[i]):
                data.append((pre_session_id, np.array(row).reshape(-1, 6)))
                pre_session_id, row = int(df['session_id'].iloc[i]), [0]
            # append 5 values
            row.append(df['song_id'].iloc[i])
        # append last session id
        data.append((df['session_id'].iloc[-1], np.array(row).reshape(-1, 6))) #last one
    else:
        row = []
        song_id = []
        for i in tqdm(range(df.shape[0])):
            # next session id
            if pre_session_id != int(df['session_id'].iloc[i]):
                data.append((pre_session_id, 
                             np.array(row).reshape(-1, 20), 
                             np.array(song_id).reshape(-1, 20)))
                pre_session_id, row,song_id = int(df['session_id'].iloc[i]), [], []
            # append 20 values
            song_id.append(df['song_id'].iloc[i])
            row.append([df['play_status'].iloc[i],
                        df['login_type'].iloc[i],
                        df['second_of_minute'].iloc[i],
                        df['minute_of_hour'].iloc[i],
                        df['hour_of_day'].iloc[i],
                        df['month'].iloc[i], 
                        df['year'].iloc[i]])
        # append last session id
        data.append((df['session_id'].iloc[-1], 
                     np.array(row).reshape(-1, 20),
                     np.array(song_id).reshape(-1, 20)))
    return data

train_source_data  = convert_per_N(train_source, 20)
train_source_label = convert_per_N(train_target, 5, label=True)
test_source_data   = convert_per_N(test_source, 20)

  0%|          | 0/11445180 [00:00<?, ?it/s]

100%|██████████| 11445180/11445180 [08:30<00:00, 22437.65it/s]
100%|██████████| 2861295/2861295 [00:32<00:00, 88638.59it/s]
100%|██████████| 2861280/2861280 [02:09<00:00, 22095.43it/s]


# convert to dataloader

In [9]:
class RankingDatset(Dataset):
    def __init__(self, data, label=None, train = True):
        self.session_id = [session_id for session_id,_,_ in data]
        self.feature    = [feature    for _,feature,_    in data]
        self.song_id    = [song_id    for _,_,song_id    in data]
        if train:
            self.label  = [label for _,label in label]
        else:
            self.label  = [0 for _ in data]
    def __len__(self):
        return len(self.session_id)
    def __getitem__(self, idx):
        session_id = self.session_id[idx]
        feature = torch.tensor(self.feature[idx], dtype=torch.long)
        song_id = torch.tensor(self.song_id[idx], dtype=torch.long)
        label = torch.tensor(self.label[idx], dtype=torch.long)
        return {'session_id': session_id, 'feature': feature, 'song_id': song_id, 'label': label}

In [10]:
train_dataset = RankingDatset(train_source_data, train_source_label)
train_size = int(hyperparams.get_train_size() * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])
test_dataset  = RankingDatset(test_source_data, train=False)
train_dataset[0]

{'session_id': 47890,
 'feature': tensor([[ 1, 49,  0,  0,  0,  0,  0,  1, 49,  0,  0,  0,  0,  0,  1, 49,  0,  0,
           0,  0],
         [ 0,  1, 49,  0,  0,  0,  0,  0,  1, 49,  0,  0,  0,  0,  0,  1, 49,  0,
           0,  0],
         [ 0,  0,  1, 49,  0,  0,  0,  0,  0,  1, 49,  0,  0,  0,  0,  0,  1, 49,
           0,  0],
         [ 0,  0,  0,  1, 49,  0,  0,  0,  0,  0,  1, 49,  0,  0,  0,  0,  0,  1,
          49,  0],
         [ 0,  0,  0,  0,  1, 49,  0,  0,  0,  0,  0,  1, 49,  0,  0,  0,  0,  0,
           1, 49],
         [ 0,  0,  0,  0,  0,  1, 49,  0,  0,  0,  0,  0,  1, 49,  0,  0,  0,  0,
           0,  1],
         [49,  0,  0,  0,  0,  0,  1, 49,  0,  0,  0,  0,  0,  1, 49,  0,  0,  0,
           0,  0]]),
 'song_id': tensor([[657176, 149727, 494131,  54032, 361890, 625642,  82272, 517085, 390513,
          662164, 452647, 637402, 286463, 172938, 118541, 428527, 478109,  41996,
          612006, 705332]]),
 'label': tensor([[     0, 551608, 461066, 544642,  57

# Convert to Dataloader with batch size

In [11]:
train_dataloader = DataLoader(train_dataset, batch_size=hyperparams.get_batch_size(), shuffle=True)
val_dataloader   = DataLoader(val_dataset, batch_size=hyperparams.get_batch_size())
test_dataloader  = DataLoader(test_dataset, batch_size=hyperparams.get_batch_size())

# Model

In [12]:
# Define the Encoder
class Encoder(nn.Module):
    def __init__(self, batch_size, num_songs, embedding_dim, hidden_size, num_feature, num_layers=1):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_size = batch_size

        self.embedding = nn.Embedding(num_songs, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim+num_feature, hidden_size, num_layers, batch_first=True)

    def forward(self, song_ids, feature):
        embedded = self.embedding(song_ids) .squeeze(1)
        # input feature cat with song embed
        lstm_input = torch.cat((embedded, feature.view(self.batch_size,20,-1)), dim=2)  # Concatenate along the feature dimension
        # Forward propagate LSTM
        out, hidden = self.lstm(lstm_input)  # out: tensor of shape (batch_size, seq_length, hidden_size)
        return out, hidden

# Define the Decoder
class Decoder(nn.Module):
    def __init__(self, batch_size, num_songs, embedding_dim, enc_hidden_size, hidden_size, num_layers=1):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_size = batch_size
        
        self.embedding = nn.Embedding(num_songs, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim+enc_hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc1 = nn.Linear(20, 1)
        self.fc2 = nn.Linear(hidden_size, num_songs)

    def forward(self, decode_song_ids, last_hidden, encoder_hidden):
        # Forward propagate LSTM
        embedded = self.embedding(decode_song_ids).squeeze(1) # torch.Size([64, 1, 128])
        encoder_hidden = self.fc1(encoder_hidden.reshape(self.batch_size,-1,20)) # torch.Size([64, 256, 1])
        lstm_input = torch.cat((embedded, encoder_hidden.view(self.batch_size,1,-1)), dim=2) # torch.Size([64, 1, 386])
        out, lstm_hidden = self.lstm(lstm_input, last_hidden)
        out = self.fc2(out.squeeze(1)) # torch.Size([64, 716557])
        return out, lstm_hidden

# Seq2Seq model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, song_ids, features, target_song_ids, use_teacher_forcing=True):
        encoder_output, hidden = self.encoder(song_ids, features)
        target_song_ids = target_song_ids.squeeze(1) # torch.Size([64, 6])
        batch_size = target_song_ids.size(0)  # 64
        target_len = target_song_ids.size(-1) # 6
        target_song_size = self.decoder.fc2.out_features
        
        outputs = torch.zeros(batch_size, target_len, target_song_size) # torch.Size([64, 6, 716557])
        decoder_input = target_song_ids[:, 0].unsqueeze(1)  # SOS token as the first input, torch.Size([64, 1])
        for t in range(1, target_len):
            decoder_output, hidden = self.decoder(decoder_input.unsqueeze(1), hidden, encoder_output)
            outputs[:, t, :] = decoder_output
            # using teacher forcing
            if use_teacher_forcing:
                decoder_input = target_song_ids[:, t].unsqueeze(1)
            else:
                decoder_input = torch.argmax(decoder_output.squeeze(0),dim = 1).unsqueeze(1)
        return torch.argmax(outputs, dim = 2)

In [13]:
encoder = Encoder(hyperparams.get_batch_size(), 
                  len(list(ID_IDX.values())), 
                  hyperparams.get_embedding_dim(), 
                  hyperparams.get_hidden_size(), 
                  train_dataset[0]['feature'].shape[0], 
                  hyperparams.get_lstm_num_layers()
                  )
decoder = Decoder(hyperparams.get_batch_size(),
                  len(list(ID_IDX.values())), 
                  hyperparams.get_embedding_dim(), 
                  hyperparams.get_hidden_size(), 
                  hyperparams.get_hidden_size(), 
                  hyperparams.get_lstm_num_layers()
                  )
model = Seq2Seq(encoder, decoder).to(hyperparams.get_device())

In [14]:
def testing_model(model=model, train_dataloader=train_dataloader):
    batch_sample = next(iter(train_dataloader))
    song_ids = batch_sample['song_id'].to(hyperparams.get_device())
    features = batch_sample['feature'].to(hyperparams.get_device())
    target   = batch_sample['label'].to(hyperparams.get_device())
    # Set the model to evaluation mode
    with torch.no_grad():
        output = model(song_ids, features, target)
        return output.shape, output
testing_model()

(torch.Size([16, 6]),
 tensor([[     0, 627674, 217020, 131695, 131695, 131695],
         [     0,  43503, 502994, 472149, 639739, 585053],
         [     0,  43503, 169674, 169674, 217020, 585053],
         [     0, 696212, 231735, 231735, 272798, 272798],
         [     0,  43503, 512256, 512256, 667592, 137733],
         [     0,  43503, 231735, 169674, 169674, 169674],
         [     0,  43503, 144857, 667592, 667592, 137733],
         [     0,  43503, 217020, 585053, 667592, 585053],
         [     0,  43503, 231735, 217020, 667592, 634661],
         [     0,  43503, 144857, 355345, 137733, 137733],
         [     0,  43503, 169674, 169674, 169674, 169674],
         [     0,  43503, 231735, 512256, 639739, 169674],
         [     0, 696212, 231735, 231735, 231735, 231232],
         [     0,  43503, 231735, 217020, 169674, 639739],
         [     0,  43503, 562567, 169674, 634661, 634661],
         [     0,  43503, 217020, 667592, 667592, 634661]]))

# Loss function, Optimizer

In [15]:
class NDCGLoss(torch.nn.Module):
    def __init__(self):
        super(NDCGLoss, self).__init__()

    def forward(self, predictions, labels):
        # Calculate nDCG loss
        # Compare predictions and labels element-wise
        gain = (predictions != labels).float()
        weightage = torch.tensor([1.0, 0.63, 0.5, 0.43, 0.38]).float()
        nDCG = torch.dot(gain, weightage)
        return nDCG

def test_NDCGLoss():
    # Example usage:
    predictions = torch.tensor([0, 5, 3, 5, 2, 1])  # Example predicted scores
    labels      = torch.tensor([0, 5, 4, 5, 2, 1])  # Example true relevance scores 
    # Define nDCG loss criterion
    criterion = NDCGLoss()  
    # Calculate nDCG loss
    loss = criterion(predictions[1:], labels[1:])
    print(f"nDCG Loss: {loss.item()}")
test_NDCGLoss()

nDCG Loss: 0.6299999952316284


In [16]:
# compute coverage
# torch.unique(torch.tensor([[0, 1, 2, 3, 4, 5],[6, 7, 8, 9, 10, 11]]))

In [17]:
loss_fn = NDCGLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=hyperparams.get_learning_rate())
total_steps = len(train_dataloader) * hyperparams.get_epochs()
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training

In [19]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0.0
    for batch in tqdm(data_loader):
        optimizer.zero_grad()
        song_ids = batch['song_id'].to(device)
        features = batch['feature'].to(device)
        target   = batch['label'].to(device)
        outputs = model(song_ids, features, target)
        loss = loss_fn(outputs[1:].cpu(), target[1:].cpu())
        total_loss+=loss.items()
        loss.backward()
        optimizer.step()
        scheduler.step()
    return total_loss

def evaluate(model, data_loader, device):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(data_loader):
            song_ids = batch['song_id'].to(device)
            features = batch['feature'].to(device)
            target   = batch['label'].to(device)
            outputs = model(song_ids, features, target)
            loss = loss_fn(outputs[1:], target[1:])
            total_loss+=loss.items()
    return total_loss

for epoch in range(hyperparams.get_epochs()):
        print(f"Epoch {epoch + 1}/{hyperparams.get_epochs()}")
        train_loss = train(model, train_dataloader, optimizer, scheduler, hyperparams.get_device())
        valid_loss = evaluate(model, val_dataloader, hyperparams.get_device())
        print(f"Training loss: {train_loss:.4f}")
        print(f"Validation loss: {valid_loss:.4f}")

Epoch 1/10


  0%|          | 0/28613 [00:00<?, ?it/s]


RuntimeError: 1D tensors expected, but got 3D and 1D tensors

# Predict tesing Result

# Generate Submittion