In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

In [2]:
# Read the Parquet file and drop duplicates based on "song_id" column
df = pd.read_parquet("../data/meta_song.parquet").drop_duplicates("song_id")
# Count the number of missing values (NaNs) in the DataFrame
na_count = df.isna().sum()
print("Number of missing values (NaNs) in the dataset: meta_song.parquet")
print(na_count)

Number of missing values (NaNs) in the dataset: meta_song.parquet
song_id             0
artist_id      128866
song_length    128866
album_id       323497
language_id    323497
album_month    323523
dtype: int64


# Load Data

In [3]:
train_source = pd.read_parquet("../data/label_train_source.parquet")
train_target = pd.read_parquet("../data/label_train_target.parquet")
test_source  = pd.read_parquet("../data/label_test_source.parquet")

train_source = train_source.sort_values(by=['session_id', 'listening_order'], ascending=[True, True])
train_target = train_target.sort_values(by=['session_id', 'listening_order'], ascending=[True, True])
test_source  = test_source.sort_values(by=['session_id', 'listening_order'], ascending=[True, True])

def merge_feacture_dataset(df:pd.DataFrame)->pd.DataFrame:
    shape = df.shape[0]
    # df1 = pd.read_parquet("../data/meta_song_composer.parquet").drop_duplicates("song_id")
    # print(len(set(df['song_id'].unique())-set(df1['song_id'].unique())))
    # df = pd.merge(df, df1, how='left') 
    # assert df.shape[0] == shape, f"origin shape: {shape}, merge after shape: {df.shape[0]}"
    # df2 = pd.read_parquet("../data/meta_song_genre.parquet").drop_duplicates("song_id")
    # print(len(set(df['song_id'].unique())-set(df2['song_id'].unique())))
    # df = pd.merge(df, df2, how='left') 
    # assert df.shape[0] == shape, f"origin shape: {shape}, merge after shape: {df.shape[0]}"
    # df3 = pd.read_parquet("../data/meta_song_lyricist.parquet").drop_duplicates("song_id")
    # print(len(set(df['song_id'].unique())-set(df3['song_id'].unique())))
    # df = pd.merge(df, df3, how='left') 
    # assert df.shape[0] == shape, f"origin shape: {shape}, merge after shape: {df.shape[0]}"
    # df4 = pd.read_parquet("../data/meta_song_producer.parquet").drop_duplicates("song_id")
    # print(len(set(df['song_id'].unique())-set(df4['song_id'].unique())))
    # df = pd.merge(df, df4, how='left') 
    # assert df.shape[0] == shape, f"origin shape: {shape}, merge after shape: {df.shape[0]}"
    # df5 = pd.read_parquet("../data/meta_song_titletext.parquet").drop_duplicates("song_id")
    # print(len(set(df['song_id'].unique())-set(df5['song_id'].unique())))
    # df = pd.merge(df, df5, how='left') 
    # assert df.shape[0] == shape, f"origin shape: {shape}, merge after shape: {df.shape[0]}"
    df6 = pd.read_parquet("../data/meta_song.parquet").drop_duplicates("song_id")
    print(len(set(df['song_id'].unique())-set(df6['song_id'].unique())))
    df = pd.merge(df, df6, how='left') 
    assert df.shape[0] == shape, f"origin shape: {shape}, merge after shape: {df.shape[0]}"
    print(f"Merge finish!, now shape is : {df.shape}")
    return df

#train_source = merge_feacture_dataset(train_source)
#train_target = merge_feacture_dataset(train_target)
#test_source  = merge_feacture_dataset(test_source)

# Encode the feactures

In [4]:
train_source.head(20)

Unnamed: 0,session_id,song_id,unix_played_at,play_status,login_type,listening_order
10952316,1,f6f06a71bb8bc38af6c0b7dae9cab00d,1660012505,0,7,1
10952317,1,7b48a87effd31c9c07b68ed212062854,1660012730,0,7,2
10952318,1,61c46d6401aab1dde7c7de23dc55c037,1660015113,0,7,3
10952319,1,7e54c9199aad70e35fe256d23701bad0,1660015289,0,7,4
10952320,1,6178580fa01b62e9b52787902c0d8ae6,1660015841,0,7,5
10952321,1,ab694649c65477d0bc574bf391a3f4a0,1660015842,0,7,6
10952322,1,5b3387fa195672dcfe979d17e4a62c9e,1660015846,0,7,7
10952323,1,2790c612d8d301e2f35550c75aea8c75,1660015846,0,7,8
10952324,1,d36c6cf30154e18e6c972704206d6b1e,1660015848,0,7,9
10952325,1,1cbcc681ecf7acef4948bff2eb8e39d7,1660015850,0,7,10


In [5]:
def fill_NaNs(df:pd.DataFrame, numerical_columns:list=None, string_columns:list=None)->pd.DataFrame:
    for column in numerical_columns:
        df[column].fillna(df[column].mean(), inplace=True)
    for column in string_columns:
        df[column].fillna(0)
    return df

def encode_unix_time(df:pd.DataFrame, sin_cos = False):
    # Convert 'unix_played_at' to a datetime column
    df['played_at_datetime'] = pd.to_datetime(df['unix_played_at'], unit='s')
    if sin_cos:
        df['hour_sin'] = np.sin(2 * np.pi * df['played_at_datetime'].dt.hour / 24)
        df['hour_cos'] = np.cos(2 * np.pi * df['played_at_datetime'].dt.hour / 24)
        df['minute_sin'] = np.sin(2 * np.pi * df['played_at_datetime'].dt.minute / 60)
        df['minute_cos'] = np.cos(2 * np.pi * df['played_at_datetime'].dt.minute / 60)
        df['second_sin'] = np.sin(2 * np.pi * df['played_at_datetime'].dt.second / 60)
        df['second_cos'] = np.cos(2 * np.pi * df['played_at_datetime'].dt.second / 60)
        df['month_sin'] = np.sin(2 * np.pi * df['played_at_datetime'].dt.month / 12)
        df['month_cos'] = np.cos(2 * np.pi * df['played_at_datetime'].dt.month / 12)
        df['year_sin'] = np.sin(2 * np.pi * df['played_at_datetime'].dt.year / 2023)
        df['year_cos'] = np.cos(2 * np.pi * df['played_at_datetime'].dt.year / 2023)
    else:
        df['hour_of_day'] = df['played_at_datetime'].dt.hour / 24
        df['minute_of_hour'] = df['played_at_datetime'].dt.minute / 60
        df['second_of_minute'] = df['played_at_datetime'].dt.second / 60
        df['month'] = df['played_at_datetime'].dt.month / 12
        df['year'] = df['played_at_datetime'].dt.year / 2023
    # Drop the specified columns from the DataFrame
    df.drop(columns=['unix_played_at', 'played_at_datetime'], inplace=True)
    return df

def get_song_ID_encode_dict(train:pd.DataFrame, test:pd.DataFrame)->dict:
    unique_song_ids = set(train['song_id'].tolist()+test['song_id'].tolist())
    ID_IDX = {song_id:i+1 for i,song_id in enumerate(unique_song_ids)}
    ID_IDX["SOS"]=0
    return ID_IDX

def encode_song_id(source_df:pd.DataFrame, target_df:pd.DataFrame, id2idx:dict):
    source_df['song_id'] = source_df['song_id'].map(id2idx)
    target_df['song_id'] = target_df['song_id'].map(id2idx)
    return source_df, target_df

In [6]:
ID_IDX = get_song_ID_encode_dict(train_source, test_source)
train_source, train_target = encode_song_id(train_source, train_target, ID_IDX)
train_source = encode_unix_time(train_source)
train_target = encode_unix_time(train_target)

In [11]:
def convert_per_N(df:pd.DataFrame, n:int, label = False):
    data = []
    pre_session_id = int(df['session_id'].iloc[0])
    if label:
        row = [0]
        for i in tqdm(range(df.shape[0])):
            # next session id
            if pre_session_id != int(df['session_id'].iloc[i]):
                data.append((pre_session_id, np.array(row).reshape(-1, 6)))
                pre_session_id, row = int(df['session_id'].iloc[i]), [0]
            # append 5 values
            row.append(df['song_id'].iloc[i])
        # append last session id
        data.append((df['session_id'].iloc[-1], np.array(row).reshape(-1, 6))) #last one
    else:
        row = []
        song_id = []
        for i in tqdm(range(df.shape[0])):
            # next session id
            if pre_session_id != int(df['session_id'].iloc[i]):
                data.append((pre_session_id, 
                             np.array(row).reshape(-1, 20), 
                             np.array(song_id).reshape(-1, 20)))
                pre_session_id, row,song_id = int(df['session_id'].iloc[i]), [], []
            # append 20 values
            song_id.append(df['song_id'].iloc[i])
            row.append([df['play_status'].iloc[i],
                        df['login_type'].iloc[i],
                        df['second_of_minute'].iloc[i],
                        df['minute_of_hour'].iloc[i],
                        df['hour_of_day'].iloc[i],
                        df['month'].iloc[i], 
                        df['year'].iloc[i]])
        # append last session id
        data.append((df['session_id'].iloc[-1], 
                     np.array(row).reshape(-1, 20),
                     np.array(song_id).reshape(-1, 20)))
    return data

train_source_data  = convert_per_N(train_source, 20)
train_source_label = convert_per_N(train_target, 5, label=True)

  0%|          | 4608/11445180 [00:00<08:22, 22777.99it/s]

100%|██████████| 11445180/11445180 [08:37<00:00, 22104.61it/s]
100%|██████████| 2861295/2861295 [00:31<00:00, 91232.28it/s]


# convert to dataloader

In [12]:
class RankingDatset(Dataset):
    def __init__(self, data, label):
        self.session_id = [session_id for session_id,_,_ in data]
        self.feature    = [feature    for _,feature,_    in data]
        self.song_id    = [song_id    for _,_,song_id    in data]
        self.label      = [label for _,label in label]
    def __len__(self):
        return len(self.session_id)
    def __getitem__(self, idx):
        session_id = self.session_id[idx]
        feature = torch.tensor(self.feature[idx], dtype=torch.float32)
        song_id = torch.tensor(self.song_id[idx], dtype=torch.float32)
        label = torch.tensor(self.label[idx], dtype=torch.float32)
        return {'session_id': session_id, 'feature': feature, 'song_id': song_id, 'label': label}

In [13]:
dataset = RankingDatset(train_source_data, train_source_label)
dataset[0]

{'session_id': 1,
 'feature': tensor([[0.0000, 7.0000, 0.0833, 0.5833, 0.0833, 0.6667, 0.9995, 0.0000, 7.0000,
          0.8333, 0.6333, 0.0833, 0.6667, 0.9995, 0.0000, 7.0000, 0.5500, 0.3000,
          0.1250, 0.6667],
         [0.9995, 0.0000, 7.0000, 0.4833, 0.3500, 0.1250, 0.6667, 0.9995, 0.0000,
          7.0000, 0.6833, 0.5000, 0.1250, 0.6667, 0.9995, 0.0000, 7.0000, 0.7000,
          0.5000, 0.1250],
         [0.6667, 0.9995, 0.0000, 7.0000, 0.7667, 0.5000, 0.1250, 0.6667, 0.9995,
          0.0000, 7.0000, 0.7667, 0.5000, 0.1250, 0.6667, 0.9995, 0.0000, 7.0000,
          0.8000, 0.5000],
         [0.1250, 0.6667, 0.9995, 0.0000, 7.0000, 0.8333, 0.5000, 0.1250, 0.6667,
          0.9995, 0.0000, 7.0000, 0.8667, 0.5000, 0.1250, 0.6667, 0.9995, 0.0000,
          7.0000, 0.8833],
         [0.5000, 0.1250, 0.6667, 0.9995, 0.0000, 7.0000, 0.9167, 0.5000, 0.1250,
          0.6667, 0.9995, 0.0000, 7.0000, 0.9500, 0.5000, 0.1250, 0.6667, 0.9995,
          0.0000, 7.0000],
         [0.9833

# Model

In [14]:
# Define the Encoder
class Encoder(nn.Module):
    def __init__(self, batch_size, num_songs, hidden_size, num_feature, embedding_dim, num_layers=1):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_size = batch_size

        self.embedding = nn.Embedding(num_songs, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim+num_feature, hidden_size, num_layers, batch_first=True)

    def forward(self, song_ids, feature):
        embedded = self.embedding(song_ids) 
        # input feature cat with song embed
        lstm_input = torch.cat((embedded, feature.view(self.batch_size,20,-1)), dim=2)  # Concatenate along the feature dimension
        # Forward propagate LSTM
        out, _ = self.lstm(lstm_input)  # out: tensor of shape (batch_size, seq_length, hidden_size)
        return out

# Define the Decoder
class Decoder(nn.Module):
    def __init__(self, batch_size, num_songs, embedding_dim, enc_hidden_size, hidden_size, num_layers=1):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_size = batch_size
        
        self.embedding = nn.Embedding(num_songs, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim+enc_hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc1 = nn.Linear(20, 1)
        self.fc2 = nn.Linear(hidden_size, num_songs)

    def forward(self, decode_song_ids, last_hidden, encoder_hidden):
        # Forward propagate LSTM
        embedded = self.embedding(decode_song_ids) 
        encoder_hidden = self.fc1(encoder_hidden.view(self.batch_size,-1,20))
        lstm_input = torch.cat((embedded, encoder_hidden.view(self.batch_size,1,-1)), dim=2)
        out, lstm_hidden = self.lstm(lstm_input, last_hidden)
        out = self.fc2(out.squeeze(1))
        return out, lstm_hidden

# Seq2Seq model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, song_ids, features, target_song_ids, teacher_forcing_ratio=0.5):
        encoder_output = self.encoder(song_ids, features)
        
        batch_size = target_song_ids.size(0)
        target_len = target_song_ids.size(1)
        target_song_size = self.decoder.fc2.out_features
        outputs = torch.zeros(batch_size, target_len, target_song_size)

        decoder_input = target_song_ids[:, 0]  # SOS token as the first input
        decoder_hidden = encoder_output[:, -1, :].unsqueeze(0), torch.zeros_like(encoder_output[:, -1, :]).unsqueeze(0)
        for t in range(1, target_len):
            decoder_output, decoder_hidden = self.decoder(decoder_input.unsqueeze(1), decoder_hidden, encoder_output)
            outputs[:, t] = decoder_output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = decoder_output.max(1)[1]
            decoder_input = target_song_ids[:, t] if teacher_force and t < target_len else top1

        return torch.argmax(outputs.squeeze(0),dim = 1)


In [15]:
# Create the Encoder and Decoder instances
num_songs = len(list(ID_IDX.values()))
embedding_dim = 128
hidden_size = 256
enc_hidden_size = hidden_size
dec_hidden_size = hidden_size
num_layers = 1
batch_size = 1 # 1 for testing
num_features = dataset[0]['feature'].shape[0]

# Create Encoder and Decoder instances
encoder = Encoder(batch_size, num_songs, enc_hidden_size, num_features, embedding_dim, num_layers)
decoder = Decoder(batch_size, num_songs, embedding_dim, enc_hidden_size, dec_hidden_size, num_layers)
seq2seq_model = Seq2Seq(encoder, decoder)

# Sample data
data = {
    'session_id': 1,
    'feature': dataset[0]['feature'].unsqueeze(0),  # tensor data for features
    'song_id': dataset[0]['song_id'],
    'label': dataset[0]['label'] # 0 is for the start input for decoder
}

# Assuming you want to use this data for prediction:
song_ids = data['song_id'].to(torch.long)
features = data['feature'].to(torch.long)
target   = data['label'].to(torch.long)
# Set the model to evaluation mode
seq2seq_model.eval()
with torch.no_grad():
    predicted_output = seq2seq_model(song_ids, features, target)
    
# Print the predicted output (just for demonstration)
print(predicted_output)


tensor([     0, 109169, 320771, 220825, 362916, 265796])
