In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

In [3]:
# 讀取meta的各個檔案
composer_df = pd.read_parquet('data/meta_song_composer.parquet')
song_df = pd.read_parquet('data/meta_song.parquet')
genre_df = pd.read_parquet('data/meta_song_genre.parquet')
lyricist_df = pd.read_parquet('data/meta_song_lyricist.parquet')
producer_df = pd.read_parquet('data/meta_song_producer.parquet')
titletext_df = pd.read_parquet('data/meta_song_titletext.parquet')

# 將資料以 song_id 作為主體合併
merged_df = pd.merge(song_df, composer_df, on='song_id', how='left')
merged_df = pd.merge(merged_df, genre_df, on='song_id', how='left')
merged_df = pd.merge(merged_df, lyricist_df, on='song_id', how='left')
merged_df = pd.merge(merged_df, producer_df, on='song_id', how='left')
merged_df = pd.merge(merged_df, titletext_df, on='song_id', how='left')
meta_merged_df = merged_df.copy()

# 顯示合併後的 DataFrame
print(meta_merged_df.head())

                            song_id  artist_id  song_length  album_id  \
0  10a46165bb84e056438e06c35763ee61       39.0        202.0     202.0   
1  10a46165bb84e056438e06c35763ee61       39.0        202.0     202.0   
2  ff025522d0f8e7198a75a4e03edce55c      133.0        278.0    9615.0   
3  ff025522d0f8e7198a75a4e03edce55c      133.0        278.0    9615.0   
4  7feec55b825a1c93d55b3ef9af9f9be5     4147.0        306.0   19017.0   

   language_id album_month                       composer_id  \
0          3.0     2000-01                               NaN   
1          3.0     2000-01                               NaN   
2          3.0     1995-01  47580d357c6779d6a244c514b0acdc72   
3          3.0     1995-01  47580d357c6779d6a244c514b0acdc72   
4          3.0     2015-06  3211cf51bfa3afff4f264a110212f615   

                           genre_id                       lyricist_id  \
0  ce4db56f6a48426643b08038139a8a75                               NaN   
1  b856b6781d370a3645c6dde0c20

In [4]:
merged_df.isnull().sum()

song_id                0
artist_id         185318
song_length       185318
album_id          401967
language_id       401967
album_month       402019
composer_id       726444
genre_id          403030
lyricist_id      1299883
producer_id      1579454
title_text_id     185318
dtype: int64

In [None]:
# 加载数据
label_train_source = pd.read_parquet('data/label_train_source.parquet')
label_train_target = pd.read_parquet('data/label_train_target.parquet')
label_test_source = pd.read_parquet('data/label_test_source.parquet')

# 标签编码
song_encoder = LabelEncoder()
all_songs = pd.concat([label_train_source['song_id'], label_train_target['song_id'], label_test_source['song_id']]).unique()
song_encoder.fit(all_songs)

# 将歌曲ID转换为数值
label_train_source['song_id'] = song_encoder.transform(label_train_source['song_id'])
label_train_target['song_id'] = song_encoder.transform(label_train_target['song_id'])
label_test_source['song_id'] = song_encoder.transform(label_test_source['song_id'])

# 数据清洗和预处理（例如去重、排序等）
# 去重
# label_train_source.drop_duplicates(inplace=True)
# label_train_target.drop_duplicates(inplace=True)
# label_test_source.drop_duplicates(inplace=True)

# 排序
# label_train_source.sort_values(by=['session_id', 'unix_played_at'], inplace=True)
# label_train_target.sort_values(by=['session_id', 'unix_played_at'], inplace=True)
# label_test_source.sort_values(by=['session_id', 'unix_played_at'], inplace=True)

In [None]:
print("train source\n",label_train_source.head())
print("=====================================================================================================")
print("train target\n",label_train_source.head())
print("=====================================================================================================")
print("test source\n",label_train_source.head())

In [None]:
def create_sequences_optimized(df_source, df_target, sequence_length=20):
    # 确保数据是按session_id和播放时间排序的
    df_source = df_source.sort_values(by=['session_id', 'unix_played_at'])
    df_target = df_target.sort_values(by=['session_id', 'unix_played_at'])

    # 使用groupby和agg来构建序列
    X = df_source.groupby('session_id')['song_id'].agg(list)
    y = df_target.groupby('session_id')['song_id'].agg(list)

    # 过滤出符合条件的序列
    X_filtered = X[X.apply(len) == sequence_length]
    y_filtered = y[y.apply(len) == 5]

    # 保留两个数据集都有的session_id
    common_sessions = X_filtered.index.intersection(y_filtered.index)
    X_final = X_filtered.loc[common_sessions].tolist()
    y_final = y_filtered.loc[common_sessions].tolist()

    return np.array(X_final), np.array(y_final)

X_train, y_train = create_sequences_optimized(label_train_source, label_train_target)
X_test = label_test_source.groupby('session_id').apply(lambda x: x['song_id'].values if len(x) == 20 else None).dropna().values

# 将目标数据（y_train）转换为独热编码，因为我们的模型将进行多分类任务
# onehot_encoder = OneHotEncoder(sparse_output=True, categories=[np.arange(len(all_songs))])
# y_train_onehot = onehot_encoder.fit_transform(y_train.reshape(-1, 1))
# # 将稀疏矩阵转换为密集数组
# y_train_dense = y_train_onehot.toarray()

# 假设每个序列的最后一个目标是我们要预测的目标
# y_train 是一个二维数组，其中每个子数组包含5个类别索引
y_train_last = [targets[-1] for targets in y_train]  # 取每个序列的最后一个目标


In [None]:
# 将数据转换为PyTorch张量
X_train_tensor = torch.tensor(X_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train_last, dtype=torch.long)
X_test_tensor = torch.tensor(list(X_test), dtype=torch.long)

In [None]:
class MusicPredictionModel(nn.Module):
    def __init__(self, num_songs, embedding_dim=50, lstm_units=64, num_heads=2, dropout=0.2):
        super(MusicPredictionModel, self).__init__()
        self.embedding = nn.Embedding(num_songs, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, lstm_units, batch_first=True)
        self.attention = nn.MultiheadAttention(lstm_units, num_heads, dropout=dropout)
        self.fc = nn.Linear(lstm_units, num_songs)

    def forward(self, x):
        embedded = self.embedding(x)  # (batch_size, seq_length, embedding_dim)
        lstm_out, _ = self.lstm(embedded)  # (batch_size, seq_length, lstm_units)
        attn_output, _ = self.attention(lstm_out, lstm_out, lstm_out)  # (batch_size, seq_length, lstm_units)
        out = self.fc(attn_output[:, -1, :])  # 取序列最后一个时间步的输出
        return out


In [None]:
if torch.cuda.is_available():
    print("GPU is available.")
    device = torch.device("cuda")
else:
    print("GPU is not available, using CPU.")
    device = torch.device("cpu")


In [None]:
def train(model, train_loader, epochs=10, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()

            optimizer.step()
            total_loss += loss.item()

        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}')
    
    torch.save(model.state_dict(), 'model.pth')


# 创建数据加载器
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

# 实例化模型并训练
num_songs = len(all_son_songs)
model = MusicPredictionModel(num_songs).to(device)
train(model, train_loader, 2)


In [None]:
def predict(model, trained_pth, test_loader):
    model.eval()
    model.load_state_dict(torch.load(trained_pth))
    predictions = []
    with torch.no_grad():
        for inputs in test_loader:
            outputs = model(inputs).to(device)
            _, predicted = torch.max(outputs.data, 1)
            predictions.extend(predicted.cpu().numpy())

    return predictions

# 创建测试数据加载器
test_loader = torch.utils.data.DataLoader(X_test_tensor, batch_size=32)

# 进行预测
predictions = predict(model, 'model.pth', test_loader)

# 保存预测结果为CSV
predicted_songs = song_encoder.inverse_transform(predictions)
pd.DataFrame(predicted_songs, columns=['predicted_song_id']).to_csv('predictions.csv', index=False)