In [109]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [110]:
# 讀取meta的各個檔案
composer_df = pd.read_parquet('data/meta_song_composer.parquet')
song_df = pd.read_parquet('data/meta_song.parquet')
genre_df = pd.read_parquet('data/meta_song_genre.parquet')
lyricist_df = pd.read_parquet('data/meta_song_lyricist.parquet')
producer_df = pd.read_parquet('data/meta_song_producer.parquet')
titletext_df = pd.read_parquet('data/meta_song_titletext.parquet')

# 將資料以 song_id 作為主體合併
merged_df = pd.merge(song_df, composer_df, on='song_id', how='left')
merged_df = pd.merge(merged_df, genre_df, on='song_id', how='left')
merged_df = pd.merge(merged_df, lyricist_df, on='song_id', how='left')
merged_df = pd.merge(merged_df, producer_df, on='song_id', how='left')
merged_df = pd.merge(merged_df, titletext_df, on='song_id', how='left')
meta_merged_df = merged_df.copy()

# 顯示合併後的 DataFrame
print(meta_merged_df.head())

                            song_id  artist_id  song_length  album_id  \
0  10a46165bb84e056438e06c35763ee61       39.0        202.0     202.0   
1  10a46165bb84e056438e06c35763ee61       39.0        202.0     202.0   
2  ff025522d0f8e7198a75a4e03edce55c      133.0        278.0    9615.0   
3  ff025522d0f8e7198a75a4e03edce55c      133.0        278.0    9615.0   
4  7feec55b825a1c93d55b3ef9af9f9be5     4147.0        306.0   19017.0   

   language_id album_month                       composer_id  \
0          3.0     2000-01                               NaN   
1          3.0     2000-01                               NaN   
2          3.0     1995-01  47580d357c6779d6a244c514b0acdc72   
3          3.0     1995-01  47580d357c6779d6a244c514b0acdc72   
4          3.0     2015-06  3211cf51bfa3afff4f264a110212f615   

                           genre_id                       lyricist_id  \
0  ce4db56f6a48426643b08038139a8a75                               NaN   
1  b856b6781d370a3645c6dde0c20

In [138]:
merged_df.isnull().sum()

song_id                0
artist_id         185318
song_length       185318
album_id          401967
language_id       401967
album_month       402019
composer_id       726444
genre_id          403030
lyricist_id      1299883
producer_id      1579454
title_text_id     185318
dtype: int64

In [111]:
# 处理缺失值，这里简单地用0填充
meta_merged_df.fillna(0, inplace=True)

# 对分类特征进行编码
label_encoders = {}
for column in ['artist_id', 'album_id', 'composer_id', 'genre_id', 'lyricist_id', 'producer_id', 'title_text_id']:
    encoder = LabelEncoder()
    meta_merged_df[column] = encoder.fit_transform(meta_merged_df[column].astype(str))
    label_encoders[column] = encoder

# 对数值特征进行标准化
scaler = StandardScaler()
meta_merged_df['song_length'] = scaler.fit_transform(meta_merged_df['song_length'].values.reshape(-1, 1))

print(meta_merged_df.head())

                            song_id  artist_id  song_length  album_id  \
0  10a46165bb84e056438e06c35763ee61      85469    -0.095027    152662   
1  10a46165bb84e056438e06c35763ee61      85469    -0.095027    152662   
2  ff025522d0f8e7198a75a4e03edce55c      14082     0.624485    309734   
3  ff025522d0f8e7198a75a4e03edce55c      14082     0.624485    309734   
4  7feec55b825a1c93d55b3ef9af9f9be5      92456     0.889568    147347   

   language_id album_month  composer_id  genre_id  lyricist_id  producer_id  \
0          3.0     2000-01            0        31            0        33946   
1          3.0     2000-01            0        26            0        33946   
2          3.0     1995-01        54451        31            0        32408   
3          3.0     1995-01        54451        26            0        32408   
4          3.0     2015-06        38307        31        23503         1980   

   title_text_id  
0           1729  
1           1729  
2           1866  
3         

In [112]:
# 将所有特征拼接成一个向量
feature_columns = ['artist_id', 'song_length', 'album_id', 'composer_id', 'genre_id', 'lyricist_id', 'producer_id', 'title_text_id']
song_vectors = meta_merged_df.groupby('song_id')[feature_columns].mean()


In [113]:
# 例如，获取特定歌曲的嵌入向量
song_id = '10a46165bb84e056438e06c35763ee61'
vector = song_vectors.loc[song_id]


In [137]:
song_vectors

Unnamed: 0_level_0,artist_id,song_length,album_id,composer_id,genre_id,lyricist_id,producer_id,title_text_id
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
00002d2ddb6c789bcb8ef14f52339e39,5912.0,-0.767203,188982.0,0.0,13.0,0.0,0.0,2221.0
000049836b2e92a6a361e40d7f0c73db,29454.0,0.605551,248850.0,75735.0,11.0,21458.0,0.0,261.0
00005d588ff5b96c84556f17b716e227,37505.0,0.056449,128132.0,0.0,6.0,0.0,33474.0,1525.0
00005ee00af3e6bc97c0a694bae95728,96740.0,0.226860,127637.0,67.0,9.0,0.0,21796.0,2200.0
000067e6edb45097956ee81bbd1edb63,41108.0,1.164119,223967.0,0.0,26.0,0.0,9235.0,2070.0
...,...,...,...,...,...,...,...,...
ffffb17f494477ddef244df20d662e01,66760.0,0.151122,187329.0,0.0,18.5,0.0,55313.0,243.0
ffffc0ee02f3fac2b4a10ed7a6c3423f,1419.0,0.567681,26662.0,0.0,35.0,0.0,0.0,1540.0
ffffcf62a064a729b55618a2648c6af0,101311.0,1.088381,155636.0,12348.0,10.0,7499.0,0.0,908.0
ffffe1b0071c87b3716f708f7066a22f,32398.0,0.662354,54531.0,79588.5,18.5,0.0,0.0,243.0


In [114]:
label_train_source = pd.read_parquet('data/label_train_source.parquet')
label_train_target = pd.read_parquet('data/label_train_target.parquet')
label_test_source = pd.read_parquet('data/label_test_source.parquet')

def replace_song_id_with_vectors(df, song_vectors):
    # 将向量表示扩展为多列
    vector_cols = [f'vector_{i}' for i in range(song_vectors.shape[1])]
    song_vectors_expanded = song_vectors.copy()
    song_vectors_expanded.columns = vector_cols
    
    # 替换 song_id 为向量
    df_with_vectors = df.join(song_vectors_expanded, on='song_id')
    
    # 移除原始的 song_id 列
    df_with_vectors.drop(columns=['song_id'], inplace=True)
    
    return df_with_vectors

label_train_source_vectors = replace_song_id_with_vectors(label_train_source, song_vectors)
label_train_target_vectors = replace_song_id_with_vectors(label_train_target, song_vectors)
label_test_source_vectors = replace_song_id_with_vectors(label_test_source, song_vectors)


In [115]:
label_test_source_vectors

Unnamed: 0,session_id,unix_played_at,play_status,login_type,listening_order,vector_0,vector_1,vector_2,vector_3,vector_4,vector_5,vector_6,vector_7
0,598,1660177283,1,7,1,99310.0,0.065916,288459.0,0.0,26.0,0.0,14213.0,882.0
1,598,1660177503,1,7,2,60868.0,0.207925,89743.0,67111.5,28.0,0.0,27304.0,893.0
2,598,1660177582,1,7,3,86607.0,0.179524,175463.0,0.0,22.0,0.0,33461.0,894.0
3,598,1660177584,1,7,4,4097.0,0.037515,197093.0,0.0,26.0,0.0,30120.0,1866.0
4,598,1660177587,1,7,5,67004.0,-0.180232,0.0,0.0,0.0,0.0,0.0,1866.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2861275,714498,1666147178,0,7,16,91992.0,-1.221631,138464.0,0.0,17.5,0.0,0.0,1341.0
2861276,714498,1666147262,0,7,17,91992.0,-1.032286,138464.0,0.0,17.5,0.0,0.0,1341.0
2861277,714498,1666147366,0,7,18,91992.0,-1.079622,138464.0,0.0,17.5,0.0,0.0,1341.0
2861278,714498,1666147464,0,7,19,91992.0,-1.136426,138464.0,0.0,17.5,0.0,0.0,1341.0


In [116]:
label_train_source_vectors.head()

Unnamed: 0,session_id,unix_played_at,play_status,login_type,listening_order,vector_0,vector_1,vector_2,vector_3,vector_4,vector_5,vector_6,vector_7
0,751,1659598909,1,7,1,2028.0,0.473009,232540.0,103483.0,23.5,63355.0,0.0,971.0
1,751,1659599182,1,7,2,60525.0,-0.966015,0.0,55025.0,0.0,33634.0,47870.0,1866.0
2,751,1659599184,0,7,3,60525.0,-0.966015,0.0,55025.0,0.0,33634.0,47870.0,1866.0
3,751,1659599185,0,7,4,60525.0,-0.966015,0.0,55025.0,0.0,33634.0,47870.0,1866.0
4,751,1659599218,1,7,5,82831.0,0.757027,0.0,10087.0,0.0,6159.0,0.0,409.0


In [117]:
unique_sessions = label_train_source_vectors['session_id'].unique()
print(len(unique_sessions))

572259


In [118]:
def create_dataset(source_df, target_df, session_col='session_id', sample_ratio=0.3):
    # 随机抽取一定比例的会话
    unique_sessions = source_df[session_col].unique()
    sampled_sessions = np.random.choice(unique_sessions, size=int(len(unique_sessions) * sample_ratio), replace=False)
    
    # 用于存储数据的列表
    X, y = [], []

    for session in sampled_sessions:
        session_source = source_df[source_df[session_col] == session].drop(columns=[session_col])
        session_target = target_df[target_df[session_col] == session].drop(columns=[session_col])
        
        if len(session_source) == 20 and len(session_target) == 5:
            X.append(session_source.values)
            y.append(session_target.values)

    return np.array(X), np.array(y)

# 假设 label_train_source_vectors 和 label_train_target_vectors 已经是处理好的 DataFrame
X, y = create_dataset(label_train_source_vectors, label_train_target_vectors)


In [119]:
y.shape

(171677, 5, 12)

In [120]:
X = X.astype(float)
y = y.astype(float)

In [121]:
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

# 创建 DataLoader
train_data = TensorDataset(X_tensor, y_tensor)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

In [122]:
if torch.cuda.is_available():
    print("GPU is available.")
    device = torch.device("cuda")
else:
    print("GPU is not available, using CPU.")
    device = torch.device("cpu")

GPU is available.


In [133]:
class MultiheadAttention(nn.Module):
    def __init__(self, hidden_size, num_heads):
        super(MultiheadAttention, self).__init__()
        self.attention = nn.MultiheadAttention(hidden_size, num_heads)

    def forward(self, x):
        attn_output, _ = self.attention(x, x, x)
        return attn_output

class SongPredictor(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, num_heads):
        super(SongPredictor, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.multihead_attn = MultiheadAttention(hidden_size, num_heads)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        gru_out, _ = self.gru(x)  # GRU层
        attn_out = self.multihead_attn(gru_out)  # 注意力机制
        out = self.fc(attn_out[:, -1, :])  # 全连接层
        return out

# 模型参数
input_size = X.shape[2]  # 输入特征的维度
hidden_size = 128  # 隐藏层大小
num_layers = 2  # GRU层数
output_size = y.shape[2] * 5  # 输出大小（例如，如果是回归问题，就是目标特征的数量）
num_heads = 4  # 注意力头数

model = SongPredictor(input_size, hidden_size, num_layers, output_size, num_heads).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [134]:
# 训练模型
def train_model(model, train_loader, criterion, optimizer, epochs):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            
            # 前向传播
            outputs = model(inputs)
            
            # 计算损失
            loss = criterion(outputs, targets.view(targets.size(0), -1))
            total_loss += loss.item()
            
            # 反向传播和优化
            loss.backward()
            optimizer.step()
        
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(train_loader):.4f}')
        torch.save(model.state_dict(), 'model.pth')

# 训练过程
epochs = 10
train_model(model, train_loader, criterion, optimizer, epochs)


Epoch [1/10], Loss: 36554147399.8613
Epoch [2/10], Loss: 13427743964.6419
Epoch [3/10], Loss: 172950236891.0196
Epoch [4/10], Loss: 154876056260.4973
Epoch [5/10], Loss: 305410438371.5131
Epoch [6/10], Loss: 391690373934.8101
Epoch [7/10], Loss: 394590994921.3823
Epoch [8/10], Loss: 334478347307.3267
Epoch [9/10], Loss: 417793553947.3894
Epoch [10/10], Loss: 592812340533.9675


In [129]:
label_test_source_vectors

Unnamed: 0,session_id,unix_played_at,play_status,login_type,listening_order,vector_0,vector_1,vector_2,vector_3,vector_4,vector_5,vector_6,vector_7
0,598,1660177283,1,7,1,99310.0,0.065916,288459.0,0.0,26.0,0.0,14213.0,882.0
1,598,1660177503,1,7,2,60868.0,0.207925,89743.0,67111.5,28.0,0.0,27304.0,893.0
2,598,1660177582,1,7,3,86607.0,0.179524,175463.0,0.0,22.0,0.0,33461.0,894.0
3,598,1660177584,1,7,4,4097.0,0.037515,197093.0,0.0,26.0,0.0,30120.0,1866.0
4,598,1660177587,1,7,5,67004.0,-0.180232,0.0,0.0,0.0,0.0,0.0,1866.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2861275,714498,1666147178,0,7,16,91992.0,-1.221631,138464.0,0.0,17.5,0.0,0.0,1341.0
2861276,714498,1666147262,0,7,17,91992.0,-1.032286,138464.0,0.0,17.5,0.0,0.0,1341.0
2861277,714498,1666147366,0,7,18,91992.0,-1.079622,138464.0,0.0,17.5,0.0,0.0,1341.0
2861278,714498,1666147464,0,7,19,91992.0,-1.136426,138464.0,0.0,17.5,0.0,0.0,1341.0


In [130]:
# 假设 predict_data_tensor 是您的预测数据张量
predict_data_tensor = label_test_source_vectors.astype(float)
predict_data_tensor = torch.tensor(predict_data_tensor, dtype=torch.float32)

predict_dataset = TensorDataset(predict_data_tensor)
predict_loader = DataLoader(predict_dataset, batch_size=32)

def predict(model, predict_loader, device):
    model.load_state_dict(torch.load('model.pth'))
    model.eval()  # 将模型设置为评估模式
    predictions = []

    with torch.no_grad():  # 在预测时不计算梯度
        for inputs in predict_loader:
            inputs = inputs[0].to(device)  # DataLoader 返回一个元组
            outputs = model(inputs)
            predictions.append(outputs.cpu())  # 如果在 GPU 上，则移动到 CPU

    # 将预测结果拼接成一个张量
    return torch.cat(predictions, dim=0)

# 使用模型进行预测
model_predictions = predict(model, predict_loader, device)

# 将预测结果转换为 numpy 数组，如果需要
predictions_np = model_predictions.numpy()


ValueError: could not determine the shape of object type 'DataFrame'

# ignore

In [125]:
label_data = ['data/label_train_source.parquet', 'data/label_train_target.parquet', 'data/label_test_source.parquet']
meta_data = ['data/meta_song_composer.parquet', 'data/meta_song.parquet', 'data/meta_song_genre.parquet',
            'data/meta_song_lyricist.parquet', 'data/meta_song_producer.parquet', 'data/meta_song_titletext.parquet']

for i in range(len(label_data)):
    df1 = pd.read_parquet(label_data[i])
    print("===============================================================================================")
    print(label_data[i])
    print(df1.head(21))

for i in range(len(meta_data)):
    df2 = pd.read_parquet(meta_data[i])
    print("===============================================================================================")
    print(meta_data[i])
    print(df2.head())

data/label_train_source.parquet
    session_id                           song_id  unix_played_at  play_status  \
0          751  6027767fad949f3ca5e772df04924949      1659598909            1   
1          751  041547bddb0a3e730f32db84c65868ca      1659599182            1   
2          751  041547bddb0a3e730f32db84c65868ca      1659599184            0   
3          751  041547bddb0a3e730f32db84c65868ca      1659599185            0   
4          751  8b32f88104ecf859be934d9b45f30cd1      1659599218            1   
5          751  e4a125e3163e4c1bd40060614c79bd53      1659599219            1   
6          751  8b32f88104ecf859be934d9b45f30cd1      1659599225            1   
7          751  5ef6718f4517d2d3c316fc45226f41dc      1659599229            1   
8          751  e4a125e3163e4c1bd40060614c79bd53      1659599252            0   
9          751  041547bddb0a3e730f32db84c65868ca      1659599253            0   
10         751  e7efab54028017e35a35d1b1637e210c      1659599287            1

In [126]:
# 讀取meta的各個檔案
composer_df = pd.read_parquet('data/meta_song_composer.parquet')
song_df = pd.read_parquet('data/meta_song.parquet')
genre_df = pd.read_parquet('data/meta_song_genre.parquet')
lyricist_df = pd.read_parquet('data/meta_song_lyricist.parquet')
producer_df = pd.read_parquet('data/meta_song_producer.parquet')
titletext_df = pd.read_parquet('data/meta_song_titletext.parquet')

# 將資料以 song_id 作為主體合併
merged_df = pd.merge(song_df, composer_df, on='song_id', how='left')
merged_df = pd.merge(merged_df, genre_df, on='song_id', how='left')
merged_df = pd.merge(merged_df, lyricist_df, on='song_id', how='left')
merged_df = pd.merge(merged_df, producer_df, on='song_id', how='left')
merged_df = pd.merge(merged_df, titletext_df, on='song_id', how='left')

# 顯示合併後的 DataFrame
print(merged_df.head())

# 將結果保存為一個新的 Parquet 檔案
merged_df.to_parquet('merged_song_data.parquet', index=False)


                            song_id  artist_id  song_length  album_id  \
0  10a46165bb84e056438e06c35763ee61       39.0        202.0     202.0   
1  10a46165bb84e056438e06c35763ee61       39.0        202.0     202.0   
2  ff025522d0f8e7198a75a4e03edce55c      133.0        278.0    9615.0   
3  ff025522d0f8e7198a75a4e03edce55c      133.0        278.0    9615.0   
4  7feec55b825a1c93d55b3ef9af9f9be5     4147.0        306.0   19017.0   

   language_id album_month                       composer_id  \
0          3.0     2000-01                               NaN   
1          3.0     2000-01                               NaN   
2          3.0     1995-01  47580d357c6779d6a244c514b0acdc72   
3          3.0     1995-01  47580d357c6779d6a244c514b0acdc72   
4          3.0     2015-06  3211cf51bfa3afff4f264a110212f615   

                           genre_id                       lyricist_id  \
0  ce4db56f6a48426643b08038139a8a75                               NaN   
1  b856b6781d370a3645c6dde0c20

In [127]:
merged_df.inf()

AttributeError: 'DataFrame' object has no attribute 'inf'

In [None]:
merged_df.isnull().sum()

song_id                0
artist_id         185318
song_length       185318
album_id          401967
language_id       401967
album_month       402019
composer_id       726444
genre_id          403030
lyricist_id      1299883
producer_id      1579454
title_text_id     185318
dtype: int64

In [None]:
# 讀取 source 和 target 資料
source_df = pd.read_parquet('data/label_train_source.parquet')
target_df = pd.read_parquet('data/label_train_target.parquet')

# 合併 source 和 target 資料
merged_df = pd.merge(source_df, target_df, on='session_id', suffixes=('_source', '_target'))

# 創建一個包含所有不同 song_id 的列表
all_song_ids = pd.concat([merged_df['song_id_source'], merged_df['song_id_target']])
unique_song_ids = all_song_ids.unique()

# 將 song_id 轉換成編碼的字典
song_id_to_encoded = {song_id: idx for idx, song_id in enumerate(unique_song_ids)}

# 將編碼後的結果重新分配回 source_df 和 target_df
source_df['song_id_encoded'] = source_df['song_id'].map(song_id_to_encoded)
target_df['song_id_encoded'] = target_df['song_id'].map(song_id_to_encoded)


Modified source_df:
   session_id                           song_id  unix_played_at  play_status  \
0         751  6027767fad949f3ca5e772df04924949      1659598909            1   
1         751  041547bddb0a3e730f32db84c65868ca      1659599182            1   
2         751  041547bddb0a3e730f32db84c65868ca      1659599184            0   
3         751  041547bddb0a3e730f32db84c65868ca      1659599185            0   
4         751  8b32f88104ecf859be934d9b45f30cd1      1659599218            1   

   login_type  listening_order  song_id_encoded  
0           7                1                0  
1           7                2                1  
2           7                3                1  
3           7                4                1  
4           7                5                2  

Modified target_df:
   session_id                           song_id  unix_played_at  play_status  \
0         307  75c2aa348888f982d85e3f870e6ba5b2      1659942029            1   
1         307  0ca

In [None]:
# 顯示修改後的 source_df 和 target_df
print("Modified source_df:")
print(source_df.head(21))
print("\nModified target_df:")
print(target_df.head(6))

Modified source_df:
    session_id                           song_id  unix_played_at  play_status  \
0          751  6027767fad949f3ca5e772df04924949      1659598909            1   
1          751  041547bddb0a3e730f32db84c65868ca      1659599182            1   
2          751  041547bddb0a3e730f32db84c65868ca      1659599184            0   
3          751  041547bddb0a3e730f32db84c65868ca      1659599185            0   
4          751  8b32f88104ecf859be934d9b45f30cd1      1659599218            1   
5          751  e4a125e3163e4c1bd40060614c79bd53      1659599219            1   
6          751  8b32f88104ecf859be934d9b45f30cd1      1659599225            1   
7          751  5ef6718f4517d2d3c316fc45226f41dc      1659599229            1   
8          751  e4a125e3163e4c1bd40060614c79bd53      1659599252            0   
9          751  041547bddb0a3e730f32db84c65868ca      1659599253            0   
10         751  e7efab54028017e35a35d1b1637e210c      1659599287            1   
11      