In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [5]:
# 讀取meta的各個檔案
composer_df = pd.read_parquet('data/meta_song_composer.parquet')
song_df = pd.read_parquet('data/meta_song.parquet')
genre_df = pd.read_parquet('data/meta_song_genre.parquet')
lyricist_df = pd.read_parquet('data/meta_song_lyricist.parquet')
producer_df = pd.read_parquet('data/meta_song_producer.parquet')
titletext_df = pd.read_parquet('data/meta_song_titletext.parquet')

# 將資料以 song_id 作為主體合併
merged_df = pd.merge(song_df, genre_df, on='song_id', how='left')
# merged_df = pd.merge(merged_df, composer_df, on='song_id', how='left')
# merged_df = pd.merge(merged_df, lyricist_df, on='song_id', how='left')
# merged_df = pd.merge(merged_df, producer_df, on='song_id', how='left')
merged_df = pd.merge(merged_df, titletext_df, on='song_id', how='left')
meta_merged_df = merged_df.copy()

# 顯示合併後的 DataFrame
print(meta_merged_df.head())

                            song_id  artist_id  song_length  album_id  \
0  10a46165bb84e056438e06c35763ee61       39.0        202.0     202.0   
1  10a46165bb84e056438e06c35763ee61       39.0        202.0     202.0   
2  ff025522d0f8e7198a75a4e03edce55c      133.0        278.0    9615.0   
3  ff025522d0f8e7198a75a4e03edce55c      133.0        278.0    9615.0   
4  7feec55b825a1c93d55b3ef9af9f9be5     4147.0        306.0   19017.0   

   language_id album_month                          genre_id  \
0          3.0     2000-01  ce4db56f6a48426643b08038139a8a75   
1          3.0     2000-01  b856b6781d370a3645c6dde0c20b3597   
2          3.0     1995-01  ce4db56f6a48426643b08038139a8a75   
3          3.0     1995-01  b856b6781d370a3645c6dde0c20b3597   
4          3.0     2015-06  ce4db56f6a48426643b08038139a8a75   

                      title_text_id  
0  b2153fe5a86fd746903746a219d40083  
1  b2153fe5a86fd746903746a219d40083  
2  c1079ef109db2aba72f78c632ab73803  
3  c1079ef109db2aba72f78

In [6]:
# 处理缺失值，这里简单地用0填充
meta_merged_df.fillna(0, inplace=True)

# 对分类特征进行编码
label_encoders = {}
for column in ['genre_id', 'title_text_id']:
    encoder = LabelEncoder()
    meta_merged_df[column] = encoder.fit_transform(meta_merged_df[column].astype(str))
    label_encoders[column] = encoder

# 对数值特征进行标准化
scaler = StandardScaler()
meta_merged_df['song_length'] = scaler.fit_transform(meta_merged_df['song_length'].values.reshape(-1, 1))

print(meta_merged_df.head())

                            song_id  artist_id  song_length  album_id  \
0  10a46165bb84e056438e06c35763ee61       39.0     0.019449     202.0   
1  10a46165bb84e056438e06c35763ee61       39.0     0.019449     202.0   
2  ff025522d0f8e7198a75a4e03edce55c      133.0     0.620591    9615.0   
3  ff025522d0f8e7198a75a4e03edce55c      133.0     0.620591    9615.0   
4  7feec55b825a1c93d55b3ef9af9f9be5     4147.0     0.842064   19017.0   

   language_id album_month  genre_id  title_text_id  
0          3.0     2000-01        31           1729  
1          3.0     2000-01        26           1729  
2          3.0     1995-01        31           1866  
3          3.0     1995-01        26           1866  
4          3.0     2015-06        31           1866  


In [8]:
# 将所有特征拼接成一个向量
feature_columns = ['artist_id', 'song_length', 'album_id', 'language_id','album_month', 'genre_id', 'title_text_id']
song_vectors = meta_merged_df.groupby('song_id')[feature_columns].mean()

# 例如，获取特定歌曲的嵌入向量
song_id = '10a46165bb84e056438e06c35763ee61'
vector = song_vectors.loc[song_id]

TypeError: agg function failed [how->mean,dtype->object]

In [None]:
label_train_source = pd.read_parquet('data/label_train_source.parquet')
label_train_target = pd.read_parquet('data/label_train_target.parquet')
label_test_source = pd.read_parquet('data/label_test_source.parquet')

def replace_song_id_with_vectors(df, song_vectors):
    # 将向量表示扩展为多列
    vector_cols = [f'vector_{i}' for i in range(song_vectors.shape[1])]
    song_vectors_expanded = song_vectors.copy()
    song_vectors_expanded.columns = vector_cols
    
    # 替换 song_id 为向量
    df_with_vectors = df.join(song_vectors_expanded, on='song_id')
    
    # 移除原始的 song_id 列
    df_with_vectors.drop(columns=['song_id'], inplace=True)
    
    return df_with_vectors

label_train_source_vectors = replace_song_id_with_vectors(label_train_source, song_vectors)
label_train_target_vectors = replace_song_id_with_vectors(label_train_target, song_vectors)
label_test_source_vectors = replace_song_id_with_vectors(label_test_source, song_vectors)


In [None]:
def create_dataset(source_df, target_df, session_col='session_id', sample_ratio=0.3):
    # 随机抽取一定比例的会话
    unique_sessions = source_df[session_col].unique()
    sampled_sessions = np.random.choice(unique_sessions, size=int(len(unique_sessions) * sample_ratio), replace=False)
    
    # 用于存储数据的列表
    X, y = [], []

    for session in sampled_sessions:
        session_source = source_df[source_df[session_col] == session].drop(columns=[session_col])
        session_target = target_df[target_df[session_col] == session].drop(columns=[session_col])
        
        if len(session_source) == 20 and len(session_target) == 5:
            X.append(session_source.values)
            y.append(session_target.values)

    return np.array(X), np.array(y)

# 假设 label_train_source_vectors 和 label_train_target_vectors 已经是处理好的 DataFrame
X, y = create_dataset(label_train_source_vectors, label_train_target_vectors)
