# chord2vecを入力にcapo予測

In [1]:
from pathlib import Path
import sys,os
sys.path.append(os.pardir)
from tools.preprocess.common import CommonPreprocessor
from tools.preprocess.interaction_matrix_generator import InteractionMatrixGenerator
from tools.preprocess.bow_vectorizer import BOWVectorizer
from tools.preprocess.cls_preprocessor import LabelGenarater

In [2]:
def evaluate(model,X_train,y_train,X_test,y_test):
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    return accuracy_score(y_test,y_pred)

In [3]:
orignal_path=Path("../data/preprocessed_50k.txt")

In [4]:
# data取得
cp=CommonPreprocessor(rare_capo_list=['capo6', 'capo7', "whole_down"],test_rate=0.2,split_seed=0)
songs=cp.get_song_list(orignal_path,shuffle=True)
songs=cp.remove_rare_capo_song(songs)
songs_train, songs_test=cp.split_dataset(songs,shuffle=False)

In [5]:
chord_stat_train=cp.retrieve_chord_stat(songs_train)
capo_stat_train=cp.retrieve_capo_stat(songs_train)
capo_stat_train

Counter({'capo0': 15571,
         'capo1': 5254,
         'capo2': 6418,
         'capo3': 5297,
         'capo4': 3857,
         'capo5': 2341,
         'half_down': 1167})

In [6]:
# label作成
lg=LabelGenarater()
y_train=lg.fit_transform(songs_train)
y_test=lg.fit_transform(songs_test)

In [16]:
bow_threshold=1000
vectorizer=BOWVectorizer(chord_stat=chord_stat_train,threshold=bow_threshold)

In [17]:
# bowデータへ変換
X_train_bow=vectorizer.get_chord_features(songs_train)
X_test_bow=vectorizer.get_chord_features(songs_test)
X_train_bow.shape,X_test_bow.shape

((39905, 180), (9977, 180))

In [18]:
# LGBM
import lightgbm as lgb
import numpy as np
from sklearn.metrics import accuracy_score

lgbm_cls = lgb.LGBMClassifier(objective='multiclass',num_class= 7)
evaluate(lgbm_cls,X_train_bow,y_train,X_test_bow,y_test)

  if diff:


0.730580334769971

In [19]:
# Logistic regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
evaluate(lr,X_train_bow,y_train,X_test_bow,y_test)

0.7179512879623133

## Chord2Vecモデル

In [20]:
from gensim.models import word2vec

class Chord2VecAvg:
    """gensimのword2vecを用いてベクトルを得て、曲ごとに平均をとる."""
    def __init__(self,model_name,vectorizer):
        self.c2v = word2vec.Word2Vec.load(model_name)
        self.bow_vectorizer=vectorizer 
        self.embeddings= self.create_embedding_mat()
    
    def transform(self,songs):
        bow=self.bow_vectorizer.get_chord_features(songs)
        return bow @ self.embeddings / np.expand_dims(np.sum(bow,axis=1),axis=1)
        
    def create_embedding_mat(self):
        embedding_dim= len(self.c2v.wv.vectors[0])
        embeddings=np.zeros((len(self.bow_vectorizer.chord_encoder),embedding_dim))
        for chord, index in self.bow_vectorizer.chord_encoder.items():
            embeddings[index] = self.c2v.wv[chord]
        return embeddings

In [21]:
embedding_dim=100
min_count=5
window_size=5
iter_num=500
model_name=f"../result/w2v/{embedding_dim}_{min_count}_{window_size}_{iter_num}.model"

c2v_vectorizer=Chord2VecAvg(model_name,vectorizer)

In [22]:
# C2V変換
X_train_c2v=c2v_vectorizer.transform(songs_train)
X_test_c2v=c2v_vectorizer.transform(songs_test)
X_train_c2v.shape,X_test_c2v.shape

  if sys.path[0] == '':


((39905, 100), (9977, 100))

In [23]:
# LGBM
lgbm_cls = lgb.LGBMClassifier(objective='multiclass',num_class= 7)
evaluate(lgbm_cls,X_train_c2v,y_train,X_test_c2v,y_test)

  if diff:


0.7251678861381177

In [24]:
# Logistic regression
lr = LogisticRegression()
evaluate(lr,X_train_c2v,y_train,X_test_c2v,y_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').