# capoデータにlightFMを適用してみる

In [136]:
from pathlib import Path
import sys,os
sys.path.append(os.pardir)
from tools.preprocess.common import CommonPreprocessor
from tools.preprocess.interaction_matrix_generator import InteractionMatrixGenerator
from tools.preprocess.bow_vectorizer import BOWVectorizer
from scipy.sparse import csr_matrix

## data部分

In [137]:
orignal_path=Path("../data/preprocessed_50k.txt")
bow_threshold=10000

In [138]:
cp=CommonPreprocessor(rare_capo_list=['capo6', 'capo7', "whole_down"],test_rate=0.2,split_seed=0)
songs=cp.get_song_list(orignal_path,shuffle=True)
songs=cp.remove_rare_capo_song(songs)
songs_train, songs_test=cp.split_dataset(songs,shuffle=False)

In [139]:
generator=InteractionMatrixGenerator(test_rate=0.1)
X_train,X_test=generator.generate_matrices(songs)
X_train=csr_matrix(X_train)
X_test=csr_matrix(X_test)

In [140]:
chord_stat_train=cp.retrieve_chord_stat(songs_train)
capo_stat_train=cp.retrieve_capo_stat(songs_train)

In [141]:
vectorizer=BOWVectorizer(chord_stat=chord_stat_train,threshold=bow_threshold)

In [142]:
songs_features=vectorizer.get_chord_features(songs)
print(songs_features.shape)
songs_features=csr_matrix(songs_features)

(49882, 62)


## model部分

In [143]:
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

In [144]:
import numpy as np
def get_prediction(model,n_songs):
    prediction=[]
    for i in range(n_songs):
        prediction.append(model.predict(user_ids=i,item_ids=[0,1,2,3,4,5,6]))
    return np.array(prediction)

In [145]:
def train_and_evaluate(model,X_train,X_test,songs_features,epoch,k):
    model.fit(X_train,item_features=songs_features, epochs=epoch)
    train_precision = precision_at_k(model, X_train,item_features=songs_features, k=k).mean()
    test_precision = precision_at_k(model, X_test,item_features=songs_features, k=k).mean()

    train_auc = auc_score(model, X_train, item_features=songs_features).mean()
    test_auc = auc_score(model, X_test, item_features=songs_features).mean()

    print(f"Precision`@{k}: train {train_precision:.2f}, test {test_precision:.2f}.")
    print(f"AUC: train {train_auc:.2f}, test {test_auc:.2f}.")
    return model

In [146]:
embedding_dim=8
lr=0.01
epoch=100
k=3

In [147]:
warp_model = LightFM(no_components=embedding_dim,learning_rate=lr, loss="warp")
warp_model=train_and_evaluate(warp_model,X_train,X_test,songs_features,epoch,k)

Precision`@3: train 0.33, test 0.22.
AUC: train 0.97, test 0.68.


In [148]:
bpr_model = LightFM(no_components=embedding_dim,learning_rate=lr, loss="bpr")
bpr_model=train_and_evaluate(bpr_model,X_train,X_test,songs_features,epoch,k)

Precision`@3: train 0.33, test 0.12.
AUC: train 1.00, test 0.44.


In [149]:
log_model = LightFM(no_components=embedding_dim,learning_rate=lr, loss="logistic")
log_model=train_and_evaluate(log_model,X_train,X_test,songs_features,epoch,k)

Precision`@3: train 0.23, test 0.23.
AUC: train 0.67, test 0.67.


In [150]:
from tools.model.baseline import BaselineModel
baseline_model=BaselineModel(generator,capo_stat_train)
baseline_model=train_and_evaluate(baseline_model,X_train,X_test,songs_features,epoch,k)

Precision`@3: train 0.16, test 0.17.
AUC: train 0.51, test 0.52.


In [151]:
def get_chord_embeddings(model,vectorizer):
    chord_embeddings={}
    for chord, idx in vectorizer.chord_encoder.items():
        chord_embeddings[chord]=model.item_embeddings[idx].tolist()
    return chord_embeddings

def get_capo_embeddings(model,generator):
    capo_embeddings={}
    capo_names=[hoge.split("=")[-1] for hoge in generator.capo_encoder.get_feature_names()]
    for i, capo  in enumerate(capo_names):
        capo_embeddings[capo]=model.user_embeddings[i].tolist()
    return capo_embeddings

def get_embeddings(model,vectorzier,generator):
    return {"capo":get_capo_embeddings(model,generator),
    "chord": get_chord_embeddings(model,vectorizer)}

In [152]:
embeddings=get_embeddings(log_model,vectorizer,generator)

In [153]:
embeddings["capo"]

{'capo0': [-0.032704759389162064,
  -0.03392348811030388,
  -0.041662879288196564,
  0.02466939389705658,
  -0.028732426464557648,
  0.033674366772174835,
  -0.02698836289346218,
  -0.059981659054756165],
 'capo1': [0.04086771234869957,
  0.04646647348999977,
  -0.06187903508543968,
  -0.014015750959515572,
  0.05240894854068756,
  -0.0033026905730366707,
  -0.060563694685697556,
  -0.03207661956548691],
 'capo2': [0.039411623030900955,
  0.03453795611858368,
  -0.014579328708350658,
  0.050754815340042114,
  0.036991093307733536,
  0.01430797204375267,
  0.0013079560594633222,
  -0.002948142820969224],
 'capo3': [0.017677849158644676,
  0.04294221103191376,
  -0.03524398431181908,
  -0.04251010715961456,
  0.05712592601776123,
  -0.02344915084540844,
  0.02664530836045742,
  0.059563469141721725],
 'capo4': [-0.04160243272781372,
  0.03850145265460014,
  -0.005813950672745705,
  -0.040117740631103516,
  0.0538114458322525,
  0.04875163361430168,
  -0.026449205353856087,
  0.0070815486

In [154]:
embeddings["chord"]

{'A': [0.018569201231002808,
  -0.053790152072906494,
  0.04686899483203888,
  -0.016804130747914314,
  0.014284375123679638,
  0.05659664049744606,
  0.052604757249355316,
  0.06061399728059769],
 'A#': [0.035849276930093765,
  0.03901899605989456,
  0.051895011216402054,
  -0.02304685488343239,
  0.020751256495714188,
  -0.029419075697660446,
  -0.022495435550808907,
  0.06426219642162323],
 'A#7': [-0.004783291835337877,
  0.004390386864542961,
  0.025544974952936172,
  -0.020611733198165894,
  0.030123593285679817,
  -0.005770729389041662,
  -0.037554606795310974,
  0.007520030252635479],
 'A#m': [0.0597066693007946,
  0.009691721759736538,
  0.048694152384996414,
  0.014613013714551926,
  -0.031140189617872238,
  0.0280451662838459,
  -0.023389562964439392,
  0.0476754829287529],
 'A#m7': [0.03605591505765915,
  -0.04999714344739914,
  0.034190285950899124,
  -0.059664174914360046,
  -0.006104225292801857,
  -0.010992315597832203,
  -0.031616006046533585,
  0.014966948889195919],


 ## jsonに保存

In [155]:
import json
from pathlib import Path
json_path=Path(f"../result/embeddings_{bow_threshold}_{embedding_dim}.json")
with json_path.open("w") as f:
    json.dump(embeddings,f)