# capoデータにlightFMを適用してみる

In [1]:
from pathlib import Path
import sys,os
sys.path.append(os.pardir)
from tools.preprocess.common import CommonPreprocessor
from tools.preprocess.interaction_matrix_generator import InteractionMatrixGenerator
from tools.preprocess.bow_vectorizer import BOWVectorizer
from scipy.sparse import csr_matrix

## data部分

In [2]:
orignal_path=Path("../data/original.txt")

In [3]:
cp=CommonPreprocessor(rare_capo_list=['-6', '-7'],test_rate=0.2,split_seed=0)
songs=cp.get_song_list(orignal_path,shuffle=True)
songs=cp.remove_rare_capo_song(songs)
songs_train, songs_test=cp.split_dataset(songs,shuffle=False)

In [4]:
generator=InteractionMatrixGenerator(test_rate=0.1)
X_train,X_test=generator.generate_matrices(songs)
X_train=csr_matrix(X_train)
X_test=csr_matrix(X_test)

In [72]:
chord_stat_train=cp.retrieve_chord_stat(songs_train)
capo_stat_train=cp.retrieve_capo_stat(songs_train)

In [11]:
vectorizer=BOWVectorizer(chord_stat=chord_stat_train,threshold=10000)

In [13]:
songs_features=vectorizer.get_chord_features(songs)
print(songs_features.shape)
songs_features=csr_matrix(songs_features)

(39995, 60)


## model部分

In [14]:
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

In [42]:
embedding_dim=4
lr=0.05
epoch=100
k=2

In [89]:
warp_model = LightFM(no_components=embedding_dim,learning_rate=lr, loss="warp")
warp_model.fit(X_train,item_features=songs_features, epochs=epoch)

train_precision = precision_at_k(warp_model, X_train,item_features=songs_features, k=k).mean()
test_precision = precision_at_k(warp_model, X_test,item_features=songs_features, k=k).mean()

train_auc = auc_score(warp_model, X_train, item_features=songs_features).mean()
test_auc = auc_score(warp_model, X_test, item_features=songs_features).mean()

print(f"Precision`@{k}: train {train_precision:.2f}, test {test_precision:.2f}.")
print(f"AUC: train {train_auc:.2f}, test {test_auc:.2f}.")

Precision`@2: train 0.49, test 0.26.
AUC: train 0.98, test 0.69.


In [108]:
class BaselineModel:
    def __init__(self,generator,capo_stat_train):
        self.generator=generator
        self.capo_stat_train=capo_stat_train
        
    def predict(self,X_train):
        songs_num=X_train.toarray().shape[0]
        """trainでおすすめcapoだった回数に比例して予測を行う.baseline用"""
        hoge=self.generator.capo_encoder.transform([{"rec_capo": capo} for capo in self.capo_stat_train.keys()]).toarray()
        counts= np.array([count for count in self.capo_stat_train.values()])
        return np.array([sum(hoge*counts)]).repeat(songs_num,axis=0)

In [112]:
baseline_model=BaselineModel(generator,capo_stat_train)

train_precision = precision_at_k(baseline_model, X_train,item_features=songs_features, k=k).mean()
test_precision = precision_at_k(baseline_model, X_test,item_features=songs_features, k=k).mean()

train_auc = auc_score(baseline_model, X_train, item_features=songs_features).mean()
test_auc = auc_score(baseline_model, X_test, item_features=songs_features).mean()

print(f"Precision`@{k}: train {train_precision:.2f}, test {test_precision:.2f}.")
print(f"AUC: train {train_auc:.2f}, test {test_auc:.2f}.")

AttributeError: 'BaselineModel' object has no attribute 'predict_rank'

In [111]:
warp_model.predict(user_ids=4,item_ids=[0,1,2,3,4,5,6])

array([ 0.75338387,  1.33822286,  0.56030935, -1.85259485,  0.01740953,
       -1.21652448, -0.09501847])

In [114]:
warp_model.predict_rank(test_interactions=X_train)

ValueError: Incorrect number of features in item_features

In [58]:
import numpy as np
def get_prediction(model,n_songs):
    prediction=[]
    for i in range(n_songs):
        prediction.append(model.predict(user_ids=i,item_ids=[0,1,2,3,4,5,6]))
    return np.array(prediction)

In [88]:
prediction=get_prediction(warp_model,X_train.toarray().shape[0])

39995