# original.txtの中身を調べる

* すでに表記ゆれはある程度前処理されている
* original.txtからデータセットを作成する方法はいくつかある
* capoの方は、半音下げ(-1) ~ capo5(-5)までが大半を占めているので、6と7は削除でよさそう
* chordの方はいろいろある。低頻度のコードを削除する、何らかのコードで置き換えるなど
* とりあえず削除する方向で

In [1]:
from pathlib import Path

In [2]:
orignal_path=Path("../data/original.txt")

In [3]:
def get_song_list(original_path):
    with orignal_path.open("r") as f:
        songs = f.readlines()
    result=[]
    for song in songs:
        song=song.rstrip('\n')
        divided=song.split(" ")
        song_dic={"chords":divided[:-1],"rec_capo":divided[-1]}
        result.append(song_dic)
    return result

def remove_rare_capo_song(songs):
    """-6,-7はほとんどないので除去する."""
    removed=[]
    for song in songs:
        if song["rec_capo"] not in ['-6','-7']:
            removed.append(song)
    return removed


from sklearn.model_selection import train_test_split

def split_dataset(songs,val_rate, test_rate,seed=None):
    songs_train_val,songs_test= train_test_split(songs,test_size=test_rate,random_state=seed)
    songs_train,songs_val= train_test_split(songs,test_size=val_rate,random_state=seed)
    return songs_train,songs_val,songs_test

In [4]:
from collections import Counter

def retrieve_chord_stat(songs):
    chord_dict = Counter()
    for song in songs:
        for chord in song["chords"]: 
            chord_dict[chord]+=1
    return chord_dict

def retrieve_capo_stat(songs):
    capo_dict =  Counter()
    for song in songs:
        capo_dict[song["rec_capo"]]+=1
    return capo_dict

In [5]:
songs=get_song_list(orignal_path)
songs=remove_rare_capo_song(songs)
songs_train,songs_valid,songs_test=split_dataset(songs,0.1,0.2)

In [6]:
chord_stat=retrieve_chord_stat(songs_train)
capo_stat=retrieve_capo_stat(songs_train)

## chordの出現頻度調査
* 低頻度chordはparseミスとかも含むので除去したい
* 5回以下しか出現しない約140個のコードを対象？

In [7]:
import numpy as np
import matplotlib.pyplot as plt

print(f"Total chords :{len(chord_stat.keys())}")

fig = plt.figure()
ax = fig.add_subplot(1,1,1)
edges = range(0,100,5)
ax.hist(list(chord_stat.values()), bins=edges)
ax.set_title('histogram of chord')
ax.set_xlabel('frequency')
ax.set_ylabel('the number of chords')
fig.show()

Total chords :853


  "matplotlib is currently using a non-GUI backend, "


In [8]:
def tokenize_rare_chord(chords_list,onehot_dict,token):
    result=[]
    for chords in chords_list:
        result_inner=[]
        for i,chord in enumerate(chords):
            if chord not in onehot_dict.keys():
                result_inner.append(token)
            else:
                result_inner.append(chord)
        result.append(result_inner)
    return result

def get_onehot_dict(chord_stat,threshold,token):
    onehot_dict={}
    i=0
    for chord,freq in chord_stat.items():
        if freq>threshold:
            onehot_dict[chord]=i
            i+=1
    onehot_dict[token]=i
    return onehot_dict  

def count_and_normalize_chord_features(tokenized,onehot_dict):
    X=[]
    features_num=len(onehot_dict)
    for chords in tokenized:
        x=[0]*features_num
        for chord in chords:
            x[onehot_dict[chord]]+=1
        x=np.array(x)
        X.append(x/sum(x))
    return np.array(X)

def get_chord_features(songs,onehot_dict=None,threshold=5,token="<UNK>"):
    """train : onehot_dictがNone. valid or test : onehot_dictを与える."""
    chords_list=[song["chords"] for song in songs]
    if onehot_dict is None:
        chord_stat=retrieve_chord_stat(songs)
        onehot_dict=get_onehot_dict(chord_stat,threshold,token)
    tokenized=tokenize_rare_chord(chords_list,onehot_dict,token)
    X=count_and_normalize_chord_features(tokenized,onehot_dict)
    return X, onehot_dict


In [9]:
X_train, onehot_dict=get_chord_features(songs_train)
X_valid, _ =get_chord_features(songs_valid,onehot_dict)
X_test, _ =get_chord_features(songs_test,onehot_dict)

In [10]:
X_train.shape,X_valid.shape,X_test.shape

((35995, 687), (4000, 687), (7999, 687))

In [11]:
onehot_dict

{'<UNK>': 686,
 'A': 29,
 'A#': 37,
 'A#-5': 487,
 'A#/A': 367,
 'A#/B': 486,
 'A#/C': 88,
 'A#/C#': 524,
 'A#/D': 169,
 'A#/D#': 60,
 'A#/E': 411,
 'A#/F': 402,
 'A#/F#': 346,
 'A#/G': 347,
 'A#/G#': 59,
 'A#6': 77,
 'A#7': 51,
 'A#7+5': 628,
 'A#7-5': 281,
 'A#7-9': 146,
 'A#7/D#': 438,
 'A#7/G#': 583,
 'A#7sus4': 231,
 'A#9': 199,
 'A#M9': 680,
 'A#add9': 164,
 'A#aug': 113,
 'A#aug/E': 320,
 'A#dim': 95,
 'A#m': 14,
 'A#m/A': 639,
 'A#m/C': 437,
 'A#m/C#': 542,
 'A#m/D': 545,
 'A#m/D#': 141,
 'A#m/F#': 562,
 'A#m/G': 572,
 'A#m/G#': 344,
 'A#m6': 310,
 'A#m6/C': 436,
 'A#m7': 101,
 'A#m7-5': 184,
 'A#m7-5/D#': 560,
 'A#m7/D#': 153,
 'A#m7/F#': 646,
 'A#m7/G#': 631,
 'A#m9': 470,
 'A#mM7': 372,
 'A#madd9': 489,
 'A#maj7': 36,
 'A#maj7/C': 315,
 'A#maj9': 261,
 'A#sus4': 63,
 'A#sus4/C': 389,
 'A#sus4/D': 658,
 'A#sus4/G': 443,
 'A-5': 455,
 'A/A#': 582,
 'A/B': 134,
 'A/C': 356,
 'A/C#': 188,
 'A/D': 186,
 'A/D#': 456,
 'A/E': 397,
 'A/F': 521,
 'A/F#': 494,
 'A/G': 147,
 'A/G#': 24

In [12]:


capo_binarize(["-1","-3","-1","1","0"])

(array([[1, 0, 0, 0],
        [0, 1, 0, 0],
        [1, 0, 0, 0],
        [0, 0, 0, 1],
        [0, 0, 1, 0]]), array(['-1', '-3', '0', '1'], dtype='<U2'))

In [14]:
capo_stat.keys()

dict_keys(['-2', '-3', '0', '-5', '-1', '-4', '1'])

In [17]:
songs_train[0]["rec_capo"]

'-2'

In [46]:
def duplicate_capos(songs,capo_kind=np.array(['1','0','-1','-2','-3','-4','-5'])):
    """正解capo以外の不正解capoを追加する.
    
    7i番目のindexがi番目の正解capo,7i+1 ~ 7i+6がi番目の不正解capo
    """
    duplicated_capos=[]
    for song in songs:
        duplicated_capos.append(song["rec_capo"])
        duplicated_capos.extend(capo_kind[capo_kind!=song["rec_capo"]])
    return duplicated_capos

from sklearn.preprocessing import LabelBinarizer

def capo_onehot_encode(capo):
    """文字列のcapoをonehot encoding"""
    binarizer=LabelBinarizer()
    binarized=binarizer.fit_transform(capo)
    return binarized, binarizer.classes_

def create_compares(logs_num):
    compares=[]
    for i in range(logs_num):
        compares.extend([[7*i,7*i+j] for j in range(1,7)])
    return compares

In [43]:
duplicated_capos=duplicate_capos(songs_train)
binariazed,capo_classes = capo_onehot_encode(duplicated_capos)

In [44]:
binarized.shape

(251965, 7)

In [45]:
classes

array(['-1', '-2', '-3', '-4', '-5', '0', '1'], dtype='<U2')

In [48]:
compares=create_compares(len(songs_train))