In [23]:
import pandas as pd
import numpy as np
import random
import gc
from os import listdir
from xgboost import XGBClassifier
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout, BatchNormalization
from keras.optimizers import SGD, RMSprop
import pickle
from sklearn.externals import joblib
from sklearn import tree
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split, cross_val_score

In [32]:
%alias_magic t timeit

Created `%t` as an alias for `%timeit`.
Created `%%t` as an alias for `%%timeit`.


# Note types
- 0: nothing
- 1: step
- 2: hold start
- 3: hold/roll end
- 4: roll start
- M: mine

# Classes
- 0: nothing
- 1: one note
- 2: two notes
- 3: three or four notes
- 4: hold start
- 5: roll start
- 6: mine

# Remember
clf = joblib.load('filename.pkl') 

In [2]:
samples_back_included = 8
num_classes = 7
num_features = 40
num_features_total = (num_features * samples_back_included) + 4
save_files = listdir('data')

def get_features_for_index(beat_features, notes, index):
    if index < 0:
        return [0] * num_features
    return beat_features[index]

def get_class_for_index(notes, index):
    if index < 0:
        return [1, 0, 0, 0, 0, 0, 0]
    row = notes[index][0]
    (steps, holds, rolls, mines) = [row.count(char) for char in ['1', '2', '4', 'M']]
    steps += (holds + rolls)
    return [int(i) for i in [steps == 0 and mines == 0, steps == 1, steps == 2, steps > 2, holds > 0, rolls > 0, mines > 0]]
    
importance_rankings = [48, 24, 12, 16, 6, 8, 3, 4, 2, 1]
def get_beat_importance(index):
    for i in range(len(importance_rankings)):
        if index % importance_rankings[i] == 0:
            return i

def get_features_for_song(key, is_full):
    X = []
    y = []
    if '{0}_beat_features.csv'.format(key) in save_files and '{0}_notes.csv'.format(key) in save_files:
        beat_features_rotated = pd.read_csv('data/{0}_beat_features.csv'.format(key)).values
        notes = pd.read_csv('data/{0}_notes.csv'.format(key), converters={'0': lambda x: str(x)}).values
        beat_features = np.flipud(np.rot90(np.array(beat_features_rotated)))
        num_notes = min(len(notes), len(beat_features))
        for i in range(num_notes):
            row_y = get_class_for_index(notes, i)
            if is_full or (not (row_y == 0 and random.randint(0, 5) != 0)):
                features = [feature for j in range(samples_back_included) for feature in get_features_for_index(beat_features, notes, i - j)]
                features.append(i % 48)
                features.append(get_beat_importance(i))
                features.append(i / 48)
                features.append(num_notes - i / 48)
                X.append(features)
                y.append(row_y)
    return np.array(X), np.array(y)

# Total 243 songs
songs_to_use = pd.read_csv('data/songs_to_use.csv').values
def build_training_data(is_full, start, end):
    X = []
    y = []
    songs_to_use = pd.read_csv('data/songs_to_use.csv').values
    for song_data in songs_to_use[start:end]:
        song_X, song_y = get_features_for_song(song_data[0], is_full)
        X.extend(song_X)
        y.extend(song_y)
    return np.array(X), np.array(y)

In [3]:
X_train, y_train = build_training_data(True, 0, 194)
y_train = np.array(list(map(lambda one_hot: np.argmax(one_hot), y_train)))

In [4]:
X_test, y_test = build_training_data(True, 194, 243)
y_test = np.array(list(map(lambda one_hot: np.argmax(one_hot), y_test)))

In [5]:
len(X_train)

534504

In [6]:
gc.collect()

35

In [None]:
gb_clf = GradientBoostingClassifier(random_state=0, learning_rate=0.12, n_estimators=25, max_depth=7, subsample=0.85, max_features=200, verbose=True)
gb_clf.fit(X_train[:100000], y_train[:100000])
print (gb_clf.score(X_train[:100000], y_train[:100000]))
print (gb_clf.score(X_test, y_test))
joblib.dump(gb_clf, 'gb_clf1.pkl')
gc.collect()
# 0.891653430684

In [13]:
gb_clf = GradientBoostingClassifier(random_state=0, learning_rate=0.12, n_estimators=25, max_depth=10, subsample=0.85, max_features=200, verbose=True)
gb_clf.fit(X_train[:100000], y_train[:100000])
print (gb_clf.score(X_train[:100000], y_train[:100000]))
print (gb_clf.score(X_test, y_test))
joblib.dump(gb_clf, 'gb_clf2.pkl')
gc.collect()
#0.97099
#0.864501730052

      Iter       Train Loss      OOB Improve   Remaining Time 
         1       77317.0745        3012.9254           48.89m
         2       64503.8453        2125.2472           49.59m
         3       54740.1825        1621.9064           49.25m
         4       46966.5138        1249.8163           49.24m
         5       40731.9701         993.5149           47.87m
         6       35804.0404         790.2325           45.72m
         7       31545.0480         642.4337           43.98m
         8       28232.2055         502.3321           42.17m
         9       25536.8471         414.1295           39.81m
        10       22930.9312         338.0288           37.60m
        20       11501.9519          54.8783           12.57m
0.97099
0.864501730052


0

In [14]:
gb_clf = GradientBoostingClassifier(random_state=0, learning_rate=0.12, n_estimators=25, max_depth=20, subsample=0.85, max_features=200, verbose=True)
gb_clf.fit(X_train[:100000], y_train[:100000])
print (gb_clf.score(X_train[:100000], y_train[:100000]))
print (gb_clf.score(X_test, y_test))
joblib.dump(gb_clf, 'gb_clf3.pkl')
gc.collect()
#1.0
#0.870732912928

      Iter       Train Loss      OOB Improve   Remaining Time 
         1       73806.7067        3027.5629          204.83m
         2       59173.8587        2107.8607          266.74m
         3       48155.8397        1594.7845          304.16m
         4       39490.5831        1236.5233          320.75m
         5       32628.4095         995.1022          324.76m
         6       27022.2342         796.0695          325.48m
         7       22424.2507         638.3685          334.62m
         8       18690.6681         512.6263          335.57m
         9       15627.6145         423.7054          329.96m
        10       13047.0904         340.6485          321.32m
        20        2505.5305          47.9474          113.01m
1.0
0.870732912928


0

In [18]:
rf_clf = RandomForestClassifier(n_estimators = 25, max_features=200, min_samples_leaf=4, verbose=True)
rf_clf.fit(X_train[:100000], y_train[:100000])
print (rf_clf.score(X_train[:100000], y_train[:100000]))
print (rf_clf.score(X_test, y_test))
joblib.dump(rf_clf, 'rf_clf_final.pkl')
gc.collect()
# 0.900850706427

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed: 15.9min finished
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    1.1s finished


0.97472


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.5s finished


0.87673941373


169

In [None]:
rf_clf = RandomForestClassifier(n_estimators = 100, max_features=200, min_samples_leaf=12, verbose=True)
rf_clf.fit(X_train, y_train)
print (rf_clf.score(X_train, y_train))
print (rf_clf.score(X_test, y_test))
joblib.dump(rf_clf, 'rf_clf_final.pkl')
gc.collect()

## Not Using

In [None]:
%%time
xgb_clf = XGBClassifier(max_depth = 3, min_child_weight=8, learning_rate=0.05, seed=0, n_estimators=100, subsample=0.80, colsample_bytree=0.80, objective="multi:softprob")
xgb_clf.fit(X_train, y_train)
print (accuracy_score(y_train, xgb_clf.predict(X_train)))
print (accuracy_score(y_test, xgb_clf.predict(X_test)))
joblib.dump(xgb_clf, 'xgb_clf.pkl')
gc.collect()

0.885299268107
0.881996974281
CPU times: user 1h 55min 59s, sys: 33.4 s, total: 1h 56min 33s
Wall time: 31min 36s


In [None]:
%%time
xgb_clf = XGBClassifier(max_depth = 4, min_child_weight=8, learning_rate=0.05, seed=0, n_estimators=100, subsample=0.80, colsample_bytree=0.80, objective="multi:softprob")
xgb_clf.fit(X_train, y_train)
print (accuracy_score(y_train, xgb_clf.predict(X_train)))
print (accuracy_score(y_test, xgb_clf.predict(X_test)))
joblib.dump(xgb_clf, 'xgb_clf.pkl')
gc.collect()

In [None]:
sgd_clf = SGDClassifier(loss="log", verbose=True)
sgd_clf.fit(X_train, y_train)
print (sgd_clf.score(X_train, y_train))
print (sgd_clf.score(X_test, y_test))
joblib.dump(gb_clf, 'rf_clf6.pkl')
gc.collect()
# 0.849 accuracy

In [None]:
model = Sequential()

model.add(Dense(512, input_shape=(324,)))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))

model.add(Dense(512))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))

model.add(Dense(num_classes))
model.add(BatchNormalization())
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adagrad',
              metrics=['accuracy'])

model.fit(X_train, y_train_2, nb_epoch=50, batch_size=64, verbose=1)
print (model.evaluate(X_test, y_test_2, batch_size=64))
model.save('models/song_class_model.h5')
model = None
gc.collect()
# 0.88877616359785194 accuracy