In [1]:
"""
Models Group 1:
Based on examples represented as flattened arrays of 30-second chromograms.
""";

In [2]:
import MTheory as mt
import imlearn

In [3]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn import svm, decomposition
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler

In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.style as ms
ms.use('seaborn-muted')
import seaborn as sns

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
data_path = '/Users/ilanmoscovitz/github/sf18_ds11/projects/03-mcnulty/Raw_Arrays.pkl'
df = pd.read_pickle(data_path)
df.columns = [str(col).strip() for col in df.columns.tolist()]

In [7]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15485,15486,15487,15488,15489,15490,15491,Song,Composer,Collection
0,0.578548,0.833667,0.641761,0.406082,0.466758,0.315837,0.178356,0.574957,1.0,0.645824,...,0.109619,0.105598,0.057196,0.137472,0.224815,0.132209,0.043358,can1,Bach,Art_of_Fugue
1,0.037789,0.516816,1.0,0.555567,0.111366,0.08975,0.105569,0.052371,0.114781,0.203799,...,0.07102,0.107716,0.063728,0.069789,0.122905,0.569828,1.0,can1,Bach,Art_of_Fugue
2,0.530424,0.247869,0.464648,0.249256,0.060218,0.047878,0.083007,0.053147,0.069365,0.120075,...,0.559203,1.0,0.534866,0.494115,0.791598,0.339638,0.11494,can1,Bach,Art_of_Fugue
3,0.209824,0.370963,0.199028,0.092517,0.113985,0.563249,1.0,0.509652,0.430535,0.890912,...,1.0,0.519597,0.085054,0.059381,0.057043,0.024744,0.320728,can1,Bach,Art_of_Fugue
4,0.496677,0.250995,0.07756,0.055055,0.534108,1.0,0.533525,0.0913,0.05627,0.049875,...,0.191431,0.062972,0.03426,0.524069,1.0,0.516596,0.039233,can1,Bach,Art_of_Fugue


In [8]:
# Unfortunately, we need to split along songs. We are using 30-second snippets,
# and it would be cheating to include parts of a single song in both the train and test sets.
# Code for custom split, customCV, and customGridCV is located in imlearn.py.

train, test = imlearn.custom_train_test_split(df, 'Song', test_size=0.3, random_seed=50)

In [9]:
# Quick check to make sure the split reasonably approximates test_size (see comment in .custom_train_test_split.) 
len(train)/len(df)

0.723939611790079

In [10]:
# How balanced are the classes?
len(train[train['Composer']=='Bach']) / len(train)

0.5034756703078451

In [11]:
# How balanced are the classes?
len(test[test['Composer']=='Bach']) / len(test)

0.4791666666666667

In [12]:
# Hide cheating attributes during training
omit_mask = ['Collection','Song','Snippet']

In [13]:
# Create pipeline for logistic regression

log_pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('logistic',LogisticRegression())
])
    
parameters = [
    { 
          'logistic__penalty': ["l1","l2"],
          'logistic__C': np.logspace(-4, 4, 50)
    },
]

In [14]:
# CustomGridCV code is in imlearn.py.

log_grid = imlearn.CustomGridCV(clf=log_pipeline, params=parameters, fold_on='Song', standard_scale=False) # Scaling is already in pipeline

log_grid.fit_score(train, y_feat='Composer',
                   omit_mask=omit_mask, display=False, random_seed=42)

KeyboardInterrupt: 

In [None]:
# TODO: How do we know which composer labels go where?
# TODO: Fix numbering
composers = ['Bach','Mozart']
predictions, actuals = log_grid.best_predictions_actuals_
imlearn.draw_confusion(predictions, actuals, composers)

In [None]:
imlearn.draw_confusion(predictions, actuals, ['Mozart','Bach'])


In [None]:
# Maybe svm can do better.

# Phat dataset: we'll use a linear kernal.

# It seems dual doesn't support regularization choice, and I have too many features for single, 
# so there isn't much tuning to do.

svm_pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('svm',svm.SVC(kernel='linear',C=1))
])

#parameters = [
#    { 
#          'svm__penalty': ['l1','l2'],
#    },
#]

#svm_grid = imlearn.CustomGridCV(params=parameters, fold_on='Song')

#svm_grid.fit_score(train, y_feat='Composer', clf=svm_pipeline, 
 #                  omit_mask=omit_mask, display=True, random_seed=42)

svm_custom_cv = imlearn.CustomCV(clf=svm_pipeline, fold_on='Song', scorer=accuracy_score, standard_scale=False)

svm_custom_cv.fit_score(train, y_feat='Composer', cv=5,
                        omit_mask=omit_mask, display=False, random_seed=42)

In [None]:
svm_custom_cv.score_

In [None]:
# Again, not very discriminating
predictions, actuals = svm_custom_cv.predictions_actuals_
imlearn.draw_confusion(predictions, actuals, composers)

In [None]:
# Lastly, let's try some pca decomp to help with datagirth.

svm_slightly_more_elaborate_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', decomposition.PCA(n_components=200)),
    ('svm', svm.SVC(kernel='linear',C=1))
])

svm_slightly_more_elaborate_CV = imlearn.CustomCV(clf=svm_slightly_more_elaborate_pipe, fold_on='Song', scorer=accuracy_score, standard_scale=False)

svm_slightly_more_elaborate_CV.fit_score(train, y_feat='Composer', cv=5,
                                         omit_mask=omit_mask, display=False, random_seed=42)

In [None]:
predictions, actuals = svm_slightly_more_elaborate_CV.predictions_actuals_
imlearn.draw_confusion(predictions, actuals, composers)

In [None]:
svm_slightly_more_elaborate_CV.score_