In [None]:
#Import statements
import numpy as np
import matplotlib.pyplot as plt
import itertools
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, roc_curve

In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
        https://stackoverflow.com/a/50386871
    """

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()

In [None]:
with open('../../data/processed/full_set.pkl', 'rb') as f:
    full_set = pickle.load(f)
with open('../../data/processed/train_set.pkl', 'rb') as f:
    train_set = pickle.load(f)
with open('../../data/processed/test_set.pkl', 'rb') as f:
    test_set = pickle.load(f)
with open('../../data/processed/train_set_30.pkl', 'rb') as f:
    train_set_30 = pickle.load(f)
with open('../../data/processed/test_set_30.pkl', 'rb') as f:
    test_set_30 = pickle.load(f)
with open('../../data/processed/train_set_70.pkl', 'rb') as f:
    train_set_70 = pickle.load(f)
with open('../../data/processed/test_set_70.pkl', 'rb') as f:
    test_set_70 = pickle.load(f)

In [None]:
test_set_70.head()

In [None]:
# Split into X, y format 
X_train = train_set.T.iloc[:-1].T
y_train = train_set.T.iloc[-1].T

X_test = test_set.T.iloc[:-1].T
y_test = test_set.T.iloc[-1].T


# Linear Classifier

In [None]:
# I'm a little suspicious about how complicated the linear kernel is - possibly more than we need.
# A multilayer perceptrol with 0 layers is also a linear classifier if we need it.
from sklearn.svm import SVC
linclf = SVC(kernel="linear")
linclf.fit(X_train, y_train) 
linclf.score(X_test, y_test)

# Multilayer Perceptron

In [None]:
from sklearn.neural_network import MLPClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
mlp = MLPClassifier(random_state=42
                    # more non default parameters? E.g. only stochastic gradient descent has been covered in lectures 
                    # also doesn't reach convergence before timeout with current settings
                   )
# mlp.fit(X_train, y_train) 
# mlp.score(X_test, y_test)

## 10-fold cross validation

In [None]:
# todo run this
def cross_validate(classifier, X, y):
    '''    
    Given a classifier and training data:
        * Do 10fold CV
        * average the scores
    What this means is for the caller to interpret.
    Returns average result over CV runs 
    '''
    standardising_classifier = make_pipeline(preprocessing.StandardScaler(), classifier)
    cross_val_score(standardising_classifier, X, y, cv=10)
    return np.mean(scores)

# Running An Experiment

In [None]:
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline



def run_experiment(classifier, X_train_data, y_train_data, X_test_data, y_test_data):
    '''
    Given a classifier, training data, and test data:
    * Train on the training data
    * Test on the test data
    * display the confusion matrix and other metrics
    '''
    classifier_pipe = make_pipeline(preprocessing.StandardScaler(), classifier)

    classifier_pipe.fit(X_train_data, y_train_data)
    y_pred = classifier_pipe.predict(X_test_data)

    conf_mat = confusion_matrix(y_test_data, y_pred)
    print(classification_report(y_test_data, y_pred))
    plot_confusion_matrix(conf_mat, target_names=y_test_data.unique().sort())


In [None]:
# Original split
run_experiment(mlp, train_set.iloc[:,:-1], train_set['target'], test_set.iloc[:,:-1], test_set['target'])

# Visualize

In [None]:
#DO MORE MORE MORE OF ZE CODE