# Machine Learning in Python - Predictive Modelling

### Lots of Imports

To build predictive models in Python we use a set of libraries that are imported here. In particular **pandas** and **sklearn** are particularly important.

In [2]:
import os
import subprocess
from IPython.display import display, HTML, Image
import io

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.plotly as py

from sklearn.tree import export_graphviz
from sklearn import tree
from sklearn import metrics
from sklearn import tree
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn import linear_model
from sklearn import neighbors

%matplotlib inline

### Load & Partition Data

In [3]:
dataset = pd.read_csv('fashionmnist/fashion-mnist_train.csv')
dataset.head()

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,0,0,0,0,0,0,5,0,...,0,0,0,30,43,0,0,0,0,0
3,0,0,0,0,1,2,0,0,0,0,...,3,0,0,0,0,1,0,0,0,0
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Examine the distribution of the two classes

In [4]:
dataset["label"].value_counts()

9    6000
8    6000
7    6000
6    6000
5    6000
4    6000
3    6000
2    6000
1    6000
0    6000
Name: label, dtype: int64

Isolate the descriptive features we are interested in

Split the data into a **training set**, a **vaidation set**, and a **test set**

In [5]:
data_matrix = dataset.as_matrix()
X = data_matrix[:,1:]
Y = data_matrix[:,0]
X_train_plus_valid, X_test, y_train_plus_valid, y_test \
    = train_test_split(X, Y, random_state=0, \
                                    train_size = 0.6)

X_train, X_valid, y_train, y_valid \
    = train_test_split(X_train_plus_valid, \
                                        y_train_plus_valid, \
                                        random_state=0, \
                                        train_size = 0.4/0.7)

### A Very Simple Decision Tree

Train a decision tree

In [6]:
my_tree = tree.DecisionTreeClassifier(criterion="entropy")
my_tree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

Visualise the decision tree so we can see what it is doing!

### Evaluating Model Performance

Assess the performance of the decision tree on the training set

In [7]:
# Make a set of predictions for the training data
y_pred = my_tree.predict(X_train)

# Print performance details
accuracy = metrics.accuracy_score(y_train, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_train, y_pred))

# Print confusion matrix
print(metrics.confusion_matrix(y_train, y_pred))

# Print nicer homemade confusion matrix
print("Confusion Matrix")
pd.crosstab(y_train, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)


Accuracy: 1.0
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      2059
          1       1.00      1.00      1.00      2055
          2       1.00      1.00      1.00      2016
          3       1.00      1.00      1.00      2020
          4       1.00      1.00      1.00      2052
          5       1.00      1.00      1.00      2116
          6       1.00      1.00      1.00      2092
          7       1.00      1.00      1.00      2037
          8       1.00      1.00      1.00      2069
          9       1.00      1.00      1.00      2055

avg / total       1.00      1.00      1.00     20571

[[2059    0    0    0    0    0    0    0    0    0]
 [   0 2055    0    0    0    0    0    0    0    0]
 [   0    0 2016    0    0    0    0    0    0    0]
 [   0    0    0 2020    0    0    0    0    0    0]
 [   0    0    0    0 2052    0    0    0    0    0]
 [   0    0    0    0    0 2116    0    0    0    0]
 [   0    0    0    0    0   

Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,2059,0,0,0,0,0,0,0,0,0,2059
1,0,2055,0,0,0,0,0,0,0,0,2055
2,0,0,2016,0,0,0,0,0,0,0,2016
3,0,0,0,2020,0,0,0,0,0,0,2020
4,0,0,0,0,2052,0,0,0,0,0,2052
5,0,0,0,0,0,2116,0,0,0,0,2116
6,0,0,0,0,0,0,2092,0,0,0,2092
7,0,0,0,0,0,0,0,2037,0,0,2037
8,0,0,0,0,0,0,0,0,2069,0,2069
9,0,0,0,0,0,0,0,0,0,2055,2055


Assess the performance of the tree on the validation dataset

In [8]:
# Make a set of predictions for the test data
y_pred = my_tree.predict(X_valid)

# Print performance details
accuracy = metrics.accuracy_score(y_valid, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_valid, y_pred))

# Print confusion matrix
print(metrics.confusion_matrix(y_valid, y_pred))

# Print nicer confusion matrix
print("Confusion Matrix")
pd.crosstab(y_valid, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Accuracy: 0.785339296131
             precision    recall  f1-score   support

          0       0.77      0.74      0.76      1545
          1       0.94      0.93      0.93      1522
          2       0.67      0.67      0.67      1560
          3       0.79      0.78      0.79      1534
          4       0.66      0.65      0.66      1590
          5       0.88      0.87      0.87      1511
          6       0.52      0.56      0.54      1510
          7       0.85      0.87      0.86      1607
          8       0.91      0.92      0.92      1537
          9       0.88      0.87      0.87      1513

avg / total       0.79      0.79      0.79     15429

[[1151    9   32   63   16    3  247    0   22    2]
 [   6 1409    8   71   11    1   15    0    1    0]
 [  34    3 1045   24  228    0  206    0   20    0]
 [  76   45   31 1195  102    3   67    0   11    4]
 [  11    5  230   75 1039    0  211    0   19    0]
 [   2    5    2    5    0 1313    2  100   13   69]
 [ 204   12  200  

Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1151,9,32,63,16,3,247,0,22,2,1545
1,6,1409,8,71,11,1,15,0,1,0,1522
2,34,3,1045,24,228,0,206,0,20,0,1560
3,76,45,31,1195,102,3,67,0,11,4,1534
4,11,5,230,75,1039,0,211,0,19,0,1590
5,2,5,2,5,0,1313,2,100,13,69,1511
6,204,12,200,59,149,3,841,1,37,4,1510
7,0,0,0,0,0,106,1,1398,7,95,1607
8,11,4,14,15,20,17,31,9,1414,2,1537
9,2,0,3,3,1,52,0,135,5,1312,1513


## Choosing Parameters Using a Grid Search

Use a cross validation to perfrom an evaluation

An alternative to using post pruning explicitly is to use a grid search through a large set of possible parameters. Here we try depths between 3 and 20 and different limits on the minimum number of samples per split.

In [9]:
# Set up the parameter grid to seaerch
param_grid ={'criterion': ['gini', "entropy"], \
             'max_depth': list(range(3, 10, 3)), \
             'min_samples_split': [50] }

# Perform the search
my_tuned_tree = GridSearchCV(tree.DecisionTreeClassifier(), \
                                param_grid, cv=2, verbose = 0, \
                            return_train_score=True)
my_tuned_tree.fit(X_train_plus_valid, y_train_plus_valid)

# Print details
print("Best parameters set found on development set:")
display(my_tuned_tree.best_params_)
display(my_tuned_tree.best_score_)
display(my_tuned_tree.cv_results_)

Best parameters set found on development set:


{'criterion': 'entropy', 'max_depth': 9, 'min_samples_split': 50}

0.7830555555555555

{'mean_fit_time': array([ 1.85387695,  2.85321486,  4.42225504,  1.64397347,  3.56941438,
         6.9267354 ]),
 'mean_score_time': array([ 0.09985805,  0.02991343,  0.03196537,  0.03034902,  0.03348505,
         0.18091607]),
 'mean_test_score': array([ 0.49938889,  0.72611111,  0.78130556,  0.53019444,  0.73580556,
         0.78305556]),
 'mean_train_score': array([ 0.50250008,  0.73922231,  0.82269436,  0.53080471,  0.75216633,
         0.8279166 ]),
 'param_criterion': masked_array(data = ['gini' 'gini' 'gini' 'entropy' 'entropy' 'entropy'],
              mask = [False False False False False False],
        fill_value = ?),
 'param_max_depth': masked_array(data = [3 6 9 3 6 9],
              mask = [False False False False False False],
        fill_value = ?),
 'param_min_samples_split': masked_array(data = [50 50 50 50 50 50],
              mask = [False False False False False False],
        fill_value = ?),
 'params': ({'criterion': 'gini', 'max_depth': 3, 'min_samples_split

In [10]:
# Make a set of predictions for the test data
y_pred = my_tuned_tree.predict(X_test)

# Print performance details
accuracy = metrics.accuracy_score(y_test, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix")
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Accuracy: 0.792125
             precision    recall  f1-score   support

          0       0.72      0.81      0.76      2396
          1       0.96      0.93      0.94      2423
          2       0.68      0.67      0.67      2424
          3       0.78      0.82      0.80      2446
          4       0.60      0.76      0.67      2358
          5       0.89      0.87      0.88      2373
          6       0.65      0.41      0.50      2398
          7       0.83      0.89      0.86      2356
          8       0.92      0.90      0.91      2394
          9       0.91      0.88      0.89      2432

avg / total       0.79      0.79      0.79     24000

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1931,14,52,125,14,6,202,1,50,1,2396
1,27,2245,20,102,10,2,12,0,5,0,2423
2,41,4,1613,35,600,2,113,0,15,1,2424
3,133,64,38,2009,136,11,46,0,8,1,2446
4,17,1,223,184,1791,4,120,0,18,0,2358
5,10,8,2,5,3,2062,1,205,19,58,2373
6,470,8,370,92,409,3,972,2,69,3,2398
7,0,0,0,0,0,105,0,2101,3,147,2356
8,30,1,39,37,32,27,38,24,2157,9,2394
9,8,0,4,0,0,82,1,204,3,2130,2432


### Final Evaluation on Test Set

Evaluate the model on a stratified test set

In [11]:
test = pd.read_csv('fashionmnist/fashion-mnist_test.csv')
test_set = test.as_matrix()
X_test = test_set[:,1:]
y_test = test_set[:,0]
# Make a set of predictions for the test data
y_pred = my_tuned_tree.predict(X_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
print("Model accuracy on test data: " +  str(accuracy))
# Print performance details
print(metrics.classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix")
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Model accuracy on test data: 0.7948
             precision    recall  f1-score   support

          0       0.70      0.80      0.75      1000
          1       0.95      0.93      0.94      1000
          2       0.71      0.67      0.69      1000
          3       0.80      0.83      0.81      1000
          4       0.63      0.80      0.70      1000
          5       0.89      0.85      0.87      1000
          6       0.66      0.42      0.52      1000
          7       0.81      0.88      0.85      1000
          8       0.92      0.91      0.91      1000
          9       0.89      0.85      0.87      1000

avg / total       0.80      0.79      0.79     10000

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,801,8,21,50,5,2,86,1,25,1,1000
1,17,930,6,36,7,0,2,0,2,0,1000
2,22,1,668,9,248,0,43,0,7,2,1000
3,56,30,16,832,45,4,14,0,3,0,1000
4,5,1,76,62,796,0,58,0,2,0,1000
5,10,0,1,3,2,850,1,96,9,28,1000
6,203,4,139,45,154,0,425,0,29,1,1000
7,0,0,0,0,0,48,0,884,2,66,1000
8,21,0,13,9,7,13,13,14,907,3,1000
9,2,1,1,0,1,38,1,97,4,855,1000
