In [1]:

# hack to allow importing from sibling directories
#https://stackoverflow.com/questions/34478398/import-local-function-from-a-module-housed-in-another-directory-with-relative-im
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from mlviz.dimensionality_reduction import HDVis
from mlviz.data_visualisation import DraughtPlot

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold


# required bokeh imports
from bokeh.io import output_notebook
output_notebook()


# Load a pre-process the data

In [6]:
HDPlot.show_example()

TypeError: a bytes-like object is required, not '_io.FileIO'

TypeError: a bytes-like object is required, not '_io.FileIO'

<IPython.core.display.Image object>

In [2]:
data = pd.read_csv('data/breast_cancer/wdbc.data', header=None)
data = data.iloc[:,0:12]
columns = ['id', 'target', 'radius', 'texture','perimeter', 'area', 'smoothness', 'compactness', 'concavity', 'concave points', 'symmetry', 'fractal dimension']
data.columns = columns

data.loc[data.target=='M','target'] = 0
data.loc[data.target=='B','target'] = 1
data.drop('id', inplace=True, axis=1)

target = data['target'].values
data.drop('target', inplace=True, axis=1)

In [3]:
data_arr = StandardScaler().fit_transform(data)
data_df = pd.DataFrame(data_arr, columns=data.columns)

# Perform Exploratory analysis

In [4]:
HDPlot = HDVis(data_df, target)

In [6]:
X, y = HDPlot.get_brushed_data()

ValueError: need at least one array to concatenate

In [13]:
DMPlot = DraughtPlot(X, y, features=columns[6:])

In [4]:
# features which are highly correlated to other features.
features_to_drop = ['perimeter', 'radius', 'concave points']

# Build  a model 

Having performed some exploratory data analysis with the tool we can now attempt to build a model. Our exploratory analysis has taught us:

- The data clusters well with UMAP and the classes are well seperated in the embedded space. We therefore expect to be able to develop an extremely strong classifier (>90 % accuracy). 
- The Draughtsman plot tool has identified some highly linearly correlated features, we are therefore able to drop these columns from our following analysis.
- It has helped us identify some of the most useful features (area and fractal dimension look the most promising).

In [5]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

In [6]:
# drop the features we learned were correlated.
fit_df = data_df.drop(features_to_drop, axis=1)

### Seach for optimal_parameters

We can now perform a grid search of the hyperparameter space to find the optimal parameters of our model.

In [7]:
# generate a hold out test set to not allow the training algorithm to see. 

X_train, X_test, y_train, y_test = train_test_split(fit_df.to_numpy(), target)

In [8]:
param_grid = {'C':[0.5,1,2,3], 'degree':[2,3,4,5,10], 'kernel':['poly']}

In [24]:
svm = SVC(gamma=True)
grid_search = GridSearchCV(svm, param_grid, cv=5)
_ = grid_search.fit(X_train, y_train)

svm_best = grid_search.best_estimator_

In [25]:
_ = svm_best.fit(X_train, y_train)

In [27]:
y_train_pred = svm_best.predict(X_train)
y_test_pred = svm_best.predict(X_test)

In [29]:
print('Training score accuracy: {0:.3f}'.format(100*accuracy_score(y_train, y_train_pred)))

print('Test score accuracy: {0:.3f}'.format(100*accuracy_score(y_test, y_test_pred)))


Training score accuracy: 96.948
Test score accuracy: 95.105


### The model

We can see we have built a fairly successful model, achieving 95.1% accuracy on the test set and 96.7 % accuracy on the training set. This shows the model overfits slightly, however by removing correlated columns we actually reduced the level of overfitting. 