In [1]:
# hack to allow importing from sibling directories
#https://stackoverflow.com/questions/34478398/import-local-function-from-a-module-housed-in-another-directory-with-relative-im
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from mlviz.dimensionality_reduction import HDVis
from mlviz.data_visualisation import DraughtPlot

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold


# required bokeh imports
from bokeh.io import output_notebook
output_notebook()


# Load a pre-process the data

In [2]:
data = pd.read_csv('data/breast_cancer/wdbc.data', header=None)
data = data.iloc[:,0:12]
columns = ['id', 'target', 'radius', 'texture','perimeter', 'area', 'smoothness', 'compactness', 'concavity', 'concave points', 'symmetry', 'fractal dimension']
data.columns = columns

data.loc[data.target=='M','target'] = 0
data.loc[data.target=='B','target'] = 1
data.drop('id', inplace=True, axis=1)

target = data['target'].values
data.drop('target', inplace=True, axis=1)

In [3]:
data_arr = StandardScaler().fit_transform(data)
data_df = pd.DataFrame(data_arr, columns=data.columns)

# Perform Exploratory analysis

In [4]:
HDPlot = HDVis(data_df, target)

In [7]:
X, y = HDPlot.get_brushed_data()

In [12]:
DMPlot = DraughtPlot(X, y, features=columns[2:6])

In [14]:
# features which are highly correlated to other features.
features_to_drop = ['perimeter', 'radius', 'concave points']

# Build  a model 

Having performed some exploratory data analysis with the tool we can now attempt to build a model. Our exploratory analysis has taught us:

- The data clusters well with UMAP and the classes are well seperated in the embedded space. We therefore expect to be able to develop an extremely strong classifier (>90 % accuracy). 
- The Draughtsman plot tool has identified some highly linearly correlated features, we are therefore able to drop these columns from our following analysis.
- It has helped us identify some of the most useful features (area and fractal dimension look the most promising).

In [15]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

In [16]:
# drop the features we learned were correlated.
fit_df = data_df.drop(features_to_drop, axis=1)

### Seach for optimal_parameters

We can now perform a grid search of the hyperparameter space to find the optimal parameters of our model.

In [17]:
# generate a hold out test set to not allow the training algorithm to see. 

X_train, X_test, y_train, y_test = train_test_split(fit_df.to_numpy(), target)

In [18]:
param_grid = {'C':[0.5,1,2,3], 'degree':[2,3,4,5,10], 'kernel':['poly']}

In [19]:
svm = SVC(gamma=True)
grid_search = GridSearchCV(svm, param_grid, cv=5)
_ = grid_search.fit(X_train, y_train)

svm_best = grid_search.best_estimator_



In [20]:
_ = svm_best.fit(X_train, y_train)

In [21]:
y_train_pred = svm_best.predict(X_train)
y_test_pred = svm_best.predict(X_test)

In [22]:
print('Training score accuracy: {0:.3f}'.format(100*accuracy_score(y_train, y_train_pred)))

print('Test score accuracy: {0:.3f}'.format(100*accuracy_score(y_test, y_test_pred)))


Training score accuracy: 96.714
Test score accuracy: 93.007


### The model

We can see we have built a fairly successful model, achieving ~95% accuracy on the test set and ~96.7 % accuracy on the training set. This shows the model overfits slightly, however by removing correlated columns we actually reduced the level of overfitting. 

In [10]:
my_cat = Persian(12)

In [14]:
my_cat.claws

10

In [15]:
f'{[1,2,3]}'

'[1, 2, 3]'

In [23]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
X, y = load_iris(return_X_y=True)

 
linear_clf = LogisticRegression(random_state=0).fit(X, y)
svm_clf = SVC(probability=True).fit(X, y)
                 




In [25]:
svm_preds = svm_clf.predict_proba(X)
log_preds = linear_clf.predict_proba(X)

In [35]:
np.dstack([svm_preds, log_preds]).mean(axis=2)

array([[0.92496164, 0.06786523, 0.00717313],
       [0.88272164, 0.10962562, 0.00765274],
       [0.91129704, 0.08066888, 0.00803409],
       [0.89406169, 0.09702742, 0.00891089],
       [0.93384157, 0.05864598, 0.00751245],
       [0.94248824, 0.04887878, 0.00863298],
       [0.93047293, 0.06048517, 0.00904191],
       [0.9151027 , 0.07756824, 0.00732907],
       [0.88081282, 0.1088151 , 0.01037208],
       [0.88006741, 0.11202238, 0.00791021],
       [0.92881153, 0.06343281, 0.00775566],
       [0.91397199, 0.07767339, 0.00835462],
       [0.87685955, 0.11507626, 0.00806418],
       [0.8962295 , 0.09150599, 0.0122645 ],
       [0.9402203 , 0.04815877, 0.01162093],
       [0.95512361, 0.03242464, 0.01245175],
       [0.95349543, 0.03824681, 0.00825777],
       [0.93095733, 0.06197304, 0.00706963],
       [0.92278249, 0.0675395 , 0.00967801],
       [0.94582026, 0.04635152, 0.00782822],
       [0.89262718, 0.09890706, 0.00846576],
       [0.94480482, 0.04761861, 0.00757657],
       [0.

In [28]:
import numpy as np