# Imports

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

# Load Data

In [2]:
columnNames = ['ID', 'Diagnosis', 'Radius', 'Texture', 'Perimeter', 'Area', 'Smoothness', 
               'Compactness', 'Concavity', 'ConcavePoints', 'Symmetry', 'FractalDimension', 
               'seRadius', 'seTexture', 'sePerimeter', 'seArea', 'seSmoothness', 'seCompactness', 'seConcavity', 
               'seConcavePoints', 'seSymmetry', 'seFractalDimension', 'worstRadius', 'worstTexture', 
               'worstPerimeter', 'worstArea', 'worstSmoothness', 'worstCompactness', 'worstConcavity', 
               'worstConcavePoints', 'worstSymmetry', 'worstFractalDimension']

dataURL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'

df = pd.read_csv(dataURL, header=None, names=columnNames)

In [3]:
df.head()

Unnamed: 0,ID,Diagnosis,Radius,Texture,Perimeter,Area,Smoothness,Compactness,Concavity,ConcavePoints,...,worstRadius,worstTexture,worstPerimeter,worstArea,worstSmoothness,worstCompactness,worstConcavity,worstConcavePoints,worstSymmetry,worstFractalDimension
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


# Visualisations 

In [4]:
#df.plot.scatter(df['Radius'], df["Perimeter"])

# Part 1: Mean, Median and Variance of Perimeter 

In [5]:
n_samples = df.shape[0]

In [6]:
n_samples

569

##  Mean

In [7]:
perimeters = df['Perimeter'].values
meanOfPerimeters = perimeters.sum() / n_samples

In [8]:
meanOfPerimeters

91.96903339191564

In [9]:
df['Perimeter'].mean()

91.96903339191564

## Variance

In [10]:
varianceOfPerimeters = np.sum (np.square(perimeters - meanOfPerimeters)) / (n_samples - 1)
varianceOfPerimeters

590.4404795217704

In [11]:
df['Perimeter'].var()

590.4404795217704

## Median 

In [12]:
perimeters.sort()
medianOfPerimeters = perimeters[284]

In [13]:
medianOfPerimeters

86.24

In [14]:
df['Perimeter'].median()

86.24

# Part 2: Distribution of Radius

## Histogram 

In [15]:
radii = df['Radius'].values
minRadius, maxRadius = np.min(radii) , np.max(radii)
maxRadius - minRadius

21.128999999999998

In [16]:
df['Radius'].plot.hist(44)

<matplotlib.axes._subplots.AxesSubplot at 0x7ff9c0a057b8>

The distibution looks skewed. A normal distribution should be symmetric.

## Skewness 

In [17]:
df['Radius'].skew()

0.9423795716730992

In [18]:
meanOfRadius = df['Radius'].mean()
varianceOfRadius = df['Radius'].var()
medianOfRadius = df['Radius'].median()

In [19]:
meanOfRadius, varianceOfRadius, medianOfRadius

(14.127291739894552, 12.418920129526722, 13.37)

Median <> Mean - another common indicator of skewness (doesn't always hold though)

## Formal test of fit 

TBC

SciPy also has a built in function to test normality (D’Agostino and Pearson’s test):

In [20]:
stats.normaltest(radii)

NormaltestResult(statistic=73.17938185797058, pvalue=1.286172249506454e-16)

## Alternative distributions 

Could try: log-normal distrbution, gamma distribution

# Part 3: Train Classifier

## Data preparation 

Shuffle the data and split into training and test sets

In [21]:
n_features = df.shape[1]
train_test_ratio = 0.7
n_training_samples =int(n_samples * 0.7)
n_test_samples = n_samples - n_training_samples

In [22]:
n_training_samples + n_test_samples == n_samples

True

In [23]:
dfShuffled = df.sample(frac=1)
X_train = dfShuffled.values[0:n_training_samples:,2:n_features].astype(float)
X_test = dfShuffled.values[n_training_samples:n_samples,2:n_features].astype(float)
y_train = (dfShuffled['Diagnosis'].values[0:n_training_samples] == 'M') * 1
y_test = (dfShuffled['Diagnosis'].values[n_training_samples:n_samples] == 'M') * 1

Do some checks to confirm that data is split correctly..

In [24]:
abs(X_train[:,0].sum() + X_test[:,0].sum() - df['Radius'].sum()) < 0.0000000001

True

In [25]:
y_train.sum() + y_test.sum() == (df['Diagnosis'] == 'M').sum()

True

## Dumb classifier 

In [26]:
from sklearn import metrics

In [27]:
y_pred_dumb = np.random.randint(2, size=y_test.shape[0])

In [28]:
metrics.confusion_matrix(y_test, y_pred_dumb)

array([[56, 47],
       [37, 31]])

In [29]:
metrics.f1_score(y_test, y_pred_dumb)

0.4246575342465754

## Logistic Regression 

In [30]:
from sklearn.linear_model import LogisticRegression

In [31]:
logReg = LogisticRegression(random_state=42)
logReg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [32]:
y_pred_log_reg = logReg.predict(X_test)

In [33]:
metrics.confusion_matrix(y_test, y_pred_log_reg)

array([[101,   2],
       [  4,  64]])

In [34]:
metrics.f1_score(y_test, y_pred_log_reg)

0.955223880597015

## Linear SVM 

In [35]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [57]:
linearSVMClassifier = Pipeline([("scaler", StandardScaler()),('linearSVC',SVC(kernel="linear", C=1.0))])
linearSVMClassifier.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('linearSVC', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [58]:
y_pred_linear_SVM = linearSVMClassifier.predict(X_test)

In [59]:
metrics.confusion_matrix(y_test, y_pred_linear_SVM)

array([[101,   2],
       [  6,  62]])

In [60]:
metrics.f1_score(y_test, y_pred_linear_SVM)

0.9393939393939394

## Kernel SVM 

In [61]:
from sklearn.preprocessing import PolynomialFeatures

In [66]:
kernelSVMClassifier = Pipeline([("scaler", StandardScaler())
                                ,('KernelSVC',SVC(kernel="poly", C=1.0, degree=3))])
kernelSVMClassifier.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('KernelSVC', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [67]:
y_pred_kernel_SVM = kernelSVMClassifier.predict(X_test)

In [68]:
metrics.confusion_matrix(y_test, y_pred_kernel_SVM)

array([[103,   0],
       [ 16,  52]])

In [69]:
metrics.f1_score(y_test, y_pred_kernel_SVM)

0.8666666666666666

In [70]:
metrics.roc_curve(y_test, y_pred_kernel_SVM)

(array([0., 1.]), array([0.76470588, 1.        ]), array([1, 0]))