## Breast Cancer Wisconsin dataset

In [1]:
import pandas as pd

df = pd.read_csv('https://archive.ics.uci.edu/ml/'
                 'machine-learning-databases'
                 '/breast-cancer-wisconsin/wdbc.data', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [2]:
df.shape

(569, 32)

### Create class response and relabel

In [3]:
from sklearn.preprocessing import LabelEncoder

X = df.loc[:, 2:].values
y = df.loc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)
le.classes_

array(['B', 'M'], dtype=object)

In [4]:
le.transform(['M', 'B'])

array([1, 0])

### Split into training and test

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, 
                     test_size=0.20,
                     stratify=y,
                     random_state=1)

# Pipeline
Import building blocks and make_pipeline.
Create simple pipeline.

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC, SVR, SVC

# Simple trainig data pipeline
pipe_lr = make_pipeline(StandardScaler(),    # Scaling of input features
                        PCA(n_components=2), # Compression into principal components
                        LinearSVC(), # Fit Logistic Regression with standard parameters
)
pipe_lr

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('linearsvc', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

### Fit with Wisconsin training data and predict outcome on test data

In [8]:
pipe_lr.fit(X_train, y_train)
y_pred = pipe_lr.predict(X_test)
print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test))

Test Accuracy: 0.956


In [9]:
pipe_lr.named_steps['linearsvc']

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

### Try: Copy, paste and:
- switch LR with SVM
- change/remove decomposition
- remove scaling

In [None]:
from sklearn.svm import SVC
#
#
#
#
#
#

Back to presentation

# K-fold cross-validation

In [10]:
import numpy as np
from sklearn.model_selection import KFold

X = np.zeros([10,1])
kfold = KFold(n_splits=3)

for train, test in kfold.split(X):
    print(train)
    print(test)

[4 5 6 7 8 9]
[0 1 2 3]
[0 1 2 3 7 8 9]
[4 5 6]
[0 1 2 3 4 5 6]
[7 8 9]


## Stratified cross-validation

In [11]:
from sklearn.model_selection import StratifiedKFold

y = np.hstack([np.zeros(4),np.ones(6)])
kfold = StratifiedKFold(n_splits=4,
                        random_state=1).split(X, y)

for k, (train, test) in enumerate(kfold):
    print(train)
    print(test)

[1 2 3 6 7 8 9]
[0 4 5]
[0 2 3 4 5 8 9]
[1 6 7]
[0 1 3 4 5 6 7 9]
[2 8]
[0 1 2 4 5 6 7 8]
[3 9]


### Applied on Wisconsin data

In [12]:
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits=10,
                        random_state=1).split(X_train, y_train)

scores = []
for k, (train, test) in enumerate(kfold):
    pipe_lr.fit(X_train[train], y_train[train])         # Refit for each training set
    score = pipe_lr.score(X_train[test], y_train[test]) # Score each test set
    scores.append(score)
    print('Fold: %2d, Class dist.: %s, Acc: %.3f' % (k+1,
          np.bincount(y_train[train]), score))
    
print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

Fold:  1, Class dist.: [256 153], Acc: 0.935
Fold:  2, Class dist.: [256 153], Acc: 0.935
Fold:  3, Class dist.: [256 153], Acc: 0.957
Fold:  4, Class dist.: [256 153], Acc: 0.978
Fold:  5, Class dist.: [256 153], Acc: 0.935
Fold:  6, Class dist.: [257 153], Acc: 0.956
Fold:  7, Class dist.: [257 153], Acc: 0.978
Fold:  8, Class dist.: [257 153], Acc: 0.933
Fold:  9, Class dist.: [257 153], Acc: 0.956
Fold: 10, Class dist.: [257 153], Acc: 0.956

CV accuracy: 0.952 +/- 0.016


### ... using the cross-validation scorer

In [23]:
from sklearn.model_selection import cross_val_score
#from sklearn.model_selection import LeaveOneOut

scores = cross_val_score(estimator=pipe_lr,
                         X=X_train,
                         y=y_train,
                         cv=10,
                         n_jobs=1)  # Distribute to processors/processes
print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
# scikit-learn computes scores per segment instead of across all samples. 
# Be careful with LOO or use cross_val_predict!

CV accuracy scores: [0.93478261 0.93478261 0.95652174 0.95652174 0.93478261 0.95555556
 0.97777778 0.93333333 0.95555556 0.95555556]
CV accuracy: 0.950 +/- 0.014


### ... using cross-validation prediction

### Try: Copy, paste and:
- Exchange cross_val_score with cross_val_predict
- Calculate accuracy of all predictions
- Compare with previous result

In [24]:
from sklearn.model_selection import cross_val_predict
# from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score

predictions = cross_val_predict(estimator=pipe_lr,
                         X=X_train,
                         y=y_train,
                         cv=10,
                         n_jobs=1)  # Distribute to processors/processes
print('CV accuracy scores: %d %s' % (len(scores), len(y_train)))
print('CV accuracy: %.3f' % accuracy_score(y_train, predictions))


CV accuracy scores: 10 455
CV accuracy: 0.952


Back to presentation

# Learning curves

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

# Simple pipeline with variable scaling and L2 penalized Logistic Regression
pipe_lr = make_pipeline(StandardScaler(),
                        LogisticRegression(penalty='l2', random_state=1))
# The L2 penalty has a parameter C which is the inverse of the regularization strength.

# Estimate learning curves using various proportions of the training data
train_sizes, train_scores, test_scores =\
                learning_curve(estimator=pipe_lr,
                               X=X_train,
                               y=y_train,
                               train_sizes=np.linspace(0.1, 1.0, 10),
                               cv=10, # Stratified KFold by default
                               n_jobs=1)

# Calculate learning curves for training and test sets
train_mean = np.mean(train_scores, axis=1) # Predict training data
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)   # Cross-validation
test_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean,
         color='blue', marker='o',
         markersize=5, label='training accuracy')

plt.fill_between(train_sizes,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.15, color='blue')

plt.plot(train_sizes, test_mean,
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='validation accuracy')

plt.fill_between(train_sizes,
                 test_mean + test_std,
                 test_mean - test_std,
                 alpha=0.15, color='green')

plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.8, 1.03])
plt.tight_layout()
plt.show()

## Validation curves

In [None]:
from sklearn.model_selection import validation_curve

# Reuse pipeline, but vary model parameter instead of sample size
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
train_scores, test_scores = validation_curve(
                estimator=pipe_lr, 
                X=X_train, 
                y=y_train, 
                param_name='logisticregression__C', # The paramter to vary
                param_range=param_range,            # ... and its values
                cv=10) # Stratified KFold by default
print(pipe_lr)

In [None]:
# Calculate validation curves for training and test sets
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(param_range, train_mean, 
         color='blue', marker='o', 
         markersize=5, label='training accuracy')

plt.fill_between(param_range, train_mean + train_std,
                 train_mean - train_std, alpha=0.15,
                 color='blue')

plt.plot(param_range, test_mean, 
         color='green', linestyle='--', 
         marker='s', markersize=5, 
         label='validation accuracy')

plt.fill_between(param_range, 
                 test_mean + test_std,
                 test_mean - test_std, 
                 alpha=0.15, color='green')

plt.grid()
plt.xscale('log')
plt.legend(loc='lower right')
plt.xlabel('Parameter C')
plt.ylabel('Accuracy')
plt.ylim([0.8, 1.0])
plt.tight_layout()
plt.show()

### Try: Copy, paste and:
- Switch to Iris data
- Fit pipeline
- Create learning and validation curves
- Assess parameter range and overfit

In [None]:
from sklearn import datasets
iris = datasets.load_iris()
Xi = iris.data
yi = iris.target
#
#
#
#
#