In [17]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases'
                 '/breast-cancer-wisconsin/wdbc.data', header=None)

In [18]:
from sklearn.preprocessing import LabelEncoder

df.head(), df.shape

(         0  1      2      3       4       5        6        7       8   \
 0    842302  M  17.99  10.38  122.80  1001.0  0.11840  0.27760  0.3001   
 1    842517  M  20.57  17.77  132.90  1326.0  0.08474  0.07864  0.0869   
 2  84300903  M  19.69  21.25  130.00  1203.0  0.10960  0.15990  0.1974   
 3  84348301  M  11.42  20.38   77.58   386.1  0.14250  0.28390  0.2414   
 4  84358402  M  20.29  14.34  135.10  1297.0  0.10030  0.13280  0.1980   
 
         9    ...        22     23      24      25      26      27      28  \
 0  0.14710   ...     25.38  17.33  184.60  2019.0  0.1622  0.6656  0.7119   
 1  0.07017   ...     24.99  23.41  158.80  1956.0  0.1238  0.1866  0.2416   
 2  0.12790   ...     23.57  25.53  152.50  1709.0  0.1444  0.4245  0.4504   
 3  0.10520   ...     14.91  26.50   98.87   567.7  0.2098  0.8663  0.6869   
 4  0.10430   ...     22.54  16.67  152.20  1575.0  0.1374  0.2050  0.4000   
 
        29      30       31  
 0  0.2654  0.4601  0.11890  
 1  0.1860  0.2750

In [19]:
np.unique(df[:][1])

array(['B', 'M'], dtype=object)

In [20]:
X = df.iloc[:, 2:].values
y = df.iloc[:, 1].values

In [21]:
le = LabelEncoder()
y = le.fit_transform(y)

In [22]:
le.transform(['M', 'B'])

array([1, 0])

In [23]:
le.classes_

array(['B', 'M'], dtype=object)

In [24]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=1)

In [25]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [26]:
pipe_lr = Pipeline(
    [('scl', StandardScaler()),
     ('pca', PCA(n_components=2)),
     ('clf', LogisticRegression(random_state=1))
    ])

pipe_lr.fit(X_train, y_train)
print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test))

Test Accuracy: 0.947


In [27]:
from sklearn.cross_validation import StratifiedKFold
kfold = StratifiedKFold(y=y_train, n_folds=10, random_state=1)

scores = []
for k, (train, test) in enumerate(kfold):
    pipe_lr.fit(X_train[train], y_train[train])
    score = pipe_lr.score(X_train[test], y_train[test])
    scores.append(score)
    print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1,
          np.bincount(y_train[train]), score))
    
print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

Fold: 1, Class dist.: [256 153], Acc: 0.891
Fold: 2, Class dist.: [256 153], Acc: 0.978
Fold: 3, Class dist.: [256 153], Acc: 0.978
Fold: 4, Class dist.: [256 153], Acc: 0.913
Fold: 5, Class dist.: [256 153], Acc: 0.935
Fold: 6, Class dist.: [257 153], Acc: 0.978
Fold: 7, Class dist.: [257 153], Acc: 0.933
Fold: 8, Class dist.: [257 153], Acc: 0.956
Fold: 9, Class dist.: [257 153], Acc: 0.978
Fold: 10, Class dist.: [257 153], Acc: 0.956

CV accuracy: 0.950 +/- 0.029


In [28]:
from sklearn.cross_validation import cross_val_score

scores = cross_val_score(estimator=pipe_lr,
                         X=X_train,
                         y=y_train,
                         cv=10,
                         n_jobs=1)
print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV accuracy scores: [ 0.89130435  0.97826087  0.97826087  0.91304348  0.93478261  0.97777778
  0.93333333  0.95555556  0.97777778  0.95555556]
CV accuracy: 0.950 +/- 0.029


In [29]:
from sklearn.learning_curve import learning_curve

In [36]:
pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('clf', LogisticRegression(penalty='l2', random_state=0))])

train_sizes, train_scores, test_scores =\
                learning_curve(estimator=pipe_lr,
                               X=X_train,
                               y=y_train,
                               train_sizes=np.linspace(0.1, 1.0, 10),
                               cv=10, # =>k
                               n_jobs=1)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

In [37]:
plt.plot(train_sizes, train_mean,
         color='blue', marker='o',
         markersize=5, label='training accuracy')

plt.fill_between(train_sizes,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.15, color='blue')

plt.plot(train_sizes, test_mean,
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='validation accuracy')

plt.fill_between(train_sizes,
                 test_mean + test_std,
                 test_mean - test_std,
                 alpha=0.15, color='green')

plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.8, 1.0])
# plt.tight_layout()
# plt.savefig('./figures/learning_curve.png', dpi=300)
plt.show()

In [38]:
train_sizes

array([ 40,  81, 122, 163, 204, 245, 286, 327, 368, 409])