Import Python Libraries


In [30]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

Read in Wisconsin Breast Cancer Dataset

In [13]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data", header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


Split Dataset Into Data Features and Target

In [14]:
df.columns = ["ID", "Diagnosis", "mean_radius", "mean_texture", 
                "mean_perimeter", "mean_area", "mean_smoothness", 
                "mean_compactness", "mean_concavity", "mean_concave_points", 
                "mean_symmetry", "mean_fractal_dimension", "sd_radius", 
                "sd_texture", "sd_perimeter", "sd_area", "sd_smoothness",
                "sd_compactness", "sd_concavity", "sd_concave_points", 
                "sd_symmetry", "sd_fractal_dimension", "worst_radius", 
                "worst_texture", "worst_perimeter", "worst_area", 
                "worst_smoothness", "worst_compactness", "worst_concavity", 
                "worst_concave_points", "worst_symmetry", "worst_fractal_dimension"]

In [15]:
df_Target = df["Diagnosis"]
df_Features = df[["mean_radius", "mean_texture", "mean_perimeter", 
                  "mean_area", "mean_smoothness", "mean_compactness", 
                  "mean_concavity", "mean_concave_points", "mean_symmetry", 
                  "mean_fractal_dimension", "sd_radius", "sd_texture", 
                  "sd_perimeter", "sd_area", "sd_smoothness", "sd_compactness", 
                  "sd_concavity", "sd_concave_points", "sd_symmetry", 
                  "sd_fractal_dimension", "worst_radius", "worst_texture", 
                  "worst_perimeter", "worst_area", "worst_smoothness", 
                  "worst_compactness", "worst_concavity", "worst_concave_points", 
                  "worst_symmetry", "worst_fractal_dimension"]]

Build a logit model and fit

In [38]:
from sklearn import linear_model

X = df_Features
y = df_Target

lg = linear_model.LogisticRegression(penalty="l2", dual=False)
lg.fit(X,y)
lg.score(X,y)

0.95957820738137078

Prepare test data and predict

In [32]:
knn = KNeighborsClassifier(n_neighbors=3,
                           weights="uniform",
                           p=2,
                           metric="minkowski")

Y = df_Target.values
X = df_Features.values

from sklearn import cross_validation
cv_indices = cross_validation.StratifiedKFold(Y, n_folds=3)

def accuracy_crossvalidator(X, Y, knn, cv_indices):
    scores = []
    for train_i, test_i in cv_indices:
        X_train = X[train_i, :]
        X_test = X[test_i, :]

        Y_train = Y[train_i]
        Y_test = Y[test_i]

        knn.fit(X_train, Y_train)
        
        acc = knn.score(X_test, Y_test)
        scores.append(acc)
        pred = knn.predict_proba(X_test)
        
        print('Fold accuracy:', acc)
        print pred
    print('Mean CV accuracy:', np.mean(scores))

In [33]:
accuracy_crossvalidator(X, Y, knn, cv_indices)

('Fold accuracy:', 0.89473684210526316)
[[ 0.          1.        ]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 1.          0.        ]
 [ 0.          1.        ]
 [ 0.33333333  0.66666667]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 0.66666667  0.33333333]
 [ 0.33333333  0.66666667]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 0.66666667  0.33333333]
 [ 1.          0.        ]
 [ 0.33333333  0.66666667]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 1.          0.        ]
 [ 1.          0.        ]
 [ 1.          0.        ]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 1.          0.        ]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 0.          

In [34]:
lg.predict(X)

array(['M', 'M', 'M', 'B', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'B', 'M', 'M', 'M', 'M', 'M', 'B', 'B', 'B', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'B', 'B',
       'B', 'M', 'M', 'M', 'M', 'B', 'M', 'B', 'M', 'B', 'B', 'B', 'B',
       'B', 'M', 'M', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'M', 'B', 'M',
       'M', 'B', 'B', 'B', 'B', 'M', 'B', 'M', 'B', 'B', 'M', 'B', 'M',
       'M', 'B', 'B', 'B', 'M', 'M', 'B', 'M', 'B', 'M', 'B', 'B', 'B',
       'B', 'B', 'B', 'M', 'M', 'B', 'B', 'B', 'M', 'M', 'B', 'B', 'B',
       'B', 'M', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'M', 'M', 'M', 'B', 'M', 'M', 'B', 'B', 'B', 'M', 'M', 'B', 'M',
       'B', 'M', 'M', 'B', 'M', 'B', 'B', 'B', 'M', 'B', 'B', 'M', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',
       'M', 'M', 'B', 'B', 'B', 'M', 'M', 'B', 'M', 'B', 'B', 'M', 'M',
       'B', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'M

In [35]:
from sklearn.metrics import confusion_matrix

In [36]:
y_pred = lg.predict(X)
y_true = df_Target
confusion_matrix(y_true,y_pred)

array([[348,   9],
       [ 14, 198]])