In [3]:
import numpy as np
import pandas as pd

# Data (Breast Cancer Wisconsin Diagnostic)

## Attribute Information

1. ID number
2. Diagnosis (M = malignant, B = benign)
3. Ten real-valued features are computed for each cell nucleus:<br>
```
a) radius (mean of distances from center to points on the perimeter)
b) texture (standard deviation of gray-scale values)
c) perimeter
d) area
e) smoothness (local variation in radius lengths)
f) compactness (perimeter^2 / area - 1.0)
g) concavity (severity of concave portions of the contour)
h) concave points (number of concave portions of the contour)
i) symmetry
j) fractal dimension ("coastline approximation" - 1)
```

## Pre Processing

In [108]:
data = pd.read_csv('wdbc.csv')

In [109]:
data = data.drop(axis = 1, columns = ['ID'])
data.head()

Unnamed: 0,Class,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [113]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data.Class = encoder.fit_transform(data.Class)

In [121]:
X = data.iloc[:, 1:].values
y = data.Class.values

# Bagging

In [115]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import f1_score

## 10-Fold Cross Validation

In [125]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10)
StratifiedKFold(n_splits=2, random_state=None, shuffle=False)

StratifiedKFold(n_splits=2, random_state=None, shuffle=False)

### Decision Tree

In [126]:
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # All data points
    bg_100 = BaggingClassifier(DecisionTreeClassifier(),
                             max_samples = 1.0,
                             max_features = 1.0,
                             n_estimators = 100,
                             random_state = 42)
    bg_100.fit(X_train, y_train)
    print(roc_auc_score(y_test, bg_100.predict_proba(X_test)[:,1]),
          bg_100.score(X_test, y_test),
          geometric_mean_score(y_test, bg_100.predict(X_test)),
          f1_score(y_test, bg_100.predict(X_test))
         )

#     # 90% of data points
#     scores_90 = BaggingClassifier(DecisionTreeClassifier(),
#                              max_samples = 0.9,
#                              max_features = 1.0,
#                              n_estimators = 100,
#                              random_state = 42)

#     # 80% of data points
#     scores_80 = BaggingClassifier(DecisionTreeClassifier(),
#                              max_samples = 0.8,
#                              max_features = 1.0,
#                              n_estimators = 100,
#                              random_state = 42)

#     # 70% of data points
#     scores_70 = BaggingClassifier(DecisionTreeClassifier(),
#                              max_samples = 0.7,
#                              max_features = 1.0,
#                              n_estimators = 100,
#                              random_state = 42)

#     # 60% of data points
#     scores_60 = BaggingClassifier(DecisionTreeClassifier(),
#                              max_samples = 0.6,
#                              max_features = 1.0,
#                              n_estimators = 100,
#                              random_state = 42)

#     # 50% of data points
#     scores_50 = BaggingClassifier(DecisionTreeClassifier(),
#                              max_samples = 0.5,
#                              max_features = 1.0,
#                              n_estimators = 100,
#                              random_state = 42)
    

0.9936868686868687 0.9482758620689655 0.9494815172056544 0.9333333333333332
0.9513888888888888 0.9137931034482759 0.9031370691409836 0.8837209302325582
0.988095238095238 0.9473684210526315 0.9378857231185629 0.9268292682926829
0.960978835978836 0.9473684210526315 0.9258200997725514 0.923076923076923
1.0 0.9824561403508771 0.9759000729485332 0.975609756097561
0.9973544973544974 0.9824561403508771 0.9860132971832694 0.9767441860465117
0.968915343915344 0.9649122807017544 0.9511897312113419 0.9500000000000001
0.9959183673469387 0.9821428571428571 0.9759000729485332 0.975609756097561
0.9979591836734694 0.9821428571428571 0.9759000729485332 0.975609756097561
1.0 1.0 1.0 1.0


### Perceptron

In [124]:
from sklearn.linear_model import Perceptron