## Importing Packages

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
%matplotlib inline

## Reading CSV

In [2]:
cancer = pd.read_csv("cancer.data.txt", header=None)

In [3]:
cancer.shape

(699, 11)

In [4]:
cancer.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [5]:
cancer.columns = ["v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10"]

In [6]:
cancer.v6.head(50)

0      1
1     10
2      2
3      4
4      1
5     10
6     10
7      1
8      1
9      1
10     1
11     1
12     3
13     3
14     9
15     1
16     1
17     1
18    10
19     1
20    10
21     7
22     1
23     ?
24     1
25     7
26     1
27     1
28     1
29     1
30     1
31     1
32     5
33     1
34     1
35     1
36     1
37     1
38    10
39     7
40     ?
41     3
42    10
43     1
44     1
45     1
46     9
47     1
48     1
49     8
Name: v6, dtype: object

In [7]:
cancer.v6 = cancer.v6.replace("?", np.nan).astype(float)

### Putting Target in 0-1 class

In [8]:
cancer.v10 = cancer.v10/2-1

## Train Test Split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X = cancer[["v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"]]
y = cancer.v10
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [11]:
print X_train.shape

(489, 9)


In [12]:
print X_test.shape

(210, 9)


## Missings

In [13]:
for i in X.columns:
    print i + '   :    ' + str(sum(X_train[i].isnull()))

v1   :    0
v2   :    0
v3   :    0
v4   :    0
v5   :    0
v6   :    8
v7   :    0
v8   :    0
v9   :    0


In [14]:
mediana = X_train.v6.median()
print mediana
X_train.loc[X_train.v6.isnull(), 'v6'] = mediana
X_test.loc[X_test.v6.isnull(), 'v6'] = mediana

1.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [15]:
for i in X.columns:
    print i + '   :    ' + str(sum(X_train[i].isnull()))

v1   :    0
v2   :    0
v3   :    0
v4   :    0
v5   :    0
v6   :    0
v7   :    0
v8   :    0
v9   :    0


## Scale

In [16]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [17]:
# Now apply the transformations to the data:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Neural Network

In [18]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(30,30,30))

In [19]:
mlp.fit(X_train,y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30, 30, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [20]:
predictions = mlp.predict(X_test)

In [21]:
probs = mlp.predict_proba(X_test)

## AUC

In [22]:
from sklearn.metrics import roc_auc_score
auc_ = roc_auc_score(y_test, probs[:,1])
print("AUC: %.4f" % auc_)

AUC: 0.9870


## Accuracy

In [23]:
from sklearn.metrics import accuracy_score
print("acurácia: %.4f" % accuracy_score(y_test, predictions))

acurácia: 0.9571


In [24]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
print classification_report(y_test,predictions)

[[132   6]
 [  3  69]]
             precision    recall  f1-score   support

        0.0       0.98      0.96      0.97       138
        1.0       0.92      0.96      0.94        72

avg / total       0.96      0.96      0.96       210



#### Optimizing Hyper-parameters

In [26]:
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV

In [27]:
tuned_parameters = [{'hidden_layer_sizes': [(1,), (10,), (5,5,)],
                     'activation' : ['logistic', 'relu'],
                     'learning_rate': ['constant', 'adaptive'],
                     'alpha': [0.01, 0.1]}]

In [28]:
clf = GridSearchCV(MLPClassifier(), tuned_parameters, cv=3)
clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'alpha': [0.01, 0.1], 'activation': ['logistic', 'relu'], 'learning_rate': ['constant', 'adaptive'], 'hidden_layer_sizes': [(1,), (10,), (5, 5)]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

## Performance

In [29]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
print(classification_report(y_test, clf.predict(X_test)))
print()

Best parameters set found on development set:
()
{'alpha': 0.1, 'activation': 'relu', 'learning_rate': 'constant', 'hidden_layer_sizes': (10,)}
()
Grid scores on development set:
()
0.838 (+/-0.188) for {'alpha': 0.01, 'activation': 'logistic', 'learning_rate': 'constant', 'hidden_layer_sizes': (1,)}
0.622 (+/-0.426) for {'alpha': 0.01, 'activation': 'logistic', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (1,)}
0.953 (+/-0.020) for {'alpha': 0.01, 'activation': 'logistic', 'learning_rate': 'constant', 'hidden_layer_sizes': (10,)}
0.965 (+/-0.029) for {'alpha': 0.01, 'activation': 'logistic', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (10,)}
0.904 (+/-0.119) for {'alpha': 0.01, 'activation': 'logistic', 'learning_rate': 'constant', 'hidden_layer_sizes': (5, 5)}
0.896 (+/-0.061) for {'alpha': 0.01, 'activation': 'logistic', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (5, 5)}
0.654 (+/-0.003) for {'alpha': 0.1, 'activation': 'logistic', 'learning_rate': 'constant', '

## Boosting

In [30]:
params = {'n_estimators': 500, 'max_depth': 2, 'min_samples_split': 2,
          'learning_rate': 0.01}
clf = ensemble.GradientBoostingClassifier(**params)

clf.fit(X_train, y_train)
print(classification_report(y_test, clf.predict(X_test)))

             precision    recall  f1-score   support

        0.0       0.94      0.97      0.96       138
        1.0       0.94      0.89      0.91        72

avg / total       0.94      0.94      0.94       210



#### Tunning

In [31]:
tuned_parameters = [{'n_estimators': [10, 100],
                     'max_depth' : [3, 5],
                     'min_samples_split': [2],
                     'learning_rate': [0.001, 0.1], 
                     'subsample': [0.5]}]

In [32]:
clf = GridSearchCV(ensemble.GradientBoostingClassifier(), tuned_parameters, cv=5)
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [10, 100], 'min_samples_split': [2], 'learning_rate': [0.001, 0.1], 'max_depth': [3, 5], 'subsample': [0.5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

#### Performance

In [33]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
print(classification_report(y_test, clf.predict(X_test)))
print()

Best parameters set found on development set:
()
{'min_samples_split': 2, 'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 3, 'subsample': 0.5}
()
Grid scores on development set:
()
0.654 (+/-0.005) for {'min_samples_split': 2, 'n_estimators': 10, 'learning_rate': 0.001, 'max_depth': 3, 'subsample': 0.5}
0.654 (+/-0.005) for {'min_samples_split': 2, 'n_estimators': 100, 'learning_rate': 0.001, 'max_depth': 3, 'subsample': 0.5}
0.654 (+/-0.005) for {'min_samples_split': 2, 'n_estimators': 10, 'learning_rate': 0.001, 'max_depth': 5, 'subsample': 0.5}
0.654 (+/-0.005) for {'min_samples_split': 2, 'n_estimators': 100, 'learning_rate': 0.001, 'max_depth': 5, 'subsample': 0.5}
0.957 (+/-0.033) for {'min_samples_split': 2, 'n_estimators': 10, 'learning_rate': 0.1, 'max_depth': 3, 'subsample': 0.5}
0.967 (+/-0.008) for {'min_samples_split': 2, 'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 3, 'subsample': 0.5}
0.957 (+/-0.040) for {'min_samples_split': 2, 'n_estimators': 10, 'l

## SVM

#### Fitting simple model

In [34]:
from sklearn.svm import SVC

In [35]:
clf = SVC(C=1.0)

In [36]:
clf.fit(X_train, y_train) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [37]:
print(classification_report(y_test, clf.predict(X_test)))

             precision    recall  f1-score   support

        0.0       0.97      0.96      0.96       138
        1.0       0.92      0.94      0.93        72

avg / total       0.95      0.95      0.95       210



#### Tunning

In [38]:
tuned_parameters = [{'kernel': ['rbf', 'linear'],
                     'C': [1, 10, 1000]}]

In [39]:
clf = GridSearchCV(SVC(), tuned_parameters, cv=5)
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'kernel': ['rbf', 'linear'], 'C': [1, 10, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [40]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
print(classification_report(y_test, clf.predict(X_test)))
print()

Best parameters set found on development set:
()
{'kernel': 'rbf', 'C': 1}
()
Grid scores on development set:
()
0.978 (+/-0.035) for {'kernel': 'rbf', 'C': 1}
0.978 (+/-0.020) for {'kernel': 'linear', 'C': 1}
0.965 (+/-0.030) for {'kernel': 'rbf', 'C': 10}
0.973 (+/-0.016) for {'kernel': 'linear', 'C': 10}
0.947 (+/-0.047) for {'kernel': 'rbf', 'C': 1000}
0.973 (+/-0.016) for {'kernel': 'linear', 'C': 1000}
()
Detailed classification report:
()
The model is trained on the full development set.
The scores are computed on the full evaluation set.
()
             precision    recall  f1-score   support

        0.0       0.97      0.96      0.96       138
        1.0       0.92      0.94      0.93        72

avg / total       0.95      0.95      0.95       210

()
