In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
spam = pd.read_csv("spambase.data", header=None)

In [3]:
spam.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [4]:
spam.shape

(4601, 58)

In [5]:
from sklearn.model_selection import train_test_split
X = spam.loc[:,0:56]
y = spam.loc[:,57]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [7]:
print(X_train.shape)
print(X_test.shape)

(3220, 57)
(1381, 57)


In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [23]:
# Now apply the transformations to the data:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(30,30,30))

In [25]:
mlp.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30, 30, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [26]:
predictions = mlp.predict(X_test)

In [27]:
predictions

array([0, 1, 0, ..., 0, 0, 0])

In [28]:
probs = mlp.predict_proba(X_test)

In [29]:
probs

array([[  9.99870530e-01,   1.29469668e-04],
       [  7.33596485e-04,   9.99266404e-01],
       [  9.82620106e-01,   1.73798942e-02],
       ..., 
       [  1.00000000e+00,   1.43884135e-10],
       [  8.37001585e-01,   1.62998415e-01],
       [  9.99999956e-01,   4.36059147e-08]])

In [34]:
from sklearn.metrics import roc_auc_score
auc_ = roc_auc_score(y_test, probs[:,1])
print("AUC: %.4f" % auc_)

AUC: 0.9795


In [36]:
from sklearn.metrics import accuracy_score
print("acurácia: %.4f" % accuracy_score(y_test, predictions))

acurácia: 0.9457


In [41]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
print classification_report(y_test,predictions)

[[787  29]
 [ 46 519]]
             precision    recall  f1-score   support

          0       0.94      0.96      0.95       816
          1       0.95      0.92      0.93       565

avg / total       0.95      0.95      0.95      1381



In [9]:
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV

In [43]:
tuned_parameters = [{'hidden_layer_sizes': [(1,), (5,), (10,), (5,5,)],
                     'activation' : ['identity', 'logistic', 'tanh', 'relu'],
                     'learning_rate': ['constant', 'adaptive'],
                     'alpha': [0.0001, 0.001, 0.01, 0.1, 1]}]

In [44]:
clf = GridSearchCV(MLPClassifier(), tuned_parameters, cv=3)
clf.fit(X_train, y_train)



GridSearchCV(cv=3, error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'alpha': [0.0001, 0.001, 0.01, 0.1, 1], 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'learning_rate': ['constant', 'adaptive'], 'hidden_layer_sizes': [(1,), (5,), (10,), (5, 5)]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [45]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
print(classification_report(y_test, clf.predict(X_test)))
print()

Best parameters set found on development set:
()
{'alpha': 0.1, 'activation': 'relu', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (10,)}
()
Grid scores on development set:
()
0.920 (+/-0.004) for {'alpha': 0.0001, 'activation': 'identity', 'learning_rate': 'constant', 'hidden_layer_sizes': (1,)}
0.919 (+/-0.004) for {'alpha': 0.0001, 'activation': 'identity', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (1,)}
0.922 (+/-0.007) for {'alpha': 0.0001, 'activation': 'identity', 'learning_rate': 'constant', 'hidden_layer_sizes': (5,)}
0.922 (+/-0.005) for {'alpha': 0.0001, 'activation': 'identity', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (5,)}
0.921 (+/-0.005) for {'alpha': 0.0001, 'activation': 'identity', 'learning_rate': 'constant', 'hidden_layer_sizes': (10,)}
0.921 (+/-0.006) for {'alpha': 0.0001, 'activation': 'identity', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (10,)}
0.921 (+/-0.007) for {'alpha': 0.0001, 'activation': 'identity', 'learning_rate': 'c

## Boosting

In [11]:
from sklearn import ensemble
from sklearn.metrics import classification_report

In [15]:
params = {'n_estimators': 500, 'max_depth': 2, 'min_samples_split': 2,
          'learning_rate': 0.01}
clf = ensemble.GradientBoostingClassifier(**params)

clf.fit(X_train, y_train)
print(classification_report(y_test, clf.predict(X_test)))

             precision    recall  f1-score   support

          0       0.93      0.97      0.95       842
          1       0.95      0.89      0.92       539

avg / total       0.94      0.94      0.94      1381



In [20]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))

0.977016027746


In [13]:
tuned_parameters = [{'n_estimators': [10, 100, 300],
                     'max_depth' : [3, 10],
                     'min_samples_split': [2, 10],
                     'learning_rate': [0.001, 0.1], 
                     'subsample': [0.5, 1]}]

In [14]:
clf = GridSearchCV(ensemble.GradientBoostingClassifier(), tuned_parameters, cv=5)
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [10, 100, 300], 'max_depth': [3, 10], 'min_samples_split': [2, 10], 'learning_rate': [0.001, 0.1], 'subsample': [0.5, 1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [52]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
print(classification_report(y_test, clf.predict(X_test)))
print()

Best parameters set found on development set:
()
{'min_samples_split': 10, 'n_estimators': 300, 'learning_rate': 0.1, 'max_depth': 3, 'subsample': 1}
()
Grid scores on development set:
()
0.612 (+/-0.001) for {'min_samples_split': 2, 'n_estimators': 10, 'learning_rate': 0.001, 'max_depth': 3, 'subsample': 0.5}
0.612 (+/-0.001) for {'min_samples_split': 2, 'n_estimators': 10, 'learning_rate': 0.001, 'max_depth': 3, 'subsample': 1}
0.612 (+/-0.001) for {'min_samples_split': 2, 'n_estimators': 100, 'learning_rate': 0.001, 'max_depth': 3, 'subsample': 0.5}
0.612 (+/-0.001) for {'min_samples_split': 2, 'n_estimators': 100, 'learning_rate': 0.001, 'max_depth': 3, 'subsample': 1}
0.837 (+/-0.019) for {'min_samples_split': 2, 'n_estimators': 300, 'learning_rate': 0.001, 'max_depth': 3, 'subsample': 0.5}
0.839 (+/-0.024) for {'min_samples_split': 2, 'n_estimators': 300, 'learning_rate': 0.001, 'max_depth': 3, 'subsample': 1}
0.612 (+/-0.001) for {'min_samples_split': 10, 'n_estimators': 10, 'le

## SVM

In [53]:
from sklearn.svm import SVC

In [56]:
clf = SVC(C=1.0)

In [57]:
clf.fit(X_train, y_train) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [58]:
print(classification_report(y_test, clf.predict(X_test)))

             precision    recall  f1-score   support

          0       0.93      0.96      0.95       816
          1       0.94      0.90      0.92       565

avg / total       0.94      0.94      0.94      1381



In [59]:
tuned_parameters = [{'kernel': ['rbf', 'linear', 'poly'],
                     'C': [1, 10, 100, 1000]}]

In [60]:
clf = GridSearchCV(SVC(), tuned_parameters, cv=5)
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'kernel': ['rbf', 'linear', 'poly'], 'C': [1, 10, 100, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [61]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
print(classification_report(y_test, clf.predict(X_test)))
print()

Best parameters set found on development set:
()
{'kernel': 'rbf', 'C': 1}
()
Grid scores on development set:
()
0.927 (+/-0.015) for {'kernel': 'rbf', 'C': 1}
0.922 (+/-0.007) for {'kernel': 'linear', 'C': 1}
0.759 (+/-0.013) for {'kernel': 'poly', 'C': 1}
0.927 (+/-0.012) for {'kernel': 'rbf', 'C': 10}
0.923 (+/-0.006) for {'kernel': 'linear', 'C': 10}
0.845 (+/-0.010) for {'kernel': 'poly', 'C': 10}
0.914 (+/-0.006) for {'kernel': 'rbf', 'C': 100}
0.923 (+/-0.007) for {'kernel': 'linear', 'C': 100}
0.913 (+/-0.013) for {'kernel': 'poly', 'C': 100}
0.907 (+/-0.014) for {'kernel': 'rbf', 'C': 1000}
0.924 (+/-0.006) for {'kernel': 'linear', 'C': 1000}
0.915 (+/-0.031) for {'kernel': 'poly', 'C': 1000}
()
Detailed classification report:
()
The model is trained on the full development set.
The scores are computed on the full evaluation set.
()
             precision    recall  f1-score   support

          0       0.93      0.96      0.95       816
          1       0.94      0.90      0