# Regressor

## Gastos Cartao

In [5]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
gastos = pd.read_csv("base_gastos_cartao.csv")

In [3]:
gastos.shape

(150, 5)

In [4]:
gastos.head()

Unnamed: 0,Gastos_Cartao,Idade,Renda,Impostos,Segmento
0,510,35,1120,60,C
1,490,30,1120,60,C
2,470,32,1040,60,C
3,460,31,1200,60,C
4,500,36,1120,60,C


## Train Test Split

In [6]:
from sklearn.model_selection import train_test_split

In [17]:
X = pd.concat([gastos[['Idade', 'Renda', 'Impostos']], pd.get_dummies(gastos.Segmento)], axis=1)
y = gastos.Gastos_Cartao
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
print X_train.shape
print X_test.shape

(105, 6)
(45, 6)


## Linear Regression

In [19]:
from sklearn.linear_model import LinearRegression

In [20]:
lr = LinearRegression()

In [21]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [22]:
predictions = lr.predict(X_test)

In [23]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, predictions)
print("MSE: %.4f" % mse)

MSE: 923.5416


In [24]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, predictions)
print("R2: %.4f" % r2)

R2: 0.8612


## Ridge Regression

In [25]:
from sklearn.linear_model import Ridge

In [26]:
ri = Ridge(alpha=10)

In [27]:
ri.fit(X_train, y_train)

Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [28]:
predictions = ri.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print("MSE: %.4f" % mse)
r2 = r2_score(y_test, predictions)
print("R2: %.4f" % r2)

MSE: 969.0087
R2: 0.8544


In [29]:
from sklearn.model_selection import GridSearchCV

In [30]:
tuned_parameters = [{'alpha': [0,0.01,0.1,0.5,1,10,100]}]

In [31]:
clf = GridSearchCV(Ridge(), tuned_parameters, cv=3, scoring='r2')
clf.fit(X_train, y_train)

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 5.93053267189e-18
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 1.14488535379e-18
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 3.83652234369e-18


GridSearchCV(cv=3, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'alpha': [0, 0.01, 0.1, 0.5, 1, 10, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='r2', verbose=0)

In [32]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)
print()

Best parameters set found on development set:
()
{'alpha': 0.1}
()
Grid scores on development set:
()
0.844 (+/-0.052) for {'alpha': 0}
0.844 (+/-0.052) for {'alpha': 0.01}
0.844 (+/-0.054) for {'alpha': 0.1}
0.844 (+/-0.059) for {'alpha': 0.5}
0.843 (+/-0.062) for {'alpha': 1}
0.840 (+/-0.066) for {'alpha': 10}
0.841 (+/-0.074) for {'alpha': 100}
()
Detailed classification report:
()
The model is trained on the full development set.
The scores are computed on the full evaluation set.
()
MSE: 923.4028
()


## TREE

In [33]:
from sklearn.tree import DecisionTreeRegressor

In [34]:
dt = DecisionTreeRegressor(max_depth=3, min_samples_split=5)

In [35]:
dt.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=5,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [36]:
predictions = dt.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print("MSE: %.4f" % mse)
r2 = r2_score(y_test, predictions)
print("R2: %.4f" % r2)

MSE: 1316.5579
R2: 0.8022


In [40]:
tuned_parameters = [{'max_depth': [1,2,3,4,5],
                     'min_samples_split': [2,3,5]}]

In [41]:
clf = GridSearchCV(DecisionTreeRegressor(), tuned_parameters, cv=3, scoring='r2')
clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'min_samples_split': [2, 3, 5], 'max_depth': [1, 2, 3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='r2', verbose=0)

In [42]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)
print()

Best parameters set found on development set:
()
{'min_samples_split': 5, 'max_depth': 4}
()
Grid scores on development set:
()
0.415 (+/-0.104) for {'min_samples_split': 2, 'max_depth': 1}
0.415 (+/-0.104) for {'min_samples_split': 3, 'max_depth': 1}
0.415 (+/-0.104) for {'min_samples_split': 5, 'max_depth': 1}
0.700 (+/-0.178) for {'min_samples_split': 2, 'max_depth': 2}
0.700 (+/-0.178) for {'min_samples_split': 3, 'max_depth': 2}
0.700 (+/-0.178) for {'min_samples_split': 5, 'max_depth': 2}
0.735 (+/-0.102) for {'min_samples_split': 2, 'max_depth': 3}
0.735 (+/-0.102) for {'min_samples_split': 3, 'max_depth': 3}
0.735 (+/-0.102) for {'min_samples_split': 5, 'max_depth': 3}
0.756 (+/-0.064) for {'min_samples_split': 2, 'max_depth': 4}
0.745 (+/-0.079) for {'min_samples_split': 3, 'max_depth': 4}
0.773 (+/-0.065) for {'min_samples_split': 5, 'max_depth': 4}
0.743 (+/-0.109) for {'min_samples_split': 2, 'max_depth': 5}
0.736 (+/-0.143) for {'min_samples_split': 3, 'max_depth': 5}
0.75

## Random Forest

In [43]:
from sklearn.ensemble import RandomForestRegressor

In [44]:
rf = RandomForestRegressor(n_estimators=100, max_depth=3, max_features=1)

In [45]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features=1, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [46]:
predictions = rf.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print("MSE: %.4f" % mse)
r2 = r2_score(y_test, predictions)
print("R2: %.4f" % r2)

MSE: 1355.0300
R2: 0.7964


In [62]:
tuned_parameters = [{'max_depth': [1,2,3,4,5],
                     'max_features': [2,4,6]}]

clf = GridSearchCV(RandomForestRegressor(), tuned_parameters, cv=3, scoring='r2')
clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'max_features': [2, 4, 6], 'max_depth': [1, 2, 3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='r2', verbose=0)

In [63]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)
print()

Best parameters set found on development set:
()
{'max_features': 6, 'max_depth': 5}
()
Grid scores on development set:
()
0.226 (+/-0.050) for {'max_features': 2, 'max_depth': 1}
0.280 (+/-0.027) for {'max_features': 4, 'max_depth': 1}
0.342 (+/-0.019) for {'max_features': 6, 'max_depth': 1}
0.369 (+/-0.084) for {'max_features': 2, 'max_depth': 2}
0.481 (+/-0.035) for {'max_features': 4, 'max_depth': 2}
0.509 (+/-0.021) for {'max_features': 6, 'max_depth': 2}
0.450 (+/-0.079) for {'max_features': 2, 'max_depth': 3}
0.539 (+/-0.065) for {'max_features': 4, 'max_depth': 3}
0.578 (+/-0.045) for {'max_features': 6, 'max_depth': 3}
0.525 (+/-0.029) for {'max_features': 2, 'max_depth': 4}
0.584 (+/-0.057) for {'max_features': 4, 'max_depth': 4}
0.641 (+/-0.058) for {'max_features': 6, 'max_depth': 4}
0.568 (+/-0.031) for {'max_features': 2, 'max_depth': 5}
0.647 (+/-0.047) for {'max_features': 4, 'max_depth': 5}
0.665 (+/-0.042) for {'max_features': 6, 'max_depth': 5}
()
Detailed classifica

# Classifier

## SPAM

In [49]:
df = pd.read_csv("spambase.data", header=None)

In [50]:
df.shape

(4601, 58)

In [51]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [52]:
from sklearn.model_selection import train_test_split

In [56]:
X = df.iloc[:,0:56]
y = df.iloc[:,57]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [57]:
print X_train.shape
print X_test.shape

(3220, 56)
(1381, 56)


## Logistic Regression

In [70]:
from sklearn.linear_model import LogisticRegression

In [71]:
lo = LogisticRegression(C=10)
lo.fit(X_train, y_train)
predictions = lo.predict(X_test)
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, predictions)
print("accuracy_score: %.4f" % acc)

accuracy_score: 0.9254


In [72]:
tuned_parameters = [{'C': [0.1,1,10]}]

clf = GridSearchCV(LogisticRegression(), tuned_parameters, cv=3, scoring='accuracy')
clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1, param_grid=[{'C': [0.1, 1, 10]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [73]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, clf.predict(X_test))
print("Accuracy: %.4f" % acc)
print()

Best parameters set found on development set:
()
{'C': 10}
()
Grid scores on development set:
()
0.916 (+/-0.016) for {'C': 0.1}
0.920 (+/-0.016) for {'C': 1}
0.921 (+/-0.017) for {'C': 10}
()
Detailed classification report:
()
The model is trained on the full development set.
The scores are computed on the full evaluation set.
()
Accuracy: 0.9254
()


## Decision Tree

In [69]:
from sklearn.tree import DecisionTreeClassifier

In [75]:
dt = DecisionTreeClassifier(max_depth=3)
dt.fit(X_train, y_train)
predictions = dt.predict(X_test)
acc = roc_auc_score(y_test, predictions)
print("accuracy_score: %.4f" % acc)

accuracy_score: 0.8401


In [77]:
tuned_parameters = [{'max_depth': [1,3,5,10,20],
                     'min_samples_split': [3,5,10]}]

clf = GridSearchCV(DecisionTreeClassifier(), tuned_parameters, cv=3, scoring='accuracy')
clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'min_samples_split': [3, 5, 10], 'max_depth': [1, 3, 5, 10, 20]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [78]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, clf.predict(X_test))
print("Accuracy: %.4f" % acc)
print()

Best parameters set found on development set:
()
{'min_samples_split': 10, 'max_depth': 10}
()
Grid scores on development set:
()
0.798 (+/-0.023) for {'min_samples_split': 3, 'max_depth': 1}
0.798 (+/-0.023) for {'min_samples_split': 5, 'max_depth': 1}
0.798 (+/-0.023) for {'min_samples_split': 10, 'max_depth': 1}
0.884 (+/-0.012) for {'min_samples_split': 3, 'max_depth': 3}
0.884 (+/-0.012) for {'min_samples_split': 5, 'max_depth': 3}
0.884 (+/-0.012) for {'min_samples_split': 10, 'max_depth': 3}
0.907 (+/-0.012) for {'min_samples_split': 3, 'max_depth': 5}
0.907 (+/-0.012) for {'min_samples_split': 5, 'max_depth': 5}
0.908 (+/-0.011) for {'min_samples_split': 10, 'max_depth': 5}
0.913 (+/-0.017) for {'min_samples_split': 3, 'max_depth': 10}
0.914 (+/-0.015) for {'min_samples_split': 5, 'max_depth': 10}
0.917 (+/-0.017) for {'min_samples_split': 10, 'max_depth': 10}
0.912 (+/-0.015) for {'min_samples_split': 3, 'max_depth': 20}
0.907 (+/-0.004) for {'min_samples_split': 5, 'max_depth

## Random Forest

In [79]:
from sklearn.ensemble import RandomForestClassifier

In [80]:
rf = RandomForestClassifier(n_estimators=100, max_depth=3, max_features=5)
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)
acc = roc_auc_score(y_test, predictions)
print("accuracy_score: %.4f" % acc)

accuracy_score: 0.8909


In [81]:
tuned_parameters = [{'n_estimators': [100,300],
                     'max_depth': [1,3,5,10,20],
                     'max_features': [1,5,10,30,56]}]

clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=3, scoring='accuracy')
clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [100, 300], 'max_features': [1, 5, 10, 30, 56], 'max_depth': [1, 3, 5, 10, 20]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [82]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, clf.predict(X_test))
print("Accuracy: %.4f" % acc)
print()

Best parameters set found on development set:
()
{'max_features': 5, 'n_estimators': 300, 'max_depth': 20}
()
Grid scores on development set:
()
0.687 (+/-0.034) for {'max_features': 1, 'n_estimators': 100, 'max_depth': 1}
0.675 (+/-0.019) for {'max_features': 1, 'n_estimators': 300, 'max_depth': 1}
0.823 (+/-0.012) for {'max_features': 5, 'n_estimators': 100, 'max_depth': 1}
0.819 (+/-0.024) for {'max_features': 5, 'n_estimators': 300, 'max_depth': 1}
0.849 (+/-0.005) for {'max_features': 10, 'n_estimators': 100, 'max_depth': 1}
0.844 (+/-0.017) for {'max_features': 10, 'n_estimators': 300, 'max_depth': 1}
0.828 (+/-0.059) for {'max_features': 30, 'n_estimators': 100, 'max_depth': 1}
0.829 (+/-0.044) for {'max_features': 30, 'n_estimators': 300, 'max_depth': 1}
0.802 (+/-0.032) for {'max_features': 56, 'n_estimators': 100, 'max_depth': 1}
0.802 (+/-0.032) for {'max_features': 56, 'n_estimators': 300, 'max_depth': 1}
0.822 (+/-0.027) for {'max_features': 1, 'n_estimators': 100, 'max_de