# Aula teste SENAC

Vamos usar a extensão jupiternotebook da linguagem Python para treinar nosso modelo. Essa extensão é uma linguagem de marcação que permite misturar HTML + código fonte para gerar relatórios científicos com fácil **reprodução**. Essa extensão funciona subindo um servidor web simples em background, responsável por executar o código python aqui presente. Além disso disponibiliza o output em HTML com possibilidade de exportação para pdf.

# Decision tree

In [13]:
# data science libs
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# remove useless warning message, is it a good idea? Why (why not)?
# import warnings
# warnings.filterwarnings('ignore')

df = pd.read_csv('../data/ad.data',
                 header=None)

explanatory_variable_columns = set(df.columns.values)

#Remove the target variable
explanatory_variable_columns.remove(len(df.columns.values)-1)

#Create a pandas series to hold the target
response_variable_column = df[len(df.columns.values)-1] # The last column describes the classes

# transform the target in a numerical value
y = [1 if e == 'ad.' else 0 for e in response_variable_column]

#copy the trainning data before any transformation
X = df[list(explanatory_variable_columns)].copy()

#Some fields has missing values this is why we have some warnings
X.replace(to_replace=' *?', value=-1, regex=True, inplace=True)

#Split the data in: test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

#  Create a sequence of tranformations (before or after the 
#predict [pre(pos)-process steps]) and predictor.
pipeline = Pipeline([
    ('clf', DecisionTreeClassifier(criterion='entropy'))
])

parameters = {
    'clf__max_depth': (150, 155, 160),
    'clf__min_samples_split': (2, 3),
    'clf__min_samples_leaf': (1, 2, 3)
}

#look the best parameters
grid_search = GridSearchCV(pipeline, 
                           parameters, 
                           n_jobs=-1, 
                           verbose=1,
                           scoring='f1')

#fit the model with the best parameters
grid_search.fit(X_train, y_train)

#get the best parameters
best_parameters = grid_search.best_estimator_.get_params()

print('Best score: %0.3f' % grid_search.best_score_)
print('Best parameters set:')
for param_name in sorted(parameters.keys()):
    print('t%s: %r' % (param_name, best_parameters[param_name]))

# apply the trained model in the test data 
predictions = grid_search.predict(X_test)

#Report of the model in the test data
print(classification_report(y_test, predictions))


Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:    6.2s finished


Best score: 0.896
Best parameters set:
tclf__max_depth: 155
tclf__min_samples_leaf: 1
tclf__min_samples_split: 3
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       705
           1       0.87      0.88      0.87       115

   micro avg       0.96      0.96      0.96       820
   macro avg       0.93      0.93      0.93       820
weighted avg       0.96      0.96      0.96       820



## Parameters

1. max_depth: This indicates how deep the tree can be
2. min_samples_split: Represents the minimum number of samples required to split an internal node
3. min_samples_leaf: The minimum number of samples required to be at a leaf node

# Xgboost

In [28]:
# data science libs
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from scipy.stats import uniform, randint

# remove useless warning message, is it a good idea? Why (why not)?
# import warnings
# warnings.filterwarnings('ignore')

df = pd.read_csv('../data/ad.data',
                 header=None)

explanatory_variable_columns = set(df.columns.values)

#Remove the target variable
explanatory_variable_columns.remove(len(df.columns.values)-1)

#Create a pandas series to hold the target
response_variable_column = df[len(df.columns.values)-1] # The last column describes the classes

# transform the target in a numerical value
y = [1 if e == 'ad.' else 0 for e in response_variable_column]

#copy the trainning data before any transformation
X = df[list(explanatory_variable_columns)].copy()

#Some fields has missing values this is why we have some warnings
X.replace(to_replace=' *?', value=-1, regex=True, inplace=True)

#Split the data in: test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

#  Create a sequence of tranformations (before or after the 
#predict [pre(pos)-process steps]) and predictor.
pipeline = Pipeline([
    ('clf', XGBClassifier())
])

parameters = {
   "clf__colsample_bytree": (0.2, 0.3),
   "clf__gamma": [0.02],
   "clf__learning_rate": [0.02], # default 0.1 
   "clf__max_depth": [1], # default 3
   "clf__n_estimators": [1000], # default 100
   "clf__subsample": [0.4]
}

# XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#        colsample_bytree=1, criterion='entropy', gamma=0, learning_rate=0.1,
#        max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
#        n_estimators=100, n_jobs=1, nthread=None,
#        objective='binary:logistic', random_state=0, reg_alpha=0,
#        reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
#        subsample=1

#look the best parameters
grid_search = GridSearchCV(pipeline, 
                           parameters, 
                           n_jobs=None, 
                           verbose=1,
                           scoring='f1')

#fit the model with the best parameters
grid_search.fit(X_train, y_train)

#get the best parameters
best_parameters = grid_search.best_estimator_.get_params()

print('Best score: %0.3f' % grid_search.best_score_)
print('Best parameters set:')
for param_name in sorted(parameters.keys()):
    print('t%s: %r' % (param_name, best_parameters[param_name]))

# apply the trained model in the test data 
predictions = grid_search.predict(X_test)

#Report of the model in the test data
print(classification_report(y_test, predictions))


Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   49.6s finished


Best score: 0.808
Best parameters set:
tclf__colsample_bytree: 0.3
tclf__gamma: 0.02
tclf__learning_rate: 0.02
tclf__max_depth: 1
tclf__n_estimators: 1000
tclf__subsample: 0.4
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       711
           1       0.99      0.78      0.87       109

   micro avg       0.97      0.97      0.97       820
   macro avg       0.98      0.89      0.93       820
weighted avg       0.97      0.97      0.97       820



## Parameters

Homework: Search the parameters definition at the internet

# Random Forest

Homework: Train a Random Forest Classifier