# Exercicio
* Monte uma classificacao utilizando a base da VEL ou SEN
* Monte testes de avaliacao de diferentes classificadores considerando:
* Busca por hiperparametros (Considere testar parametros de regularizacao)
* Busca por features
* Utilize um metodo de validacao cruzada

In [70]:
import warnings
warnings.filterwarnings("ignore")

In [22]:
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold, StratifiedKFold

from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score

from sklearn.feature_selection import SelectKBest, chi2

In [64]:
# ler dados

df = pd.read_csv('dataset/evasao.csv')
df = df.drop(['DT_INGRESSO_CURSO'], axis=1)

X = df.drop(['TP_SITUACAO'], axis=1)
y = df['TP_SITUACAO']


# reduzir o número de features mantendo apenas as k mais relevantes

X = SelectKBest(chi2, k=5).fit_transform(X, y)


# testar alguns modelos de forma exploratória

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

pipe_sgd = Pipeline([('scl', StandardScaler()), ('clf', SGDClassifier())])
pipe_lr = Pipeline([('scl', StandardScaler()), ('clf', LogisticRegression())])
pipe_knn = Pipeline([('scl', StandardScaler()), ('clf', KNeighborsClassifier())])
pipe_dt = Pipeline([('scl', StandardScaler()), ('clf', DecisionTreeClassifier())])

pipelines = [pipe_sgd, pipe_lr, pipe_knn, pipe_dt]
pipe_dict = {0: 'SGD Classifier', 1: 'Logistic Regression', 2: 'KNN', 3: 'Decision Tree'}

for pipe in pipelines:
    pipe.fit(X_train, y_train)
    
for idx, val in enumerate(pipelines):
    print('%s pipeline test accuracy: %.3f' % (pipe_dict[idx], val.score(X_test, y_test)))


# selecionar um modelo para otimizar
    
pipetree = pipelines[3]
pipe = [pipetree]    

param_range = [3, 5]
grid_params = [{'clf__criterion': ['gini', 'entropy'],
                'clf__max_depth': param_range,
                'clf__min_samples_leaf': param_range,
                'clf__min_samples_split': param_range[1:]
              }]


scores = ['accuracy', 'recall_macro', 'precision_macro']
for score in scores:
    
    kfolds = StratifiedKFold(n_splits=10, shuffle=True)
    cv = kfolds.split(X_train, y_train)

    print("\n\n# Decision Tree - Tuning hyper-parameters for %s" % score)
    gs = GridSearchCV(estimator=pipetree, param_grid=grid_params, scoring=score, cv=cv)
    gs.fit(X_train, y_train)
    print('\nBest accuracy: %.3f' % gs.best_score_)
    print('\nBest params:\n', gs.best_params_)

    print("\nGrid scores on development set:")
    means = gs.cv_results_['mean_test_score']
    stds = gs.cv_results_['std_test_score']
    #for mean, std, params in zip(means, stds, gs.cv_results_['params']):
    #    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    
    print("\nClassification report:")
    print()
    y_true, y_pred = y_test, gs.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()    

SGD Classifier pipeline test accuracy: 0.872
Logistic Regression pipeline test accuracy: 0.867
KNN pipeline test accuracy: 0.873
Decision Tree pipeline test accuracy: 0.846


# Decision Tree - Tuning hyper-parameters for accuracy

Best accuracy: 0.893

Best params:
 {'clf__criterion': 'gini', 'clf__max_depth': 5, 'clf__min_samples_leaf': 5, 'clf__min_samples_split': 5}

Grid scores on development set:

Classification report:

              precision    recall  f1-score   support

           0       0.54      0.36      0.44        85
           1       0.91      0.95      0.93       577

    accuracy                           0.88       662
   macro avg       0.73      0.66      0.68       662
weighted avg       0.86      0.88      0.87       662




# Decision Tree - Tuning hyper-parameters for recall_macro

Best accuracy: 0.693

Best params:
 {'clf__criterion': 'entropy', 'clf__max_depth': 3, 'clf__min_samples_leaf': 5, 'clf__min_samples_split': 5}

Grid scores on development set:

Cl

In [63]:
pipelines[0].get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'scl', 'clf', 'scl__copy', 'scl__with_mean', 'scl__with_std', 'clf__alpha', 'clf__average', 'clf__class_weight', 'clf__early_stopping', 'clf__epsilon', 'clf__eta0', 'clf__fit_intercept', 'clf__l1_ratio', 'clf__learning_rate', 'clf__loss', 'clf__max_iter', 'clf__n_iter_no_change', 'clf__n_jobs', 'clf__penalty', 'clf__power_t', 'clf__random_state', 'clf__shuffle', 'clf__tol', 'clf__validation_fraction', 'clf__verbose', 'clf__warm_start'])

In [78]:
pipetree = pipelines[0] #sgd
pipe = [pipetree]    

param_range = [3, 5]
alphas = [0.0001, 0.01, 1]

grid_params = [{'clf__penalty': ['l1', 'l2'],
                'clf__loss': ['hinge', 'modified_huber', 'log']#,
                #'clf__alpha': alphas
              }]


scores = ['accuracy', 'recall_macro', 'precision_macro']
for score in scores:
    
    kfolds = StratifiedKFold(n_splits=2, shuffle=True)
    cv = kfolds.split(X_train, y_train)

    print("\n\n# SGD - Tuning hyper-parameters for %s" % score)
    gs = GridSearchCV(estimator=pipetree, param_grid=grid_params, scoring=score, cv=cv)
    gs.fit(X_train, y_train)
    print('\nBest accuracy: %.3f' % gs.best_score_)
    print('\nBest params:\n', gs.best_params_)

    print("\nGrid scores on development set:")
    means = gs.cv_results_['mean_test_score']
    stds = gs.cv_results_['std_test_score']
    #for mean, std, params in zip(means, stds, gs.cv_results_['params']):
    #    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    
    print("\nClassification report:")
    print()
    y_true, y_pred = y_test, gs.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()    



# SGD - Tuning hyper-parameters for accuracy

Best accuracy: 0.871

Best params:
 {'clf__loss': 'modified_huber', 'clf__penalty': 'l1'}

Grid scores on development set:

Classification report:

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        85
           1       0.87      1.00      0.93       577

    accuracy                           0.87       662
   macro avg       0.44      0.50      0.47       662
weighted avg       0.76      0.87      0.81       662




# SGD - Tuning hyper-parameters for recall_macro

Best accuracy: 0.887

Best params:
 {'clf__loss': 'modified_huber', 'clf__penalty': 'l1'}

Grid scores on development set:

Classification report:

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        85
           1       0.87      1.00      0.93       577

    accuracy                           0.87       662
   macro avg       0.44      0.50      0.47       662
weig