In [1]:
## IMPORTS

import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, auc, precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

# import custom modules
import sys
sys.path.insert(1, '../utils/')
import model_utils

In [2]:
df = pd.read_csv("../../pre_processing/datasets/no_filtered_analysis.csv")

In [3]:
# Dropo a coluna de RA
if "RA" in df.columns:
    print("Removing column RA")
    df.drop('RA', inplace=True, axis=1)

Removing column RA


In [4]:
df.head(10)

Unnamed: 0,EVADIDO,RESID_ARARAS,BOLSISTA,GRADE_CORRENTE,NOTA_MEDIA,PONTUACAO_PS,TOT_MAT_CURSADAS,TOT_APROVACOES,TOT_REPROVACOES,TOT_REPROV_FREQ,...,SIF029,SIF030,SIF038,SIF039,SIF070,NCS037,NCS040,SIF006,SIF072,SIF073
0,0,1,1,2019,7.0,47.0,10,9,1,0,...,4,0,0,0,0,0,0,0,1,1
1,1,0,1,2016,5.5,47.0,18,15,3,3,...,0,0,0,0,0,0,0,1,0,0
2,1,0,0,2014,3.0,46.0,12,6,6,0,...,0,0,0,0,0,0,0,4,0,0
3,1,0,1,2014,5.1,47.0,49,37,12,10,...,1,1,5,4,1,1,1,1,0,0
4,0,0,1,2019,5.4,30.0,10,8,2,2,...,0,0,0,0,0,0,0,0,2,1
5,1,1,1,2019,6.4,68.0,5,4,1,0,...,0,0,0,0,0,0,0,0,0,0
6,1,1,1,2016,2.2,40.0,6,1,5,4,...,0,0,0,0,0,0,0,0,0,0
7,1,0,1,2017,6.3,47.0,16,14,2,1,...,0,0,0,0,0,0,0,1,0,0
8,1,1,1,2014,6.4,45.3,12,10,2,2,...,0,0,0,0,0,0,0,1,0,0
9,0,0,1,2016,6.2,47.0,48,46,2,1,...,1,1,1,1,1,1,1,1,0,0


In [9]:
# Run all predictions based on fold n_splits
def run_prediction(X_train, X_test, y_train, y_test):

    # used to evaluate best model params
    gridsearch_metrics = {
            'REC':'recall', 
            'AUC':'roc_auc', 
            'Accuracy': 'accuracy'
    }

    # model gridsearch params
    model_params = {
        'kernel': ['linear','rbf', 'sigmoid'], 
        'C': [0.3,0.5,1,2,3,10],
        'gamma': [0.3,0.5,1],
        'decision_function_shape':['ovo', 'ovr'],
        'shrinking': [True, False]
    }

    # gridsearch initialization
    clf = GridSearchCV(SVC(),
                       param_grid = model_params,
                       scoring = gridsearch_metrics,
                       refit='Accuracy')

    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)

    predict_data = {
        "model_algorithm": 'svm',
        "confusion_matrix": confusion_matrix(y_test, predictions),
        "classification_report": classification_report(y_test, predictions),
        "model_accuracy": accuracy_score(y_test, predictions),
        "epoch_params": clf.best_params_
    }

    return predict_data

In [7]:
def main():
    X = df.iloc[:, df.columns != "EVADIDO"].values
    y = df["EVADIDO"]

    skf = StratifiedKFold(n_splits = 4)
    skf.get_n_splits(X, y)
    scaler = StandardScaler()
    print(f"StratifiedKFold config: {skf} \n")
    all_predictions = []
    for train_index, test_index in skf.split(X, y):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        # return best k element
        all_predictions.append(run_prediction(X_train, X_test, y_train, y_test))

        # General model outputs
        model_utils.generate_output(all_predictions)

In [10]:
if __name__ == "__main__":
    main()

StratifiedKFold config: StratifiedKFold(n_splits=4, random_state=None, shuffle=False) 

