# Decision Tree

In [None]:
import csv
import pandas as pd
from sklearn import tree
import pydotplus
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import matplotlib.image as pltimg
from sklearn.preprocessing import OneHotEncoder
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
def mapping(df):
    df_values = df.values
    dict_to_map = {}
    count = 0
    for i in df_values:
        if (i not in dict_to_map):
            dict_to_map[i] = count
            count = count+1
    return dict_to_map

In [None]:
map_grau_de_risco = {
    'Alto Risco': 1,
    'Baixo Risco': 0
}

In [None]:
map_segmento = {
    'PRODUTOS': 1,
    'SERVIÇOS': 0
}

In [None]:
map_licenciados = {
    'Não Licenciados': 0,
    'Em processo': 1,
    'Licenciados' : 2
}

In [None]:
db = pd.read_csv('../data/processed/db_final.csv')

In [None]:
features = ['Grau de Risco', 'SEGMENTO DA ATIVIDADE ECONÔMICA']
target = ['label_licenciados']

In [None]:
output_regiao = pd.get_dummies(db['Classificação Manual'], prefix='regiao')
output_subgrupo = pd.get_dummies(db['SUBGRUPO'], prefix='subgrupo')

In [None]:
db = pd.concat([db, output_regiao], axis = 1)
db = pd.concat([db, output_subgrupo], axis = 1)

In [None]:
for i in output_regiao.columns:
    features.append(i)
for i in output_subgrupo.columns:
    features.append(i)

In [None]:
features

In [None]:
X = db[features]
y = db[target]

In [None]:
X['Grau de Risco'] = X['Grau de Risco'].map(map_grau_de_risco)
X['SEGMENTO DA ATIVIDADE ECONÔMICA'] = X['SEGMENTO DA ATIVIDADE ECONÔMICA'].map(map_segmento)
y['label_licenciados'] = y['label_licenciados'].map(map_licenciados)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.167, random_state=1)

In [None]:
dt_hyper_param_max_depth = list(range(2,11))
dt_hyper_param_min_samples_split = list(range(2,1000))
# i = 10
# while(i < 100):
#     dt_hyper_param_min_samples_split.append(int(len(X_train)/i))
#     i = i + 10

In [None]:
best_score = 0
best_max_depth = 0
best_min_samples_split = 0

In [None]:
for i in dt_hyper_param_max_depth:
    for j in dt_hyper_param_min_samples_split:
        print("Testing: ")
        print(i)
        print(j)
        dtree = DecisionTreeClassifier(max_depth = i, min_samples_split = j)
        dtree = dtree.fit(X_train, y_train)
        if(dtree.score(X_val, y_val) > best_score):
            best_score = dtree.score(X_val, y_val)
            best_max_depth = i
            best_min_samples_split = j
            best_dtree = dtree
data = tree.export_graphviz(best_dtree, out_file=None, class_names=['Não Licenciados','Em processo','Licenciados'])
graph = pydotplus.graph_from_dot_data(data)
graph.write_png('../reports/figures/decision_tree.png')

In [None]:
dtree = DecisionTreeClassifier()
dtree = dtree.fit(X_train, y_train)
dtree.score(X_val, y_val)

In [None]:

data = tree.export_graphviz(best_dtree, out_file=None, class_names=['Não Licenciados','Em processo','Licenciados'])
graph = pydotplus.graph_from_dot_data(data)
graph.write_png('../reports/figures/decision_tree.png')

In [None]:
best_dtree.score(X_test, y_test)

# Regressão Logística

In [None]:
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

In [None]:
max_iter = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]

In [None]:
multi_class_params = ['auto', 'ovr', 'multinomial']

In [None]:
model = LogisticRegression(multi_class='multinomial', solver = 'lbfgs')
fitting = model.fit(X_train, y_train)

In [None]:
best_solver = ""
best_iter = 0
best_multi_class = ""
best_score = 0
for i in solvers:
    for j in max_iter:
        for k in multi_class_params:
            try:
                model = LogisticRegression(multi_class = k, max_iter = j, solver = i)
                fitting = model.fit(X_train, y_train)
                if(fitting.score(X_val, y_val) > best_score):
                    best_fitting = fitting
                    best_score = fitting.score(X_val, y_val)
                    best_multi_class = k
                    best_iter = j
                    best_solver = i
            except Exception as e:
                print(e)
print("Best solver: "+best_solver)
print("with max_iter: "+str(best_iter))
print("with multi_class: "+best_multi_class)
print("Achieved a score of: "+str(best_score))

Utilizar o melhor modelo na base de teste

In [None]:
best_fitting.score(X_test, y_test)

In [None]:
accurracy = sum(true_pred)/len(true_pred)
recall = 