# Aplicações de Algoritmos de aprendizado de máquina

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import (
    f1_score, precision_score, recall_score, accuracy_score, classification_report,
    confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve
)
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV, KFold, cross_val_score
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from tensorflow import keras
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from keras_tuner.tuners import RandomSearch
import services.function as sf

warnings.filterwarnings('ignore')


Using TensorFlow backend


In [2]:
dataset = pd.read_csv('dados/df_poscomp.csv')

In [3]:
X_dataset = dataset.loc[:,['IDADE','SEXO','REGIAO','ESTADO','matematica', 'fund_computacao','tec_computacao','area_concentration']]

In [4]:
# X_dataset.drop('area_concentration',axis=1, inplace =True)
# X_dataset = pd.get_dummies(X_dataset.iloc[:,:-1]).astype('int64')

In [5]:
X_dataset['area_concentration'] = dataset.area_concentration
X_dataset['area_concentration'] = X_dataset['area_concentration'].astype('category')

In [6]:
# colunas_a_padronizar = ['matematica', 'fund_computacao', 'tec_computacao']
# scaler = StandardScaler()
# X_dataset[colunas_a_padronizar] = scaler.fit_transform(X_dataset[colunas_a_padronizar])

In [7]:
def encode_categorical_columns(df, columns):
    le = LabelEncoder()
    for col in columns:
        df[col] = le.fit_transform(df[col])
    return df

categorical_columns = ["area_concentration",'SEXO','REGIAO','ESTADO']
X_dataset = encode_categorical_columns(X_dataset, categorical_columns)


In [8]:
X_dataset

Unnamed: 0,IDADE,SEXO,REGIAO,ESTADO,matematica,fund_computacao,tec_computacao,area_concentration
0,27,1,1,5,14.0,21.0,10.0,1
1,21,1,4,22,16.0,22.0,12.0,1
2,25,1,3,25,4.0,16.0,8.0,1
3,30,1,2,13,4.0,13.0,7.0,1
4,19,0,2,2,6.0,15.0,11.0,0
...,...,...,...,...,...,...,...,...
10916,22,0,3,10,14.0,16.0,8.0,0
10917,48,1,1,14,10.0,16.0,9.0,0
10918,24,1,3,25,9.0,18.0,8.0,1
10919,25,1,3,7,10.0,16.0,6.0,0


In [9]:
X = X_dataset.drop(['area_concentration'], axis=1)
y = X_dataset['area_concentration']

### Dividir a base

In [10]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)

In [11]:
print("Shape of x_train : ",x_train.shape)
print("Shape of x_test  : ",x_test.shape)
print("Shape of y_train : ",y_train.shape)
print("Shape of y_test  : ",y_test.shape)

Shape of x_train :  (7644, 7)
Shape of x_test  :  (3277, 7)
Shape of y_train :  (7644,)
Shape of y_test  :  (3277,)


In [12]:
from collections import Counter
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# enumerate the splits and summarize the distributions
for train_ix, test_ix in skf.split(X, y):
# select rows
    train_x, test_x = X.iloc[train_ix], X.loc[test_ix]
    train_y, test_y = y.iloc[train_ix], y.iloc[test_ix]
# summarize train and test composition
    train, train1 = len(train_y[train_y==0]), len(train_y[train_y==1])
    test, test1 = len(test_y[test_y==0]), len(test_y[test_y==1])
    print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train, train1, test, test1))

>Train: 0=6185, 1=3643, Test: 0=688, 1=405
>Train: 0=6186, 1=3643, Test: 0=687, 1=405
>Train: 0=6186, 1=3643, Test: 0=687, 1=405
>Train: 0=6186, 1=3643, Test: 0=687, 1=405
>Train: 0=6186, 1=3643, Test: 0=687, 1=405
>Train: 0=6186, 1=3643, Test: 0=687, 1=405
>Train: 0=6186, 1=3643, Test: 0=687, 1=405
>Train: 0=6186, 1=3643, Test: 0=687, 1=405
>Train: 0=6185, 1=3644, Test: 0=688, 1=404
>Train: 0=6185, 1=3644, Test: 0=688, 1=404


In [13]:
def print_score(clf, x_train, y_train, x_test, y_test, train=True):
    if train:
        pred = clf.predict(x_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n==========================================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("__________________________________________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("__________________________________________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(x_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n==========================================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("__________________________________________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("__________________________________________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

### Árvore de Decisão

In [330]:
parametros = {'criterion': ['entropy'],
              'splitter': ['best', 'random'],
              'min_samples_leaf': [22],
             'max_depth':[7],
             'max_features': ['sqrt'],
              'max_leaf_nodes': [None, 5, 10, 20],
             'min_impurity_decrease': [0.0, 0.1, 0.2],
    'ccp_alpha': [0.0, 0.1, 0.2]}

In [14]:
parametros = {'criterion': ["entropy"],
    'splitter': ["best", "random"],
    'max_depth': [None, 10, 20, 30, 40, 50],  # Ou qualquer outro intervalo apropriado
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.2],
    'max_features': [None, "auto", "sqrt", "log2"],
    'random_state': [None, 42],  # Use um valor fixo se desejar repetibilidade nos resultados
    'max_leaf_nodes': [None, 5, 10, 20],
    'min_impurity_decrease': [0.0, 0.1, 0.2],
    'ccp_alpha': [0.0, 0.1, 0.2],
}

In [None]:
tree_clf = DecisionTreeClassifier(random_state=42)
tree_cv = GridSearchCV(estimator=tree_clf, param_grid=parametros, scoring="accuracy", n_jobs=-1, verbose=1, cv=10)
tree_cv.fit(x_train, y_train)
melhores_parametros = tree_cv.best_params_
melhor_resultado = tree_cv.best_score_
print(melhores_parametros)
print(melhor_resultado)
print(tree_cv.best_estimator_)

tree_clf = DecisionTreeClassifier(**melhores_parametros)
tree_clf.fit(x_train, y_train)
print_score(tree_clf, x_train, y_train, x_test, y_test, train=True)
print_score(tree_clf, x_train, y_train, x_test, y_test, train=False)

Fitting 10 folds for each of 93312 candidates, totalling 933120 fits


In [None]:
tree_clf = DecisionTreeClassifier(random_state=42)
tree_cv = GridSearchCV(tree_clf, parametros, scoring="accuracy", n_jobs=-1, verbose=1, cv=10)
tree_cv.fit(train_x, train_y)
best_params = tree_cv.best_params_
print(f"Best paramters: {best_params})")
print(tree_cv.best_estimator_)

tree_clf = DecisionTreeClassifier(**best_params)
tree_clf.fit(train_x, train_y)
print_score(tree_clf, train_x, train_y, test_x, test_y, train=True)
print_score(tree_clf, train_x, train_y, test_x, test_y, train=False)

### Rede Neural

In [266]:
parametros = {'activation': ['relu', 'logistic', 'tahn'],
              'solver': ['adam', 'sgd'],
              'batch_size': [2,3,4,5,6]}

In [267]:
neural_clf = MLPClassifier(random_state=42)
neural_cv = GridSearchCV(estimator=neural_clf, param_grid=parametros)
neural_cv.fit(x_train, y_train)
melhores_parametros = neural_cv.best_params_
melhor_resultado = neural_cv.best_score_
print(melhores_parametros)
print(melhor_resultado)

neural_clf = MLPClassifier(**melhores_parametros)
neural_clf.fit(x_train, y_train)
print_score(neural_clf, x_train, y_train, x_test, y_test, train=True)
print_score(neural_clf, x_train, y_train, x_test, y_test, train=False)

{'activation': 'relu', 'batch_size': 3, 'solver': 'adam'}
0.6542386462082119
Train Result:
Accuracy Score: 65.57%
__________________________________________________________________________
CLASSIFICATION REPORT:
                     0            1  accuracy    macro avg  weighted avg
precision     0.658473     0.630689  0.655678     0.644581      0.648176
recall        0.940969     0.171197  0.655678     0.556083      0.655678
f1-score      0.774773     0.269295  0.655678     0.522034      0.587434
support    4811.000000  2833.000000  0.655678  7644.000000   7644.000000
__________________________________________________________________________
Confusion Matrix: 
 [[4527  284]
 [2348  485]]

Test Result:
Accuracy Score: 65.06%
__________________________________________________________________________
CLASSIFICATION REPORT:
                     0            1  accuracy    macro avg  weighted avg
precision     0.655688     0.605422  0.650595     0.630555      0.637051
recall        0.9364

In [268]:
neural_clf = MLPClassifier(random_state=42)
neural_cv = GridSearchCV(estimator=neural_clf, param_grid=parametros)
neural_cv.fit(train_x, train_y)
melhores_parametros = neural_cv.best_params_
melhor_resultado = neural_cv.best_score_
print(melhores_parametros)
print(melhor_resultado)

neural_clf = MLPClassifier(**melhores_parametros)
neural_clf.fit(train_x, train_y)
print_score(neural_clf, train_x, train_y, test_x, test_y, train=True)
print_score(neural_clf, train_x, train_y, test_x, test_y, train=False)

{'activation': 'logistic', 'batch_size': 5, 'solver': 'adam'}
0.6516441075898416
Train Result:
Accuracy Score: 65.24%
__________________________________________________________________________
CLASSIFICATION REPORT:
                     0            1  accuracy    macro avg  weighted avg
precision     0.668084     0.571160  0.652355     0.619622      0.632150
recall        0.889410     0.250000  0.652355     0.569705      0.652355
f1-score      0.763021     0.347776  0.652355     0.555399      0.609073
support    6185.000000  3644.000000  0.652355  9829.000000   9829.000000
__________________________________________________________________________
Confusion Matrix: 
 [[5501  684]
 [2733  911]]

Test Result:
Accuracy Score: 66.12%
__________________________________________________________________________
CLASSIFICATION REPORT:
                    0           1  accuracy    macro avg  weighted avg
precision    0.671336    0.603659  0.661172     0.637497      0.646298
recall       0.90552

### Random Forest

In [260]:
parametros = {
    'criterion': ['entropy'],
    'max_depth': [3,5,7, 8, 9],
    # 'max_features': np.arange(2,3),
    # 'min_samples_leaf': [1, 2, 5],
    # 'min_samples_split': [2,3,4,5],
    'n_estimators': [500,700,900,1000]}

In [261]:
forest_clf = RandomForestClassifier(random_state=42)
forest_cv = GridSearchCV(estimator=forest_clf, param_grid=parametros, scoring="accuracy", n_jobs=-1, verbose=1, cv=3)
forest_cv.fit(x_train, y_train)
melhores_parametros = forest_cv.best_params_
melhor_resultado = forest_cv.best_score_
print(melhores_parametros)
print(melhor_resultado)
print(forest_cv.best_estimator_)

forest_clf = RandomForestClassifier(**melhores_parametros)
forest_clf.fit(x_train, y_train)
print_score(forest_clf, x_train, y_train, x_test, y_test, train=True)
print_score(forest_clf, x_train, y_train, x_test, y_test, train=False)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
{'criterion': 'entropy', 'max_depth': 9, 'n_estimators': 700}
0.6598639455782314
RandomForestClassifier(criterion='entropy', max_depth=9, n_estimators=700,
                       random_state=42)
Train Result:
Accuracy Score: 72.20%
__________________________________________________________________________
CLASSIFICATION REPORT:
                     0            1  accuracy    macro avg  weighted avg
precision     0.702748     0.847059  0.722004     0.774903      0.756232
recall        0.967574     0.304977  0.722004     0.636276      0.722004
f1-score      0.814167     0.448482  0.722004     0.631324      0.678638
support    4811.000000  2833.000000  0.722004  7644.000000   7644.000000
__________________________________________________________________________
Confusion Matrix: 
 [[4655  156]
 [1969  864]]

Test Result:
Accuracy Score: 65.21%
__________________________________________________________________________
CLASSIFIC

In [262]:
forest_clf = RandomForestClassifier(random_state=42)
forest_cv = GridSearchCV(estimator=forest_clf, param_grid=parametros, scoring="accuracy", n_jobs=-1, verbose=1, cv=3)
forest_cv.fit(train_x, train_y)
melhores_parametros = forest_cv.best_params_
melhor_resultado = forest_cv.best_score_
print(melhores_parametros)
print(melhor_resultado)
print(forest_cv.best_estimator_)

forest_clf = RandomForestClassifier(**melhores_parametros)
forest_clf.fit(train_x, train_y)
print_score(forest_clf, train_x, train_y, test_x, test_y, train=True)
print_score(forest_clf, train_x, train_y, test_x, test_y, train=False)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
{'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 500}
0.6525607864484885
RandomForestClassifier(criterion='entropy', max_depth=5, n_estimators=500,
                       random_state=42)
Train Result:
Accuracy Score: 66.28%
__________________________________________________________________________
CLASSIFICATION REPORT:
                     0            1  accuracy    macro avg  weighted avg
precision     0.658847     0.708333  0.662834     0.683590      0.677194
recall        0.962652     0.153952  0.662834     0.558302      0.662834
f1-score      0.782289     0.252931  0.662834     0.517610      0.586035
support    6185.000000  3644.000000  0.662834  9829.000000   9829.000000
__________________________________________________________________________
Confusion Matrix: 
 [[5954  231]
 [3083  561]]

Test Result:
Accuracy Score: 66.12%
__________________________________________________________________________
CLASSIFIC

### SVM

In [263]:
parametros = {'C':[0.1,1,100,1000],
              'kernel':['rbf'],
              'degree':[1,2,3,4,5,6],
             }

In [264]:
svm_clf = SVC(random_state=42)
svm_clf = GridSearchCV(estimator=svm_clf, param_grid=parametros, scoring='accuracy', cv=10)
svm_clf.fit(train_x, train_y)
melhores_parametros = svm_clf.best_params_
melhor_resultado = svm_clf.best_score_
print(melhores_parametros)
print(melhor_resultado)

svm_clf = SVC(**melhores_parametros)
svm_clf.fit(train_x, train_y)
print_score(svm_clf, train_x, train_y, test_x, test_y, train=True)
print_score(svm_clf, train_x, train_y, test_x, test_y, train=False)

{'C': 1000, 'degree': 1, 'kernel': 'rbf'}
0.6526620574201341
Train Result:
Accuracy Score: 66.13%
__________________________________________________________________________
CLASSIFICATION REPORT:
                     0            1  accuracy    macro avg  weighted avg
precision     0.662531     0.651007  0.661308     0.656769      0.658259
recall        0.941148     0.186334  0.661308     0.563741      0.661308
f1-score      0.777637     0.289738  0.661308     0.533687      0.596753
support    6185.000000  3644.000000  0.661308  9829.000000   9829.000000
__________________________________________________________________________
Confusion Matrix: 
 [[5821  364]
 [2965  679]]

Test Result:
Accuracy Score: 65.84%
__________________________________________________________________________
CLASSIFICATION REPORT:
                    0           1  accuracy    macro avg  weighted avg
precision    0.662539    0.626016  0.658425     0.644277      0.649027
recall       0.933140    0.190594  0.658

In [265]:
svm_clf = SVC(random_state=42)
svm_clf = GridSearchCV(estimator=svm_clf, param_grid=parametros, scoring='accuracy', cv=10)
svm_clf.fit(x_train, y_train)
melhores_parametros = svm_clf.best_params_
melhor_resultado = svm_clf.best_score_
print(melhores_parametros)
print(melhor_resultado)

svm_clf = SVC(**melhores_parametros)
svm_clf.fit(x_train, y_train)
print_score(svm_clf, x_train, y_train, x_test, y_test, train=True)
print_score(svm_clf, x_train, y_train, x_test, y_test, train=False)

{'C': 1000, 'degree': 1, 'kernel': 'rbf'}
0.6529312527803441
Train Result:
Accuracy Score: 66.43%
__________________________________________________________________________
CLASSIFICATION REPORT:
                     0            1  accuracy    macro avg  weighted avg
precision     0.665292     0.656506  0.664312     0.660899      0.662036
recall        0.939098     0.197670  0.664312     0.568384      0.664312
f1-score      0.778831     0.303852  0.664312     0.541342      0.602796
support    4811.000000  2833.000000  0.664312  7644.000000   7644.000000
__________________________________________________________________________
Confusion Matrix: 
 [[4518  293]
 [2273  560]]

Test Result:
Accuracy Score: 64.30%
__________________________________________________________________________
CLASSIFICATION REPORT:
                     0            1  accuracy    macro avg  weighted avg
precision     0.652635     0.563380  0.642966     0.608008      0.619543
recall        0.924830     0.164609 

### Validação Cruzada

In [None]:
x_poscomp = np.concatenate((x_train, x_test), axis = 0)
y_poscomp = np.concatenate((y_train, y_test), axis = 0)

In [None]:
resultados_arvore = []
resultados_random_forest = []
resultados_knn = []
resultados_logistica = []
resultados_svm = []
resultados_rede_neural = []

for i in range(30):
  print(i)
  kfold = KFold(n_splits=10, shuffle=True, random_state=i)

  arvore = DecisionTreeClassifier(max_depth=3, min_samples_leaf=7)
  scores = cross_val_score(arvore, x_poscomp, y_poscomp, cv = kfold)
  #print(scores)
  #print(scores.mean())
  resultados_arvore.append(scores.mean())

  random_forest = RandomForestClassifier(criterion='entropy', max_depth=6, max_features=10,
                       min_samples_split=5, n_estimators=70)
  scores = cross_val_score(random_forest, x_poscomp, y_poscomp, cv = kfold)
  resultados_random_forest.append(scores.mean())

  svm = SVC(kernel = 'rbf', C= 100, degree= 1, gamma= 0.001)
  scores = cross_val_score(svm, x_poscomp, y_poscomp, cv = kfold)
  resultados_svm.append(scores.mean())

  rede_neural = MLPClassifier(activation = 'relu', batch_size = 56, solver = 'adam')
  scores = cross_val_score(rede_neural, x_poscomp, y_poscomp, cv = kfold)
  resultados_rede_neural.append(scores.mean())


In [None]:
resultados = pd.DataFrame({'Arvore': resultados_arvore, 'Random forest': resultados_random_forest,
                            'SVM': resultados_svm, 'Rede neural': resultados_rede_neural})
resultados

In [None]:
resultados.describe()

In [None]:
resultados.var()

In [None]:
(resultados.std() / resultados.mean()) * 100

#### Teste de normalidade

In [None]:
alpha = 0.05

In [None]:
from scipy.stats import shapiro

In [None]:
shapiro(resultados_arvore), shapiro(resultados_random_forest), shapiro(resultados_svm), shapiro(resultados_rede_neural)

#### Teste de hipótese com ANOVA e Tukey

In [None]:
from scipy.stats import f_oneway

In [None]:
_, p = f_oneway(resultados_arvore, resultados_random_forest, resultados_svm, resultados_rede_neural)
p

In [None]:
alpha = 0.05
if p <= alpha:
  print('Hipótese nula rejeitada. Dados são diferentes')
else:
  print('Hipótese alternativa rejeitada. Resultados são iguais')

In [None]:
resultados_algoritmos = {'accuracy': np.concatenate([resultados_arvore, resultados_random_forest, resultados_svm, resultados_rede_neural]),
                         'algoritmo': ['arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore','arvore',
                          'random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest','random_forest',
                          'svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm','svm',
                          'rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural','rede_neural']}

In [None]:
resultados_df = pd.DataFrame(resultados_algoritmos)
resultados_df

In [None]:
from statsmodels.stats.multicomp import MultiComparison

In [None]:
compara_algoritmos = MultiComparison(resultados_df['accuracy'], resultados_df['algoritmo'])

In [None]:
teste_estatistico = compara_algoritmos.tukeyhsd()
print(teste_estatistico)

In [None]:
resultados.mean()

In [None]:
teste_estatistico.plot_simultaneous();

#### Salvar o classificador treinado

In [None]:
parametros = {
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'dual': [True, False],
    'tol': [1e-4, 1e-3, 1e-2],
    'C': [0.01, 0.1, 1.0, 10.0],
    'fit_intercept': [True, False],
    'class_weight': [None, 'balanced'],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'],
    'max_iter': [100, 200, 300],
    'multi_class': ['auto', 'ovr', 'multinomial'],
    'verbose': [0, 1, 2],
    'n_jobs': [1, -1],
    'l1_ratio': [0.0, 0.5, 1.0]
}

In [None]:
logistica_clf = LogisticRegression(random_state=42)
logistica_clf = GridSearchCV(estimator=logistica_clf, param_grid=parametros, scoring="accuracy", n_jobs=-1, verbose=1, cv=10)
logistica_clf.fit(x_train, y_train)
melhores_parametros = logistica_clf.best_params_
melhor_resultado = logistica_clf.best_score_
print(melhores_parametros)
print(melhor_resultado)
print(logistica_clf.best_estimator_)

logistica_clf = LogisticRegression(**melhores_parametros)
logistica_clf.fit(x_train, y_train)
print_score(logistica_clf, x_train, y_train, x_test, y_test, train=True)
print_score(logistica_clf, x_train, y_train, x_test, y_test, train=False)

In [None]:
logistica_clf = LogisticRegression(random_state=42)
logistica_clf = GridSearchCV(estimator=logistica_clf, param_grid=parametros, scoring="accuracy", n_jobs=-1, verbose=1, cv=10)
logistica_clf.fit(train_x, train_y)
melhores_parametros = logistica_clf.best_params_
melhor_resultado = logistica_clf.best_score_
print(melhores_parametros)
print(melhor_resultado)
print(logistica_clf.best_estimator_)

logistica_clf = LogisticRegression(**melhores_parametros)
logistica_clf.fit(train_x, train_y)
print_score(logistica_clf, train_x, train_y, test_x, test_y, train=True)
print_score(logistica_clf, train_x, train_y, test_x, test_y, train=False)

In [None]:
parametros = {
    'priors': [None, [0.2, 0.8], [0.5, 0.5]],  # Exemplos de valores para priors
    'var_smoothing': [1e-9, 1e-8, 1e-10]  # Valores para var_smoothing
}


In [None]:
naive_clf = GaussianNB()
naive_clf = GridSearchCV(estimator=naive_clf, param_grid=parametros, scoring="accuracy", n_jobs=-1, verbose=1, cv=10)
naive_clf.fit(x_train, y_train)
melhores_parametros = naive_clf.best_params_
melhor_resultado = naive_clf.best_score_
print(melhores_parametros)
print(melhor_resultado)
print(naive_clf.best_estimator_)

naive_clf = GaussianNB(**melhores_parametros)
naive_clf.fit(x_train, y_train)
print_score(naive_clf, x_train, y_train, x_test, y_test, train=True)
print_score(naive_clf, x_train, y_train, x_test, y_test, train=False)

In [None]:
naive_clf = GaussianNB()
naive_clf = GridSearchCV(estimator=naive_clf, param_grid=parametros, scoring="accuracy", n_jobs=-1, verbose=1, cv=10)
naive_clf.fit(train_x, train_y)
melhores_parametros = naive_clf.best_params_
melhor_resultado = naive_clf.best_score_
print(melhores_parametros)
print(melhor_resultado)
print(naive_clf.best_estimator_)

naive_clf = GaussianNB(**melhores_parametros)
naive_clf.fit(train_x, train_y)
print_score(naive_clf, train_x, train_y, test_x, test_y, train=True)
print_score(naive_clf, train_x, train_y, test_x, test_y, train=False)

### Os algoritmos

In [12]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [288]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

algorithms = [
    ('Árvore de Decisão (C4.5)', DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=22,
                       random_state=42, splitter='random')),
    ('Random Forest', RandomForestClassifier(criterion='entropy', max_depth=9, n_estimators=700,
                       random_state=42)),
    ('SVM', SVC(C= 1000, degree= 1, kernel= 'rbf',random_state=42)),
    ('Rede Neural', MLPClassifier(activation= 'relu', batch_size= 3, solver= 'adam', random_state=42)),
]

for name, classifier in algorithms:
    classifier.fit(x_train, y_train)
    train_predictions = classifier.predict(x_train)
    test_predictions = classifier.predict(x_test)
    
    train_accuracy = accuracy_score(y_train, train_predictions)
    test_accuracy = accuracy_score(y_test, test_predictions)
    
    precision = precision_score(y_test, test_predictions)
    recall = recall_score(y_test, test_predictions)
    f1 = f1_score(y_test, test_predictions)
    
    conf_matrix = confusion_matrix(y_test, test_predictions)
    
    print(f'{name} - Acurácia no Conjunto de Treinamento: {train_accuracy * 100:.2f}%')
    print(f'{name} - Acurácia no Conjunto de Teste: {test_accuracy * 100:.2f}%')
    print(f'{name} - Precisão: {precision * 100:.2f}%')
    print(f'{name} - Recall: {recall * 100:.2f}%')
    print(f'{name} - F1-Score: {f1 * 100:.2f}%')
    print(f'{name} - Matriz de Confusão:\n{conf_matrix}\n')


Árvore de Decisão (C4.5) - Acurácia no Conjunto de Treinamento: 66.03%
Árvore de Decisão (C4.5) - Acurácia no Conjunto de Teste: 65.46%
Árvore de Decisão (C4.5) - Precisão: 64.77%
Árvore de Decisão (C4.5) - Recall: 14.98%
Árvore de Decisão (C4.5) - F1-Score: 24.33%
Árvore de Decisão (C4.5) - Matriz de Confusão:
[[1963   99]
 [1033  182]]

Random Forest - Acurácia no Conjunto de Treinamento: 71.98%
Random Forest - Acurácia no Conjunto de Teste: 65.55%
Random Forest - Precisão: 61.20%
Random Forest - Recall: 19.34%
Random Forest - F1-Score: 29.39%
Random Forest - Matriz de Confusão:
[[1913  149]
 [ 980  235]]

SVM - Acurácia no Conjunto de Treinamento: 66.43%
SVM - Acurácia no Conjunto de Teste: 64.30%
SVM - Precisão: 56.34%
SVM - Recall: 16.46%
SVM - F1-Score: 25.48%
SVM - Matriz de Confusão:
[[1907  155]
 [1015  200]]

Rede Neural - Acurácia no Conjunto de Treinamento: 65.80%
Rede Neural - Acurácia no Conjunto de Teste: 64.78%
Rede Neural - Precisão: 63.32%
Rede Neural - Recall: 11.93%

In [289]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

algorithms = [
    ('Árvore de Decisão (C4.5)', DecisionTreeClassifier(max_depth=11, min_samples_leaf=30, random_state=42,
                       splitter='random')),
    ('Random Forest', RandomForestClassifier(criterion='entropy', max_depth=5, n_estimators=500,
                       random_state=42)),
    ('SVM', SVC(C= 1000, degree= 1, kernel= 'rbf',random_state=42)),
    ('Rede Neural', MLPClassifier(activation= 'logistic', batch_size= 5, solver= 'adam',random_state=42)),
]

for name, classifier in algorithms:
    classifier.fit(train_x, train_y)
    train_predictions = classifier.predict(train_x)
    test_predictions = classifier.predict(test_x)
    
    train_accuracy = accuracy_score(train_y, train_predictions)
    test_accuracy = accuracy_score(test_y, test_predictions)
    
    precision = precision_score(test_y, test_predictions)
    recall = recall_score(test_y, test_predictions)
    f1 = f1_score(test_y, test_predictions)
    
    conf_matrix = confusion_matrix(test_y, test_predictions)
    
    print(f'{name} - Acurácia no Conjunto de Treinamento: {train_accuracy * 100:.2f}%')
    print(f'{name} - Acurácia no Conjunto de Teste: {test_accuracy * 100:.2f}%')
    print(f'{name} - Precisão: {precision * 100:.2f}%')
    print(f'{name} - Recall: {recall * 100:.2f}%')
    print(f'{name} - F1-Score: {f1 * 100:.2f}%')
    print(f'{name} - Matriz de Confusão:\n{conf_matrix}\n')

Árvore de Decisão (C4.5) - Acurácia no Conjunto de Treinamento: 66.89%
Árvore de Decisão (C4.5) - Acurácia no Conjunto de Teste: 66.21%
Árvore de Decisão (C4.5) - Precisão: 60.61%
Árvore de Decisão (C4.5) - Recall: 24.75%
Árvore de Decisão (C4.5) - F1-Score: 35.15%
Árvore de Decisão (C4.5) - Matriz de Confusão:
[[623  65]
 [304 100]]

Random Forest - Acurácia no Conjunto de Treinamento: 66.23%
Random Forest - Acurácia no Conjunto de Teste: 66.39%
Random Forest - Precisão: 69.07%
Random Forest - Recall: 16.58%
Random Forest - F1-Score: 26.75%
Random Forest - Matriz de Confusão:
[[658  30]
 [337  67]]

SVM - Acurácia no Conjunto de Treinamento: 66.13%
SVM - Acurácia no Conjunto de Teste: 65.84%
SVM - Precisão: 62.60%
SVM - Recall: 19.06%
SVM - F1-Score: 29.22%
SVM - Matriz de Confusão:
[[642  46]
 [327  77]]

Rede Neural - Acurácia no Conjunto de Treinamento: 65.56%
Rede Neural - Acurácia no Conjunto de Teste: 65.11%
Rede Neural - Precisão: 63.22%
Rede Neural - Recall: 13.61%
Rede Neural

# testar o algoritmos

In [290]:
classifier = RandomForestClassifier(criterion='entropy', max_depth=9, n_estimators=700,
                       random_state=42)
classifier.fit(x_train, y_train)
predictions = classifier.predict(x_test)
accuracy = accuracy_score(y_test, predictions)
print(f'Acurácia: {accuracy * 100:.2f}%')

# print_score(classifier, x_train, y_train, x_test, y_test, train=True)
# print_score(classifier, x_train, y_train, x_test, y_test, train=False)

Acurácia: 65.55%


In [292]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.66      0.93      0.77      2062
           1       0.61      0.19      0.29      1215

    accuracy                           0.66      3277
   macro avg       0.64      0.56      0.53      3277
weighted avg       0.64      0.66      0.59      3277



In [301]:
precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')

In [302]:
print(f'Metricas = {precision * 100:.2f}%')
print(f'Metricas = {recall * 100:.2f}%')
print(f'Metricas = {f1 * 100:.2f}%')

Metricas = 63.66%
Metricas = 56.06%
Metricas = 53.30%


In [286]:
valores_unicos, contagem = np.unique(predictions, return_counts=True)
valores_unicos, contagem

(array([0, 1], dtype=int64), array([2893,  384], dtype=int64))

In [None]:
f1_score(y_test, predictions)

In [276]:
from sklearn.preprocessing import StandardScaler

# Dados do usuário
dados_usuario = [[1, 10, 37, 2, 6.0, 14.0, 5.0]]  # Substitua esses valores pelos dados reais do usuário
previsao_usuario = classifier.predict(dados_usuario)

print(f'Especialidade prevista para o usuário: {previsao_usuario[0]}')  # Use [0] para acessar o valor único na matriz de previsão


Especialidade prevista para o usuário: 0


In [None]:
df = X_dataset[X_dataset['ESPECIALIDADE'] == 'ciência de dados']

In [None]:
df = df.groupby(['ESPECIALIDADE']).agg({'matematica': ['mean', 'median', 'max', 'std'],
                                                            'fund_computacao': ['mean', 'median', 'max', 'std'],
                                                            'tec_computacao': ['mean', 'median', 'max', 'std']})
# df = df.sort_values(by=('total', 'mean'), ascending=False)
df

In [None]:
def train_model(X, y):
    smtom = SMOTETomek(random_state=139)
    X_resampled, y_resampled = smtom.fit_resample(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)
    
    classifier = RandomForestClassifier(criterion='entropy', max_depth=90, max_features='sqrt',
                    min_samples_leaf=4, min_samples_split=2, n_estimators=195, random_state=42)
    classifier.fit(X_train, y_train)
    
    predictions = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    
    return classifier, accuracy

In [None]:
if previsao_usuario == "informatica" or previsao_usuario == "computacao":
    # Filtrar os dados para a área de concentração correspondente
    area = X_dataset[X_dataset['area_concentration'] == previsao_usuario[0]]
    X_area = area.drop(['ESPECIALIDADE', 'area_concentration'], axis=1)
    y_area = area['ESPECIALIDADE']

    # Treinar o modelo
    classifier, accuracy = train_model(X_area, y_area)

    print(f'Acurácia: {accuracy * 100:.2f}%')
    print(f'Linha de Pesquisa Prevista: {classifier.predict(X_area.iloc[[0]])[0]}')
else:
    print("Área de concentração não reconhecida")
