# 4. Data Mining

In [2]:
import pandas as pd

In [3]:
poscomp = pd.read_csv('../data/processed/poscomp_transformed.csv')

X = poscomp.drop(columns='area')
y = poscomp['area']

### 4.1 Divisão do Conjunto de Dados em Treino, Validação e Teste.

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2,stratify=y)

X_train, X_val, y_train, y_val= train_test_split(X_train, y_train, random_state=42, test_size=0.2,stratify=y_train)


### 4.2 Criação de um Preprocessor

Ultilização da classe ColumnTransformer para a aplicação da normalização apenas nas features 'idade',  'matematica', 'fund_computacao' e 'tec_computacao'.

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

num_var = ['idade', 'matematica', 'fund_computacao', 'tec_computacao']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_var)
    ],
    remainder='passthrough'
)

### 4.3 Pipelines

Criação dos pipelines dos modelos escolhidos. 

- Foi escolhido os modelos de random forest, svc, um modelo de rede neural chamado MLPClassifier e o gradient boosting. 
- Tais pipelines foram agrupados em um único dicionário, chamado "pipelines".

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

pipelines = {
    'forest': Pipeline(steps=[
        ('clf', RandomForestClassifier(random_state=42))
    ]),
    'svc': Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('clf', SVC(probability=True, random_state=42))
    ]),
    'nn': Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('clf', MLPClassifier(random_state=42))
    ]),
    'boosting': Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('clf', GradientBoostingClassifier(random_state=42))
    ])
}

### 4.4 Random Search

Uso da técnica de random search para a otimização dos hiperparâmetros dos modelos.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

params_grid = {
    'params_forest': {
        'clf__n_estimators': [200, 500, 1000],
        'clf__max_depth': [60, 100, None],
        'clf__min_samples_leaf': [10, 20, None],
        'clf__min_samples_split': [20, 40, 60, None],
        'clf__bootstrap': [True, False]
    },
    'params_svc': {
        'clf__C': np.random.uniform(0.1, 10, 3), 
        'clf__gamma': ['scale', 'auto'],
        'clf__kernel': ['rbf', 'poly',]
    },
    'params_nn': {
        'clf__hidden_layer_sizes': [(50, 25), (100,), (100, 50, 25)],
        'clf__alpha': [0.001, 0.01, 0.1], 
        'clf__activation': ['relu', 'tanh'],
        'clf__learning_rate_init': np.random.uniform(0.0005, 0.01, 3)
    },
    'params_boosting': {
        'clf__n_estimators': [250, 500, 1000],
        'clf__learning_rate': [0.09, 0.1, 0.11],
        'clf__max_depth': [3, 10, 30, None],
        'clf__max_leaf_nodes': [5, 10, 20, None]
    }
}

best_models = {}

for model, params_model in zip(pipelines.keys(), params_grid.keys()):
    
    random_search = RandomizedSearchCV(
       estimator=pipelines[model],
       param_distributions=params_grid[params_model],
       cv=3,
       scoring='accuracy',
       n_jobs=-1,
    )

    random_search.fit(X_train, y_train)

    best_models[model] = random_search

15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/home/koheiseko/Documents/projects/poscomp-classification/.venv/lib64/python3.13/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/koheiseko/Documents/projects/poscomp-classification/.venv/lib64/python3.13/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/koheiseko/Documents/projects/poscomp-classification/.venv/lib64/python3.13/site-packages/sklearn/pipeline.p

### 4.5 Criação de um Modelo de Ensemble

Foi usado uma técnica de ensemble, votting classifier, que combina a predição dos 4 modelos em apenas uma.

In [96]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators=[
        ('forest', best_models['forest'].best_estimator_),
        ('svc', best_models['svc'].best_estimator_),
        ('nn', best_models['nn'].best_estimator_),
        ('boosting', best_models['boosting'].best_estimator_),
    ], voting='hard',
    weights=[3, 2, 1, 2]
)

voting_clf.fit(X_train, y_train)



0,1,2
,estimators,"[('forest', ...), ('svc', ...), ...]"
,voting,'hard'
,weights,"[3, 2, ...]"
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,n_estimators,500
,criterion,'gini'
,max_depth,
,min_samples_split,40
,min_samples_leaf,10
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,C,np.float64(1.6982560571154641)
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,

0,1,2
,transformers,"[('num', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,hidden_layer_sizes,"(100,)"
,activation,'tanh'
,solver,'adam'
,alpha,0.001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,np.float64(0....6951599453247)
,power_t,0.5
,max_iter,200
,shuffle,True

0,1,2
,transformers,"[('num', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,loss,'log_loss'
,learning_rate,0.11
,n_estimators,250
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


### 4.6 Métricas com os Dados de Validação

In [101]:
from sklearn.metrics import accuracy_score

y_pred = voting_clf.predict(X_val)

print(f'Acurácia do Modelo de Ensemble com o Conjunto de Dados de Validação: {accuracy_score(y_val, y_pred) * 100:.5f}')

for model in best_models.keys():
    y_pred = best_models[model].predict(X_val)
    accuracy_model = accuracy_score(y_val, y_pred)

    print(f'Acurácia do Modelo de {model.capitalize()} com o Conjunto de Dados de Validação: {accuracy_model * 100:.5f}%')

Acurácia do Modelo de Ensemble com o Conjunto de Dados de Validação: 40.06276
Acurácia do Modelo de Forest com o Conjunto de Dados de Validação: 40.27197%
Acurácia do Modelo de Svc com o Conjunto de Dados de Validação: 40.48117%
Acurácia do Modelo de Nn com o Conjunto de Dados de Validação: 36.71548%
Acurácia do Modelo de Boosting com o Conjunto de Dados de Validação: 39.85356%


### 4.7 Salvamento do Modelo de Ensemble

In [93]:
import joblib

joblib.dump(value=voting_clf, filename='../models/ensemble2.joblib')

['../models/ensemble2.joblib']