# Model Trainnig

In [33]:
import pandas as pd

train_predictors = pd.read_csv('../data/train_predictors_FE.csv', sep = ';', index_col=0)
test_predictors = pd.read_csv('../data/test_predictors_FE.csv', sep = ';', index_col=0)
train_label = pd.read_csv('../data/train_label_FE.csv', sep = ';', index_col=0)
test_label = pd.read_csv('../data/test_label_FE.csv', sep = ';', index_col=0)


# display(train_label.head(30))

train_label = train_label.squeeze()
test_label  = test_label.squeeze()

print("Formato atual de y:", train_label.shape)
print("Tipo de y:", type(train_label))


Formato atual de y: (734,)
Tipo de y: <class 'pandas.core.series.Series'>


#### Define os experimentos

In [34]:
import sys
import os
# Adiciona o diretório acima ao PATH do Python
sys.path.append(os.path.abspath(os.path.join('..')))


from machine_learning.decorator.BaseParamGrid import BaseParamGrid
from machine_learning.decorator.CommonParamsDecorator import CommonParamsDecorator
from machine_learning.decorator.RandomForestDecorator import RandomForestDecorator
from machine_learning.decorator.SVCDecorator import SVCDecorator
from machine_learning.decorator.LogisticRegressionDecorator import LogisticRegressionDecorator


base_grid = BaseParamGrid()

# # Decorando com vários classificadores
decorated_grid = CommonParamsDecorator(
                        SVCDecorator(
                            RandomForestDecorator(
                                LogisticRegressionDecorator(base_grid)
                            )
                        ),
                        random_states=[42, 101]
                    )

# Obtendo o param_grid final
param_grid = decorated_grid.get_params()
display(param_grid)

[{'classifier': [LogisticRegression()],
  'classifier__C': [0.1, 1, 10],
  'classifier__solver': ['liblinear'],
  'classifier__random_state': [42, 101]},
 {'classifier': [RandomForestClassifier()],
  'classifier__n_estimators': [50, 100],
  'classifier__max_depth': [None, 5, 10],
  'classifier__random_state': [42, 101]},
 {'classifier': [SVC()],
  'classifier__kernel': ['linear', 'rbf'],
  'classifier__C': [0.1, 1, 10],
  'classifier__random_state': [42, 101]}]

#### fita os experimentos 

In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import accuracy_score, precision_score, f1_score, make_scorer
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.model_selection import StratifiedKFold, cross_val_score


predictors = train_predictors.copy()
label = train_label.copy()

print(predictors.shape)
print(label.shape)

# Definir as métricas que queremos
scoring = {
            'accuracy': make_scorer(accuracy_score),
            'precision': make_scorer(precision_score, average='weighted'),
            'f1': make_scorer(f1_score, average='weighted')
        }

pipe = Pipeline([
                # Modelo stub (será substituído)
                ('classifier', DummyClassifier())  
                ])


skf = StratifiedKFold(
                        n_splits=5, 
                        shuffle=True, 
                        random_state=42
                     )

grid = GridSearchCV(estimator = pipe,
                    param_grid = param_grid, 
                    cv=skf, 
                    refit='f1',
                    scoring=scoring)

grid.fit(predictors, label)

(734, 20)
(734,)


### Model Test

In [36]:
from sklearn.metrics import classification_report


print("Melhor C:", grid.best_params_)
print("Melhor F1 (validação):", grid.best_score_)


# hearth_test_preprocessed_todf

# Avaliação no teste
best_model = grid.best_estimator_
y_pred = best_model.predict(test_predictors)
print(classification_report(test_label, y_pred))

Melhor C: {'classifier': RandomForestClassifier(), 'classifier__max_depth': 5, 'classifier__n_estimators': 100, 'classifier__random_state': 42}
Melhor F1 (validação): 0.8699323001877011
              precision    recall  f1-score   support

           0       0.93      0.79      0.85        81
           1       0.85      0.95      0.90       103

    accuracy                           0.88       184
   macro avg       0.89      0.87      0.88       184
weighted avg       0.89      0.88      0.88       184



### model interpretability

In [37]:
# import lime
# import lime.lime_tabular


# feature_names = predictors.columns

# # predictors = pd.concat([hearth_train_preprocessed_todf, 
# #                         hearth_dev_preprocessed_todf], 
# #                         axis = 0)

# # label = pd.concat([train_label, 
# #                     dev_label], 
# #                     axis = 0)

# # Criar explainer
# explainer = lime.lime_tabular.LimeTabularExplainer(
#     training_data=predictors.values,
#     feature_names=feature_names,
#     class_names=['Healthy', 'Hearth Attack'],  # Nomes das suas classes
#     # categorical_features=range(len(numeric_features), len(feature_names)),
#     # categorical_names={i: ['Não', 'Sim'] for i in range(len(numeric_features), len(feature_names))},
#     discretize_continuous=True,
#     verbose=True,
#     mode='classification'
# )

# idx = 0  # índice do exemplo que quer explicar
# instance = hearth_test_preprocessed_todf.iloc[idx].values

# # Gerar explicação
# exp = explainer.explain_instance(
#     data_row=instance,
#     predict_fn=best_model.predict_proba,
#     num_features=5,  # Número de features a mostrar
#     top_labels=2  # Número de classes a explicar
# )

# # Visualizar a explicação
# exp.show_in_notebook(show_table=True, show_all=True)


In [38]:
# import shap


# mod = best_model.named_steps['classifier']
# shap_values = explainer(hearth_test_preprocessed_todf)


# # use Kernel SHAP to explain test set predictions
# explainer = shap.KernelExplainer(mod.predict_proba, predictors, link="logit")
# shap_values = explainer.shap_values(hearth_test_preprocessed_todf, nsamples=100)

# print(predictors.shape)
# print(hearth_test_preprocessed_todf.shape)


# # plot the SHAP values for the Setosa output of the first instance
# # shap.force_plot(explainer.expected_value[0], 
# #                 shap_values[0][0,:], 
# #                 hearth_test_preprocessed_todf.iloc[0,:], link="logit")

# # # visualize the first prediction's explanation
# # shap.plots.bar(shap_values)


In [39]:
# print(shap_values.shape)
# print(hearth_test_preprocessed_todf.shape)

# # plot the SHAP values for the Setosa output of the first instance
# shap.force_plot(explainer.expected_value[0], 
#                 shap_values[0][0,:], 
#                 hearth_test_preprocessed_todf.iloc[0,:], link="logit")

# # visualize the first prediction's explanation
# shap.plots.bar(shap_values)