# **TO GRANT OR NOT TO GRANT: DECIDING ON COMPENSATION BENEFITS - PART 4: FEATURE SELECTION AND MODEL ASSESSMENT**

## 1. Imports and Initial Transformation

In [2]:
# importing the libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from imblearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE


In [3]:
# setting the options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
np.set_printoptions(threshold=np.inf)

In [4]:
df_train = pd.read_csv('../Data/train_data_preproc_OpenEnded.csv', sep=',').set_index('Unnamed: 0')
df_val = pd.read_csv('../Data/validation_data_preproc_OpenEnded.csv', sep=',').set_index('Unnamed: 0')

In [5]:
X_train = df_train.drop(columns=['Agreement Reached'])
y_train = df_train['Agreement Reached']

X_val = df_val.drop(columns=['Agreement Reached'])
y_val = df_val['Agreement Reached']

In [6]:
df_train2 = pd.read_csv('../Data/train_data_preproc.csv', sep=',').set_index('Unnamed: 0')
df_val2 = pd.read_csv('../Data/validation_data_preproc.csv', sep=',').set_index('Unnamed: 0')
df_test2 = pd.read_csv('../Data/test_data_preproc.csv', sep=',').set_index('Unnamed: 0')

In [7]:
X_train2 = df_train2.drop(columns=['Claim Injury Type'])
y_train2 = df_train2['Claim Injury Type']

X_val2 = df_val2.drop(columns=['Claim Injury Type'])
y_val2 = df_val2['Claim Injury Type']

## 2. Feature selection

First we are going to define the model. We chose to use XGBoost for now because we believe it is a good model to deal with complex relationships and noisy data and after testing some models like Random Forest we were happier with the results this one provided. We are defining the number of trees as 200 to try to reduce the overfit and the same logic applies to the depth of each tree being equal to 6  <br>

We are going to use a Wrapper Method, mor specifically Recursive Feature Elimination, to find the optimum number of features for the model we defined. We also use Stratified K-Fold to guarantee that each fold contains the same class distribution of the training dataset. <br>

To evaluate model performance we use macro F1-Score, which is the simple average of the F1-scores for each class in a multiclass problem. This way, we are also consistent with Kaggle.

In [20]:
# defining the moedel

# model = XGBClassifier(
#     n_estimators=200,
#     max_depth=8,
#     learning_rate=0.2,
#     scale_pos_weight=19,  # Mantendo o ajuste para desbalanceamento
#     min_child_weight=10,
#     colsample_bytree=1.0,
#     subsample=0.8,
#     random_state=42,
#     use_label_encoder=False,
#     eval_metric='mlogloss'
# )

model = XGBClassifier(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.15,  # Ajuste para tentar uma taxa de aprendizado ligeiramente menor
    scale_pos_weight=18,  # Ajuste para o desbalanceamento
    min_child_weight=8,  # Reduzido para permitir divisões mais leves
    colsample_bytree=0.9,  # Redução para testar se menos colunas melhoram a generalização
    subsample=0.85,  # Aumento ligeiro da amostragem para testar a robustez
    reg_alpha=0.1,  # Regularização L1 para maior controle de sparsity
    reg_lambda=1.0,  # Regularização L2 para penalizar pesos elevados
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

best_f1_score = 0
best_num_features = 0
best_selected_features = []

results = []


Now we are going to perform a for loop to test different numbers of features and determine the optimal count. Due to the fact that this code is very computationally expensive, we run different feature ranges in smaller batches, rather than testing all at once. The code below is an example of one range, and we will later present the results obtained. We are keeping not only the score on the validation, but also on the training so that we can evaluate if the model overfits.

In [21]:
for n_features in range(16, 30):

    rfe = RFE(estimator=model, n_features_to_select=n_features, step=1)
    X_train_rfe = rfe.fit_transform(X_train, y_train)
    
    fold_val_scores = []
    fold_train_scores = []
    
    for train_index, val_index in kf.split(X_train_rfe, y_train):
        X_fold_train, X_fold_val = X_train_rfe[train_index], X_train_rfe[val_index]
        y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        model.fit(X_fold_train, y_fold_train)
        
        y_train_pred = model.predict(X_fold_train)
        y_val_pred = model.predict(X_fold_val)
        
        train_f1 = f1_score(y_fold_train, y_train_pred, average='macro')
        val_f1 = f1_score(y_fold_val, y_val_pred, average='macro')
        
        fold_train_scores.append(train_f1)
        fold_val_scores.append(val_f1)
    
    avg_train_f1 = np.mean(fold_train_scores)
    avg_val_f1 = np.mean(fold_val_scores)
    overfit_percentage = ((avg_train_f1 - avg_val_f1) / avg_train_f1) * 100
    
    print(f"Number of features: {n_features} | Avg Train F1-Score: {avg_train_f1:.4f} | Avg Val F1-Score: {avg_val_f1:.4f} | Overfit %: {overfit_percentage:.2f}%")
    
    results.append({
        'Number of features': n_features,
        'Average Training F1-Score Macro': avg_train_f1,
        'Average Validation F1-Score Macro': avg_val_f1,
        'Overfit Percentage': overfit_percentage
    })
    
    if avg_val_f1 > best_f1_score:
        best_f1_score = avg_val_f1
        best_num_features = n_features
        best_selected_features = X_train.columns[rfe.support_]

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Number of features: 16 | Avg Train F1-Score: 0.6549 | Avg Val F1-Score: 0.6339 | Overfit %: 3.20%


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Number of features: 17 | Avg Train F1-Score: 0.6858 | Avg Val F1-Score: 0.6543 | Overfit %: 4.60%


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Number of features: 18 | Avg Train F1-Score: 0.6957 | Avg Val F1-Score: 0.6594 | Overfit %: 5.22%


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Number of features: 19 | Avg Train F1-Score: 0.6968 | Avg Val F1-Score: 0.6606 | Overfit %: 5.19%


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Number of features: 20 | Avg Train F1-Score: 0.6989 | Avg Val F1-Score: 0.6622 | Overfit %: 5.24%


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Number of features: 21 | Avg Train F1-Score: 0.6982 | Avg Val F1-Score: 0.6618 | Overfit %: 5.21%


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Number of features: 22 | Avg Train F1-Score: 0.6995 | Avg Val F1-Score: 0.6622 | Overfit %: 5.34%


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Number of features: 23 | Avg Train F1-Score: 0.7041 | Avg Val F1-Score: 0.6655 | Overfit %: 5.49%


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Number of features: 24 | Avg Train F1-Score: 0.7063 | Avg Val F1-Score: 0.6660 | Overfit %: 5.70%


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Number of features: 25 | Avg Train F1-Score: 0.7074 | Avg Val F1-Score: 0.6672 | Overfit %: 5.68%


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Number of features: 26 | Avg Train F1-Score: 0.7079 | Avg Val F1-Score: 0.6671 | Overfit %: 5.77%


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Number of features: 27 | Avg Train F1-Score: 0.7069 | Avg Val F1-Score: 0.6665 | Overfit %: 5.71%


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Number of features: 28 | Avg Train F1-Score: 0.7096 | Avg Val F1-Score: 0.6679 | Overfit %: 5.88%


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Number of features: 29 | Avg Train F1-Score: 0.7099 | Avg Val F1-Score: 0.6681 | Overfit %: 5.89%


Finally, we see our best results and convert our results to a dataframe to keep them on a csv

In [22]:
results_df = pd.DataFrame(results)

print(f"\nBest number of features: {best_num_features}")
print(f"Best F1-Score Macro on validation: {best_f1_score:.4f}")
print("Selected features:", best_selected_features.tolist())

results_df.to_csv("../Data/feature_selection_results_(16-29)_AgreReach.csv", index=False)


Best number of features: 29
Best F1-Score Macro on validation: 0.6681
Selected features: ['Attorney/Representative', 'Average Weekly Wage', 'Carrier Name', 'COVID-19 Indicator', 'IME-4 Count', 'Industry Code', 'C-3 Missed Timing', 'Days Difference', 'C-2 Missing', 'C-3 Missing', 'Has Hearing', 'Has IME-4 Report', 'Accident Date_year', 'Assembly Date_year', 'C-2 Date_year', 'C-3 Date_year', 'First Hearing Date_year', 'Carrier Type_3A. SELF PUBLIC', 'Carrier Type_4A. SELF PRIVATE', 'Carrier Type_5D. SPECIAL FUND - UNKNOWN', 'Carrier Type_UNKNOWN', 'District Name_BINGHAMTON', 'District Name_BUFFALO', 'District Name_NYC', 'District Name_ROCHESTER', 'District Name_STATEWIDE', 'District Name_SYRACUSE', 'Medical Fee Region_IV', 'Medical Fee Region_UK']


In [24]:
#Importing the results we got before

file_0 = pd.read_csv("../Data/feature_selection_results_(16-29)_AgreReach.csv", sep=',')
# file_1 = pd.read_csv("../Data/feature_selection_results_(20-24).csv", sep=',')
# file_2 = pd.read_csv("../Data/feature_selection_results_(25-31).csv", sep=',')
# file_3 = pd.read_csv("../Data/feature_selection_results_(32-48).csv", sep=',')

combined_results = pd.concat([file_0], ignore_index=True)

combined_results

Unnamed: 0,Number of features,Average Training F1-Score Macro,Average Validation F1-Score Macro,Overfit Percentage
0,16,0.654864,0.6339,3.201182
1,17,0.685816,0.654291,4.596604
2,18,0.695679,0.659381,5.217517
3,19,0.696768,0.660629,5.18668
4,20,0.698866,0.662247,5.2397
5,21,0.698191,0.661782,5.214751
6,22,0.699533,0.662195,5.337486
7,23,0.704137,0.66551,5.485772
8,24,0.706262,0.666011,5.699222
9,25,0.707372,0.667176,5.682359


Analysing our results we understand that the less number of features the less the model overfits and the F1 Macro Score doesn't change much. However, we are aware that we are still ocuring in substantial overfitting and that the model isn't generalysing as well as we'd like, we'll choose the number of features 17, since it seems to be the best trade off in overfit and score on validation.

## MODELO PARA AGREEMENT REACHED

### 3.1. XGBoost

Now that we chose to keep 17 features, we run RFE again just to select these x best features for the model. We will define a pipeline to facilitate the process in case we want to apply techniques to deal with class imbalance like SMOTE.

##### Modelo com RFE para selecionar features

In [None]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# num_features = 16

# model = XGBClassifier(
#     n_estimators=200,
#     max_depth=6,
#     learning_rate=0.1,
#     scale_pos_weight=1,
#     random_state=42,
#     use_label_encoder=False,
#     eval_metric='mlogloss'
# )


# model = XGBClassifier(
#     n_estimators=200,
#     max_depth=8,
#     learning_rate=0.15,  # Ajuste para tentar uma taxa de aprendizado ligeiramente menor
#     scale_pos_weight=18,  # Ajuste para o desbalanceamento
#     min_child_weight=8,  # Reduzido para permitir divisões mais leves
#     colsample_bytree=0.9,  # Redução para testar se menos colunas melhoram a generalização
#     subsample=0.85,  # Aumento ligeiro da amostragem para testar a robustez
#     reg_alpha=0.1,  # Regularização L1 para maior controle de sparsity
#     reg_lambda=1.0,  # Regularização L2 para penalizar pesos elevados
#     random_state=42,
#     use_label_encoder=False,
#     eval_metric='mlogloss'
# )

# from catboost import CatBoostClassifier

model = CatBoostClassifier(
    iterations=1000,                  # Número elevado de iterações para permitir ajustes finos
    depth=8,                          # Profundidade moderada para capturar relações complexas sem overfitting
    learning_rate=0.05,               # Taxa de aprendizado menor para ajustes mais cuidadosos
    l2_leaf_reg=4,                    # Regularização L2 para evitar overfitting
    random_strength=1.5,              # Introduz ruído no modelo para aumentar a generalização
    bagging_temperature=1.0,          # Controle do bootstrap para reduzir overfitting
    border_count=254,                 # Aumenta a quantidade de divisões de bin para capturar mais nuances nos dados
    scale_pos_weight=20,              # Ajuste para lidar com desbalanceamento (baseado na relação de classes)
    eval_metric="F1",                 # Focado em otimizar F1, ideal para problemas desbalanceados
    random_seed=42,
    early_stopping_rounds=50,         # Parada precoce para evitar overfitting e reduzir o tempo de treino
    verbose=100                       # Saída periódica para monitoramento
)


pipeline = Pipeline([
    ('classifier', model)
])



# rfe = RFE(estimator=model, n_features_to_select=num_features, step=5)
# X_train_rfe = rfe.fit_transform(X_train, y_train)

# all_features = X_train.columns
# selected_features = all_features[rfe.support_]
# removed_features = all_features[~rfe.support_]

# X_train_sample_selected = X_train[selected_features]
# X_val_sample_selected = X_val[selected_features]
# print("Features selecionadas:", selected_features.tolist())


We'll use the Stratified K-Fold to evaluate the model in each of the folders and to optimize the amount of data we have. Relatively to the score we are planning to use F1 Macro, Precision Macro and Recall Macro. The preference for Macro comes from the fact that it attributes equal weight to all classes, regardless of how many instances each class has, ensuring the model performance is assessed fairly. <br> 

Using Precision and Recall alongside F1 Macro allows us to gain a deeper understanding of our model's behavior in the context of an imbalanced dataset, offering insights into different aspects of its performance across classes

#### 3.1.1. F1-Macro SEM RFE

In [61]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

train_fold_scores = []
val_fold_scores = []

for train_index, val_index in kf.split(X_train, y_train):
    X_fold_train, X_fold_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    pipeline.fit(X_fold_train, y_fold_train)
    
    y_train_pred = pipeline.predict(X_fold_train)
    y_val_pred = pipeline.predict(X_fold_val)
    
    train_f1 = f1_score(y_fold_train, y_train_pred, average='macro')
    val_f1 = f1_score(y_fold_val, y_val_pred, average='macro')
    
    train_fold_scores.append(train_f1)
    val_fold_scores.append(val_f1)

    print(f"Fold - F1-Score Macro (Train): {train_f1:.4f} | F1-Score Macro (Validation): {val_f1:.4f}")

average_train_f1 = np.mean(train_fold_scores)
average_val_f1 = np.mean(val_fold_scores)

print(f"\nAverage F1-Score Macro on Train: {average_train_f1:.4f}")
print(f"Average F1-Score Macro on Validation: {average_val_f1:.4f}")

0:	learn: 0.8916663	total: 51.2ms	remaining: 51.2s
100:	learn: 0.8941335	total: 4.01s	remaining: 35.7s
200:	learn: 0.8988636	total: 8.22s	remaining: 32.7s
300:	learn: 0.9044324	total: 12.7s	remaining: 29.6s
400:	learn: 0.9101564	total: 17.3s	remaining: 25.8s
500:	learn: 0.9152026	total: 21.7s	remaining: 21.7s
600:	learn: 0.9196044	total: 26.2s	remaining: 17.4s
700:	learn: 0.9243697	total: 30.6s	remaining: 13s
800:	learn: 0.9284861	total: 35.1s	remaining: 8.71s
900:	learn: 0.9311925	total: 39.5s	remaining: 4.34s
999:	learn: 0.9339855	total: 44s	remaining: 0us
Fold - F1-Score Macro (Train): 0.6952 | F1-Score Macro (Validation): 0.6660
0:	learn: 0.8904660	total: 49ms	remaining: 48.9s
100:	learn: 0.8932828	total: 4.5s	remaining: 40.1s
200:	learn: 0.8976046	total: 9.34s	remaining: 37.1s
300:	learn: 0.9034039	total: 15.3s	remaining: 35.6s
400:	learn: 0.9088254	total: 20.4s	remaining: 30.5s
500:	learn: 0.9140982	total: 24.9s	remaining: 24.8s
600:	learn: 0.9185871	total: 29.3s	remaining: 19.4s

#### 3.1.2. F1-Macro COM RFE

In [60]:
# Definir StratifiedKFold para o conjunto de treino com features selecionadas
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Inicializar listas para armazenar os resultados de cada fold
train_fold_scores = []
val_fold_scores = []

# Realizar o cross-validation com as features selecionadas
for train_index, val_index in kf.split(X_train_sample_selected, y_train):
    X_fold_train, X_fold_val = X_train_sample_selected.iloc[train_index], X_train_sample_selected.iloc[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Treinar o pipeline no fold atual
    pipeline.fit(X_fold_train, y_fold_train)
    
    # Previsões para treino e validação
    y_train_pred = pipeline.predict(X_fold_train)
    y_val_pred = pipeline.predict(X_fold_val)
    
    # Calcular o F1-Score Macro para treino e validação
    train_f1 = f1_score(y_fold_train, y_train_pred, average='macro')
    val_f1 = f1_score(y_fold_val, y_val_pred, average='macro')
    
    # Armazenar as pontuações de treino e validação
    train_fold_scores.append(train_f1)
    val_fold_scores.append(val_f1)

    print(f"Fold - F1-Score Macro (Train): {train_f1:.4f} | F1-Score Macro (Validation): {val_f1:.4f}")

# Calcular as médias das pontuações de treino e validação
average_train_f1 = np.mean(train_fold_scores)
average_val_f1 = np.mean(val_fold_scores)

print(f"\nAverage F1-Score Macro on Train: {average_train_f1:.4f}")
print(f"Average F1-Score Macro on Validation: {average_val_f1:.4f}")

0:	learn: 0.8914912	total: 27.4ms	remaining: 27.3s
100:	learn: 0.8929902	total: 3.14s	remaining: 28s
200:	learn: 0.8947132	total: 6.23s	remaining: 24.8s
300:	learn: 0.8966406	total: 9.45s	remaining: 21.9s
400:	learn: 0.8986079	total: 12.7s	remaining: 19s
500:	learn: 0.9004723	total: 16s	remaining: 15.9s
600:	learn: 0.9019955	total: 19.2s	remaining: 12.8s
700:	learn: 0.9037792	total: 22.5s	remaining: 9.58s
800:	learn: 0.9049263	total: 25.7s	remaining: 6.39s
900:	learn: 0.9065736	total: 29.1s	remaining: 3.19s
999:	learn: 0.9078393	total: 32.4s	remaining: 0us
Fold - F1-Score Macro (Train): 0.6537 | F1-Score Macro (Validation): 0.6448
0:	learn: 0.8903154	total: 32.9ms	remaining: 32.9s
100:	learn: 0.8922039	total: 3.57s	remaining: 31.8s
200:	learn: 0.8935664	total: 7.13s	remaining: 28.4s
300:	learn: 0.8953130	total: 10.8s	remaining: 25.1s
400:	learn: 0.8975566	total: 14.6s	remaining: 21.8s
500:	learn: 0.8992595	total: 18.3s	remaining: 18.3s
600:	learn: 0.9008380	total: 22.1s	remaining: 14.7

KeyboardInterrupt: 

## Modelo CLAIM INJURY TYPE SEM AGREEMENT REACHED

In [35]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Listas para armazenar os resultados de treino e validação para a abordagem sem a nova feature
train_fold_scores_all = []
val_fold_scores_all = []

# Primeira abordagem: Todas as features iniciais (sem predicted_agreement_reached)
for train_index, val_index in kf.split(X_train2, y_train2):
    X_fold_train, X_fold_val = X_train2.iloc[train_index], X_train2.iloc[val_index]
    y_fold_train, y_fold_val = y_train2.iloc[train_index], y_train2.iloc[val_index]
    
    pipeline.fit(X_fold_train, y_fold_train)
    
    y_train_pred_all = pipeline.predict(X_fold_train)
    y_val_pred_all = pipeline.predict(X_fold_val)
    
    # Calcular F1-Score Macro para treino e validação
    train_f1_all = f1_score(y_fold_train, y_train_pred_all, average='macro')
    val_f1_all = f1_score(y_fold_val, y_val_pred_all, average='macro')
    
    # Armazenar os F1-Scores para esta abordagem
    train_fold_scores_all.append(train_f1_all)  
    val_fold_scores_all.append(val_f1_all)

# Calcular médias dos F1-Scores para esta abordagem
average_train_f1_all = np.mean(train_fold_scores_all)
average_val_f1_all = np.mean(val_fold_scores_all)

# Imprimir os resultados
print(f"\nAverage F1-Score Macro on Train (All Features): {average_train_f1_all:.4f}")
print(f"Average F1-Score Macro on Validation (All Features): {average_val_f1_all:.4f}")


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.




Average F1-Score Macro on Train (All Features): 0.6348
Average F1-Score Macro on Validation (All Features): 0.4354


## Modelo CLAIM INJURY TYPE COM E SEM AGREEMENT REACHED SEM RFE

In [58]:
# Treinar o pipeline para prever Agreement Reached
pipeline.fit(X_train, y_train)

# Gerar previsões para o conjunto de treino e validação para a variável Agreement Reached
y_train_agreement_pred = pipeline.predict(X_train)
y_val_agreement_pred = pipeline.predict(X_val)


Parameters: { "use_label_encoder" } are not used.



In [59]:
# Adicionar as previsões de Agreement Reached como nova coluna em X_train2 e X_val2
X_train_with_agreement = X_train2.copy()
X_train_with_agreement['predicted_agreement_reached'] = y_train_agreement_pred

X_val_with_agreement = X_val2.copy()
X_val_with_agreement['predicted_agreement_reached'] = y_val_agreement_pred


In [60]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Listas para armazenar os resultados de treino e validação para a abordagem com a nova feature
train_fold_scores_with_agreement = []
val_fold_scores_with_agreement = []

# Segunda abordagem: Todas as features iniciais + predicted_agreement_reached
for train_index, val_index in kf.split(X_train_with_agreement, y_train2):
    X_fold_train, X_fold_val = X_train_with_agreement.iloc[train_index], X_train_with_agreement.iloc[val_index]
    y_fold_train, y_fold_val = y_train2.iloc[train_index], y_train2.iloc[val_index]
    
    # Treinar o pipeline
    pipeline.fit(X_fold_train, y_fold_train)
    
    # Previsões para treino e validação
    y_train_pred_with_agreement = pipeline.predict(X_fold_train)
    y_val_pred_with_agreement = pipeline.predict(X_fold_val)
    
    # Calcular F1-Score Macro para treino e validação
    train_f1_with_agreement = f1_score(y_fold_train, y_train_pred_with_agreement, average='macro')
    val_f1_with_agreement = f1_score(y_fold_val, y_val_pred_with_agreement, average='macro')
    
    # Armazenar as pontuações
    train_fold_scores_with_agreement.append(train_f1_with_agreement)
    val_fold_scores_with_agreement.append(val_f1_with_agreement)

# Calcular as médias das pontuações
average_train_f1_with_agreement = np.mean(train_fold_scores_with_agreement)
average_val_f1_with_agreement = np.mean(val_fold_scores_with_agreement)

# Imprimir os resultados
print(f"\nAverage F1-Score Macro on Train (All Features + Agreement Reached): {average_train_f1_with_agreement:.4f}")
print(f"Average F1-Score Macro on Validation (All Features + Agreement Reached): {average_val_f1_with_agreement:.4f}")


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.




Average F1-Score Macro on Train (All Features + Agreement Reached): 0.7649
Average F1-Score Macro on Validation (All Features + Agreement Reached): 0.4457


## Modelo CLAIM INJURY TYPE COM AGREEMENT REACHED PÓS RFE

Crio modelo para agreement reached, faço feature selection no agreement reached e fico com X_train_sample_selected e X_val_sample_selected. Pelo facto de que as linhas vão ser as mesmas que no modelo de claim injury type vou incluir as previsões logo no X_train_sample_selected e X_val_..., depois meto essa coluna no X_train2 e X_val2 que preveem claim injury type, ou seja, como se transferrisemos a coluna?

In [36]:
# Treinar o modelo de 'Agreement Reached' com as features selecionadas
pipeline.fit(X_train_sample_selected, y_train)

# Gerar previsões para 'Agreement Reached' no conjunto de treino e validação
y_train_agreement_pred = pipeline.predict(X_train_sample_selected)
y_val_agreement_pred = pipeline.predict(X_val_sample_selected)


Parameters: { "use_label_encoder" } are not used.



In [38]:
# Adicionar a previsão de Agreement Reached como uma nova coluna em X_train2 e X_val2
X_train2['predicted_agreement_reached'] = y_train_agreement_pred
X_val2['predicted_agreement_reached'] = y_val_agreement_pred

In [39]:
# Definir y_train2 e y_val2 para o modelo de 'Claim Injury Type'
y_train_claim = y_train2  # target para 'Claim Injury Type'
y_val_claim = y_val2

# Treina o modelo com todas as features, incluindo 'predicted_agreement_reached'
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_fold_scores = []
val_fold_scores = []

# Usar X_train2 e y_train_claim com a nova feature 'predicted_agreement_reached'
for train_index, val_index in kf.split(X_train2, y_train_claim):
    X_fold_train, X_fold_val = X_train2.iloc[train_index], X_train2.iloc[val_index]
    y_fold_train, y_fold_val = y_train_claim.iloc[train_index], y_train_claim.iloc[val_index]
    
    # Treinar o pipeline
    pipeline.fit(X_fold_train, y_fold_train)
    
    # Fazer previsões
    y_train_pred = pipeline.predict(X_fold_train)
    y_val_pred = pipeline.predict(X_fold_val)
    
    # Calcular F1-Score Macro
    train_f1 = f1_score(y_fold_train, y_train_pred, average='macro')
    val_f1 = f1_score(y_fold_val, y_val_pred, average='macro')
    
    # Armazenar as pontuações
    train_fold_scores.append(train_f1)
    val_fold_scores.append(val_f1)

# Calcular médias
average_train_f1 = np.mean(train_fold_scores)
average_val_f1 = np.mean(val_fold_scores)

print(f"\nAverage F1-Score Macro on Train: {average_train_f1:.4f}")
print(f"Average F1-Score Macro on Validation: {average_val_f1:.4f}")


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.




Average F1-Score Macro on Train: 0.7164
Average F1-Score Macro on Validation: 0.4457


#### RFE ao modelo final

Avaliar importancia das features no modelo XGBoost

In [44]:
# Treinar o modelo com todas as features, incluindo a nova variável
pipeline.fit(X_train2, y_train2)

# Obter a importância das features
feature_importances = pipeline.named_steps['classifier'].feature_importances_

# Associar importâncias às features completas
feature_importance_df = pd.DataFrame({
    'Feature': X_train2.columns,  # Usando todas as colunas do X_train2
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



                                                     Feature  Importance
19                                               C-2 Missing    0.230062
3                                        Average Weekly Wage    0.135489
2                                    Attorney/Representative    0.093330
1                             Alternative Dispute Resolution    0.090315
22                                          Has IME-4 Report    0.083018
21                                               Has Hearing    0.041699
44                                      Carrier Type_UNKNOWN    0.029794
7                                         COVID-19 Indicator    0.025554
29                                             C-2 Date_year    0.016048
20                                               C-3 Missing    0.014451
9                                                IME-4 Count    0.012592
43                   Carrier Type_5D. SPECIAL FUND - UNKNOWN    0.011847
16                                         C-2 Miss

#### 3.1.4. Training the Model

In [19]:
#Training the model in the entire dataset before making a prediction on the test data

pipeline.fit(X_train_sample_selected, y_train)

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



In [None]:
X_test_selected = df_test[selected_features]

y_test_pred = pipeline.predict(X_test_selected)

class_mapping = {
    0: "1. CANCELLED",
    1: "2. NON-COMP",
    2: "3. MED ONLY",
    3: "4. TEMPORARY",
    4: "5. PPD SCH LOSS",
    5: "6. PPD NSL",
    6: "7. PTD",
    7: "8. DEATH"
}

df_submission = pd.DataFrame({
    'Claim Identifier': df_test.index,
    'Claim Injury Type': y_test_pred
})

df_submission['Claim Injury Type'] = df_submission['Claim Injury Type'].map(class_mapping)

df_submission.to_csv("../Data/Group43_Version10.csv", index=False)