# Import the Dataset and Explore the Data <a name="import-the-dataset-and-explore-the-data"></a>


## 1.1 Importing Libraries <a name="11-importing-libraries"></a>


In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
#from kmodes.kmodes import KModes
from math import ceil

from sklearn.model_selection import train_test_split
import scipy.stats as stats
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier

from scipy.stats import chi2_contingency

#wrapper methods
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# embedded methods
from sklearn.linear_model import LassoCV

import warnings
warnings.filterwarnings('ignore')


# for better resolution plots
%config InlineBackend.figure_format = 'retina' # optionally, you can change 'svg' to 'retina'

# Seeting seaborn style
sns.set()

## Loading the variables from previous notebook

In [9]:
from variables import feature_selection, selected_features_rf, selected_features_dt, selected_features_rfe #, selected_features_svm

feature_selection

['Age at Injury',
 'Alternative Dispute Resolution',
 'Attorney/Representative',
 'Average Weekly Wage',
 'Carrier Name',
 'County of Injury',
 'COVID-19 Indicator',
 'Gender',
 'IME-4 Count',
 'Industry Code',
 'WCIO Cause of Injury Code',
 'WCIO Nature of Injury Code',
 'WCIO Part Of Body Code',
 'Zip Code',
 'Number of Dependents',
 'First Hearing',
 'C-3 Delivery',
 'C-2 Delivery',
 'Days from Accident to C-2',
 'Forms Delivered Count',
 'Claim Antiguity',
 'Valid Full Claim',
 'C-3 under Deadline_no info',
 'C-3 under Deadline_yes',
 'Carrier Type_2']

## Loading the previous Datasets <a name="12-loading-and-reading-the-dataset"></a>


In [18]:
X_train_processed = pd.read_csv('X_train_processed.csv', index_col=0)
X_val_processed= pd.read_csv('X_val_processed.csv', index_col=0)
X_test_processed = pd.read_csv('X_test_processed.csv', index_col=0)

In [27]:
X_train_processed_copy = pd.read_csv('X_train_processed_copy.csv', index_col=0)
X_val_processed_copy = pd.read_csv('X_val_processed_copy.csv', index_col=0)
X_test_processed_copy = pd.read_csv('X_test_processed_copy.csv', index_col=0)

In [29]:
y_train = pd.read_csv("y_train.csv", index_col=0)
y_val = pd.read_csv("y_val.csv", index_col=0)
#uploading it as a 1-column dataframe while preserving its index, to make sure it aligns

In [31]:
y_train = y_train.iloc[:, 0]
y_val = y_val.iloc[:, 0]
#turning it back into a panda series, so i can perform certain operations like Chi-Square Independence Test

## Prepare datasets with selected features for each model

In [36]:
X_train_rf = X_train_processed_copy[selected_features_rf]

In [40]:
X_val_rf = X_val_processed_copy[selected_features_rf]

In [48]:
X_train = X_train_processed[feature_selection]

In [52]:
X_val = X_val_processed[feature_selection]

In [59]:
feature_selection

['Age at Injury',
 'Alternative Dispute Resolution',
 'Attorney/Representative',
 'Average Weekly Wage',
 'Carrier Name',
 'County of Injury',
 'COVID-19 Indicator',
 'Gender',
 'IME-4 Count',
 'Industry Code',
 'WCIO Cause of Injury Code',
 'WCIO Nature of Injury Code',
 'WCIO Part Of Body Code',
 'Zip Code',
 'Number of Dependents',
 'First Hearing',
 'C-3 Delivery',
 'C-2 Delivery',
 'Days from Accident to C-2',
 'Forms Delivered Count',
 'Claim Antiguity',
 'Valid Full Claim',
 'C-3 under Deadline_no info',
 'C-3 under Deadline_yes',
 'Carrier Type_2']

# 6. Model Assessment and Selection

## Random Forest

In [54]:
model = RandomForestClassifier(
    n_estimators=300,  # Aumenta o número de árvores
    max_depth=20,  # Limita a profundidade máxima das árvores
    min_samples_split=5,  # Define o número mínimo de amostras para dividir um nó
    min_samples_leaf=2,  # Define o número mínimo de amostras em uma folha
    max_features='sqrt',  # Limita o número de variáveis consideradas em cada divisão
    class_weight='balanced',  # Ajusta pesos para lidar com classes desbalanceadas
    random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)
class_report = classification_report(y_val, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

Accuracy: 0.71
Confusion Matrix:
[[ 1729  1507   132   311    34     0     0    30]
 [  894 77213  1358  7454   343     0     0    62]
 [   38  9698  1266  7495  2072    23     0    80]
 [   32  3342   668 31758  8320   230     0   202]
 [    0   130    71  4726  9394    95     0    68]
 [    0     1     2   774   471    13     0     2]
 [    0     0     0    19     9     0     0     1]
 [    0    11     9    46     8     0     0    67]]

Classification Report:
              precision    recall  f1-score   support

           1       0.64      0.46      0.54      3743
           2       0.84      0.88      0.86     87324
           3       0.36      0.06      0.10     20672
           4       0.60      0.71      0.65     44552
           5       0.45      0.65      0.53     14484
           6       0.04      0.01      0.02      1263
           7       0.00      0.00      0.00        29
           8       0.13      0.48      0.21       141

    accuracy                           0.71   

In [55]:
print("Class distribution in y_val:", y_val.value_counts())
print("Class distribution in y_pred:", pd.Series(y_pred).value_counts())

Class distribution in y_val: Claim Injury Type
2    87324
4    44552
3    20672
5    14484
1     3743
6     1263
8      141
7       29
Name: count, dtype: int64
Class distribution in y_pred: 2    91902
4    52583
5    20651
3     3506
1     2693
8      512
6      361
Name: count, dtype: int64


In [771]:
y_train = model.predict(X_train_scaled_fs)

In [772]:
confusion_matrix(y_val, y_pred)

array([[ 1066,  1254,    90,    80,     6,     0,     0,     0],
       [  626, 54223,   806,  2522,    34,     0,     0,     5],
       [   15,  6828,  1326,  4786,   806,    15,     0,     5],
       [   16,  1019,   586, 24875,  2903,   273,     0,    30],
       [    0,    14,    49,  3703,  5759,   120,     0,    11],
       [    0,     1,    11,   555,   233,    41,     0,     1],
       [    0,     0,     0,    14,     5,     0,     0,     0],
       [    2,    14,     9,    31,     1,     0,     0,    37]],
      dtype=int64)

In [773]:
accuracy_score(y_val, y_pred)

0.7606483981673432

In [774]:
precision_score(y_val, y_pred, average='macro')  # or 'micro' or 'weighted'


0.464115529053868

In [775]:
recall_score(y_val, y_pred, average='macro')  # or 'micro' or 'weighted'

0.4163658011919117

In [776]:
f1_score(y_val, y_pred, average='macro')

0.42107458018466154

## KNN

In [789]:
modelKNN = KNeighborsClassifier()

In [791]:
modelKNN.fit(X = X_train_scaled_fs, y = y_train)

In [None]:
y_pred_train = modelKNN.predict(X_train_scaled_fs)
y_pred_val = modelKNN.predict(X_val_scaled_fs)

In [None]:
modelKNN.predict_proba(X_val_scaled_fs)

In [None]:
print(modelKNN.score(X_train_scaled_fs, y_pred_train))

In [None]:
print(modelKNN.score(X_val_scaled_fs, y_pred_val))

In [None]:
confusion_matrix(y_val, labels_val)

In [None]:
# Check class distribution
print("True class distribution in y_val:", pd.Series(y_val).value_counts())
print("Predicted class distribution in y_pred:", pd.Series(y_pred_val).value_counts())

In [None]:
accuracy_score(y_val, y_pred_val)

In [None]:
precision_score(y_val, y_pred_val, average='macro')

In [None]:
recall_score(y_val, y_pred_val, average='macro')

In [None]:
f1_score(y_val, y_pred_val, average='macro')

## Logistic Regression

In [None]:
log_model = LogisticRegression(
    solver='saga',         # Efficient solver for large datasets
    multi_class='multinomial',  # Better for multi-class problems
    penalty='l2',          # Use L2 regularization
    C=1.0,                 # Default regularization strength
    max_iter=500,          # Increase max iterations
    class_weight='balanced',  # Handle class imbalance
    n_jobs=-1,             # Use all CPU cores for computation
    random_state=42,       # Reproducibility
    verbose=1              # Monitor training progress
)

In [None]:
log_model.fit(X_train_scaled_fs,y_train)

In [None]:
y_pred_train=log_model.predict(X_train_scaled_fs)
y_pred = log_model.predict(X_val_scaled_fs)
y_pred

In [None]:
# Check class distribution
print("True class distribution in y_val:", pd.Series(y_val).value_counts())
print("Predicted class distribution in y_pred:", pd.Series(y_pred).value_counts())

In [None]:
# Compute accuracy score for training data
train_score = log_model.score(X_train_scaled_fs, y_train)

# Compute accuracy score for validation data
val_score = log_model.score(X_val_scaled_fs, y_val)

# Print the scores
print("Logistic Regression Training Accuracy:", train_score)
print("Logistic Regression Validation Accuracy:", val_score)

In [None]:
accuracy_score(y_val, y_pred)

In [None]:
precision_score(y_val, y_pred, average='macro')

In [None]:
recall_score(y_val, y_pred, average='macro')

In [None]:
f1_score(y_val, y_pred, average='macro')

### Using RFE's Feature Selection

In [None]:
log_model.fit(X_train_scaled_lr,y_train)

In [None]:
y_pred_train=log_model.predict(X_train_scaled_lr)
y_pred = log_model.predict(X_val_scaled_lr)
y_pred

In [None]:
# Check class distribution
print("True class distribution in y_val:", pd.Series(y_val).value_counts())
print("Predicted class distribution in y_pred:", pd.Series(y_pred).value_counts())

In [None]:
# Compute accuracy score for training data
train_score = log_model.score(X_train_scaled_lr, y_train)

# Compute accuracy score for validation data
val_score = log_model.score(X_val_scaled_lr, y_val)

# Print the scores
print("Logistic Regression Training Accuracy:", train_score)
print("Logistic Regression Validation Accuracy:", val_score)

In [None]:
accuracy_score(y_val, y_pred)

In [None]:
precision_score(y_val, y_pred, average='macro')

In [None]:
recall_score(y_val, y_pred, average='macro')

In [None]:
f1_score(y_val, y_pred, average='macro')

In [None]:
metrics(y_train = y_train, pred_train = y_pred_train, y_val = y_val, pred_val = y_pred)

## Support Vector Machines

In [None]:
SVC_model = SVC(kernel='linear')

In [None]:
y_pred_train=SVC_model.predict(X_train_scaled_fs)
y_pred = SVC_model.predict(X_val_scaled_fs)
y_pred

In [None]:
# Check class distribution
print("True class distribution in y_val:", pd.Series(y_val).value_counts())
print("Predicted class distribution in y_pred:", pd.Series(y_pred).value_counts())

In [None]:
# Compute accuracy score for training data
train_score = SVC_model.score(X_train_scaled_fs, y_train)

# Compute accuracy score for validation data
val_score = SVC_model.score(X_val_scaled_fs, y_val)

# Print the scores
print("Logistic Regression Training Accuracy:", train_score)
print("Logistic Regression Validation Accuracy:", val_score)

In [None]:
accuracy_score(y_val, y_pred)

In [None]:
precision_score(y_val, y_pred, average='macro')

In [None]:
recall_score(y_val, y_pred, average='macro')

In [None]:
f1_score(y_val, y_pred, average='macro')

In [None]:
metrics(y_train = y_train, pred_train = y_pred_train, y_val = y_val, pred_val = y_pred)

## Naive Bayes

In [None]:
f1_scores_train = []
f1_scores_val = []

# Set up StratifiedKFold 
k = 10
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

# For each fold
for train_index, val_index in skf.split(X_train_val, y_train_val):
    
    #split between training and val
    X_train, y_train = X_train_val.iloc[train_index], y_train_val.iloc[train_index]  
    X_val, y_val = X_train_val.iloc[val_index], y_train_val.iloc[val_index]  

    # Create a Naive Bayes model
    nb_model = GaussianNB()
    nb_model.fit(X_train_val, y_train_val)

    # Predictions on training set
    y_train_pred = nb_model.predict(X_train_val)
    f1_train = f1_score(y_train, y_train_pred)
    f1_scores_train.append(f1_train)

    # Predictions on validation set
    y_val_pred = nb_model.predict(X_test)  
    f1_val = f1_score(y_test, y_val_pred) 
    f1_scores_val.append(f1_val)

# Output the mean F1 scores
print("Mean F1 Score on Training Set:", np.mean(f1_scores_train))
print("Mean F1 Score on Validation Set:", np.mean(f1_scores_val))

# 7.Final Prediction

In [726]:
model_rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=20, # MISSING: Best Practice: Use cross-validation to determine the optimal depth.
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42)

model_rf.fit(X_train_scaled_fs, y_train)

y_pred_test = model_rf.predict(X_test_scaled_fs)

In [735]:
print("Class distribution in y_val:", y_val.value_counts())
print("Class distribution in y_pred_test:", pd.Series(y_pred_test).value_counts())

Class distribution in y_val: Claim Injury Type
2    58216
4    29702
3    13781
5     9656
1     2496
6      842
8       94
7       19
Name: count, dtype: int64
Class distribution in y_pred_test: 2    282235
4     62713
3     29440
1      9711
5      3812
8        56
6         8
Name: count, dtype: int64


In [767]:
y_pred_test_series = pd.Series(y_pred_test)

y_pred_test_series = y_pred_test_series.astype(int)

y_pred_test_series = y_pred_test_series.replace({
    1: '1. CANCELLED',
    2: '2. NON-COMP',
    3: '3. MED ONLY',
    4: '4. TEMPORARY',
    5: '5. PPD SCH LOSS',
    6: '6. PPD NSL',
    7: '7. PTD',
    8: '8. DEATH'})

y_pred_test_series

0          2. NON-COMP
1         4. TEMPORARY
2          2. NON-COMP
3          2. NON-COMP
4          2. NON-COMP
              ...     
387970     3. MED ONLY
387971    1. CANCELLED
387972    1. CANCELLED
387973    1. CANCELLED
387974    1. CANCELLED
Length: 387975, dtype: object

In [769]:
# Convert y_pred_test to a pandas Series if it's not already
y_pred_test_series = pd.Series(y_pred_test_series, index=X_test.index)

# Combine 'Claim Identifier' from X_test and the predictions from y_pred_test_series
submission2 = pd.DataFrame({
    'Claim Identifier': X_test_copy['Claim Identifier'],  # 'Claim Identifier' from X_test
    'Claim Injury Type': y_pred_test_series  # Your predictions
})

# Export the combined DataFrame to CSV
submission2.to_csv("submission2.csv", index=False)

# Print confirmation message
print("Arquivo exportado como submission2.csv")

Arquivo exportado como submission2.csv


In [755]:
y_pred_test_series.shape

(387975,)