In [334]:
%pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [1]:
# Standard libraries
import time
import numpy as np

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.model_selection import (
    GridSearchCV, PredefinedSplit, KFold, StratifiedKFold, train_test_split, cross_val_score
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, make_scorer
)
from imblearn.over_sampling import SMOTE

from Preprocessing_functions import *

import importlib
imported_module = importlib.import_module("Preprocessing_functions")
importlib.reload(imported_module)

# pandas max columns display
pd.set_option('display.max_columns', None)

## Import Dataset

In [2]:
train_data = pd.read_csv('train_data.csv', index_col='Claim Identifier')
test_data = pd.read_csv('test_data.csv', index_col='Claim Identifier')

  train_data = pd.read_csv('train_data.csv', index_col='Claim Identifier')


In [3]:
train_data = train_data[~(train_data.drop(columns=['Assembly Date']).isna().all(axis=1) & train_data['Assembly Date'].notna())]

In [4]:
X = train_data.drop(columns=['Claim Injury Type', 'WCB Decision', 'Agreement Reached','OIICS Nature of Injury Description'])
y = train_data['Claim Injury Type']

test_data = test_data.drop(columns=['OIICS Nature of Injury Description'])

____

# Auxiliary Functions


In [5]:
def create_predifined_split_with_features(X, y, preprocess_steps,selected_features, n_splits = 5):
    """
    Creates a PredefinedSplit object to be used in cross-validation, more specifically in GridSearchCV.

    Steps:
    - Defines the number of splits
    - Splits the data into training and validation sets
    - Applies the preprocessing steps to the training and validation sets
    - Returns the PredefinedSplit object and the preprocessed data
    """

    X_combined_list = []
    y_combined_list = []

    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    test_data = np.zeros(len(X), dtype=int) - 1

    for fold_idx, (_, test_idx) in enumerate(kf.split(X, y)):
        test_data[test_idx] = fold_idx

    ps = PredefinedSplit(test_fold=test_data)

    for train_index, test_index in ps.split():

        # Get fold
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]

        # Preprocess and encode data    
        X_train, X_val = preprocess_steps(X_train, X_val)
        y_train, y_val, le = encoding_label(y_train, y_val)

        X_combined_list.append(X_train[selected_features])
        y_combined_list.append(y_train)

    X_combined = pd.concat(X_combined_list, axis=0)
    y_combined = np.concatenate(y_combined_list, axis=0)

    return ps, X_combined, y_combined

In [6]:
def create_predifined_split(X, y, preprocess_steps, n_splits = 5):
    """
    Creates a PredefinedSplit object to be used in cross-validation, more specifically in GridSearchCV.

    Steps:
    - Defines the number of splits
    - Splits the data into training and validation sets
    - Applies the preprocessing steps to the training and validation sets
    - Returns the PredefinedSplit object and the preprocessed data
    """

    X_combined_list = []
    y_combined_list = []

    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    test_data = np.zeros(len(X), dtype=int) - 1

    for fold_idx, (_, test_idx) in enumerate(kf.split(X, y)):
        test_data[test_idx] = fold_idx

    ps = PredefinedSplit(test_fold=test_data)

    for train_index, test_index in ps.split():

        # Get fold
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]

        # Preprocess and encode data    
        X_train, X_val = preprocess_steps(X_train, X_val)
        y_train, y_val, le = encoding_label(y_train, y_val)

        X_combined_list.append(X_train)
        y_combined_list.append(y_train)

    X_combined = pd.concat(X_combined_list, axis=0)
    y_combined = np.concatenate(y_combined_list, axis=0)

    return ps, X_combined, y_combined

In [7]:
from sklearn.feature_selection import chi2
import numpy as np
import pandas as pd

def average_chi2_across_folds_with_predefined_split(X, y, ps):
    """
    Computes the average Chi-Squared score and p-value across folds using a PredefinedSplit.
    """
    feature_scores = []

    for train_idx, test_idx in ps.split():

        X_train, y_train = X.iloc[train_idx], y[train_idx]
        
        chi2_scores, p_values = chi2(X_train, y_train)
        
        feature_scores.append((chi2_scores, p_values))

    chi2_scores_all = np.array([scores[0] for scores in feature_scores])
    p_values_all = np.array([scores[1] for scores in feature_scores])
    
    avg_chi2_scores = np.mean(chi2_scores_all, axis=0)
    avg_p_values = np.mean(p_values_all, axis=0)
    
    results = pd.DataFrame({
        'Feature': X.columns,
        'Average_Chi2_Score': avg_chi2_scores,
        'Average_P_Value': avg_p_values
    }).sort_values(by='Average_Chi2_Score', ascending=False)
    
    return results


In [19]:
from sklearn.feature_selection import f_classif
import numpy as np
import pandas as pd

def average_anova_across_folds_with_predefined_split(X, y, ps):
    """
    Computes the average ANOVA F-test score and p-value across folds using a PredefinedSplit.
    """
    feature_scores = []
    
    for train_idx, test_idx in ps.split():
        X_train, y_train = X.iloc[train_idx], y[train_idx]
        
        f_scores, p_values = f_classif(X_train, y_train)
        
        feature_scores.append((f_scores, p_values))

    f_scores_all = np.array([scores[0] for scores in feature_scores])
    p_values_all = np.array([scores[1] for scores in feature_scores])
    
    avg_f_scores = np.mean(f_scores_all, axis=0)
    avg_p_values = np.mean(p_values_all, axis=0)
    
    results = pd.DataFrame({
        'Feature': X.columns,
        'Average_F_Score': avg_f_scores,
        'Average_P_Value': avg_p_values
    }).sort_values(by='Average_F_Score', ascending=False)
    
    return results


## List creation for preprocessing

In [8]:
CODE_COLUMNS = ['Industry Code', 'WCIO Cause of Injury Code',
       'WCIO Nature of Injury Code', 'WCIO Part Of Body Code']

DESCRIPTION_COLUMNS = ['WCIO Cause of Injury Description','WCIO Nature of Injury Description','WCIO Part Of Body Description','Industry Code Description']

BOOLEAN_COLUMNS = ['Alternative Dispute Resolution', 'Attorney/Representative','COVID-19 Indicator']

date_order = ['Accident Date', 'C-2 Date','C-3 Date','Assembly Date', 'First Hearing Date']

numerical_columns = [
    'Accident Date', 
    'Age at Injury', 
    'Assembly Date', 
    'Average Weekly Wage', 
    'Birth Year', 
    'C-2 Date', 
    'C-3 Date', 
    'First Hearing Date', 
    'IME-4 Count', 
]

outliers_columns = [
    'Accident Date', 
    'Age at Injury', 
    'Assembly Date', 
    'Average Weekly Wage', 
    'Birth Year',
    'IME-4 Count', 
]

categorical_features = ['Alternative Dispute Resolution',
 'Attorney/Representative',
 'Carrier Name',
 'Carrier Type',
 'County of Injury',
 'COVID-19 Indicator',
 'District Name',
 'Gender',
 'Industry Code',
 'Medical Fee Region',
 'WCIO Cause of Injury Code',
 'WCIO Nature of Injury Code',
 'WCIO Part Of Body Code',
 'Zip Code']


columns_to_scale = ['Accident Date',
                'Assembly Date',
                'Average Weekly Wage',
                'Age at Injury',
                'Birth Year', 
                'Number of Dependents',
                'IME-4 Count']

date_columns = ['Accident Date', 'Assembly Date']

outliers_iqr_specific = ['Age at Injury', 'Birth Year']

columns_to_drop = ['C-2 Date', 'C-3 Date', 'First Hearing Date']

low_cardinality_cols = [col for col in categorical_features if X[col].nunique() < 10]
high_cardinality_cols = [col for col in categorical_features if X[col].nunique() > 10]

## Preprocess Functions

In [9]:
binning2_columns = ['Age at Injury', 'Age at Injury', 'Birth Year', 'Average Weekly Wage', 'IME-4 Count']
date_columns = ['Accident Date', 'Assembly Date']

def create_groupingFeatures(X_train, X_val):

    X_train, X_val= newFeature_binnedGroups(X_train, X_val, binning2_columns, 6)

    X_train, X_val = newFeature_month(X_train, X_val, date_columns)



    X_train, X_val = newFeature_daysBetween(X_train, X_val, firstDate='Accident Date', secondDate='Assembly Date')
    date_columns.append('Days Between Accident Date and Assembly Date')
    
    return X_train, X_val

In [10]:


def preprocessing_scaling_encoding_dum(X_train, X_val):
    X_train, X_val = type_conversion_categorical(X_train, X_val,categorical_features)
    X_train, X_val = drop_description_columns(X_train, X_val)
    X_train, X_val = convert_to_timestamp(X_train, X_val, date_order)
    X_train, X_val = convert_to_bool(X_train, X_val, col_names=BOOLEAN_COLUMNS)
    X_train, X_val = impute_mean_numerical(X_train, X_val, numerical_columns)
    X_train, X_val = fill_missing_with_mode(X_train, X_val)
    X_train, X_val = feature_creation_has_Cdate(X_train, X_val)
    X_train, X_val = drop_unwanted_columns(X_train, X_val, columns_to_drop)
    X_train, X_val = log_transform(X_train, X_val)
    X_train, X_val = outliers_specific2(X_train, X_val, outliers_iqr_specific[0], 14)
    X_train, X_val = outliers_specific2(X_train, X_val, outliers_iqr_specific[1], 1934)
    X_train, X_val = scaling_robust(X_train, X_val, columns_to_scale)
    X_train, X_val = encoding_onehot(X_train, X_val, low_cardinality_cols)
    X_train, X_val = encoding_frequency1(X_train, X_val, high_cardinality_cols)

    return X_train, X_val

In [11]:
def preprocessing_newFeatures_advanced(X_train, X_val):

    # Type conversion
    X_train, X_val = type_conversion_categorical(X_train, X_val, categorical_features)
    X_train, X_val = convert_to_timestamp(X_train, X_val, date_order)
    X_train, X_val = convert_to_bool(X_train, X_val, col_names=BOOLEAN_COLUMNS)

    

    # Knowledge-based imputation of features
    X_train, X_val = fill_missing_codes_description_based(X_train, X_val)
    X_train, X_val = fillna_zip_code(X_train, X_val)
    X_train, X_val = fillnan_accident_date(X_train, X_val)
    X_train, X_val = fillnan_birth_year(X_train, X_val)
    X_train, X_val = impute_weekly_wage_with_zipIndustryCode(X_train, X_val)
    X_train, X_val = fillnan_IME4_count(X_train, X_val)

    # Impute still missing values
    X_train, X_val = impute_mean_numerical(X_train, X_val, numerical_columns)
    X_train, X_val = fill_missing_with_mode(X_train, X_val)

    

    # Feature creation
    X_train, X_val = feature_creation_has_Cdate(X_train, X_val)
    X_train, X_val = drop_unwanted_columns(X_train, X_val, columns_to_drop)
    X_train, X_val = newFeature_hasIME4(X_train, X_val)
    X_train, X_val = drop_description_columns(X_train, X_val)
    X_train, X_val = convert_to_datetime(X_train, X_val, date_columns)

    # Grouping features
    X_train, X_val= create_groupingFeatures(X_train, X_val)

    # Treating outliers
    X_train, X_val = log_transform(X_train, X_val)
    X_train, X_val = outliers_specific2(X_train, X_val, outliers_iqr_specific[0], 14)
    X_train, X_val = outliers_specific2(X_train, X_val, outliers_iqr_specific[1], 1934)

    # Scaling
    X_train, X_val = scaling_robust(X_train, X_val, binning2_columns)
    X_train, X_val = scaling_robust(X_train, X_val, date_columns)

    low_cardinality_cols = [col for col in categorical_features if X_train[col].nunique() < 10]
    high_cardinality_cols = [col for col in categorical_features if X_train[col].nunique() > 10]

    X_train, X_val = encoding_onehot(X_train, X_val, low_cardinality_cols)
    X_train, X_val = encoding_frequency1(X_train, X_val, high_cardinality_cols)

    return X_train, X_val

# Feature Selection


## Predefined Split

In [21]:
X_combined 

Unnamed: 0_level_0,Accident Date,Age at Injury,Assembly Date,Average Weekly Wage,Birth Year,IME-4 Count,Number of Dependents,Has C-3 Date,Has C-2 Date,Has First Hearing Date,Alternative Dispute Resolution_False,Alternative Dispute Resolution_True,Alternative Dispute Resolution_nan,Attorney/Representative_False,Attorney/Representative_True,Carrier Type_1A. PRIVATE,Carrier Type_2A. SIF,Carrier Type_3A. SELF PUBLIC,Carrier Type_4A. SELF PRIVATE,Carrier Type_5A. SPECIAL FUND - CONS. COMM. (SECT. 25-A),Carrier Type_5C. SPECIAL FUND - POI CARRIER WCB MENANDS,Carrier Type_5D. SPECIAL FUND - UNKNOWN,Carrier Type_UNKNOWN,COVID-19 Indicator_False,COVID-19 Indicator_True,District Name_ALBANY,District Name_BINGHAMTON,District Name_BUFFALO,District Name_HAUPPAUGE,District Name_NYC,District Name_ROCHESTER,District Name_STATEWIDE,District Name_SYRACUSE,Gender_F,Gender_M,Gender_U,Gender_X,Medical Fee Region_I,Medical Fee Region_II,Medical Fee Region_III,Medical Fee Region_IV,Medical Fee Region_UK,Carrier Name,County of Injury,Industry Code,WCIO Cause of Injury Code,WCIO Nature of Injury Code,WCIO Part Of Body Code,Zip Code
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1
5393875,-0.984980,-0.478261,-1.066667,0.000000,0.48,0.000000,-0.50,1,1,1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.022253,0.005825,0.076072,0.017830,0.192838,0.001986,0.000956
5393091,-1.207101,0.173913,-1.066667,1.117229,-0.12,0.791516,0.25,1,1,1,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.004987,0.001322,0.053698,0.020404,0.097254,0.065849,0.000366
5393889,-1.028675,-0.086957,-1.066667,1.087877,0.12,0.000000,0.75,1,1,1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.015849,0.030317,0.036619,0.021558,0.009551,0.013889,0.000880
5393887,-0.984980,0.826087,-1.066667,0.929071,-0.72,0.000000,-0.50,1,1,1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.193528,0.020017,0.198926,0.017190,0.030240,0.063000,0.003088
5393848,-0.988621,0.260870,-1.066667,0.000000,-0.20,0.000000,-0.50,1,1,1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.021373,0.027033,0.045963,0.025741,0.082126,0.063000,0.000416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6165240,1.001821,-0.478261,0.961039,0.000000,0.60,0.000000,0.00,1,1,1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.000353,0.005098,0.160792,0.046254,0.267133,0.090185,0.000196
6165331,1.007286,-0.565217,0.961039,0.000000,-1.68,0.000000,-0.50,1,1,1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.024291,0.068912,0.075841,0.025079,0.009013,0.054658,0.003092
6165285,0.983607,-0.391304,0.962894,0.000000,0.52,0.000000,0.75,1,1,1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.004048,0.075926,0.199259,0.064873,0.021656,0.054658,0.002591
6165339,0.981785,-0.782609,0.962894,0.000000,0.88,0.000000,0.50,1,1,1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.004384,0.052219,0.199259,0.005087,0.062576,0.043508,0.002648


In [12]:
ps, X_combined, y_combined = create_predifined_split(X, y, preprocessing_scaling_encoding_dum, n_splits=5) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[coulmns] = X_train[coulmns].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val[coulmns] = X_val[coulmns].astype(str)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object

## Chi square 

In [23]:
#for categorigal features
chi_features = [
    'Has C-3 Date',
    'Has C-2 Date',
    'Has First Hearing Date',
    'Carrier Name',
    'County of Injury',
    'Industry Code',
    'WCIO Cause of Injury Code',
    'WCIO Nature of Injury Code',
    'WCIO Part Of Body Code',
    'Zip Code',
    'Alternative Dispute Resolution_False',
    'Alternative Dispute Resolution_True',
    'Alternative Dispute Resolution_nan',
    'Attorney/Representative_False',
    'Attorney/Representative_True',
    'Carrier Type_1A. PRIVATE',
    'Carrier Type_2A. SIF',
    'Carrier Type_3A. SELF PUBLIC',
    'Carrier Type_4A. SELF PRIVATE',
    'Carrier Type_5A. SPECIAL FUND - CONS. COMM. (SECT. 25-A)',
    'Carrier Type_5C. SPECIAL FUND - POI CARRIER WCB MENANDS',
    'Carrier Type_5D. SPECIAL FUND - UNKNOWN',
    'Carrier Type_UNKNOWN',
    'COVID-19 Indicator_False',
    'COVID-19 Indicator_True',
    'District Name_NYC',
    'District Name_BUFFALO',
    'District Name_ALBANY',
    'District Name_HAUPPAUGE',
    'District Name_STATEWIDE',
    'District Name_SYRACUSE',
    'District Name_BINGHAMTON',
    'District Name_ROCHESTER',
    'Gender_F',
    'Gender_M',
    'Gender_U',
    'Gender_X',
    'Medical Fee Region_I',
    'Medical Fee Region_II',
    'Medical Fee Region_III',
    'Medical Fee Region_IV',
    'Medical Fee Region_UK',
]
chi2_results = average_chi2_across_folds_with_predefined_split(X_combined[chi_features], y_combined, ps)

In [18]:
chi2_results

Unnamed: 0,Feature,Average_Chi2_Score,Average_P_Value
11,Attorney/Representative_True,118446.506054,0.0
10,Attorney/Representative_False,55953.817184,0.0
14,Carrier Type_3A. SELF PUBLIC,7177.575251,0.0
13,Carrier Type_2A. SIF,5212.932245,0.0
21,COVID-19 Indicator_True,4274.147869,0.0
12,Carrier Type_1A. PRIVATE,2726.280824,0.0
30,Gender_F,2374.563487,0.0
8,Alternative Dispute Resolution_True,2003.355011,0.0
31,Gender_M,1813.222665,0.0
19,Carrier Type_UNKNOWN,1451.044386,3.452917e-297


Only Zip code, Alternative Dispute Resolution_nan and Alternative Dispute Resolution_False

In [25]:
anova_features =[
    'Accident Date', 
    'Age at Injury', 
    'Assembly Date', 
    'Average Weekly Wage', 
    'Birth Year', 
    'IME-4 Count', 
]
anova_result = average_anova_across_folds_with_predefined_split(X_combined[anova_features], y_combined, ps)

In [26]:
anova_result

Unnamed: 0,Feature,Average_F_Score,Average_P_Value
3,Average Weekly Wage,174843.294364,0.0
5,IME-4 Count,2955.534057,0.0
1,Age at Injury,1041.869837,0.0
2,Assembly Date,639.697662,0.0
0,Accident Date,541.534905,0.0
4,Birth Year,527.851524,0.0


## Lasso Regression

In [None]:
logreg_cv = LogisticRegressionCV(
    penalty='l1',
    solver='saga', # Use SAGA solver for large datasets 
    Cs=5,
    cv=ps,
    random_state=42,
    class_weight='balanced', # Balance class weights
    n_jobs=-1,
    scoring='f1_macro', # Use macro F1 score as scoring metric
    max_iter=2000
)
logreg_cv.fit(X_combined, y_combined)

# Identify selected and unselected features
if len(logreg_cv.coef_.shape) > 1:
    coefs = np.abs(logreg_cv.coef_).mean(axis=0)
else:
    coefs = logreg_cv.coef_.flatten()

selected_features = X_combined.columns[coefs != 0].tolist()
unselected_features = X_combined.columns[coefs == 0].tolist()

sorted_idx = np.argsort(np.abs(coefs))
sorted_features = X_combined.columns[sorted_idx]
sorted_coefs = coefs[sorted_idx]

plt.figure(figsize=(20, 15))
plt.barh(sorted_features, sorted_coefs)
plt.xlabel("Coefficient Value")
plt.ylabel("Features")
plt.title("Feature Importance via Logistic Regression with L1 Penalty (Sorted)")
plt.axvline(0, color="black", linewidth=0.8, linestyle="--")
plt.tight_layout()
plt.show()

print("Selected Features:", selected_features)
print("Unselected Features:", unselected_features)


In [None]:
# put features together with there coefficients
feature_importance = pd.DataFrame({'Feature': sorted_features, 'Coefficient': sorted_coefs})
feature_importance

| Feature                                | Coefficient |
|----------------------------------------|-------------|
| Alternative Dispute Resolution_nan     | 0.091208    |
| Medical Fee Region_III                 | 0.104723    |
| Accident Date                          | 0.156352    |
| Birth Year                             | 0.161018    |
| Number of Dependents                   | 0.162521    |
| Medical Fee Region_UK                  | 0.199784    |
| Medical Fee Region_I                   | 0.211184    |
| Medical Fee Region_II                  | 0.218863    |
| Medical Fee Region_IV                  | 0.251852    |
| Has C-3 Date                           | 0.339323    |
| Has C-2 Date                           | 0.339323    |
| Has First Hearing Date                 | 0.339323    |
| Attorney/Representative_True           | 0.344442    |
| Carrier Type_5A. SPECIAL FUND - CONS. COMM. (S... | 0.354133    |
| Carrier Type_4A. SELF PRIVATE          | 0.426216    |
| Carrier Type_3A. SELF PUBLIC           | 0.427759    |
| Age at Injury                          | 0.448413    |
| Gender_X                               | 0.459640    |
| COVID-19 Indicator_False               | 0.469610    |
| Carrier Type_1A. PRIVATE               | 0.495387    |
| Carrier Type_5C. SPECIAL FUND - POI CARRIER WC... | 0.506768    |
| Assembly Date                          | 0.543958    |
| District Name_HAUPPAUGE               | 0.592683    |
| IME-4 Count                            | 0.615296    |
| District Name_BUFFALO                 | 0.658296    |
| District Name_SYRACUSE                | 0.679366    |
| District Name_NYC                     | 0.706079    |
| District Name_ALBANY                  | 0.709348    |
| Carrier Type_5D. SPECIAL FUND - UNKNOWN | 0.721796    |
| District Name_BINGHAMTON              | 0.733356    |
| District Name_STATEWIDE               | 0.814429    |
| Alternative Dispute Resolution_False  | 0.955822    |
| Gender_F                               | 1.027072    |
| County of Injury                       | 1.331568    |
| Alternative Dispute Resolution_True   | 1.348975    |
| COVID-19 Indicator_True               | 1.349827    |
| Gender_M                               | 1.498185    |
| Attorney/Representative_False         | 1.740615    |
| Carrier Type_2A. SIF                  | 2.027552    |
| Industry Code                          | 2.075674    |
| Carrier Type_UNKNOWN                  | 2.330205    |
| Gender_U                               | 2.846922    |
| Average Weekly Wage                    | 3.204603    |
| Zip Code                               | 3.625286    |
| WCIO Nature of Injury Code            | 4.186389    |
| District Name_ROCHESTER               | 4.952202    |
| WCIO Cause of Injury Code             | 6.970056    |
| WCIO Part Of Body Code                | 8.713728    |
| Carrier Name                           | 9.174048    |

All feature are selected by using this method

## RFECV with preprocessing_scaling_encoding_dum (All features)

In [None]:
 # Initialize RandomForest model
rf_model = RandomForestClassifier(
    n_estimators=100,            # Number of trees
    max_depth=15,                # Limit tree depth
    min_samples_split=50,        # Minimum samples for a split
    min_samples_leaf=20,         # Minimum samples per leaf
    max_features='sqrt',         # Features to consider per split
    class_weight='balanced',     # Handle class imbalance
    bootstrap=True,              # Use bootstrapping
    random_state=42,             # Ensure reproducibility
    n_jobs=-1                    # Use all CPU cores
)

# Set up RFECV with RandomForest and cross-validation
rfecv = RFECV(estimator=rf_model, step=1, cv=ps, scoring='f1_macro') 

# Fit RFECV
rfecv.fit(X_combined, y_combined)

#Get the selected features
selected_features_RFE_basic = X_combined.columns[rfecv.support_].tolist()
optimal_num_features = rfecv.n_features_
feature_ranking = rfecv.ranking_

print("Optimal number of features:", optimal_num_features)
print("Selected Features:", selected_features_RFE_basic)

In [None]:
feature_ranking = pd.DataFrame({'Feature': X_combined.columns, 'Ranking': feature_ranking})
Optimal_number_of_features = 25

Optimal number of features: 25

| Feature                                      | Ranking |
|----------------------------------------------|---------|
| Accident Date                                | 1       |
| Age at Injury                                | 1       |
| Assembly Date                                | 1       |
| Average Weekly Wage                          | 1       |
| Birth Year                                   | 1       |
| IME-4 Count                                  | 1       |
| Number of Dependents                         | 1       |
| Attorney/Representative_False               | 1       |
| Attorney/Representative_True                | 1       |
| Carrier Type_1A. PRIVATE                     | 1       |
| Carrier Type_2A. SIF                         | 1       |
| Carrier Type_3A. SELF PUBLIC                 | 1       |
| COVID-19 Indicator_False                     | 1       |
| COVID-19 Indicator_True                      | 1       |
| District Name_NYC                            | 1       |
| Gender_F                                     | 1       |
| Gender_M                                     | 1       |
| Medical Fee Region_IV                        | 1       |
| Carrier Name                                 | 1       |
| County of Injury                             | 1       |
| Industry Code                                | 1       |
| WCIO Cause of Injury Code                    | 1       |
| WCIO Nature of Injury Code                   | 1       |
| WCIO Part Of Body Code                       | 1       |
| Zip Code                                     | 1       |
| Medical Fee Region_I                         | 2       |
| District Name_BUFFALO                        | 3       |
| Medical Fee Region_II                        | 4       |
| District Name_ALBANY                         | 5       |
| District Name_HAUPPAUGE                      | 6       |
| Medical Fee Region_UK                        | 7       |
| Medical Fee Region_III                       | 8       |
| District Name_STATEWIDE                      | 9       |
| District Name_SYRACUSE                       | 10      |
| Carrier Type_4A. SELF PRIVATE                | 11      |
| Alternative Dispute Resolution_False         | 12      |
| District Name_BINGHAMTON                     | 13      |
| District Name_ROCHESTER                      | 14      |
| Alternative Dispute Resolution_True          | 15      |
| Carrier Type_5D. SPECIAL FUND - UNKNOWN      | 16      |
| Carrier Type_UNKNOWN                         | 17      |
| Gender_U                                     | 18      |
| Carrier Type_5A. SPECIAL FUND - CONS. COMM...| 19      |
| Has C-2 Date                                 | 20      |
| Carrier Type_5C. SPECIAL FUND - POI CARRIE...| 21      |
| Has First Hearing Date                       | 22      |
| Has C-3 Date                                 | 23      |
| Alternative Dispute Resolution_nan           | 24      |
| Gender_X                                     | 25      |


## Feature selection report (simple preprocesing)

| Feature                                      | Lasso       | RFE | Chi-Square AND Anova |
|----------------------------------------------|-------------|-----|----------------------|
| Accident Date                                | 0.156352    | 1   | Yes                  |
| Age at Injury                                | 0.448413    | 1   | Yes                  |
| Assembly Date                                | 0.543958    | 1   | Yes                  |
| Average Weekly Wage                          | 3.204603    | 1   | Yes                  |
| Birth Year                                   | 0.161018    | 1   | Yes                  |
| IME-4 Count                                  | 0.615296    | 1   | Yes                  |
| Number of Dependents                         | 0.162521    | 1   | Yes                  |
| Attorney/Representative_False               | 1.740615    | 1   | Yes                  |
| Attorney/Representative_True                | 0.344442    | 1   | Yes                  |
| Carrier Type_1A. PRIVATE                     | 0.495387    | 1   | Yes                  |
| Carrier Type_2A. SIF                         | 2.027552    | 1   | Yes                  |
| Carrier Type_3A. SELF PUBLIC                 | 0.427759    | 1   | Yes                  |
| COVID-19 Indicator_False                     | 0.469610    | 1   | Yes                  |
| COVID-19 Indicator_True                      | 1.349827    | 1   | Yes                  |
| District Name_NYC                            | 0.706079    | 1   | Yes                  |
| Gender_F                                     | 1.027072    | 1   | Yes                  |
| Gender_M                                     | 1.498185    | 1   | Yes                  |
| Medical Fee Region_IV                        | 0.251852    | 1   | Yes                  |
| Carrier Name                                 | 9.174048    | 1   | Yes                  |
| County of Injury                             | 1.331568    | 1   | Yes                  |
| Industry Code                                | 2.075674    | 1   | Yes                  |
| WCIO Cause of Injury Code                    | 6.970056    | 1   | Yes                  |
| WCIO Nature of Injury Code                   | 4.186389    | 1   | Yes                  |
| WCIO Part Of Body Code                       | 8.713728    | 1   | Yes                  |
| Zip Code                                     | 3.625286    | 1   | No                   |
| Medical Fee Region_I                         | 0.211184    | 2   | Yes                  |
| District Name_BUFFALO                        | 0.658296    | 3   | Yes                  |
| Medical Fee Region_II                        | 0.218863    | 4   | Yes                  |
| District Name_ALBANY                         | 0.709348    | 5   | Yes                  |
| District Name_HAUPPAUGE                      | 0.592683    | 6   | Yes                  |
| Medical Fee Region_UK                        | 0.199784    | 7   | Yes                  |
| Medical Fee Region_III                       | 0.104723    | 8   | Yes                  |
| District Name_STATEWIDE                      | 0.814429    | 9   | Yes                  |
| District Name_SYRACUSE                       | 0.679366    | 10  | Yes                  |
| Carrier Type_4A. SELF PRIVATE                | 0.426216    | 11  | Yes                  |
| Alternative Dispute Resolution_False         | 0.955822    | 12  | No                   |
| District Name_BINGHAMTON                     | 0.733356    | 13  | Yes                  |
| District Name_ROCHESTER                      | 4.952202    | 14  | Yes                  |
| Alternative Dispute Resolution_True          | 1.348975    | 15  | Yes                  |
| Carrier Type_5D. SPECIAL FUND - UNKNOWN      | 0.721796    | 16  | Yes                  |
| Carrier Type_UNKNOWN                         | 2.330205    | 17  | Yes                  |
| Gender_U                                     | 2.846922    | 18  | Yes                  |
| Carrier Type_5A. SPECIAL FUND - CONS. COMM...| 0.354133    | 19  | Yes                  |
| Has C-2 Date                                 | 0.339323    | 20  | Yes                  |
| Carrier Type_5C. SPECIAL FUND - POI CARRIE...| 0.506768    | 21  | Yes                  |
| Has First Hearing Date                       | 0.339323    | 22  | Yes                  |
| Has C-3 Date                                 | 0.339323    | 23  | Yes                  |
| Alternative Dispute Resolution_nan           | 0.091208    | 24  | No                   |
| Gender_X                                     | 0.459640    | 25  | Yes                  |

## RFECV with preprocessing_newFeatures_advanced (All features)

In [None]:
 # Initialize RandomForest model
rf_model = RandomForestClassifier(
    n_estimators=100,            # Number of trees
    max_depth=15,                # Limit tree depth
    min_samples_split=50,        # Minimum samples for a split
    min_samples_leaf=20,         # Minimum samples per leaf
    max_features='sqrt',         # Features to consider per split
    class_weight='balanced',     # Handle class imbalance
    bootstrap=True,              # Use bootstrapping
    random_state=42,             # Ensure reproducibility
    n_jobs=-1                    # Use all CPU cores
)

# Set up RFECV with RandomForest and cross-validation
rfecv = RFECV(estimator=rf_model, step=1, cv=ps, scoring='f1_macro', n_jobs=-1) 

# Fit RFECV
rfecv.fit(X_combined, y_combined)

#Get the selected features
selected_features_RF = X_combined.columns[rfecv.support_].tolist()
feature_ranking = rfecv.ranking_
optimal_num_features = rfecv.n_features_

print("Optimal number of features:", optimal_num_features)
print("Feature Ranking:", feature_ranking)
print("Selected Features:", selected_features_RF)

Optimal number of features: 17<br>
Selected Features: <br>['Accident Date', 'Age at Injury', 'Assembly Date', 'Average Weekly Wage', 'Birth Year', 'IME-4 Count', 'IME-4 Count 1', 'Attorney/Representative_False', 'Attorney/Representative_True', 'COVID-19 Indicator_True', 'Carrier Name', 'County of Injury', 'Industry Code', 'WCIO Cause of Injury Code', 'WCIO Nature of Injury Code', 'WCIO Part Of Body Code', 'Zip Code']

In [None]:
#create the feature ranking wee the features and the ranking are together
feature_ranking = pd.DataFrame({'Feature': X_combined.columns, 'Ranking': feature_ranking})
optimal_num_features_advanced = 17

In [None]:
selected_features_advanced_rf = ['Accident Date', 'Age at Injury', 'Assembly Date', 'Average Weekly Wage', 'Birth Year', 'IME-4 Count', 'IME-4 Count 1', 'Attorney/Representative_False', 'Attorney/Representative_True', 'COVID-19 Indicator_True', 'Carrier Name', 'County of Injury', 'Industry Code', 'WCIO Cause of Injury Code', 'WCIO Nature of Injury Code', 'WCIO Part Of Body Code', 'Zip Code']

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Feature</th>
      <th>Ranking</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>Accident Date</td>
      <td>1</td>
    </tr>
    <tr>
      <th>1</th>
      <td>Age at Injury</td>
      <td>1</td>
    </tr>
    <tr>
      <th>2</th>
      <td>Assembly Date</td>
      <td>1</td>
    </tr>
    <tr>
      <th>3</th>
      <td>Average Weekly Wage</td>
      <td>1</td>
    </tr>
    <tr>
      <th>4</th>
      <td>Birth Year</td>
      <td>1</td>
    </tr>
    <tr>
      <th>5</th>
      <td>IME-4 Count</td>
      <td>1</td>
    </tr>
    <tr>
      <th>6</th>
      <td>Number of Dependents</td>
      <td>1</td>
    </tr>
    <tr>
      <th>7</th>
      <td>Has C-3 Date</td>
      <td>23</td>
    </tr>
    <tr>
      <th>8</th>
      <td>Has C-2 Date</td>
      <td>20</td>
    </tr>
    <tr>
      <th>9</th>
      <td>Has First Hearing Date</td>
      <td>22</td>
    </tr>
    <tr>
      <th>10</th>
      <td>Alternative Dispute Resolution_False</td>
      <td>12</td>
    </tr>
    <tr>
      <th>11</th>
      <td>Alternative Dispute Resolution_True</td>
      <td>15</td>
    </tr>
    <tr>
      <th>12</th>
      <td>Alternative Dispute Resolution_nan</td>
      <td>24</td>
    </tr>
    <tr>
      <th>13</th>
      <td>Attorney/Representative_False</td>
      <td>1</td>
    </tr>
    <tr>
      <th>14</th>
      <td>Attorney/Representative_True</td>
      <td>1</td>
    </tr>
    <tr>
      <th>15</th>
      <td>Carrier Type_1A. PRIVATE</td>
      <td>1</td>
    </tr>
    <tr>
      <th>16</th>
      <td>Carrier Type_2A. SIF</td>
      <td>1</td>
    </tr>
    <tr>
      <th>17</th>
      <td>Carrier Type_3A. SELF PUBLIC</td>
      <td>1</td>
    </tr>
    <tr>
      <th>18</th>
      <td>Carrier Type_4A. SELF PRIVATE</td>
      <td>11</td>
    </tr>
    <tr>
      <th>19</th>
      <td>Carrier Type_5A. SPECIAL FUND - CONS. COMM. (S...</td>
      <td>19</td>
    </tr>
    <tr>
      <th>20</th>
      <td>Carrier Type_5C. SPECIAL FUND - POI CARRIER WC...</td>
      <td>21</td>
    </tr>
    <tr>
      <th>21</th>
      <td>Carrier Type_5D. SPECIAL FUND - UNKNOWN</td>
      <td>16</td>
    </tr>
    <tr>
      <th>22</th>
      <td>Carrier Type_UNKNOWN</td>
      <td>17</td>
    </tr>
    <tr>
      <th>23</th>
      <td>COVID-19 Indicator_False</td>
      <td>1</td>
    </tr>
    <tr>
      <th>24</th>
      <td>COVID-19 Indicator_True</td>
      <td>1</td>
    </tr>
    <tr>
      <th>25</th>
      <td>District Name_ALBANY</td>
      <td>5</td>
    </tr>
    <tr>
      <th>26</th>
      <td>District Name_BINGHAMTON</td>
      <td>13</td>
    </tr>
    <tr>
      <th>27</th>
      <td>District Name_BUFFALO</td>
      <td>3</td>
    </tr>
    <tr>
      <th>28</th>
      <td>District Name_HAUPPAUGE</td>
      <td>6</td>
    </tr>
    <tr>
      <th>29</th>
      <td>District Name_NYC</td>
      <td>1</td>
    </tr>
    <tr>
      <th>30</th>
      <td>District Name_ROCHESTER</td>
      <td>14</td>
    </tr>
    <tr>
      <th>31</th>
      <td>District Name_STATEWIDE</td>
      <td>9</td>
    </tr>
    <tr>
      <th>32</th>
      <td>District Name_SYRACUSE</td>
      <td>10</td>
    </tr>
    <tr>
      <th>33</th>
      <td>Gender_F</td>
      <td>1</td>
    </tr>
    <tr>
      <th>34</th>
      <td>Gender_M</td>
      <td>1</td>
    </tr>
    <tr>
      <th>35</th>
      <td>Gender_U</td>
      <td>18</td>
    </tr>
    <tr>
      <th>36</th>
      <td>Gender_X</td>
      <td>25</td>
    </tr>
    <tr>
      <th>37</th>
      <td>Medical Fee Region_I</td>
      <td>2</td>
    </tr>
    <tr>
      <th>38</th>
      <td>Medical Fee Region_II</td>
      <td>4</td>
    </tr>
    <tr>
      <th>39</th>
      <td>Medical Fee Region_III</td>
      <td>8</td>
    </tr>
    <tr>
      <th>40</th>
      <td>Medical Fee Region_IV</td>
      <td>1</td>
    </tr>
    <tr>
      <th>41</th>
      <td>Medical Fee Region_UK</td>
      <td>7</td>
    </tr>
    <tr>
      <th>42</th>
      <td>Carrier Name</td>
      <td>1</td>
    </tr>
    <tr>
      <th>43</th>
      <td>County of Injury</td>
      <td>1</td>
    </tr>
    <tr>
      <th>44</th>
      <td>Industry Code</td>
      <td>1</td>
    </tr>
    <tr>
      <th>45</th>
      <td>WCIO Cause of Injury Code</td>
      <td>1</td>
    </tr>
    <tr>
      <th>46</th>
      <td>WCIO Nature of Injury Code</td>
      <td>1</td>
    </tr>
    <tr>
      <th>47</th>
      <td>WCIO Part Of Body Code</td>
      <td>1</td>
    </tr>
    <tr>
      <th>48</th>
      <td>Zip Code</td>
      <td>1</td>
    </tr>
  </tbody>
</table>
</div>

# Performance Evaluation


## GridSearchCV

In [None]:
def get_best_parameters(X, y, model, param_grid, preprocess_steps, n_splits=5):
    """
    Finds the best hyperparameters for a given model using GridSearchCV.

    Steps:
    - Creates a PredefinedSplit object
    - Creates a GridSearchCV object
    - Fits the GridSearchCV object
    - Returns the best hyperparameters and the best score
    """

    scoring = make_scorer(f1_score, average='macro')

    grid_search = GridSearchCV(
        model,
        param_grid,
        cv=ps,
        scoring=scoring,
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_combined, y_combined)

    print("Best Parameters:", grid_search.best_params_)
    print("Best F1-macro Score:", grid_search.best_score_)

    return grid_search.best_params_, grid_search.best_score_

______

# Model Assessment

in this section we are comparing model the same preprocessing technique
- the feature selected are the one selected with the RFECV (preprocessing_scaling_encoding_dum)
- we will selecte the model that has the highest f1-score macro

In [None]:
## Feature selected using RFECV With simple preprocessing
selected_features = ['Accident Date',
                     'Age at Injury', 
                     'Assembly Date', 
                     'Average Weekly Wage', 
                     'Birth Year', 
                     'IME-4 Count', 
                     'Number of Dependents',
                     'Attorney/Representative_False', 
                     'Attorney/Representative_True', 
                     'Carrier Type_1A. PRIVATE', 
                     'Carrier Type_2A. SIF',
                     'Carrier Type_3A. SELF PUBLIC', 
                     'COVID-19 Indicator_False', 
                     'COVID-19 Indicator_True', 
                     'District Name_NYC', 
                     'Gender_F', 
                     'Gender_M',
                     'Medical Fee Region_IV',
                     'Carrier Name', 
                     'County of Injury', 
                     'Industry Code', 
                     'WCIO Cause of Injury Code', 
                     'WCIO Nature of Injury Code', 
                     'WCIO Part Of Body Code', 
                     'Zip Code']

## Logistic Regression

In [None]:
param_grid = {
    'penalty': ['l2', 'elasticnet'],         
    'C': [0.01, 0.1, 1, 10],              
    'solver': ['lbfgs', 'saga'],            
    'class_weight': ['balanced', None],      
    'l1_ratio': [0.5]                   
}

model = LogisticRegression(max_iter=1000)

In [None]:
logisticregression_best_param, logisticregression_best_score = get_best_parameters(X, y, model, param_grid, preprocessing_scaling_encoding_dum, n_splits=5)

Best Parameters: 
- 'penalty': l2
- 'C': 10
- 'solver': 'lbfgs'
- 'class_weight': 'balanced' 
- 'l1_ratio': 0.5 <br>

Best F1-macro Score: 0.29291927882029445

## Random Forest

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}
model=RandomForestClassifier()

In [None]:
RandomForestClassifier_best_param, RandomForestClassifier_best_score = get_best_parameters(X, y, model, param_grid, preprocessing_scaling_encoding_dum, n_splits=5)

Best Parameters: 
- 'bootstrap': False
- 'max_depth': None
- 'max_features': 'sqrt'
- 'min_samples_leaf': 1
- 'min_samples_split': 2
-  'n_estimators': 200 <br>

Best_NN F1-macro Score: 0.3706281959869002

## NN

In [None]:
model = MLPClassifier(
        solver='adam',
        max_iter=1000,  # Increase if needed
        random_state=42,
    )

In [None]:
param_grid = {
    'hidden_layer_sizes': [
        (int(0.75 * len(selected_features)), int(0.5 * len(selected_features))),  # Original configuration
        (int(0.5 * len(selected_features)), int(0.25 * len(selected_features)), int(0.125 * len(selected_features))),  # Three layers
    ],
    'learning_rate_init': [0.01, 0.1],  # Test lower and higher learning rates
    'activation': ['relu', 'tanh'],  # Compare relu and tanh
    'alpha': [0.001, 0.01],  # Regularization strength
    'batch_size': ['auto', 64, 128],  # Test different batch sizes
}

In [None]:
nnGS_best_params, nnGS_best_score = get_best_parameters(X, y, model, param_grid, preprocessing_scaling_encoding_dum, selected_features, n_splits=5)

Best Parameters:
- 'activation': 'tanh'
- 'alpha': 0.001
- 'batch_size': 'auto' 
- 'hidden_layer_sizes': (34, 23)
- 'learning_rate_init': 0.01<br>

Best_NN F1-macro Score: 0.30394956984435917

## XGBoost

In [None]:
param_grid = {
    'n_estimators': [100],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.3],
    'gamma': [0, 2],
    'reg_alpha': [0, 5],
    'reg_lambda': [1, 10],
    'min_child_weight': [1, 5] 
}
model=XGBClassifier(random_state=42, n_jobs=-1)

In [None]:
xgb_best_params, xgb_best_score = get_best_parameters(X, y, model, param_grid, preprocessing_scaling_encoding_dum, selected_features, n_splits=5)

Best Parameters: 
- 'gamma': 0
- 'learning_rate': 0.3
- 'max_depth': 6
-  'min_child_weight': 1
-  'n_estimators': 100
-  'reg_alpha': 0
-  'reg_lambda': 1<br>

Best_Xgboost F1-macro Score: 0.44258821978115226<br>

## Knn

In [None]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform'], # Weight function
    'metric': ['minkowski'],
    'p': [1, 2], # Power parameter for Minkowski distance
    'algorithm': ['auto', 'ball_tree'],  # Algorithm for nearest neighbor search
    'leaf_size': [20, 30, 40, 50] 
}

model=KNeighborsClassifier()

In [None]:
knn_best_param, knn_best_score = get_best_parameters(X, y, model, param_grid, preprocessing_scaling_encoding_dum, selected_features, n_splits=5)

Knn_best_parameter:
- 'algorithm': 'auto'
- 'leaf_size': 20
- 'metric': 'minkowski'
- 'n_neighbors': 5
- 'p': 1
- 'weights': 'uniform' <br>

Knn_Best F1-macro Score:: 0.3001599658723925

## Ensamble Models
look for last year project


# Final Model

## Optimising the final selected model: Xgboost classifier

in this section we try to optimise the performance of the classifier
- feature selection varies across different try (some with none)
- we use different preprocessing techniques
- again we select the model with the highest f1_score macro

In [None]:
kf = StratifiedKFold(n_splits=2, random_state=42, shuffle=True)

accuracies = []
f1_scores = []
fold_predictions = []  # Store predictions for test data 

for train_index, test_index in kf.split(X,y):

    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    y_train, y_val , Label_Encoder= encoding_label(y_train, y_val)
    X_train_model, X_val = preprocessing_scaling_encoding_dum(X_train, X_val)
    
    # Define the model
    model = XGBClassifier(
        gamma=0,
        learning_rate=0.3,
        max_depth=6,
        min_child_weight=1,
        n_estimators=100,
        reg_alpha=0,
        reg_lambda=1
    )
    
    smote = SMOTE()
    X_train_model, y_train = smote.fit_resample(X_train_model, y_train)

    
    model.fit(X_train_model, y_train)
    
    y_pred = model.predict(X_val)

    accuracies.append(accuracy_score(y_val, y_pred))
    f1_scores.append(f1_score(y_val, y_pred, average='macro') )

    _, X_test_preprocessed = preprocessing_scaling_encoding_dum(X_train, test_data)
    fold_predictions.append(model.predict_proba(X_test_preprocessed))


fold_predictions = np.mean(fold_predictions, axis=0)
y_test_pred = np.argmax(fold_predictions, axis=1)

accuracies_mean = np.mean(accuracies)
f1_scores_mean = np.mean(f1_scores)
y_test_predictions_xgboost_ = y_test_pred

print(f"Mean Accuracy: {accuracies_mean:.2f}")
print(f"Mean F1 Score: {f1_scores_mean:.2f}")


In [None]:
fold_predictions

In [None]:
y_test_predictions_xgboost_ = Label_Encoder.inverse_transform(y_test_predictions_xgboost_)
y_test_predictions_xgboost_ = pd.DataFrame(y_test_predictions_xgboost_, columns=['Claim Injury Type'], index=test_data.index)
y_test_predictions_xgboost_.to_csv('y_test_predictions_xgboost.csv')