<div style="background-color: darkslategray; color: white; padding: 15px; border-radius: 8px;">
    <center><h1 style="font-family: Arial, sans-serif;">TO GRANT OR NOT TO GRANT: DECIDING ON COMPENSATION BENEFITS</h1></center>
    <center><h3 style="font-family: Arial, sans-serif;">Machine Learning Project</h3></center>
</div>

<div style="background-color: white; color: white; padding: 15px; border-radius: 8px;">
    <center><a href="https://github.com/isabella-fc/to-grant-or-not-to-grant/tree/main/ml_project/wcb" target="_blank" style="font-size: 26px; color: #0000FF; text-decoration: none;">Github Repository (Web Application)</a></center>
</div>

# Open Ended Section

This notebook is divided into two parts.<br>
The first is a short demonstration of the process of transforming the model and features in ´to an applicable form for an web application (the whole process and files are accesible through the link to the github repository)<br>
The second is an assesment of models and creation of a final model to predict the "Agreement Reached".

In [7]:
# Standard Libraries
import time
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import importlib

import os
import pickle

# Sklearn: Model Selection
from sklearn.model_selection import (
    GridSearchCV, PredefinedSplit, StratifiedKFold, train_test_split, cross_val_score
)

# Sklearn: Feature Selection
from sklearn.feature_selection import RFE

# Sklearn: Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

# Sklearn: Metrics
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    confusion_matrix, ConfusionMatrixDisplay, classification_report, make_scorer
)


# Custom Preprocessing Functions
from Preprocessing_functions import *

# Reload Preprocessing Functions (for updates during runtime)
imported_module = importlib.import_module("Preprocessing_functions")
importlib.reload(imported_module)

# Pandas Display Settings
pd.set_option('display.max_columns', None)

# Plotting Settings
sns.set()  # Apply Seaborn style globally for plots
plt.rcParams["figure.figsize"] = (10, 6)  # Set default plot size


# Model Transformation

In [13]:
def preprocess_form(form_data):
    """
    Preprocess form.cleaned_data for XGBoost prediction:
    - Converts all string inputs into appropriate numeric types.
    - Converts categorical variables into numerical values.
    - Ensures strict column order according to FEATURE_ORDER.
    """
    # Helper functions
    def parse_boolean(value):
        return 1 if str(value).lower() == 'true' else 0 if str(value).lower() == 'false' else np.nan

    # Initialize processed_data with default values (0 or np.nan)
    processed_data = {feature: 0 for feature in FEATURE_ORDER}

    # Numeric fields
    processed_data['Age at Injury'] = form_data.get('age_at_injury', np.nan)
    processed_data['Average Weekly Wage'] = float(form_data.get('average_weekly_wage', np.nan))
    processed_data['Birth Year'] = form_data.get('birth_year', np.nan)
    ime4count = form_data.get('ime4_count', 0)
    processed_data['IME-4 Count'] = ime4count != 0
    processed_data['Number of Dependents'] = form_data.get('number_of_dependents', 0)

    # Date fields
    for field, prefix in {
        'accident_date': 'Accident Date_',
        'c2_date': 'C-2 Date_',
        'assembly_date': 'Assembly Date_',
    }.items():
        date_value = form_data.get(field, None)
        print(f"Processing date field: {field}, Value: {date_value}, Type: {type(date_value)}")  # Debugging
        processed_data = add_datetime_features(processed_data, date_value, prefix)

    # Boolean fields
    boolean_fields = {
    'covid_indicator': 'COVID-19 Indicator',
    'c3_form_submitted': 'Has C-3 Date',
    'first_hearing_date': 'Has First Hearing Date',
}

    # Process boolean fields (idk why)
    for key, prefix in boolean_fields.items():
        field_value = form_data.get(key)
        processed_data = add_boolean_one_hot(processed_data, field_value, prefix)


    # Carrier Type mapping
    carrier_types = [
        '1A. PRIVATE', '2A. SIF', '3A. SELF PUBLIC', '4A. SELF PRIVATE',
        '5A. SPECIAL FUND - CONS. COMM. (SECT. 25-A)', 'UNKNOWN'
    ]
    carrier = form_data.get('carrier_type', 'UNKNOWN')
    for ct in carrier_types:
        processed_data[f'Carrier Type_{ct}'] = int(carrier == ct)

    
    # Gender mapping
    gender_mapping = ['F', 'M', 'U', 'X']
    gender = form_data.get('gender', 'U')
    for g in gender_mapping:
        processed_data[f'Gender_{g}'] = int(gender == g)

    #Alternative dispute resolution mapping
    adr_options ={'False': 'N', 'True': 'Y', 'None': 'U'}
    for key, value in adr_options.items():
        processed_data[f'Alternative Dispute Resolution_{value}'] = int(form_data.get('alternative_dispute_resolution', 'U') == key)

    # representative and attorney mapping
    attorney_options ={'False': 'N', 'True': 'Y',}
    for key, value in attorney_options.items():
        processed_data[f'Attorney/Representative_{value}'] = int(form_data.get('attorney_representative', 'False') == key)

    #covid 19 indicator 
    covid_options ={'False': 'N', 'True': 'Y',}
    for key, value in covid_options.items():
        processed_data[f'COVID-19 Indicator_{value}'] = int(form_data.get('covid_indicator', 'False') == key)

    medical_options = ['Medical Fee Region_I', "Medical Fee Region_II", "Medical Fee Region_III", "Medical Fee Region_IV""Medical Fee Region_UK"]
    for option in medical_options:
        processed_data[option] = int(form_data.get('medical_fee_region', 'Medical Fee Region_UK') == option)

    # County mapping
    processed_data = match_one_hot_encoding(
        processed_data, COUNTIES, form_data.get('county_of_injury', 'UNKNOWN'), 'County of Injury_'
    )

    processed_data = match_one_hot_encoding(
        processed_data, INDUSTRY_CODES, form_data.get('industry_code', 'UNKNOWN'), 'Industry Code_', float_conversion=True
    )

    processed_data = match_one_hot_encoding(
        processed_data, CAUSE_OF_INJURY_CODES, form_data.get('wcio_cause_of_injury_code', 'UNKNOWN'), 'WCIO Cause of Injury Code_', float_conversion=True
    )

    processed_data = match_one_hot_encoding(
        processed_data, NATURE_OF_INJURY_CODES, form_data.get('wcio_nature_of_injury_code', 'UNKNOWN'), 'WCIO Nature of Injury Code_', float_conversion=True
    )

    processed_data = match_one_hot_encoding(
        processed_data, PART_OF_BODY_CODES, form_data.get('wcio_part_of_body_code', 'UNKNOWN'), 'WCIO Part Of Body Code_', float_conversion=True
    )

    district_options = ['District Name_ALBANY', 'District Name_BINGHAMTON', 'District Name_BUFFALO','District Name_HAUPPAUGE','District Name_NYC','District Name_ROCHESTER','District Name_STATEWIDE','District Name_SYRACUSE',]
    processed_data = match_one_hot_encoding(processed_data, district_options, form_data.get('district_name', 'UNKNOWN'), 'District Name_')

    processed_data['Carrier Name'] = float(form_data.get('encoded_value_carrier', 0))
    processed_data['Zip Code'] = float(form_data.get('encoded_value', 0))

    # Create DataFrame and strictly enforce FEATURE_ORDER
    processed_df = pd.DataFrame([processed_data])

    # Drop unexpected columns
    processed_df = processed_df[[col for col in FEATURE_ORDER if col in processed_df.columns]]

    return processed_df


def match_one_hot_encoding(df, features, value, prefix, float_conversion=False):
    """
    Matches the first value in a one-hot encoded feature set
    Leaves the rest 0
    """
    if float_conversion:
        value = float(value)

    for feature in features:
        # Check if the feature starts with the given prefix
        if feature.startswith(prefix) :
            # Set the column to 1 if it matches the target value, otherwise 0
            df[feature] = 1 if feature == f"{prefix}{value}" else 0

    return df


def add_datetime_features(df, date_value, prefix):
    """
    Adds year, month, day, and day of the week features for a given date to the DataFrame.

    Args:
        df (dict): The dictionary to update with new features.
        date_value (datetime.date or str): The date value to extract features from.
        prefix (str): The prefix for the new feature names.

    Returns:
        dict: The updated dictionary with the new date-related features.
    """
    # Convert the date_value to pandas datetime if it's not None
    date_value = pd.to_datetime(date_value, errors='coerce')  # Handles None and invalid dates
    if pd.notnull(date_value):
        df[f'{prefix}Year'] = date_value.year
        df[f'{prefix}Month'] = date_value.month
        df[f'{prefix}Day'] = date_value.day
        df[f'{prefix}DayOfWeek'] = date_value.dayofweek
    else:
        # Assign NaN if the date is missing or invalid
        df[f'{prefix}Year'] = np.nan
        df[f'{prefix}Month'] = np.nan
        df[f'{prefix}Day'] = np.nan
        df[f'{prefix}DayOfWeek'] = np.nan

    return df

def add_boolean_one_hot(df, field_value, prefix):
    """
    Adds one-hot encoded columns for a boolean field to the DataFrame.

    Args:
        df (dict): The dictionary to update with new features.
        field_value (str or bool): The field value ('True', 'False', or equivalent).
        prefix (str): The prefix for the new one-hot encoded column names.

    Returns:
        dict: The updated dictionary with one-hot encoded columns.
    """
    # Parse the field value to determine boolean state
    parsed_value = 1 if str(field_value).lower() == 'true' else 0
    
    # Add one-hot encoded columns
    df[f'{prefix}_1'] = parsed_value
    df[f'{prefix}_0'] = 1 - parsed_value

    return df

___
# Agreement Reached Prediction

## Data preprocessing

In [14]:
train_data = pd.read_csv('train_data.csv', index_col='Claim Identifier')
test_data = pd.read_csv('test_data.csv', index_col='Claim Identifier')

  train_data = pd.read_csv('train_data.csv', index_col='Claim Identifier')


In [15]:
train_data = train_data[~(train_data.drop(columns=['Assembly Date']).isna().all(axis=1) & train_data['Assembly Date'].notna())]

In [16]:
X = train_data.drop(columns=['Claim Injury Type', 'WCB Decision', 'Agreement Reached','OIICS Nature of Injury Description'])
y = train_data['Agreement Reached']

test_data = test_data.drop(columns=['OIICS Nature of Injury Description'])

In [17]:
#in percent
y.value_counts()/len(y)*100

Agreement Reached
0.0    95.333487
1.0     4.666513
Name: count, dtype: float64

In [18]:
CODE_COLUMNS = ['Industry Code', 'WCIO Cause of Injury Code',
       'WCIO Nature of Injury Code', 'WCIO Part Of Body Code']

DESCRIPTION_COLUMNS = ['WCIO Cause of Injury Description','WCIO Nature of Injury Description','WCIO Part Of Body Description','Industry Code Description']

BOOLEAN_COLUMNS = ['Alternative Dispute Resolution', 'Attorney/Representative','COVID-19 Indicator']

date_order = ['Accident Date', 'C-2 Date','C-3 Date','Assembly Date', 'First Hearing Date']

numerical_columns = [
    'Accident Date', 
    'Age at Injury', 
    'Assembly Date', 
    'Average Weekly Wage', 
    'Birth Year', 
    'C-2 Date', 
    'C-3 Date', 
    'First Hearing Date', 
    'IME-4 Count', 
]

outliers_columns = [
    'Accident Date', 
    'Age at Injury', 
    'Assembly Date', 
    'Average Weekly Wage', 
    'Birth Year',
    'IME-4 Count', 
]

categorical_features = ['Alternative Dispute Resolution',
 'Attorney/Representative',
 'Carrier Name',
 'Carrier Type',
 'County of Injury',
 'COVID-19 Indicator',
 'District Name',
 'Gender',
 'Industry Code',
 'Medical Fee Region',
 'WCIO Cause of Injury Code',
 'WCIO Nature of Injury Code',
 'WCIO Part Of Body Code',
 'Zip Code']


columns_to_scale = ['Accident Date',
                'Assembly Date',
                'Average Weekly Wage',
                'Age at Injury',
                'Birth Year', 
                'Number of Dependents',
                'IME-4 Count']

date_columns = ['Accident Date', 'Assembly Date']

outliers_iqr_specific = ['Age at Injury', 'Birth Year']

columns_to_drop = ['C-2 Date', 'C-3 Date', 'First Hearing Date']

low_cardinality_cols = [col for col in categorical_features if X[col].nunique() < 10]
high_cardinality_cols = [col for col in categorical_features if X[col].nunique() > 10]

In [19]:
def preprocessing_scaling_encoding_dum(X_train, X_val):
    X_train, X_val = type_conversion_categorical(X_train, X_val,categorical_features)
    X_train, X_val = drop_description_columns(X_train, X_val)
    X_train, X_val = convert_to_timestamp(X_train, X_val, date_order)
    X_train, X_val = convert_to_bool(X_train, X_val, col_names=BOOLEAN_COLUMNS)
    X_train, X_val = impute_mean_numerical(X_train, X_val, numerical_columns)
    X_train, X_val = fill_missing_with_mode(X_train, X_val)
    X_train, X_val = feature_creation_has_Cdate(X_train, X_val)
    X_train, X_val = drop_unwanted_columns(X_train, X_val, columns_to_drop)
    X_train, X_val = scaling_robust(X_train, X_val, columns_to_scale)
    X_train, X_val = encoding_onehot(X_train, X_val, low_cardinality_cols)
    X_train, X_val = encoding_frequency1(X_train, X_val, high_cardinality_cols)

    return X_train, X_val

In [20]:
def create_predifined_split(X, y, preprocess_steps, n_splits = 5):
    """
    Creates a PredefinedSplit object to be used in cross-validation, more specifically in GridSearchCV.

    Steps:
    - Defines the number of splits
    - Splits the data into training and validation sets
    - Applies the preprocessing steps to the training and validation sets
    - Returns the PredefinedSplit object and the preprocessed data
    """

    X_combined_list = []
    y_combined_list = []

    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    test_data = np.zeros(len(X), dtype=int) - 1

    for fold_idx, (_, test_idx) in enumerate(kf.split(X, y)):
        test_data[test_idx] = fold_idx

    ps = PredefinedSplit(test_fold=test_data)

    for train_index, test_index in ps.split():

        # Get fold
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]

        # Preprocess and encode data    
        X_train, X_val = preprocess_steps(X_train, X_val)
        y_train, y_val, le = encoding_label(y_train, y_val)

        X_combined_list.append(X_train)
        y_combined_list.append(y_train)

    X_combined = pd.concat(X_combined_list, axis=0)
    y_combined = np.concatenate(y_combined_list, axis=0)

    return ps, X_combined, y_combined

In [21]:
ps, X_combined, y_combined = create_predifined_split(X, y, preprocessing_scaling_encoding_dum, n_splits=5) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[coulmns] = X_train[coulmns].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val[coulmns] = X_val[coulmns].astype(str)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object

## Feature selection (RFECV)

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=100,            # Number of trees
    max_depth=15,                # Limit tree depth
    min_samples_split=50,        # Minimum samples for a split
    min_samples_leaf=20,         # Minimum samples per leaf
    max_features='sqrt',         # Features to consider per split
    class_weight='balanced',     # Handle class imbalance
    bootstrap=True,              # Use bootstrapping
    random_state=42,             # Ensure reproducibility
    n_jobs=-1                    # Use all CPU cores
)

# Set up RFECV with RandomForest and cross-validation
rfecv = RFECV(estimator=rf_model, step=1, cv=ps, scoring='f1_macro') 

# Fit RFECV
rfecv.fit(X_combined, y_combined)

#Get the selected features
selected_features_RFE_basic = X_combined.columns[rfecv.support_].tolist()
optimal_num_features = rfecv.n_features_
feature_ranking = rfecv.ranking_

print("Optimal number of features:", optimal_num_features)

In [18]:
selected_features_RFE_basic = ['Accident Date', 'Age at Injury', 'Assembly Date', 'Average Weekly Wage', 'Birth Year', 'IME-4 Count', 'Number of Dependents', 'Attorney/Representative_False', 'Attorney/Representative_True', 'Carrier Type_1A. PRIVATE', 'Carrier Type_2A. SIF', 'Carrier Type_3A. SELF PUBLIC', 'COVID-19 Indicator_False', 'COVID-19 Indicator_True', 'District Name_NYC', 'Gender_F', 'Gender_M', 'Medical Fee Region_I', 'Medical Fee Region_IV', 'Carrier Name', 'County of Injury', 'Industry Code', 'WCIO Cause of Injury Code', 'WCIO Nature of Injury Code', 'WCIO Part Of Body Code', 'Zip Code']

## Model assessment

In [19]:
def evaluate_models_with_overfitting_check(X, y, predefined_split, selected_features, models):
    """
    Evaluates multiple models on a dataset with a predefined split and calculates train-test metric differences.
    
    Parameters:
    - X (pd.DataFrame): Feature dataset.
    - y (pd.Series or np.array): Target variable.
    - predefined_split (PredefinedSplit): Predefined split object for cross-validation.
    - models (dict): Dictionary of models, where keys are model names and values are model instances.
    
    Returns:
    - pd.DataFrame: Summary table with mean, variance, and train-test differences for evaluation metrics.
    """
    results = []

    for model_name, model in models.items():
        fold_metrics = {
            'accuracy_train': [],
            'accuracy_test': [],
            'precision_macro_train': [],
            'precision_macro_test': [],
            'recall_macro_train': [],
            'recall_macro_test': [],
            'f1_macro_train': [],
            'f1_macro_test': []
        }
        
        # Loop through predefined splits
        for train_idx, test_idx in predefined_split.split():
            # Split data
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            
            # Train model
            model.fit(X_train[selected_features], y_train)
            
            # Predict on train and test data
            y_train_pred = model.predict(X_train[selected_features])
            y_test_pred = model.predict(X_test[selected_features])
            
            # Calculate metrics for train and test data
            fold_metrics['accuracy_train'].append(accuracy_score(y_train, y_train_pred))
            fold_metrics['accuracy_test'].append(accuracy_score(y_test, y_test_pred))
            fold_metrics['precision_macro_train'].append(precision_score(y_train, y_train_pred, average='macro', zero_division=0))
            fold_metrics['precision_macro_test'].append(precision_score(y_test, y_test_pred, average='macro', zero_division=0))
            fold_metrics['recall_macro_train'].append(recall_score(y_train, y_train_pred, average='macro', zero_division=0))
            fold_metrics['recall_macro_test'].append(recall_score(y_test, y_test_pred, average='macro', zero_division=0))
            fold_metrics['f1_macro_train'].append(f1_score(y_train, y_train_pred, average='macro', zero_division=0))
            fold_metrics['f1_macro_test'].append(f1_score(y_test, y_test_pred, average='macro', zero_division=0))
        
        # Calculate mean, variance, and train-test differences for each metric
        for metric_name in ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']:
            train_metric = np.array(fold_metrics[f'{metric_name}_train'])
            test_metric = np.array(fold_metrics[f'{metric_name}_test'])
            
            mean_train = np.mean(train_metric)
            mean_test = np.mean(test_metric)
            variance_train = np.var(train_metric)
            variance_test = np.var(test_metric)
            mean_difference = mean_train - mean_test
            
            results.append({
                'Model': model_name,
                'Metric': metric_name,
                'Mean_Train': mean_train,
                'Mean_Test': mean_test,
                'Variance_Train': variance_train,
                'Variance_Test': variance_test,
                'Train-Test_Difference': mean_difference
            })
        
        print(results)
    
    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

In [20]:
models = {
    'Logistic Regression': LogisticRegression(penalty='l2', C=10, solver='lbfgs', class_weight='balanced', l1_ratio=0.5, max_iter=1000, n_jobs=-1),
    'Random Forest': RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', bootstrap=False, random_state=42, n_jobs=-1),
    'XGBoost': XGBClassifier(gamma = 0, learning_rate = 0.3, max_depth = 6, min_child_weight = 1, n_estimators = 200, reg_alpha = 0, reg_lambda = 1,random_state=42, n_jobs=-1)   
}
performance_results = evaluate_models_with_overfitting_check(X_combined, y_combined, ps, selected_features_RFE_basic, models)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[{'Model': 'Logistic Regression', 'Metric': 'accuracy', 'Mean_Train': np.float64(0.8066172087520608), 'Mean_Test': np.float64(0.8066219997465719), 'Variance_Train': np.float64(1.0844921610587807e-07), 'Variance_Test': np.float64(5.268755205832088e-07), 'Train-Test_Difference': np.float64(-4.790994511116864e-06)}, {'Model': 'Logistic Regression', 'Metric': 'precision_macro', 'Mean_Train': np.float64(0.5960936865786358), 'Mean_Test': np.float64(0.5960826080028989), 'Variance_Train': np.float64(3.376579292659026e-08), 'Variance_Test': np.float64(3.3484526617922406e-07), 'Train-Test_Difference': np.float64(1.1078575736900298e-05)}, {'Model': 'Logistic Regression', 'Metric': 'recall_macro', 'Mean_Train': np.float64(0.8591195834395972), 'Mean_Test': np.float64(0.8590653947350285), 'Variance_Train': np.float64(1.108756709818786e-07), 'Variance_Test': np.float64(1.3942206427630029e-06), 'Train-Test_Difference': np.float64(5.4188704568636226e-05)}, {'Model': 'Logistic Regression', 'Metric': 'f1

## Final Model Config

In [24]:
model_AR = XGBClassifier(
        n_estimators=500,
        scale_pos_weight = 20,  
        max_delta_step = 5,     
        learning_rate=0.05,
        max_depth=6,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        random_state=42,
        n_jobs=-1
    )