In [161]:
# importing the libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.neighbors import LocalOutlierFactor
from matplotlib import pyplot as plt
import seaborn as sns
import re
import math
import datetime

In [162]:
# setting the options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
# pd.set_option('display.max_rows', None)
np.set_printoptions(threshold=np.inf)
plt.style.use('seaborn-v0_8-dark')

In [163]:
# importing the training and test data
df = pd.read_csv('train_data_initial_inspection.csv', sep=',')
df_test = pd.read_csv('test_data.csv', sep=',')

In [164]:
df.head()

Unnamed: 0.1,Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Industry Code Description,Medical Fee Region,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Agreement Reached,WCB Decision,Number of Dependents,C-2 Missed Timing,C-3 Missed Timing,Days Difference,C-2 Missing,C-3 Missing,Has Hearing,Has IME-4 Report
0,0,2019-12-30,31,N,2020-01-01,N,0.0,1988.0,2019-12-31,,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,2. NON-COMP,ST. LAWRENCE,N,SYRACUSE,,M,,44.0,RETAIL TRADE,I,,27.0,FROM LIQUID OR GREASE SPILLS,10.0,CONTUSION,62.0,BUTTOCKS,13662,0,Not Work Related,1,0,0,2.0,0,1,1,0
1,1,2019-08-30,46,N,2020-01-14,Y,1745.93,1973.0,2020-01-01,2020-01-01,ZURICH AMERICAN INSURANCE CO,1A. PRIVATE,4. TEMPORARY,WYOMING,N,ROCHESTER,2020-02-21,F,4.0,23.0,CONSTRUCTION,I,,94.0,REPETITIVE MOTION,49.0,SPRAIN OR TEAR,38.0,SHOULDER(S),14569,1,Not Work Related,4,1,0,137.0,0,0,0,1
2,2,2019-12-06,40,N,2020-01-01,N,1434.8,1979.0,2020-01-01,,INDEMNITY INSURANCE CO OF,1A. PRIVATE,4. TEMPORARY,ORANGE,N,ALBANY,,M,,56.0,ADMINISTRATIVE AND SUPPORT AND WASTE MANAGEMENT AND REMEDIAT,II,,17.0,OBJECT BEING LIFTED OR HANDLED,7.0,CONCUSSION,10.0,MULTIPLE HEAD INJURY,12589,0,Not Work Related,6,1,0,26.0,0,1,1,0
3,4,2019-12-30,61,N,2020-01-01,N,,1958.0,2019-12-31,,STATE INSURANCE FUND,2A. SIF,2. NON-COMP,DUTCHESS,N,ALBANY,,M,,62.0,HEALTH CARE AND SOCIAL ASSISTANCE,II,,16.0,"HAND TOOL, UTENSIL; NOT POWERED",43.0,PUNCTURE,36.0,FINGER(S),12603,0,Not Work Related,1,0,0,2.0,0,1,1,0
4,5,2019-12-26,67,N,2020-01-01,N,0.0,1952.0,2019-12-31,,INDEMNITY INS. OF N AMERICA,1A. PRIVATE,3. MED ONLY,SUFFOLK,N,HAUPPAUGE,,M,,44.0,RETAIL TRADE,IV,,31.0,"FALL, SLIP OR TRIP, NOC",10.0,CONTUSION,38.0,SHOULDER(S),11772,0,Not Work Related,5,0,0,6.0,0,1,1,0


In [165]:
# List variables with less than 3 unique values
binary_candidates = [col for col in df.columns if df[col].nunique() <= 3]

print("Features with less than 3 values:", binary_candidates)

Features with less than 3 values: ['Alternative Dispute Resolution', 'Attorney/Representative', 'COVID-19 Indicator', 'Gender', 'OIICS Nature of Injury Description', 'Agreement Reached', 'WCB Decision', 'C-2 Missed Timing', 'C-3 Missed Timing', 'C-2 Missing', 'C-3 Missing', 'Has Hearing', 'Has IME-4 Report']


In [166]:
# Display unique values for each variable
for variable in binary_candidates:
    unique_values = df[variable].unique()
    print(f"Feature - '{variable}': {unique_values}")

Feature - 'Alternative Dispute Resolution': ['N' 'Y' 'U']
Feature - 'Attorney/Representative': ['N' 'Y']
Feature - 'COVID-19 Indicator': ['N' 'Y']
Feature - 'Gender': ['M' 'F' 'U']
Feature - 'OIICS Nature of Injury Description': [nan]
Feature - 'Agreement Reached': [0 1]
Feature - 'WCB Decision': ['Not Work Related']
Feature - 'C-2 Missed Timing': [0 1]
Feature - 'C-3 Missed Timing': [0 1]
Feature - 'C-2 Missing': [0 1]
Feature - 'C-3 Missing': [1 0]
Feature - 'Has Hearing': [1 0]
Feature - 'Has IME-4 Report': [0 1]


In [167]:
# split binary features into 0/1 and Y/N
yes_no_features = ['Alternative Dispute Resolution', 'Attorney/Representative', 'COVID-19 Indicator', 'Gender']
binary_features = ['Agreement Reached', 'C-2 Missed Timing', 'C-3 Missed Timing', 'C-2 Missing', 'C-3 Missing', 'Has Hearing', 'Has IME-4 Report']

In [168]:
# Convert Yes/No features from N/Y to 0/1
for feature in yes_no_features:
    df[feature] = df[feature].map({'Y': 1, 'N': 0, 'U': np.nan})  # Converts 'Y' to 1, 'N' to 0, and 'U' to NaN

# Add the converted Yes/No features to binary_features
binary_features.extend(yes_no_features)
print("Final binary features list:", binary_features)

Final binary features list: ['Agreement Reached', 'C-2 Missed Timing', 'C-3 Missed Timing', 'C-2 Missing', 'C-3 Missing', 'Has Hearing', 'Has IME-4 Report', 'Alternative Dispute Resolution', 'Attorney/Representative', 'COVID-19 Indicator', 'Gender']


In [169]:
# List of selected features
features = df.select_dtypes(include=[np.number]).columns.tolist()

print("Current features list:", features)

Current features list: ['Unnamed: 0', 'Age at Injury', 'Alternative Dispute Resolution', 'Attorney/Representative', 'Average Weekly Wage', 'Birth Year', 'COVID-19 Indicator', 'Gender', 'IME-4 Count', 'Industry Code', 'OIICS Nature of Injury Description', 'WCIO Cause of Injury Code', 'WCIO Nature of Injury Code', 'WCIO Part Of Body Code', 'Agreement Reached', 'Number of Dependents', 'C-2 Missed Timing', 'C-3 Missed Timing', 'Days Difference', 'C-2 Missing', 'C-3 Missing', 'Has Hearing', 'Has IME-4 Report']


In [170]:
# Remaining variables that are not yet in the features list
remaining_features = [col for col in df.columns if col not in features]

print("Variables that still need processing:", remaining_features)

Variables that still need processing: ['Accident Date', 'Assembly Date', 'C-2 Date', 'C-3 Date', 'Carrier Name', 'Carrier Type', 'Claim Injury Type', 'County of Injury', 'District Name', 'First Hearing Date', 'Industry Code Description', 'Medical Fee Region', 'WCIO Cause of Injury Description', 'WCIO Nature of Injury Description', 'WCIO Part Of Body Description', 'Zip Code', 'WCB Decision']


In [171]:
remaining_features = ['Zip Code', 'Medical Fee Region', 'District Name', 
                      'County of Injury', 'Claim Injury Type', 'Carrier Type', 'Carrier Name']

# Count unique values for each feature
unique_counts = {feature: df[feature].nunique() for feature in remaining_features}

print("Unique value counts for remaining features:")

for feature, count in unique_counts.items():
    print(f"{feature}: {count} unique values")

Unique value counts for remaining features:
Zip Code: 9989 unique values
Medical Fee Region: 5 unique values
District Name: 8 unique values
County of Injury: 63 unique values
Claim Injury Type: 8 unique values
Carrier Type: 8 unique values
Carrier Name: 2039 unique values


In [172]:
# Remove Zip Code and Carrier Name due to high redunctant values
remaining_features = [feature for feature in remaining_features if feature not in ['Zip Code', 'Carrier Name']]

add_features = ['Claim Injury Type']
features.extend(add_features)

In [173]:
from sklearn.preprocessing import LabelEncoder

# Apply Label Encoding to Claim Injury Type (target)
label_encoder = LabelEncoder()
df['Claim Injury Type'] = label_encoder.fit_transform(df['Claim Injury Type'])

variable = 'Claim Injury Type'
unique_values = df[variable].unique()
print(f"Feature - '{variable}': {unique_values}")

Feature - 'Claim Injury Type': [1 3 2 4 5 0 7 6]


In [174]:
from sklearn.preprocessing import OrdinalEncoder

# Apply OrdinalEncoder to Medical Fee Region
df['Medical Fee Region'] = OrdinalEncoder().fit_transform(df[['Medical Fee Region']])
# Leave just the number from Carrier Type
df['Carrier Type'] = df['Carrier Type'].str.extract('(\d+)').astype(float)

# unique values
for variable in remaining_features:
    unique_values = df[variable].unique()
    print(f"Feature - '{variable}': {unique_values}")

add_features = ['Medical Fee Region', 'Carrier Type']
features.extend(add_features)

Feature - 'Medical Fee Region': [0. 1. 3. 4. 2.]
Feature - 'District Name': ['SYRACUSE' 'ROCHESTER' 'ALBANY' 'HAUPPAUGE' 'NYC' 'BUFFALO' 'BINGHAMTON'
 'STATEWIDE']
Feature - 'County of Injury': ['ST. LAWRENCE' 'WYOMING' 'ORANGE' 'DUTCHESS' 'SUFFOLK' 'ONONDAGA'
 'RICHMOND' 'MONROE' 'KINGS' 'NEW YORK' 'QUEENS' 'WESTCHESTER' 'GREENE'
 'NASSAU' 'ALBANY' 'ERIE' 'BRONX' 'CAYUGA' 'NIAGARA' 'LIVINGSTON'
 'WASHINGTON' 'MADISON' 'WARREN' 'SENECA' 'GENESEE' 'SARATOGA'
 'CHAUTAUQUA' 'COLUMBIA' 'RENSSELAER' 'CATTARAUGUS' 'ROCKLAND' 'SCHUYLER'
 'BROOME' 'ULSTER' 'CLINTON' 'ONEIDA' 'UNKNOWN' 'MONTGOMERY' 'ONTARIO'
 'SCHENECTADY' 'CHEMUNG' 'YATES' 'HERKIMER' 'ALLEGANY' 'TIOGA' 'FULTON'
 'DELAWARE' 'TOMPKINS' 'OSWEGO' 'PUTNAM' 'LEWIS' 'ESSEX' 'OTSEGO'
 'CORTLAND' 'ORLEANS' 'SULLIVAN' 'CHENANGO' 'FRANKLIN' 'WAYNE' 'JEFFERSON'
 'STEUBEN' 'SCHOHARIE' 'HAMILTON']
Feature - 'Claim Injury Type': [1 3 2 4 5 0 7 6]
Feature - 'Carrier Type': [ 1.  2.  4.  3. nan  5.]


In [175]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder

target = df['Claim Injury Type']

# apply Label Encoding to District Name and County of Injury
df['District Name Encoded'] = LabelEncoder().fit_transform(df['District Name'])
df['County of Injury Encoded'] = LabelEncoder().fit_transform(df['County of Injury'])

# chi2 to categorical features and target
chi2_stats, p_values = chi2(df[['District Name Encoded', 'County of Injury Encoded']], target)

# show significance
print("p-value for District Name:", p_values[0])
print("p-value for County of Injury:", p_values[1])


p-value for District Name: 1.7221683375956468e-127
p-value for County of Injury: 0.0


In [176]:
print("Since they have high significance with the target feature, we will keep them")

from sklearn.preprocessing import LabelEncoder

# apply label Encoding for District Name and County of Injury
label_encoder_district = LabelEncoder()
label_encoder_county = LabelEncoder()

df['District Name Encoded'] = label_encoder_district.fit_transform(df['District Name'])
df['County of Injury Encoded'] = label_encoder_county.fit_transform(df['County of Injury'])

add_features = ['District Name Encoded', 'County of Injury Encoded']
features.extend(add_features)

Since they have high significance with the target feature, we will keep them


In [177]:
print("Final features:", features)

Final features: ['Unnamed: 0', 'Age at Injury', 'Alternative Dispute Resolution', 'Attorney/Representative', 'Average Weekly Wage', 'Birth Year', 'COVID-19 Indicator', 'Gender', 'IME-4 Count', 'Industry Code', 'OIICS Nature of Injury Description', 'WCIO Cause of Injury Code', 'WCIO Nature of Injury Code', 'WCIO Part Of Body Code', 'Agreement Reached', 'Number of Dependents', 'C-2 Missed Timing', 'C-3 Missed Timing', 'Days Difference', 'C-2 Missing', 'C-3 Missing', 'Has Hearing', 'Has IME-4 Report', 'Claim Injury Type', 'Medical Fee Region', 'Carrier Type', 'District Name Encoded', 'County of Injury Encoded']


In [178]:
df_rfe = df[features].copy()  # new DataFrame with only selected features and the target

df_rfe.head()

Unnamed: 0.1,Unnamed: 0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,COVID-19 Indicator,Gender,IME-4 Count,Industry Code,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Nature of Injury Code,WCIO Part Of Body Code,Agreement Reached,Number of Dependents,C-2 Missed Timing,C-3 Missed Timing,Days Difference,C-2 Missing,C-3 Missing,Has Hearing,Has IME-4 Report,Claim Injury Type,Medical Fee Region,Carrier Type,District Name Encoded,County of Injury Encoded
0,0,31,0.0,0.0,0.0,1988.0,0.0,,,44.0,,27.0,10.0,62.0,0,1,0,0,2.0,0,1,1,0,1,0.0,1.0,7,49
1,1,46,0.0,1.0,1745.93,1973.0,0.0,,4.0,23.0,,94.0,49.0,38.0,1,4,1,0,137.0,0,0,0,1,3,0.0,1.0,5,61
2,2,40,0.0,0.0,1434.8,1979.0,0.0,,,56.0,,17.0,7.0,10.0,0,6,1,0,26.0,0,1,1,0,3,1.0,1.0,0,35
3,4,61,0.0,0.0,,1958.0,0.0,,,62.0,,16.0,43.0,36.0,0,1,0,0,2.0,0,1,1,0,1,1.0,2.0,0,13
4,5,67,0.0,0.0,0.0,1952.0,0.0,,,44.0,,31.0,10.0,38.0,0,5,0,0,6.0,0,1,1,0,2,3.0,1.0,3,51


In [181]:
# Define X and y
X = df_rfe.drop(columns=['Claim Injury Type'])
y = df_rfe['Claim Injury Type']

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

model = RandomForestClassifier(random_state=55)

cv = StratifiedKFold(n_splits=3)

rfe = RFECV(estimator=model, step=5, cv=cv, scoring='accuracy', n_jobs=-1)

rfe.fit(X, y)

print("Número ideal de features selecionadas:", rfe.n_features_)

selected_features = X.columns[rfe.support_]
print("Features Selecionadas:", selected_features)

print("Pontuação média com as features selecionadas:", rfe.grid_scores_.mean())

In [None]:
# Definir X e y com base no dataset e na variável alvo
X = df_rfe.drop(columns=['Claim Injury Type'])
y = df_rfe['Claim Injury Type']

# Importar o modelo base e a função de seleção de features
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

# Instanciar o modelo base
base_model = RandomForestClassifier(random_state=42)

# Configurar a validação cruzada
cv = StratifiedKFold(n_splits=5)

# Configurar o RFECV com o modelo base e validação cruzada
rfe = RFECV(estimator=base_model, step=1, cv=cv, scoring='accuracy', n_jobs=-1)

# Executar o RFECV no conjunto de dados de treino
rfe.fit(X, y)

# Número ideal de features selecionadas
print("Número ideal de features selecionadas:", rfe.n_features_)

# Features selecionadas
selected_features = X.columns[rfe.support_]
print("Features Selecionadas:", selected_features)

# Exibir a pontuação média do modelo com as features selecionadas
print("Pontuação média com as features selecionadas:", rfe.grid_scores_.mean())


# 25 minutos a correr