# PART 2

## 1. Imports and initial transformations

In [1]:
# importing the libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.neighbors import LocalOutlierFactor
from matplotlib import pyplot as plt
import seaborn as sns
import re
import math
import datetime

In [2]:
# setting the options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
np.set_printoptions(threshold=np.inf)
plt.style.use('seaborn-v0_8-dark')

In [None]:
# importing the training and test data
df = pd.read_csv('/home/shadybea/OneDrive/General/Machine Learning/Project/Data/train_data.csv', sep=',')
df_test = pd.read_csv('/home/shadybea/OneDrive/General/Machine Learning/Project/Data/test_data.csv', sep=',')


In [None]:
df.shape

In [5]:
df = df[~df['Claim Identifier'].duplicated()].set_index('Claim Identifier').rename_axis(None)

In [6]:
# Changing the data type of the values to string
df['Zip Code'] = df['Zip Code'].astype(str)

In [7]:
features_to_cast = ['Age at Injury', 'Birth Year', 'IME-4 Count', 'Industry Code', 'WCIO Cause of Injury Code', 'WCIO Nature of Injury Code', 'WCIO Part Of Body Code', 'Number of Dependents', 'Agreement Reached']

In [8]:
for feature in features_to_cast:
    df[feature] = pd.to_numeric(df[feature], errors='coerce').astype('Int64')

In [9]:
date_columns = ['Accident Date', 'Assembly Date', 'C-2 Date', 'C-3 Date', 'First Hearing Date']

for col in date_columns:
    # first, we convert the columns to datetime
    df[col] = pd.to_datetime(df[col])
    # then we convert it to int
    # df[col] = df[col].dt.strftime('%Y%m%d').astype('Int64')

In [10]:
df['Zip Code'] = df['Zip Code'].replace('nan', np.nan)

In [11]:
df.drop_duplicates(inplace=True)

In [12]:
df = df[~df['Claim Injury Type'].isna()]

## 2. Data Preprocessing

### 2.1. Weird values

As we previously mentioned, there are some columns with weird values:
- **Age at Injury**: multiple values below 14, which is the minimum legal age to work in the USA
- **Birth Year**: multiple 0 values
- **WCIO Part Of Body Code**: has a negative value

Before we go any further, let's try to tackle these issues.

#### 2.1.1. Age at Injury

In [None]:
len(df[df['Age at Injury'] < 14])

In [None]:
df[df['Age at Injury'] < 14].head()

In [None]:
len(df[(df['Age at Injury'] < 14) & (df['Birth Year'] != 0) & ~(df['Accident Date'].isna()) & ~(df['Birth Year'].isna())])

We can manually calculate these 122 values, since we have information about 'Birth Year' and 'Accident Date'

In [None]:
df[(df['Age at Injury'] < 14) & (df['Birth Year'] != 0) & ~(df['Accident Date'].isna()) & ~(df['Birth Year'].isna())].head()

In [17]:
df['Age at Injury'] = np.where(
    # we select only the rows that meet the necessary conditions
    (df['Age at Injury'] < 14) & (df['Birth Year'] != 0) & (df['Accident Date'].notna()) & (df['Birth Year'].notna()),
    # if conditions are met, we calculate the new value
    df['Accident Date'].dt.year - df['Birth Year'],
    # otherwise, we keep the original value
    df['Age at Injury']
)

In [None]:
len(df[(df['Age at Injury'] < 14) & (df['Birth Year'] != 0) & (df['Accident Date'].notna()) & (df['Birth Year'].notna())])

Apparently we still have some inconsistent data, but we were able to correct around 100 rows.

#### 2.1.2. Birth Year

In [None]:
len(df[df['Birth Year'] == 0])

In [None]:
len(df[(df['Birth Year'] == 0) & (df['Accident Date'].notna()) & (df['Age at Injury'] >= 14)])

In [None]:
df[(df['Birth Year'] == 0) & ~(df['Accident Date'].notna()) & (df['Age at Injury'] >= 14)].head()

We can manually impute most of the weird values just by doing some simple math.

In [22]:
df['Birth Year'] = np.where(
    # we select only the rows that meet the necessary conditions
    (df['Birth Year'] == 0) & (df['Birth Year'].notna()) & (df['Accident Date'].notna()) & (df['Age at Injury'] >= 14),
    # if conditions are met, we calculate the new value
    df['Accident Date'].dt.year - df['Age at Injury'],
    # otherwise, we keep the original value
    df['Birth Year']
)

In [None]:
len(df[df['Birth Year'] == 0])

We were able to significantly reduce the amount of weird values

#### 2.1.3. WCIO Part Of Body Code

In [None]:
len(df[df['WCIO Part Of Body Code'] < 0])

In [None]:
df[df['WCIO Part Of Body Code'] < 0]['WCIO Part Of Body Code'].unique()

All negative values are the same value - let's check if there are any values '9' or if we can simply convert these values to its absolute value.

In [None]:
len(df[df['WCIO Part Of Body Code'] == 9])

As there are no values that take the value '9', we will convert the negative values to the absolute value

In [27]:
df['WCIO Part Of Body Code'] = np.where(
    # we select only the rows that meet the necessary conditions
    (df['WCIO Part Of Body Code'] < 0) & (df['WCIO Part Of Body Code'].notna()),
    # if conditions are met, we calculate the new value
    df['WCIO Part Of Body Code'].abs(),
    # otherwise, we keep the original value
    df['WCIO Part Of Body Code']
)

In [None]:
len(df[df['WCIO Part Of Body Code'] < 0])

### 2.2. Missing values

In [None]:
# Calculate the number of NaNs for each column
nan_counts = df.isna().sum()

# Get the total number of rows (entries) in the DataFrame
total_rows = df.shape[0]

# Calculate the percentage of NaN values for each column
percentage_nans = (nan_counts / total_rows) * 100

# Format the percentage with '%' sign
percentage_nans = percentage_nans.apply(lambda x: f"{x:.2f}%")

# Combine all information into a DataFrame for better readability
nan_summary = pd.DataFrame({
    'NaN Count': nan_counts,
    'Total Values': [total_rows] * len(nan_counts),  # Ensure this column matches the length of nan_counts
    'Percentage NaN': percentage_nans
})

# Print the result
print("Summary of NaN values per column:\n")
print(nan_summary)

In [15]:
missing_values = [
    'Accident Date'
    , 'Average Weekly Wage'
    , 'Birth Year'
    #, 'C-2 Date'  # missing form could have relationship with the target
    #, 'C-3 Date'  # missing form could have relationship with the target
    #, 'First Hearing Date'  # missing values means no hearing has held
    #, 'IME-4 Count'  # missing form could have relationship with the target
    , 'Industry Code'
    #, 'Industry Code Description'  # we will only use the numeric form of this variable
    #, 'OIICS Nature of Injury Description'  # only missing values, so we will not use this feature
    , 'WCIO Cause of Injury Code'
    #, 'WCIO Cause of Injury Description'  # we will only use the numeric form of this variable
    , 'WCIO Nature of Injury Code'
    #, 'WCIO Nature of Injury Description'  # we will only use the numeric form of this variable
    , 'WCIO Part Of Body Code'
    #, 'WCIO Part Of Body Description'  # we will only use the numeric form of this variable
    , 'Zip Code'
]

We will not impute missing values in the commented columns, as per the explainations in the comments

### 2.3. Feature Encoding

In [31]:
# we drop the descriptive columns and the unary column ('WCB Decision')
df.drop(columns=['WCIO Part Of Body Description', 'Industry Code Description', 'WCIO Nature of Injury Description', 'WCIO Cause of Injury Description', 'OIICS Nature of Injury Description', 'WCB Decision'], inplace=True)

In [32]:
ordinal_encoder = OrdinalEncoder()

In [33]:
categorical_columns = [
    'Alternative Dispute Resolution'
    , 'Attorney/Representative'
    , 'Carrier Type'
    , 'Carrier Name'
    , 'Claim Injury Type'
    , 'County of Injury'
    , 'COVID-19 Indicator'
    , 'District Name'
    , 'Gender'
    , 'Medical Fee Region'
    , 'Zip Code'
]

In [34]:
df[categorical_columns] = ordinal_encoder.fit_transform(df[categorical_columns])

In [35]:
def to_ordinal(df, column):
    df[column] = df[column].apply(lambda x: x.toordinal() if pd.notnull(x) else np.nan).astype('Int64')

In [36]:
df['Accident Date'] = pd.to_datetime(df['Accident Date'])
# df['Assembly Date'] = pd.to_datetime(df['Assembly Date'])

In [37]:
dates = [
    'Accident Date'
    , 'Assembly Date'
    , 'C-2 Date'
    , 'C-3 Date'
    , 'First Hearing Date'
]

In [38]:
for col in dates:
    to_ordinal(df, col)

In [None]:
df.dtypes

All columns are now numeric

In [None]:
df.head()

For the features where missing values carry meaning, one could leave them as NaN and try an algorithm that can take advantage and learn from these missing values. We will analyse the missing values impact on the target later in this section.</br></br>
For the remaining columns, we will impute these values, using KNN Imputer. For that, we will take a small sample of our data, without missing values, and test the imputer to make a choice regarding the optimal number of neighbors.

### 2.5. Outliers

In [41]:
lof = LocalOutlierFactor(n_neighbors=20, n_jobs=-1)

In [42]:
test = df.sample(n=100000, random_state=17)

In [43]:
# initialize the imputer
imputer = SimpleImputer(strategy='median')
# create a new df with no missing values
df_temp_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Run LOF
outlier_labels = lof.fit_predict(df_temp_imputed)

# Remove rows marked as outliers
df_no_outliers = df[outlier_labels != -1]

In [None]:
df_no_outliers.shape

In [None]:
# % lines removed
(1- (df_no_outliers.shape[0] / df.shape[0])) * 100

### 2.4. Feature Scaling

In [46]:
min_max_scaler = MinMaxScaler()

In [47]:
df_scaled = min_max_scaler.fit_transform(df_no_outliers)

In [48]:
df_scaled = pd.DataFrame(df_scaled, columns = df_no_outliers.columns).set_index(df_no_outliers.index)

In [None]:
df_scaled.head()

In [50]:
df_scaled.to_csv('/home/shadybea/OneDrive/General/Machine Learning/Project/Data/train_data_scaled.csv', index=True)

### 2.5. Data Imputation

In [50]:
# numeric_feats = [
#     'Accident Date'
#     , 'Age at Injury'
#     , 'Assembly Date'
#     , 'Average Weekly Wage'
#     , 'Birth Year'
#     , 'C-2 Date'
#     , 'C-3 Date'
#     , 'First Hearing Date'
#     , 'IME-4 Count'
#     , 'Number of Dependents'
# ]

# cat_feats = [
#     'Alternative Dispute Resolution'
#     , 'Attorney/Representative'
#     , 'Carrier Name'
#     , 'Carrier Type'
#     , 'Claim Injury Type'
#     , 'County of Injury'
#     , 'COVID-19 Indicator'
#     , 'District Name'
#     , 'Gender'
#     , 'Industry Code'
#     , 'Medical Fee Region'
#     , 'WCIO Cause of Injury Code'
#     , 'WCIO Nature of Injury Code'
#     , 'WCIO Part Of Body Code'
#     , 'Zip Code'
#     , 'Agreement Reached'
# ]

In [4]:
df_scaled = pd.read_csv('/home/shadybea/OneDrive/General/Machine Learning/Project/Data/train_data_scaled.csv', sep=',')

In [5]:
df_scaled.set_index(df_scaled.columns[0], inplace=True)

In [6]:
df_scaled_imputed = df_scaled.copy()

In [52]:
# missing_numeric = []

# for col in numeric_feats:
#     if len(df_scaled_imputed[df_scaled_imputed[col].isna()]) > 0:
#         missing_numeric.append(col)

In [7]:
imp = IterativeImputer(
    estimator=RandomForestRegressor(n_jobs=-1, random_state=17)
    ,initial_strategy='median'
    ,max_iter=5
    ,random_state=17
    ,verbose=2
)

In [8]:
fit_sample = df_scaled_imputed.sample(n=30000, random_state=17)

In [9]:
imp.fit(fit_sample)

[IterativeImputer] Completing matrix with shape (30000, 26)
[IterativeImputer] Ending imputation round 1/5, elapsed time 160.82
[IterativeImputer] Change: 1.5331491339482575, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 2/5, elapsed time 369.92
[IterativeImputer] Change: 0.697065061111261, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 3/5, elapsed time 574.19
[IterativeImputer] Change: 0.8914983660568255, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 4/5, elapsed time 767.28
[IterativeImputer] Change: 0.8996460719571997, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 5/5, elapsed time 953.98
[IterativeImputer] Change: 0.748209577870083, scaled tolerance: 0.001 




In [10]:
df_scaled_imputed = imp.transform(df_scaled_imputed)

[IterativeImputer] Completing matrix with shape (559202, 26)
[IterativeImputer] Ending imputation round 1/5, elapsed time 4.72
[IterativeImputer] Ending imputation round 2/5, elapsed time 8.20
[IterativeImputer] Ending imputation round 3/5, elapsed time 12.60
[IterativeImputer] Ending imputation round 4/5, elapsed time 15.94
[IterativeImputer] Ending imputation round 5/5, elapsed time 19.27


In [11]:
df_scaled_imputed = pd.DataFrame(df_scaled_imputed, columns = df_scaled.columns).set_index(df_scaled.index)

In [12]:
df_scaled_imputed.head()

Unnamed: 0_level_0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Medical Fee Region,WCIO Cause of Injury Code,WCIO Nature of Injury Code,WCIO Part Of Body Code,Zip Code,Agreement Reached,Number of Dependents
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
5393875,0.939606,0.258333,0.0,0.0,0.0,0.0,0.985134,0.84842,0.194773,0.58533,0.0,0.142857,0.790323,0.0,1.0,0.062503,0.333333,0.032222,0.407407,0.0,0.265306,0.1,0.588889,0.391192,0.0,0.166667
5393091,0.934224,0.383333,0.0,0.0,1.0,0.06279,0.977701,0.848517,0.1846,0.999511,0.0,0.428571,0.983871,0.0,0.714286,0.013836,0.0,0.041667,0.148148,0.0,0.979592,0.533333,0.322222,0.457898,1.0,0.666667
5393889,0.938548,0.333333,0.0,0.0,0.0,0.051601,0.980674,0.848517,0.195839,0.437164,0.0,0.428571,0.564516,0.0,0.0,0.187635,0.333333,0.02875,0.555556,0.25,0.795918,0.066667,0.011111,0.305696,0.0,1.0
5393887,0.939606,0.508333,0.0,0.0,0.0,0.0,0.970268,0.84842,0.187032,0.836186,0.142857,0.142857,0.209677,0.0,0.0,0.06834,0.333333,0.023056,0.62963,0.25,0.153061,0.466667,0.3,0.306989,0.0,0.166667
5393863,0.93943,0.558333,0.0,0.0,0.0,0.0,0.967294,0.84842,0.19386,0.436675,0.0,0.285714,0.822581,0.0,0.428571,0.114792,0.333333,0.030278,0.407407,0.75,0.306122,0.1,0.322222,0.234815,0.0,0.833333


In [13]:
df_scaled_imputed.isna().sum()

Accident Date                     0
Age at Injury                     0
Alternative Dispute Resolution    0
Assembly Date                     0
Attorney/Representative           0
Average Weekly Wage               0
Birth Year                        0
C-2 Date                          0
C-3 Date                          0
Carrier Name                      0
Carrier Type                      0
Claim Injury Type                 0
County of Injury                  0
COVID-19 Indicator                0
District Name                     0
First Hearing Date                0
Gender                            0
IME-4 Count                       0
Industry Code                     0
Medical Fee Region                0
WCIO Cause of Injury Code         0
WCIO Nature of Injury Code        0
WCIO Part Of Body Code            0
Zip Code                          0
Agreement Reached                 0
Number of Dependents              0
dtype: int64

In [16]:
df_scaled[missing_values] = df_scaled_imputed[missing_values]

In [17]:
df_scaled.head()

Unnamed: 0_level_0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Medical Fee Region,WCIO Cause of Injury Code,WCIO Nature of Injury Code,WCIO Part Of Body Code,Zip Code,Agreement Reached,Number of Dependents
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
5393875,0.939606,0.258333,0.0,0.0,0.0,0.0,0.985134,0.84842,,0.58533,0.0,0.142857,0.790323,0.0,1.0,,0.333333,,0.407407,0.0,0.265306,0.1,0.588889,0.391192,0.0,0.166667
5393091,0.934224,0.383333,0.0,0.0,1.0,0.06279,0.977701,0.848517,0.1846,0.999511,0.0,0.428571,0.983871,0.0,0.714286,0.013836,0.0,0.041667,0.148148,0.0,0.979592,0.533333,0.322222,0.457898,1.0,0.666667
5393889,0.938548,0.333333,0.0,0.0,0.0,0.051601,0.980674,0.848517,,0.437164,0.0,0.428571,0.564516,0.0,0.0,,0.333333,,0.555556,0.25,0.795918,0.066667,0.011111,0.305696,0.0,1.0
5393887,0.939606,0.508333,0.0,0.0,0.0,0.0,0.970268,0.84842,,0.836186,0.142857,0.142857,0.209677,0.0,0.0,,0.333333,,0.62963,0.25,0.153061,0.466667,0.3,0.306989,0.0,0.166667
5393863,0.93943,0.558333,0.0,0.0,0.0,0.0,0.967294,0.84842,,0.436675,0.0,0.285714,0.822581,0.0,0.428571,,0.333333,,0.407407,0.75,0.306122,0.1,0.322222,0.234815,0.0,0.833333


In [18]:
df_scaled.isna().sum()

Accident Date                          0
Age at Injury                          0
Alternative Dispute Resolution         0
Assembly Date                          0
Attorney/Representative                0
Average Weekly Wage                    0
Birth Year                             0
C-2 Date                           12746
C-3 Date                          378915
Carrier Name                           0
Carrier Type                           0
Claim Injury Type                      0
County of Injury                       0
COVID-19 Indicator                     0
District Name                          0
First Hearing Date                414888
Gender                                 0
IME-4 Count                       430115
Industry Code                          0
Medical Fee Region                     0
WCIO Cause of Injury Code              0
WCIO Nature of Injury Code             0
WCIO Part Of Body Code                 0
Zip Code                               0
Agreement Reache

In [19]:
df_scaled.to_csv('/home/shadybea/OneDrive/General/Machine Learning/Project/Data/train_data_scaled_imputed.csv', index=True)

In [4]:
df = pd.read_csv('/home/shadybea/OneDrive/General/Machine Learning/Project/Data/train_data_scaled_imputed.csv', sep=',')

In [7]:
df.set_index(df.columns[0], inplace=True)

In [5]:
def mode(x): return x.mode().iloc[0] if not x.mode().empty else None


In [9]:
df['C-3 Date Missing'] = df['C-3 Date'].isna().map({True: 1, False: 0})

In [10]:
df[['C-3 Date Missing', 'Claim Injury Type']].groupby(['Claim Injury Type', 'C-3 Date Missing'])['C-3 Date Missing'].agg(
    frequency=lambda x: x.value_counts(),
    proportion=lambda x: x.value_counts() / len(df) * 100,
    mode=mode
)

Unnamed: 0_level_0,Unnamed: 1_level_0,frequency,proportion,mode
Claim Injury Type,C-3 Date Missing,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,0,5078,0.90808,0
0.0,1,5756,1.029324,1
0.142857,0,35885,6.41718,0
0.142857,1,249429,44.604454,1
0.285714,0,22572,4.036466,0
0.285714,1,43711,7.816674,1
0.428571,0,77055,13.779457,0
0.428571,1,67999,12.160007,1
0.571429,0,36285,6.488711,0
0.571429,1,10768,1.925601,1
