### Imports

In [1]:
# General Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Sklearn packages
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import LabelEncoder


import warnings
warnings.filterwarnings("ignore")


%load_ext autoreload
%autoreload 2

In [2]:
# We want to predict Claim Injury Type

### EDA will consiste of:
    - In-depth exploration of the dataset. 
    - Feature creation, in order to better explain the information
    - Find trends, patterns or anomalities.


In [3]:
# For this study we will consider all dates in reference to 25 of Dec of 2024

# this is to convert birth date into age and how much time has passed sim the different dates

In [4]:
train_df = pd.read_csv("./data/train_data.csv", dtype={ 29 : str})

In [5]:
train_df.head()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,...,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Agreement Reached,WCB Decision,Number of Dependents
0,2019-12-30,31.0,N,2020-01-01,N,0.0,1988.0,2019-12-31,,NEW HAMPSHIRE INSURANCE CO,...,27.0,FROM LIQUID OR GREASE SPILLS,10.0,CONTUSION,62.0,BUTTOCKS,13662.0,0.0,Not Work Related,1.0
1,2019-08-30,46.0,N,2020-01-01,Y,1745.93,1973.0,2020-01-01,2020-01-14,ZURICH AMERICAN INSURANCE CO,...,97.0,REPETITIVE MOTION,49.0,SPRAIN OR TEAR,38.0,SHOULDER(S),14569.0,1.0,Not Work Related,4.0
2,2019-12-06,40.0,N,2020-01-01,N,1434.8,1979.0,2020-01-01,,INDEMNITY INSURANCE CO OF,...,79.0,OBJECT BEING LIFTED OR HANDLED,7.0,CONCUSSION,10.0,MULTIPLE HEAD INJURY,12589.0,0.0,Not Work Related,6.0
3,,,,2020-01-01,,,,,,,...,,,,,,,,,,
4,2019-12-30,61.0,N,2020-01-01,N,,1958.0,2019-12-31,,STATE INSURANCE FUND,...,16.0,"HAND TOOL, UTENSIL; NOT POWERED",43.0,PUNCTURE,36.0,FINGER(S),12603.0,0.0,Not Work Related,1.0


In [6]:
train_df["Claim Injury Type"].unique()

array(['2. NON-COMP', '4. TEMPORARY', nan, '3. MED ONLY',
       '5. PPD SCH LOSS', '6. PPD NSL', '1. CANCELLED', '8. DEATH',
       '7. PTD'], dtype=object)

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593471 entries, 0 to 593470
Data columns (total 33 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   Accident Date                       570337 non-null  object 
 1   Age at Injury                       574026 non-null  float64
 2   Alternative Dispute Resolution      574026 non-null  object 
 3   Assembly Date                       593471 non-null  object 
 4   Attorney/Representative             574026 non-null  object 
 5   Average Weekly Wage                 545375 non-null  float64
 6   Birth Year                          544948 non-null  float64
 7   C-2 Date                            559466 non-null  object 
 8   C-3 Date                            187245 non-null  object 
 9   Carrier Name                        574026 non-null  object 
 10  Carrier Type                        574026 non-null  object 
 11  Claim Identifier          

In [8]:
train_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age at Injury,574026.0,42.11427,14.25643,0.0,31.0,42.0,54.0,117.0
Average Weekly Wage,545375.0,491.0883,6092.918,0.0,0.0,0.0,841.0,2828079.0
Birth Year,544948.0,1886.768,414.6444,0.0,1965.0,1977.0,1989.0,2018.0
Claim Identifier,593471.0,23667600.0,107927100.0,5393066.0,5593414.5,5791212.0,5991000.5,999891667.0
IME-4 Count,132803.0,3.207337,2.832303,1.0,1.0,2.0,4.0,73.0
Industry Code,564068.0,58.64531,19.64417,11.0,45.0,61.0,71.0,92.0
OIICS Nature of Injury Description,0.0,,,,,,,
WCIO Cause of Injury Code,558386.0,54.38114,25.87428,1.0,31.0,56.0,75.0,99.0
WCIO Nature of Injury Code,558369.0,41.01384,22.20752,1.0,16.0,49.0,52.0,91.0
WCIO Part Of Body Code,556944.0,39.73815,22.36594,-9.0,33.0,38.0,53.0,99.0


In [9]:
train_df["WCB Decision"].unique()
# Very weird, we cannot assume that nan are Work Related cases as the decision is unknown at the start of claim

array(['Not Work Related', nan], dtype=object)

In [10]:
for column in train_df.columns:
    print(f"{column} consistes of the following values:")
    print("-------------------------------------------------------")
    print(train_df[column].value_counts())
    print("-------------------------------------------------------")
    print("\n")

Accident Date consistes of the following values:
-------------------------------------------------------
Accident Date
2020-03-01    1245
2020-12-18    1001
2022-02-07     977
2022-01-05     883
2021-02-18     851
              ... 
2023-01-09       1
2017-06-20       1
2017-11-07       1
1993-09-30       1
2008-04-29       1
Name: count, Length: 5539, dtype: int64
-------------------------------------------------------


Age at Injury consistes of the following values:
-------------------------------------------------------
Age at Injury
31.0     14041
30.0     14022
32.0     13994
29.0     13657
51.0     13486
         ...  
104.0        1
115.0        1
5.0          1
113.0        1
114.0        1
Name: count, Length: 108, dtype: int64
-------------------------------------------------------


Alternative Dispute Resolution consistes of the following values:
-------------------------------------------------------
Alternative Dispute Resolution
N    571412
Y      2609
U         5
Name

In [11]:
# Need to change most float to int for consistency 
# Change Y and N to 1 and 0
# Change Alternative Dispute Resolution to 1 and 0 and understand the meaning of "U"
# Label Encoder for must Categorical Variables

# Accident Date to days since

In [12]:
train_df[train_df["Alternative Dispute Resolution"] == "U"]

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,...,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Agreement Reached,WCB Decision,Number of Dependents
145799,2018-03-12,57.0,U,2020-10-21,N,0.0,1961.0,,,ELECTRICAL EMPLOYERS,...,,,,,,,10301,0.0,Not Work Related,5.0
493388,2021-11-05,56.0,U,2022-07-11,N,0.0,1965.0,,,ELECTRICAL EMPLOYERS,...,,,,,,,10595,0.0,Not Work Related,5.0
494317,2022-07-06,52.0,U,2022-07-13,Y,0.0,1969.0,,2023-03-09,ELECTRICAL EMPLOYERS,...,,,,,,,11694,0.0,Not Work Related,5.0
551004,2022-09-01,57.0,U,2022-10-17,N,,1965.0,,,ELECTRICAL EMPLOYERS,...,,,,,,,11755,0.0,Not Work Related,0.0
551394,2022-08-29,55.0,U,2022-10-17,N,0.0,1967.0,,,ELECTRICAL EMPLOYERS,...,,,,,,,11743,0.0,Not Work Related,6.0



train_df[feature]= train_df[feature].apply(lambda x: 1 if x else 0, axis=1)


In [13]:
# Gender can be M (male), F (Female), X (not specified) or U (undisclosed)
gender_dict = {
    "F" : 0,
    "M" : 1,
    "X" : 2,
    "U" : 3
}
#train_df["Gender Encoded"] = train_df["Gender"].replace(gender_dict)

In [29]:
features_to_encode = [
    "Gender", "Alternative Dispute Resolution", "Attorney/Representative", "COVID-19 Indicator", "Carrier Name",
    "Carrier Type", "Claim Injury Type", "County of Injury", "District Name", "Medical Fee Region"]

features_to_drop = [
    "OIICS Nature of Injury Description",
    "Industry Code Description", "WCIO Cause of Injury Description", "WCIO Cause of Injury Description",
    "WCIO Nature of Injury Description", "WCIO Part Of Body Description"
]

label_encoders = {}

for feature in features_to_encode:
    le = LabelEncoder()
    train_df[feature] = le.fit_transform(train_df[feature])
    label_encoders[feature] = le

# original_values = label_encoders['Gender'].inverse_transform(train_df['Gender Encoded'])

In [30]:
train_df = train_df.drop(features_to_drop, axis = 1)

In [35]:
train_df.sample().T

Unnamed: 0,2620
Accident Date,2019-12-19
Age at Injury,35.0
Alternative Dispute Resolution,0
Assembly Date,2020-01-07
Attorney/Representative,0
Average Weekly Wage,0.0
Birth Year,1984.0
C-2 Date,2020-01-07
C-3 Date,
Carrier Name,89


In [16]:
nan_count = train_df["Claim Injury Type"].isna().sum()
print(f"We found that {nan_count/train_df.shape[0] * 100}% of the training data has NAs on the target, therefor we will drop it.")


We found that 3.276486972404717% of the training data has NAs on the target, therefor we will drop it.


In [17]:
nan_rows = train_df[train_df["Claim Injury Type"].isna()]
#nan_rows.value_counts
train_df = train_df.dropna(subset=["Claim Injury Type"])

In [18]:
dup_rows = train_df.duplicated()
dup_rows.value_counts()

False    574026
Name: count, dtype: int64

In [34]:
train_df['Hearing Occurred'] = train_df['First Hearing Date'].notna().astype(int)
train_df['C-2 Ocurred'] = train_df['C-2 Date'].notna().astype(int)
train_df['C-3 Ocurred'] = train_df['C-3 Date'].notna().astype(int)

In [37]:
train_df[train_df["Accident Date"].isna()]

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,...,WCIO Cause of Injury Code,WCIO Nature of Injury Code,WCIO Part Of Body Code,Zip Code,Agreement Reached,WCB Decision,Number of Dependents,Hearing Occurred,C-2 Ocurred,C-3 Ocurred
370,,0.0,0,2020-01-02,1,0.0,1959.0,,2019-12-23,1703,...,,,,,0.0,Not Work Related,6.0,0,0,1
396,,0.0,0,2020-01-02,1,0.0,1984.0,,2019-12-27,1011,...,,,,13212,0.0,Not Work Related,4.0,0,0,1
486,,0.0,0,2020-01-02,1,0.0,0.0,2021-03-26,2019-12-23,393,...,90.0,31.0,13.0,34997,0.0,Not Work Related,2.0,1,1,1
599,,0.0,0,2020-01-02,1,0.0,1967.0,,2019-12-27,71,...,,,,12209,0.0,Not Work Related,6.0,0,0,1
760,,0.0,0,2020-01-03,1,0.0,1971.0,,2019-12-23,1197,...,,,,11231,0.0,Not Work Related,1.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
592566,,0.0,0,2022-12-29,1,0.0,1967.0,,2022-12-21,1710,...,,,,11422,0.0,Not Work Related,1.0,0,0,1
592914,,0.0,0,2022-12-30,1,0.0,1961.0,2023-01-10,2022-12-27,441,...,52.0,31.0,13.0,11756,0.0,Not Work Related,0.0,1,1,1
592964,,0.0,0,2022-12-30,1,0.0,0.0,2023-01-10,2022-12-28,1291,...,99.0,78.0,-9.0,22473,0.0,Not Work Related,3.0,0,1,1
592978,,0.0,0,2022-12-30,1,0.0,1972.0,2023-01-20,2022-12-22,1011,...,60.0,78.0,39.0,11221,0.0,Not Work Related,6.0,0,1,1
