## 1. Imports and initial transformations

In [1]:
# importing the libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.neighbors import LocalOutlierFactor
from matplotlib import pyplot as plt
import seaborn as sns
import re
import math
import datetime

In [2]:
# setting the options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
# pd.set_option('display.max_rows', None)
np.set_printoptions(threshold=np.inf)
plt.style.use('seaborn-v0_8-dark')

In [3]:
# importing the training and test data
df = pd.read_csv('../train_data.csv', sep=',')
df_test = pd.read_csv('../test_data.csv', sep=',')

  df = pd.read_csv('../train_data.csv', sep=',')


In [4]:
df.shape

(593471, 33)

In [5]:
# we check the first rows of the dataset
df.head()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Identifier,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Industry Code Description,Medical Fee Region,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Agreement Reached,WCB Decision,Number of Dependents
0,2019-12-30,31.0,N,2020-01-01,N,0.0,1988.0,2019-12-31,,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,5393875,2. NON-COMP,ST. LAWRENCE,N,SYRACUSE,,M,,44.0,RETAIL TRADE,I,,27.0,FROM LIQUID OR GREASE SPILLS,10.0,CONTUSION,62.0,BUTTOCKS,13662.0,0.0,Not Work Related,1.0
1,2019-08-30,46.0,N,2020-01-01,Y,1745.93,1973.0,2020-01-01,2020-01-14,ZURICH AMERICAN INSURANCE CO,1A. PRIVATE,5393091,4. TEMPORARY,WYOMING,N,ROCHESTER,2020-02-21,F,4.0,23.0,CONSTRUCTION,I,,97.0,REPETITIVE MOTION,49.0,SPRAIN OR TEAR,38.0,SHOULDER(S),14569.0,1.0,Not Work Related,4.0
2,2019-12-06,40.0,N,2020-01-01,N,1434.8,1979.0,2020-01-01,,INDEMNITY INSURANCE CO OF,1A. PRIVATE,5393889,4. TEMPORARY,ORANGE,N,ALBANY,,M,,56.0,ADMINISTRATIVE AND SUPPORT AND WASTE MANAGEMENT AND REMEDIAT,II,,79.0,OBJECT BEING LIFTED OR HANDLED,7.0,CONCUSSION,10.0,MULTIPLE HEAD INJURY,12589.0,0.0,Not Work Related,6.0
3,,,,2020-01-01,,,,,,,,957648180,,,,,,,,,,,,,,,,,,,,,
4,2019-12-30,61.0,N,2020-01-01,N,,1958.0,2019-12-31,,STATE INSURANCE FUND,2A. SIF,5393887,2. NON-COMP,DUTCHESS,N,ALBANY,,M,,62.0,HEALTH CARE AND SOCIAL ASSISTANCE,II,,16.0,"HAND TOOL, UTENSIL; NOT POWERED",43.0,PUNCTURE,36.0,FINGER(S),12603.0,0.0,Not Work Related,1.0


In [6]:
# we check if there are any aggregation rows at the end of the dataset
df.tail()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Identifier,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Industry Code Description,Medical Fee Region,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Agreement Reached,WCB Decision,Number of Dependents
593466,,,,2022-12-31,,,,,,,,327160035,,,,,,,,,,,,,,,,,,,,,
593467,2022-12-13,72.0,N,2022-12-31,N,0.0,1950.0,2022-12-31,,TECHNOLOGY INSURANCE CO. INC.,1A. PRIVATE,6165075,2. NON-COMP,SULLIVAN,N,BINGHAMTON,,F,,48.0,TRANSPORTATION AND WAREHOUSING,I,,25.0,FROM DIFFERENT LEVEL (ELEVATION),90.0,MULTIPLE PHYSICAL INJURIES ONLY,-9.0,MULTIPLE,12779.0,0.0,Not Work Related,3.0
593468,,,,2022-12-31,,,,,,,,249875936,,,,,,,,,,,,,,,,,,,,,
593469,,,,2022-12-31,,,,,,,,120584215,,,,,,,,,,,,,,,,,,,,,
593470,,,,2022-12-31,,,,,,,,818961390,,,,,,,,,,,,,,,,,,,,,


### Index is now the Claim Identifier

In [7]:
df = df[~df['Claim Identifier'].duplicated()].set_index('Claim Identifier').rename_axis(None)

In [8]:
df.head()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Industry Code Description,Medical Fee Region,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Agreement Reached,WCB Decision,Number of Dependents
5393875,2019-12-30,31.0,N,2020-01-01,N,0.0,1988.0,2019-12-31,,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,2. NON-COMP,ST. LAWRENCE,N,SYRACUSE,,M,,44.0,RETAIL TRADE,I,,27.0,FROM LIQUID OR GREASE SPILLS,10.0,CONTUSION,62.0,BUTTOCKS,13662.0,0.0,Not Work Related,1.0
5393091,2019-08-30,46.0,N,2020-01-01,Y,1745.93,1973.0,2020-01-01,2020-01-14,ZURICH AMERICAN INSURANCE CO,1A. PRIVATE,4. TEMPORARY,WYOMING,N,ROCHESTER,2020-02-21,F,4.0,23.0,CONSTRUCTION,I,,97.0,REPETITIVE MOTION,49.0,SPRAIN OR TEAR,38.0,SHOULDER(S),14569.0,1.0,Not Work Related,4.0
5393889,2019-12-06,40.0,N,2020-01-01,N,1434.8,1979.0,2020-01-01,,INDEMNITY INSURANCE CO OF,1A. PRIVATE,4. TEMPORARY,ORANGE,N,ALBANY,,M,,56.0,ADMINISTRATIVE AND SUPPORT AND WASTE MANAGEMENT AND REMEDIAT,II,,79.0,OBJECT BEING LIFTED OR HANDLED,7.0,CONCUSSION,10.0,MULTIPLE HEAD INJURY,12589.0,0.0,Not Work Related,6.0
957648180,,,,2020-01-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5393887,2019-12-30,61.0,N,2020-01-01,N,,1958.0,2019-12-31,,STATE INSURANCE FUND,2A. SIF,2. NON-COMP,DUTCHESS,N,ALBANY,,M,,62.0,HEALTH CARE AND SOCIAL ASSISTANCE,II,,16.0,"HAND TOOL, UTENSIL; NOT POWERED",43.0,PUNCTURE,36.0,FINGER(S),12603.0,0.0,Not Work Related,1.0


In [9]:
# we check the datatypes and null counts
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 593470 entries, 5393875 to 818961390
Data columns (total 32 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   Accident Date                       570337 non-null  object 
 1   Age at Injury                       574026 non-null  float64
 2   Alternative Dispute Resolution      574026 non-null  object 
 3   Assembly Date                       593470 non-null  object 
 4   Attorney/Representative             574026 non-null  object 
 5   Average Weekly Wage                 545375 non-null  float64
 6   Birth Year                          544948 non-null  float64
 7   C-2 Date                            559466 non-null  object 
 8   C-3 Date                            187245 non-null  object 
 9   Carrier Name                        574026 non-null  object 
 10  Carrier Type                        574026 non-null  object 
 11  Claim Injury Type     

In [10]:
# Changing the data type of the values to string
df['Zip Code'] = df['Zip Code'].astype(str)

**Data type analysis:**

Features that should be integers:
- 'Age at Injury'
- 'Birth Year'
- 'IME-4 Count'
- 'Industry Code'
- 'WCIO Cause of Injury Code'
- 'WCIO Nature of Injury Code'
- 'WCIO Part Of Body Code'
- 'Number of Dependents'

Features that should be booleans:
- 'Agreement Reached'

In [11]:
features_to_cast = ['Age at Injury', 'Birth Year', 'IME-4 Count', 'Industry Code', 'WCIO Cause of Injury Code', 'WCIO Nature of Injury Code', 'WCIO Part Of Body Code', 'Number of Dependents', 'Agreement Reached']

In [12]:
for feature in features_to_cast:
    df[feature] = pd.to_numeric(df[feature], errors='coerce').astype('Int64')

In [13]:
date_columns = ['Accident Date', 'Assembly Date', 'C-2 Date', 'C-3 Date', 'First Hearing Date']

for col in date_columns:
    # first, we convert the columns to datetime
    df[col] = pd.to_datetime(df[col])
    # then we convert it to int
    # df[col] = df[col].dt.strftime('%Y%m%d').astype('Int64')

In [14]:
df['Zip Code'] = df['Zip Code'].replace('nan', np.nan)

In [15]:
df.drop_duplicates(inplace=True)

In [16]:
df = df[~df['Claim Injury Type'].isna()]

## 2. Data Preprocessing

### 2.1. Weird values

As we previously mentioned, there are some columns with weird values:
- **Age at Injury**: multiple values below 14, which is the minimum legal age to work in the USA
- **Birth Year**: multiple 0 values
- **WCIO Part Of Body Code**: has a negative value

Before we go any further, let's try to tackle these issues.

#### 2.1.1. Age at Injury

In [17]:
len(df[df['Age at Injury'] < 14])

5510

In [18]:
df[df['Age at Injury'] < 14].head()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Industry Code Description,Medical Fee Region,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Agreement Reached,WCB Decision,Number of Dependents
5393971,2019-06-26,0,N,2020-01-02,N,0.0,0,NaT,NaT,AMERICAN ZURICH INSURANCE CO,1A. PRIVATE,2. NON-COMP,NASSAU,N,NYC,NaT,M,,62.0,HEALTH CARE AND SOCIAL ASSISTANCE,IV,,,,,,,,11801.0,0,Not Work Related,1
5394093,2019-12-28,0,N,2020-01-02,N,0.0,0,2019-12-30,NaT,TWIN CITY FIRE INS CO.,1A. PRIVATE,2. NON-COMP,NEW YORK,N,NYC,NaT,M,,71.0,"ARTS, ENTERTAINMENT, AND RECREATION",IV,,55.0,HOLDING OR CARRYING,52.0,STRAIN OR TEAR,61.0,ABDOMEN INCLUDING GROIN,,0,Not Work Related,5
5394068,NaT,0,N,2020-01-02,Y,0.0,1959,NaT,2019-12-23,STARR INDEMNITY & LIABILITY CO,1A. PRIVATE,2. NON-COMP,BRONX,N,NYC,NaT,M,,23.0,CONSTRUCTION,IV,,,,,,,,,0,Not Work Related,6
5393794,NaT,0,N,2020-01-02,Y,0.0,1984,NaT,2019-12-27,LM INSURANCE CORP,1A. PRIVATE,2. NON-COMP,ONONDAGA,N,SYRACUSE,NaT,F,,,,II,,,,,,,,13212.0,0,Not Work Related,4
5393876,2019-12-20,0,N,2020-01-02,N,0.0,0,2019-12-30,NaT,CARMEL CENTRAL SCHOOL DISTRICT,3A. SELF PUBLIC,2. NON-COMP,PUTNAM,N,ALBANY,NaT,F,,,,III,,74.0,"FELLOW WORKER, PATIENT OR OTHER PERSON",40.0,LACERATION,35.0,HAND,10512.0,0,Not Work Related,0


In [19]:
# This code is used to determine the number of individuals in the dataset who were under 14 years old at the time of their injury, 
# whose birth year is recorded and not given as 0, and who have a recorded (non-missing) accident date and birth year. 
len(df[(df['Age at Injury'] < 14) & (df['Birth Year'] != 0) & ~(df['Accident Date'].isna()) & ~(df['Birth Year'].isna())])

122

We can manually calculate these 122 values, since we have information about 'Birth Year' and 'Accident Date'

In [20]:
df[(df['Age at Injury'] < 14) & (df['Birth Year'] != 0) & ~(df['Accident Date'].isna()) & ~(df['Birth Year'].isna())].head()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Industry Code Description,Medical Fee Region,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Agreement Reached,WCB Decision,Number of Dependents
5399016,2019-08-11,0,N,2020-01-09,N,150.0,1901,2020-01-09,2019-12-27,WESCO INSURANCE COMPANY,1A. PRIVATE,3. MED ONLY,KINGS,N,NYC,NaT,M,,72,ACCOMMODATION AND FOOD SERVICES,IV,,45,COLLISION OR SIDESWIPE WITH ANOTHER VEHICLE,90,MULTIPLE PHYSICAL INJURIES ONLY,90,MULTIPLE BODY PARTS (INCLUDING BODY,11219,0,Not Work Related,5
5401957,2020-01-12,0,N,2020-01-14,N,528.71,1901,2020-01-14,NaT,AMTRUST INS CO OF KANSAS INC,1A. PRIVATE,4. TEMPORARY,ST. LAWRENCE,N,ALBANY,NaT,M,,31,MANUFACTURING,I,,19,"CUT, PUNCTURE, SCRAPE, NOC",40,LACERATION,35,HAND,13662,0,Not Work Related,5
5406866,2020-01-06,0,N,2020-01-17,N,0.0,1901,2020-01-17,NaT,INDEMNITY INS. OF N AMERICA,1A. PRIVATE,2. NON-COMP,ERIE,N,BUFFALO,NaT,F,,54,"PROFESSIONAL, SCIENTIFIC, AND TECHNICAL SERVICES",I,,31,"FALL, SLIP OR TRIP, NOC",59,"ALL OTHER SPECIFIC INJURIES, NOC",53,KNEE,14150,0,Not Work Related,0
5412625,2020-01-09,1,N,2020-01-25,N,0.0,2018,2020-01-24,NaT,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,2. NON-COMP,SUFFOLK,N,HAUPPAUGE,NaT,F,,54,"PROFESSIONAL, SCIENTIFIC, AND TECHNICAL SERVICES",IV,,31,"FALL, SLIP OR TRIP, NOC",10,CONTUSION,11,SKULL,11720,0,Not Work Related,6
5412736,2020-01-09,1,N,2020-01-25,N,,2018,2020-01-24,NaT,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,2. NON-COMP,SUFFOLK,N,HAUPPAUGE,NaT,F,,54,"PROFESSIONAL, SCIENTIFIC, AND TECHNICAL SERVICES",IV,,31,"FALL, SLIP OR TRIP, NOC",10,CONTUSION,11,SKULL,11720,0,Not Work Related,4


In [21]:
df['Age at Injury'] = np.where(
    # we select only the rows that meet the necessary conditions
    (df['Age at Injury'] < 14) & (df['Birth Year'] != 0) & (df['Accident Date'].notna()) & (df['Birth Year'].notna()),
    # if conditions are met, we calculate the new value
    df['Accident Date'].dt.year - df['Birth Year'],
    # otherwise, we keep the original value
    df['Age at Injury']
)

In [22]:
len(df[(df['Age at Injury'] < 14) & (df['Birth Year'] != 0) & (df['Accident Date'].notna()) & (df['Birth Year'].notna())])

27

Apparently we still have some inconsistent data, but we were able to correct around 100 rows.

#### 2.1.2. Birth Year

In [23]:
len(df[df['Birth Year'] == 0])

25081

In [24]:
len(df[(df['Birth Year'] == 0) & (df['Accident Date'].notna()) & (df['Age at Injury'] >= 14)])

23141

In [25]:
df[(df['Birth Year'] == 0) & ~(df['Accident Date'].notna()) & (df['Age at Injury'] >= 14)].head()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Industry Code Description,Medical Fee Region,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Agreement Reached,WCB Decision,Number of Dependents


We can manually impute most of the weird values just by doing some simple math.

In [26]:
df['Birth Year'] = np.where(
    # we select only the rows that meet the necessary conditions
    (df['Birth Year'] == 0) & (df['Birth Year'].notna()) & (df['Accident Date'].notna()) & (df['Age at Injury'] >= 14),
    # if conditions are met, we calculate the new value
    df['Accident Date'].dt.year - df['Age at Injury'],
    # otherwise, we keep the original value
    df['Birth Year']
)

In [27]:
len(df[df['Birth Year'] == 0])

1940

We were able to significantly reduce the amount of weird values

#### 2.1.3. WCIO Part Of Body Code

In [28]:
len(df[df['WCIO Part Of Body Code'] < 0])

42011

In [29]:
df[df['WCIO Part Of Body Code'] < 0]['WCIO Part Of Body Code'].unique()

<IntegerArray>
[-9]
Length: 1, dtype: Int64

All negative values are the same value - let's check if there are any values '9' or if we can simply convert these values to its absolute value.

In [30]:
len(df[df['WCIO Part Of Body Code'] == 9])

0

As there are no values that take the value '9', we will convert the negative values to the absolute value

In [31]:
df['WCIO Part Of Body Code'] = np.where(
    # we select only the rows that meet the necessary conditions
    (df['WCIO Part Of Body Code'] < 0) & (df['WCIO Part Of Body Code'].notna()),
    # if conditions are met, we calculate the new value
    df['WCIO Part Of Body Code'].abs(),
    # otherwise, we keep the original value
    df['WCIO Part Of Body Code']
)

In [32]:
len(df[df['WCIO Part Of Body Code'] < 0])

0

#### 2.1.4. Datetime variables

Some dates happen before the accident happened

In [33]:
# Check that the DataFrame has a multi-index with the keys as one of the levels
# If not sure, you can re-create the DataFrame with a clear multi-index as follows:
invalid_orders_df = pd.concat([
    df[df['Assembly Date'] < df['Accident Date']],
    df[df['C-2 Date'] < df['Accident Date']],
    df[df['C-3 Date'] < df['Accident Date']],
    df[df['First Hearing Date'] < df['Accident Date']]
], keys=[
    'Assembly Date before Accident Date',
    'C-2 Date before Accident Date',
    'C-3 Date before Accident Date',
    'First Hearing Date before Accident Date'
], names=['Date Issue', 'Index'])

# Specify the datetime columns
datetime_columns = [
    'Accident Date',
    'Assembly Date',
    'C-2 Date',
    'C-3 Date',
    'First Hearing Date'
]


# Display the head of the DataFrame for the datetime columns
invalid_orders_df[datetime_columns].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Accident Date,Assembly Date,C-2 Date,C-3 Date,First Hearing Date
Date Issue,Index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Assembly Date before Accident Date,5393953,2020-02-26,2020-01-01,2019-12-31,2020-01-31,2020-05-04
Assembly Date before Accident Date,5394099,2020-01-23,2020-01-02,2020-01-14,2019-12-27,2020-06-05
Assembly Date before Accident Date,5393830,2020-03-01,2020-01-02,2020-01-17,2019-12-27,2020-08-28
Assembly Date before Accident Date,5394613,2020-01-27,2020-01-03,2020-01-15,2019-12-30,2021-09-01
Assembly Date before Accident Date,5394770,2020-01-23,2020-01-03,2020-01-06,2019-12-26,2020-05-29


In [34]:
# Now calculate value_counts for 'Claim Injury Type' for each key
# This iterates over each key in the multi-index and performs value_counts on 'Claim Injury Type'
for key in invalid_orders_df.index.levels[0]:  # Loop through each key
    print(f"Counts for {key}:")
    # Extract the subset for the current key and perform value_counts on 'Claim Injury Type'
    claim_type_counts = invalid_orders_df.xs(key, level='Date Issue')['Claim Injury Type'].value_counts()
    print(claim_type_counts)
    print("\n")

Counts for Assembly Date before Accident Date:
Claim Injury Type
5. PPD SCH LOSS    723
3. MED ONLY        350
4. TEMPORARY       255
2. NON-COMP         54
6. PPD NSL          14
1. CANCELLED         7
7. PTD               3
8. DEATH             1
Name: count, dtype: int64


Counts for C-2 Date before Accident Date:
Claim Injury Type
5. PPD SCH LOSS    404
4. TEMPORARY       256
3. MED ONLY        252
2. NON-COMP         48
6. PPD NSL          12
1. CANCELLED         7
7. PTD               2
8. DEATH             1
Name: count, dtype: int64


Counts for C-3 Date before Accident Date:
Claim Injury Type
5. PPD SCH LOSS    700
3. MED ONLY        314
4. TEMPORARY       192
2. NON-COMP         58
6. PPD NSL          14
1. CANCELLED         9
7. PTD               2
Name: count, dtype: int64


Counts for First Hearing Date before Accident Date:
Claim Injury Type
3. MED ONLY        34
5. PPD SCH LOSS    18
4. TEMPORARY       15
2. NON-COMP         5
6. PPD NSL          2
Name: count, dtype: in

Maybe this discrepancy can affect the target variable

In [35]:
columns_to_display = datetime_columns + ['Claim Injury Type']

invalid_orders_df[columns_to_display].head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accident Date,Assembly Date,C-2 Date,C-3 Date,First Hearing Date,Claim Injury Type
Date Issue,Index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Assembly Date before Accident Date,5393953,2020-02-26,2020-01-01,2019-12-31,2020-01-31,2020-05-04,5. PPD SCH LOSS
Assembly Date before Accident Date,5394099,2020-01-23,2020-01-02,2020-01-14,2019-12-27,2020-06-05,6. PPD NSL
Assembly Date before Accident Date,5393830,2020-03-01,2020-01-02,2020-01-17,2019-12-27,2020-08-28,5. PPD SCH LOSS
Assembly Date before Accident Date,5394613,2020-01-27,2020-01-03,2020-01-15,2019-12-30,2021-09-01,5. PPD SCH LOSS
Assembly Date before Accident Date,5394770,2020-01-23,2020-01-03,2020-01-06,2019-12-26,2020-05-29,5. PPD SCH LOSS


In [36]:
invalid_orders_df['Claim Injury Type'].value_counts().sort_values()

Claim Injury Type
8. DEATH              2
7. PTD                7
1. CANCELLED         23
6. PPD NSL           42
2. NON-COMP         165
4. TEMPORARY        718
3. MED ONLY         950
5. PPD SCH LOSS    1845
Name: count, dtype: int64

In [37]:
# List all columns, then remove those you want to exclude from checking duplicates
columns_to_check = invalid_orders_df.columns.difference([
    'Accident Date', 'Assembly Date', 'C-2 Date', 'C-3 Date', 'First Hearing Date'
])

# Check for duplicates based on the remaining columns
duplicates = invalid_orders_df.duplicated(subset=columns_to_check, keep=False)

# Count duplicates
duplicate_count = duplicates.sum()


print(f"Number of different rows (excluding specified date columns): {duplicate_count}")

# Optional: Display rows that are duplicates
# duplicate_rows = invalid_orders_df[duplicates]
# print(duplicate_rows)

Number of different rows (excluding specified date columns): 3453


This discrepancy happens 3453 times

### 2.2. Missing values

In [38]:
# Calculate the number of NaNs for each column
nan_counts = df.isna().sum()

# Get the total number of rows (entries) in the DataFrame
total_rows = df.shape[0]

# Calculate the percentage of NaN values for each column
percentage_nans = (nan_counts / total_rows) * 100

# Format the percentage with '%' sign
percentage_nans = percentage_nans.apply(lambda x: f"{x:.2f}%")

# Combine all information into a DataFrame for better readability
nan_summary = pd.DataFrame({
    'NaN Count': nan_counts,
    'Total Values': [total_rows] * len(nan_counts),  # Ensure this column matches the length of nan_counts
    'Percentage NaN': percentage_nans
})

# Print the result
print("Summary of NaN values per column:\n")
print(nan_summary)

Summary of NaN values per column:

                                    NaN Count  Total Values Percentage NaN
Accident Date                            3688        574025          0.64%
Age at Injury                               0        574025          0.00%
Alternative Dispute Resolution              0        574025          0.00%
Assembly Date                               0        574025          0.00%
Attorney/Representative                     0        574025          0.00%
Average Weekly Wage                     28651        574025          4.99%
Birth Year                              29078        574025          5.07%
C-2 Date                                14559        574025          2.54%
C-3 Date                               386781        574025         67.38%
Carrier Name                                0        574025          0.00%
Carrier Type                                0        574025          0.00%
Claim Injury Type                           0        574025      

In [39]:
missing_values = [
    'Accident Date'
    , 'Average Weekly Wage'
    , 'Birth Year'
    #, 'C-2 Date'  # missing form could have relationship with the target
    #, 'C-3 Date'  # missing form could have relationship with the target
    #, 'First Hearing Date'  # missing values means no hearing has held
    #, 'IME-4 Count'  # missing form could have relationship with the target
    , 'Industry Code'
    #, 'Industry Code Description'  # we will only use the numeric form of this variable
    #, 'OIICS Nature of Injury Description'  # only missing values, so we will not use this feature
    , 'WCIO Cause of Injury Code'
    #, 'WCIO Cause of Injury Description'  # we will only use the numeric form of this variable
    , 'WCIO Nature of Injury Code'
    #, 'WCIO Nature of Injury Description'  # we will only use the numeric form of this variable
    , 'WCIO Part Of Body Code'
    #, 'WCIO Part Of Body Description'  # we will only use the numeric form of this variable
    , 'Zip Code'
]

We will not impute missing values in the commented columns, as per the explainations in the comments

### 2.3. Categorical Encoding

In [40]:
# we can drop the descriptive columns and the unary column ('WCB Decision')
df.drop(columns=['WCIO Part Of Body Description', 'Industry Code Description', 'WCIO Nature of Injury Description', 'WCIO Cause of Injury Description', 'OIICS Nature of Injury Description', 'WCB Decision'], inplace=True)

In [41]:
categorical_columns = [
    'Alternative Dispute Resolution'
    , 'Attorney/Representative'
    , 'Carrier Type'
    , 'Carrier Name'
    , 'Claim Injury Type'
    , 'County of Injury'
    , 'COVID-19 Indicator'
    , 'District Name'
    , 'Gender'
    , 'Medical Fee Region'
    , 'Zip Code'
]

data = df[categorical_columns]

data = data.dropna()

# Display the first few rows to verify the remaining columns
data.head()


Unnamed: 0,Alternative Dispute Resolution,Attorney/Representative,Carrier Type,Carrier Name,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,Gender,Medical Fee Region,Zip Code
5393875,N,N,1A. PRIVATE,NEW HAMPSHIRE INSURANCE CO,2. NON-COMP,ST. LAWRENCE,N,SYRACUSE,M,I,13662
5393091,N,Y,1A. PRIVATE,ZURICH AMERICAN INSURANCE CO,4. TEMPORARY,WYOMING,N,ROCHESTER,F,I,14569
5393889,N,N,1A. PRIVATE,INDEMNITY INSURANCE CO OF,4. TEMPORARY,ORANGE,N,ALBANY,M,II,12589
5393887,N,N,2A. SIF,STATE INSURANCE FUND,2. NON-COMP,DUTCHESS,N,ALBANY,M,II,12603
5393863,N,N,1A. PRIVATE,INDEMNITY INS. OF N AMERICA,3. MED ONLY,SUFFOLK,N,HAUPPAUGE,M,IV,11772


In [42]:
ordinal_encoder = OrdinalEncoder()

In [43]:
data = df.copy()
data[categorical_columns] = ordinal_encoder.fit_transform(df[categorical_columns])

In [44]:
def to_ordinal(data, column):
    data[column] = data[column].apply(lambda x: x.toordinal() if pd.notnull(x) else np.nan).astype('Int64')

In [45]:
data['Accident Date'] = pd.to_datetime(df['Accident Date'])
# df['Assembly Date'] = pd.to_datetime(df['Assembly Date'])

In [46]:
dates = [
    'Accident Date'
    , 'Assembly Date'
    , 'C-2 Date'
    , 'C-3 Date'
    , 'First Hearing Date'
]

In [47]:
data.dtypes

Accident Date                     datetime64[ns]
Age at Injury                            float64
Alternative Dispute Resolution           float64
Assembly Date                     datetime64[ns]
Attorney/Representative                  float64
Average Weekly Wage                      float64
Birth Year                               float64
C-2 Date                          datetime64[ns]
C-3 Date                          datetime64[ns]
Carrier Name                             float64
Carrier Type                             float64
Claim Injury Type                        float64
County of Injury                         float64
COVID-19 Indicator                       float64
District Name                            float64
First Hearing Date                datetime64[ns]
Gender                                   float64
IME-4 Count                                Int64
Industry Code                              Int64
Medical Fee Region                       float64
WCIO Cause of Injury

In [48]:
for col in dates:
    to_ordinal(data, col)

In [49]:
data.dtypes

Accident Date                       Int64
Age at Injury                     float64
Alternative Dispute Resolution    float64
Assembly Date                       Int64
Attorney/Representative           float64
Average Weekly Wage               float64
Birth Year                        float64
C-2 Date                            Int64
C-3 Date                            Int64
Carrier Name                      float64
Carrier Type                      float64
Claim Injury Type                 float64
County of Injury                  float64
COVID-19 Indicator                float64
District Name                     float64
First Hearing Date                  Int64
Gender                            float64
IME-4 Count                         Int64
Industry Code                       Int64
Medical Fee Region                float64
WCIO Cause of Injury Code           Int64
WCIO Nature of Injury Code          Int64
WCIO Part Of Body Code            float64
Zip Code                          

All columns are now numeric

In [50]:
data.head()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Medical Fee Region,WCIO Cause of Injury Code,WCIO Nature of Injury Code,WCIO Part Of Body Code,Zip Code,Agreement Reached,Number of Dependents
5393875,737423,31.0,0.0,737425,0.0,0.0,1988.0,737424,,1197.0,0.0,1.0,49.0,0.0,7.0,,1.0,,44,0.0,27,10,62.0,3935.0,0,1
5393091,737301,46.0,0.0,737425,1.0,1745.93,1973.0,737425,737438.0,2044.0,0.0,3.0,61.0,0.0,5.0,737476.0,0.0,4.0,23,0.0,97,49,38.0,4606.0,1,4
5393889,737399,40.0,0.0,737425,0.0,1434.8,1979.0,737425,,894.0,0.0,3.0,35.0,0.0,0.0,,1.0,,56,1.0,79,7,10.0,3075.0,0,6
5393887,737423,61.0,0.0,737425,0.0,,1958.0,737424,,1710.0,1.0,1.0,13.0,0.0,0.0,,1.0,,62,1.0,16,43,36.0,3088.0,0,1
5393863,737419,67.0,0.0,737425,0.0,0.0,1952.0,737424,,893.0,0.0,2.0,51.0,0.0,3.0,,1.0,,44,3.0,31,10,38.0,2362.0,0,5


For the features where missing values carry meaning, one could leave them as NaN and try an algorithm that can take advantage and learn from these missing values. We will analyse the missing values impact on the target later in this section.</br></br>
For the remaining columns, we will impute these values, using KNN Imputer. For that, we will take a small sample of our data, without missing values, and test the imputer to make a choice regarding the optimal number of neighbors.

### 2.4. Outliers

In [51]:
lof = LocalOutlierFactor(n_neighbors=20, n_jobs=-1)

In [52]:
test = df.sample(n=100000, random_state=17)

In [53]:
# initialize the imputer
imputer = SimpleImputer(strategy='median')
# create a new df with no missing values
df_temp_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Run LOF
outlier_labels = lof.fit_predict(df_temp_imputed)

# Remove rows marked as outliers
data_no_outliers = data[outlier_labels != -1]

In [54]:
data_no_outliers.shape

(559202, 26)

In [55]:
# % lines removed
(1- (data_no_outliers.shape[0] / df.shape[0])) * 100

2.5822917120334465

### 2.5. Feature Scaling

In [56]:
min_max_scaler = MinMaxScaler()

In [57]:
df_scaled = min_max_scaler.fit_transform(data_no_outliers)

In [58]:
df_scaled = pd.DataFrame(df_scaled, columns = data_no_outliers.columns).set_index(data_no_outliers.index)

In [59]:
len(data_no_outliers['Accident Date'].unique())

5405

In [60]:
df_scaled.head()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Medical Fee Region,WCIO Cause of Injury Code,WCIO Nature of Injury Code,WCIO Part Of Body Code,Zip Code,Agreement Reached,Number of Dependents
5393875,0.939606,0.258333,0.0,0.0,0.0,0.0,0.985134,0.84842,,0.58533,0.0,0.142857,0.790323,0.0,1.0,,0.333333,,0.407407,0.0,0.265306,0.1,0.588889,0.391192,0.0,0.166667
5393091,0.934224,0.383333,0.0,0.0,1.0,0.06279,0.977701,0.848517,0.1846,0.999511,0.0,0.428571,0.983871,0.0,0.714286,0.013836,0.0,0.041667,0.148148,0.0,0.979592,0.533333,0.322222,0.457898,1.0,0.666667
5393889,0.938548,0.333333,0.0,0.0,0.0,0.051601,0.980674,0.848517,,0.437164,0.0,0.428571,0.564516,0.0,0.0,,0.333333,,0.555556,0.25,0.795918,0.066667,0.011111,0.305696,0.0,1.0
5393887,0.939606,0.508333,0.0,0.0,0.0,,0.970268,0.84842,,0.836186,0.142857,0.142857,0.209677,0.0,0.0,,0.333333,,0.62963,0.25,0.153061,0.466667,0.3,0.306989,0.0,0.166667
5393863,0.93943,0.558333,0.0,0.0,0.0,0.0,0.967294,0.84842,,0.436675,0.0,0.285714,0.822581,0.0,0.428571,,0.333333,,0.407407,0.75,0.306122,0.1,0.322222,0.234815,0.0,0.833333


In [None]:
#df_scaled.to_csv('train_data_scaled.csv', index=True)

### 2.6. Data Imputation

In [None]:
# numeric_feats = [
#     'Accident Date'
#     , 'Age at Injury'
#     , 'Assembly Date'
#     , 'Average Weekly Wage'
#     , 'Birth Year'
#     , 'C-2 Date'
#     , 'C-3 Date'
#     , 'First Hearing Date'
#     , 'IME-4 Count'
#     , 'Number of Dependents'
# ]

# cat_feats = [
#     'Alternative Dispute Resolution'
#     , 'Attorney/Representative'
#     , 'Carrier Name'
#     , 'Carrier Type'
#     , 'Claim Injury Type'
#     , 'County of Injury'
#     , 'COVID-19 Indicator'
#     , 'District Name'
#     , 'Gender'
#     , 'Industry Code'
#     , 'Medical Fee Region'
#     , 'WCIO Cause of Injury Code'
#     , 'WCIO Nature of Injury Code'
#     , 'WCIO Part Of Body Code'
#     , 'Zip Code'
#     , 'Agreement Reached'
# ]

In [None]:
#df_scaled = pd.read_csv('train_data_scaled.csv', sep=',')

In [61]:
df_scaled.head()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Medical Fee Region,WCIO Cause of Injury Code,WCIO Nature of Injury Code,WCIO Part Of Body Code,Zip Code,Agreement Reached,Number of Dependents
5393875,0.939606,0.258333,0.0,0.0,0.0,0.0,0.985134,0.84842,,0.58533,0.0,0.142857,0.790323,0.0,1.0,,0.333333,,0.407407,0.0,0.265306,0.1,0.588889,0.391192,0.0,0.166667
5393091,0.934224,0.383333,0.0,0.0,1.0,0.06279,0.977701,0.848517,0.1846,0.999511,0.0,0.428571,0.983871,0.0,0.714286,0.013836,0.0,0.041667,0.148148,0.0,0.979592,0.533333,0.322222,0.457898,1.0,0.666667
5393889,0.938548,0.333333,0.0,0.0,0.0,0.051601,0.980674,0.848517,,0.437164,0.0,0.428571,0.564516,0.0,0.0,,0.333333,,0.555556,0.25,0.795918,0.066667,0.011111,0.305696,0.0,1.0
5393887,0.939606,0.508333,0.0,0.0,0.0,,0.970268,0.84842,,0.836186,0.142857,0.142857,0.209677,0.0,0.0,,0.333333,,0.62963,0.25,0.153061,0.466667,0.3,0.306989,0.0,0.166667
5393863,0.93943,0.558333,0.0,0.0,0.0,0.0,0.967294,0.84842,,0.436675,0.0,0.285714,0.822581,0.0,0.428571,,0.333333,,0.407407,0.75,0.306122,0.1,0.322222,0.234815,0.0,0.833333


In [None]:
#df_scaled.set_index(df_scaled.columns[0], inplace=True)

In [62]:
df_scaled_imputed = df_scaled.copy()

In [63]:
imp = IterativeImputer(
    estimator=RandomForestRegressor(n_jobs=-1, random_state=17)
    ,initial_strategy='median'
    ,max_iter=5
    ,random_state=17
    ,verbose=2
)

In [64]:
fit_sample = df_scaled_imputed.sample(n=30000, random_state=17)

In [65]:
imp.fit(fit_sample)

[IterativeImputer] Completing matrix with shape (30000, 26)
[IterativeImputer] Ending imputation round 1/5, elapsed time 59.71
[IterativeImputer] Change: 1.5547486545526927, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 2/5, elapsed time 128.44
[IterativeImputer] Change: 1.0361577833739262, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 3/5, elapsed time 198.44
[IterativeImputer] Change: 0.836141464142012, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 4/5, elapsed time 266.99
[IterativeImputer] Change: 0.8454298441470212, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 5/5, elapsed time 334.87
[IterativeImputer] Change: 0.699772085033756, scaled tolerance: 0.001 




In [66]:
df_scaled_imputed = imp.transform(df_scaled_imputed)

[IterativeImputer] Completing matrix with shape (559202, 26)
[IterativeImputer] Ending imputation round 1/5, elapsed time 3.90
[IterativeImputer] Ending imputation round 2/5, elapsed time 7.27
[IterativeImputer] Ending imputation round 3/5, elapsed time 11.46
[IterativeImputer] Ending imputation round 4/5, elapsed time 14.72
[IterativeImputer] Ending imputation round 5/5, elapsed time 17.15


In [67]:
df_scaled_imputed = pd.DataFrame(df_scaled_imputed, columns = df_scaled.columns).set_index(df_scaled.index)

In [68]:
df_scaled_imputed.head()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Medical Fee Region,WCIO Cause of Injury Code,WCIO Nature of Injury Code,WCIO Part Of Body Code,Zip Code,Agreement Reached,Number of Dependents
5393875,0.939606,0.258333,0.0,0.0,0.0,0.0,0.985134,0.84842,0.195237,0.58533,0.0,0.142857,0.790323,0.0,1.0,0.066132,0.333333,0.032639,0.407407,0.0,0.265306,0.1,0.588889,0.391192,0.0,0.166667
5393091,0.934224,0.383333,0.0,0.0,1.0,0.06279,0.977701,0.848517,0.1846,0.999511,0.0,0.428571,0.983871,0.0,0.714286,0.013836,0.0,0.041667,0.148148,0.0,0.979592,0.533333,0.322222,0.457898,1.0,0.666667
5393889,0.938548,0.333333,0.0,0.0,0.0,0.051601,0.980674,0.848517,0.195558,0.437164,0.0,0.428571,0.564516,0.0,0.0,0.181535,0.333333,0.027222,0.555556,0.25,0.795918,0.066667,0.011111,0.305696,0.0,1.0
5393887,0.939606,0.508333,0.0,0.0,0.0,0.0,0.970268,0.84842,0.187471,0.836186,0.142857,0.142857,0.209677,0.0,0.0,0.083623,0.333333,0.024444,0.62963,0.25,0.153061,0.466667,0.3,0.306989,0.0,0.166667
5393863,0.93943,0.558333,0.0,0.0,0.0,0.0,0.967294,0.84842,0.190989,0.436675,0.0,0.285714,0.822581,0.0,0.428571,0.141535,0.333333,0.026806,0.407407,0.75,0.306122,0.1,0.322222,0.234815,0.0,0.833333


In [69]:
df_scaled_imputed.isna().sum()

Accident Date                     0
Age at Injury                     0
Alternative Dispute Resolution    0
Assembly Date                     0
Attorney/Representative           0
Average Weekly Wage               0
Birth Year                        0
C-2 Date                          0
C-3 Date                          0
Carrier Name                      0
Carrier Type                      0
Claim Injury Type                 0
County of Injury                  0
COVID-19 Indicator                0
District Name                     0
First Hearing Date                0
Gender                            0
IME-4 Count                       0
Industry Code                     0
Medical Fee Region                0
WCIO Cause of Injury Code         0
WCIO Nature of Injury Code        0
WCIO Part Of Body Code            0
Zip Code                          0
Agreement Reached                 0
Number of Dependents              0
dtype: int64

In [70]:
df_scaled.head()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Medical Fee Region,WCIO Cause of Injury Code,WCIO Nature of Injury Code,WCIO Part Of Body Code,Zip Code,Agreement Reached,Number of Dependents
5393875,0.939606,0.258333,0.0,0.0,0.0,0.0,0.985134,0.84842,,0.58533,0.0,0.142857,0.790323,0.0,1.0,,0.333333,,0.407407,0.0,0.265306,0.1,0.588889,0.391192,0.0,0.166667
5393091,0.934224,0.383333,0.0,0.0,1.0,0.06279,0.977701,0.848517,0.1846,0.999511,0.0,0.428571,0.983871,0.0,0.714286,0.013836,0.0,0.041667,0.148148,0.0,0.979592,0.533333,0.322222,0.457898,1.0,0.666667
5393889,0.938548,0.333333,0.0,0.0,0.0,0.051601,0.980674,0.848517,,0.437164,0.0,0.428571,0.564516,0.0,0.0,,0.333333,,0.555556,0.25,0.795918,0.066667,0.011111,0.305696,0.0,1.0
5393887,0.939606,0.508333,0.0,0.0,0.0,,0.970268,0.84842,,0.836186,0.142857,0.142857,0.209677,0.0,0.0,,0.333333,,0.62963,0.25,0.153061,0.466667,0.3,0.306989,0.0,0.166667
5393863,0.93943,0.558333,0.0,0.0,0.0,0.0,0.967294,0.84842,,0.436675,0.0,0.285714,0.822581,0.0,0.428571,,0.333333,,0.407407,0.75,0.306122,0.1,0.322222,0.234815,0.0,0.833333


In [71]:
df_scaled[missing_values] = df_scaled_imputed[missing_values]

In [72]:
df_scaled.head()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Medical Fee Region,WCIO Cause of Injury Code,WCIO Nature of Injury Code,WCIO Part Of Body Code,Zip Code,Agreement Reached,Number of Dependents
5393875,0.939606,0.258333,0.0,0.0,0.0,0.0,0.985134,0.84842,,0.58533,0.0,0.142857,0.790323,0.0,1.0,,0.333333,,0.407407,0.0,0.265306,0.1,0.588889,0.391192,0.0,0.166667
5393091,0.934224,0.383333,0.0,0.0,1.0,0.06279,0.977701,0.848517,0.1846,0.999511,0.0,0.428571,0.983871,0.0,0.714286,0.013836,0.0,0.041667,0.148148,0.0,0.979592,0.533333,0.322222,0.457898,1.0,0.666667
5393889,0.938548,0.333333,0.0,0.0,0.0,0.051601,0.980674,0.848517,,0.437164,0.0,0.428571,0.564516,0.0,0.0,,0.333333,,0.555556,0.25,0.795918,0.066667,0.011111,0.305696,0.0,1.0
5393887,0.939606,0.508333,0.0,0.0,0.0,0.0,0.970268,0.84842,,0.836186,0.142857,0.142857,0.209677,0.0,0.0,,0.333333,,0.62963,0.25,0.153061,0.466667,0.3,0.306989,0.0,0.166667
5393863,0.93943,0.558333,0.0,0.0,0.0,0.0,0.967294,0.84842,,0.436675,0.0,0.285714,0.822581,0.0,0.428571,,0.333333,,0.407407,0.75,0.306122,0.1,0.322222,0.234815,0.0,0.833333


In [73]:
df_scaled.isna().sum()

Accident Date                          0
Age at Injury                          0
Alternative Dispute Resolution         0
Assembly Date                          0
Attorney/Representative                0
Average Weekly Wage                    0
Birth Year                             0
C-2 Date                           12746
C-3 Date                          378915
Carrier Name                           0
Carrier Type                           0
Claim Injury Type                      0
County of Injury                       0
COVID-19 Indicator                     0
District Name                          0
First Hearing Date                414888
Gender                                 0
IME-4 Count                       430115
Industry Code                          0
Medical Fee Region                     0
WCIO Cause of Injury Code              0
WCIO Nature of Injury Code             0
WCIO Part Of Body Code                 0
Zip Code                               0
Agreement Reache

In [None]:
#df_scaled.to_csv('/home/shadybea/OneDrive/General/Machine Learning/Project/Data/train_data_scaled_imputed.csv', index=True)

In [None]:
#df = pd.read_csv('/home/shadybea/OneDrive/General/Machine Learning/Project/Data/train_data_scaled_imputed.csv', sep=',')

In [None]:
#df.set_index(df.columns[0], inplace=True)

In [74]:
def mode(x): return x.mode().iloc[0] if not x.mode().empty else None


In [75]:
df['C-3 Date Missing'] = df['C-3 Date'].isna().map({True: 1, False: 0})

In [76]:
df[['C-3 Date Missing', 'Claim Injury Type']].groupby(['Claim Injury Type', 'C-3 Date Missing'])['C-3 Date Missing'].agg(
    frequency=lambda x: x.value_counts(),
    proportion=lambda x: x.value_counts() / len(df) * 100,
    mode=mode
)

Unnamed: 0_level_0,Unnamed: 1_level_0,frequency,proportion,mode
Claim Injury Type,C-3 Date Missing,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1. CANCELLED,0,5609,0.977135,0
1. CANCELLED,1,6867,1.196289,1
2. NON-COMP,0,38127,6.642045,0
2. NON-COMP,1,252951,44.066199,1
3. MED ONLY,0,23841,4.153303,0
3. MED ONLY,1,45065,7.850703,1
4. TEMPORARY,0,78974,13.757937,0
4. TEMPORARY,1,69533,12.113235,1
5. PPD SCH LOSS,0,37215,6.483167,0
5. PPD SCH LOSS,1,11065,1.927616,1
