# **TO GRANT OR NOT TO GRANT: DECIDING ON COMPENSATION BENEFITS - PART 3: DATA CLEANING AND PRE-PROCESSING**

## 1. Imports and initial transformations

In [1]:
# importing the libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [2]:
# setting the options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
np.set_printoptions(threshold=np.inf)

In [3]:
# importing the data
df = pd.read_csv('../Data/train_data_visual_exploration.csv', sep=',', index_col=0)
df_test = pd.read_csv('../Data/test_data_visual_exploration.csv', sep=',', index_col=0)

  df = pd.read_csv('../Data/train_data_visual_exploration.csv', sep=',', index_col=0)
  df_test = pd.read_csv('../Data/test_data_visual_exploration.csv', sep=',', index_col=0)


In [4]:
# defining the numeric, date and categorical features
numeric_features = [
    'Age at Injury'
    , 'Average Weekly Wage'
    , 'Birth Year'
    , 'IME-4 Count'
    , 'Number of Dependents'
    , 'Days Difference'
]
date_features = [
    'Accident Date'
    , 'Assembly Date'
    , 'C-2 Date'
    , 'C-3 Date'
    , 'First Hearing Date'
]
categorical_features = [
    'Alternative Dispute Resolution'
    , 'Attorney/Representative'
    , 'Carrier Name'
    , 'Carrier Type'
    , 'Claim Injury Type'
    , 'County of Injury'
    , 'COVID-19 Indicator'
    , 'District Name'
    , 'Gender'
    , 'Industry Code'
    , 'Medical Fee Region'
    , 'WCIO Cause of Injury Code'
    , 'WCIO Nature of Injury Code'
    , 'WCIO Part Of Body Code'
    , 'Zip Code'
    , 'C-2 Missed Timing'
    , 'C-3 Missed Timing'
    , 'C-2 Missing'
    , 'C-3 Missing'
    , 'Has Hearing'
    , 'Has IME-4 Report'
]

all_features = numeric_features + date_features + categorical_features

In [5]:
# convert numeric feature to int:
for feature in list(filter(lambda feat: feat not in ['Average Weekly Wage'], numeric_features)):
    df[feature] = pd.to_numeric(df[feature], errors='coerce').astype('Int64')
    df_test[feature] = pd.to_numeric(df_test[feature], errors='coerce').astype('Int64')

In [6]:
# convert date features to datetime
for col in date_features:
    df[col] = pd.to_datetime(df[col], errors='coerce', format='%Y-%m-%d')
    df_test[col] = pd.to_datetime(df_test[col], errors='coerce', format='%Y-%m-%d')

In [7]:
df['Zip Code'] = df['Zip Code'].astype(str)
df_test['Zip Code'] = df_test['Zip Code'].astype(str)

## 2. Data Cleaning

As we previously mentioned, there are some columns with weird values:
- **Age at Injury**: multiple values below 14, which is the minimum legal age to work in the USA
- **WCIO Part Of Body Code**: has a negative value

Before we go any further, let's try to tackle these issues.

### 2.1. Age at Injury

In [8]:
len(df[df['Age at Injury'] < 14])

5388

In [9]:
len(df_test[df_test['Age at Injury'] < 14])

3123

In [10]:
len(df[(df['Age at Injury'] < 14) & ((df['Accident Date'].isna()) | (df['Birth Year'].isna()))])

5388

In [11]:
len(df_test[(df_test['Age at Injury'] < 14) & ((df_test['Accident Date'].isna()) | (df_test['Birth Year'].isna()))])

3107

As discussed in the notebook 'Part1-InitialInspection.ipynb', these values cannot yet be fixed, since we are trying to fix them using mathematical logic. Therefore, we will come back to fixing these values after we perform data imputation on the 'Accident Date' and 'Birth Year' columns. If there still are any inconsistent values in the test data, we will set them as NaN and impute them later.

### 2.2. WCIO Part Of Body Code

In [12]:
len(df[df['WCIO Part Of Body Code'] < 0])

42010

In [13]:
df[df['WCIO Part Of Body Code'] < 0]['WCIO Part Of Body Code'].unique()

array([-9.])

In [14]:
df_test[df_test['WCIO Part Of Body Code'] < 0]['WCIO Part Of Body Code'].unique()

array([-9.])

All negative values are the same value - let's check if there are any values '9' or if we can simply convert these values to its absolute value.

In [15]:
len(df[df['WCIO Part Of Body Code'] == 9])

0

In [16]:
len(df_test[df_test['WCIO Part Of Body Code'] == 9])

0

As there are no values that take the value '9', we will convert the negative values to the absolute value

In [17]:
df['WCIO Part Of Body Code'] = np.where(
    # we select only the rows that meet the necessary conditions
    (df['WCIO Part Of Body Code'] < 0) & (df['WCIO Part Of Body Code'].notna()),
    # if conditions are met, we calculate the new value
    df['WCIO Part Of Body Code'].abs(),
    # otherwise, we keep the original value
    df['WCIO Part Of Body Code']
)

In [18]:
len(df[df['WCIO Part Of Body Code'] < 0])

0

In [19]:
df_test['WCIO Part Of Body Code'] = np.where(
    # we select only the rows that meet the necessary conditions
    (df_test['WCIO Part Of Body Code'] < 0) & (df_test['WCIO Part Of Body Code'].notna()),
    # if conditions are met, we calculate the new value
    df_test['WCIO Part Of Body Code'].abs(),
    # otherwise, we keep the original value
    df_test['WCIO Part Of Body Code']
)

In [20]:
len(df_test[df_test['WCIO Part Of Body Code'] < 0])

0

## 3. Missing values

In [21]:
# calculate the number of NaNs for each column
nan_counts = df.isna().sum()

# get the total number of rows
total_rows = df.shape[0]

# calculate the percentage of NaN values for each column
percentage_nans = (nan_counts / total_rows) * 100

# format the percentages with '%' sign
percentage_nans = percentage_nans.apply(lambda x: f"{x:.2f}%")

# combine all the information into a DataFrame
nan_summary = pd.DataFrame({
    'NaN Count': nan_counts
    , 'Total Values': [total_rows] * len(nan_counts)
    , 'Percentage NaN': percentage_nans
})

display(nan_summary)

Unnamed: 0,NaN Count,Total Values,Percentage NaN
Accident Date,3688,573995,0.64%
Age at Injury,0,573995,0.00%
Alternative Dispute Resolution,0,573995,0.00%
Assembly Date,0,573995,0.00%
Attorney/Representative,0,573995,0.00%
Average Weekly Wage,28648,573995,4.99%
Birth Year,31018,573995,5.40%
C-2 Date,14558,573995,2.54%
C-3 Date,386756,573995,67.38%
Carrier Name,0,573995,0.00%


In [22]:
missing_values = [
    'Accident Date'
    , 'Average Weekly Wage'
    , 'Birth Year'
    #, 'C-2 Date'  # missing form could have relationship with the target
    #, 'C-3 Date'  # missing form could have relationship with the target
    #, 'First Hearing Date'  # missing values means no hearing has held
    , 'Gender'
    #, 'IME-4 Count'  # missing form could have relationship with the target
    , 'Industry Code'
    , 'WCIO Cause of Injury Code'
    , 'WCIO Nature of Injury Code'
    , 'WCIO Part Of Body Code'
    , 'Zip Code'
    , 'Days Difference'
]

In [23]:
# calculate the number of NaNs for each column
nan_counts = df_test.isna().sum()

# get the total number of rows
total_rows = df_test.shape[0]

# calculate the percentage of NaN values for each column
percentage_nans = (nan_counts / total_rows) * 100

# format the percentages with '%' sign
percentage_nans = percentage_nans.apply(lambda x: f"{x:.2f}%")

# combine all the information into a DataFrame
nan_summary = pd.DataFrame({
    'NaN Count': nan_counts
    , 'Total Values': [total_rows] * len(nan_counts)
    , 'Percentage NaN': percentage_nans
})

display(nan_summary)

Unnamed: 0,NaN Count,Total Values,Percentage NaN
Accident Date,2444,387975,0.63%
Age at Injury,0,387975,0.00%
Alternative Dispute Resolution,0,387975,0.00%
Assembly Date,0,387975,0.00%
Attorney/Representative,0,387975,0.00%
Average Weekly Wage,19204,387975,4.95%
Birth Year,20301,387975,5.23%
C-2 Date,9134,387975,2.35%
C-3 Date,302759,387975,78.04%
Carrier Name,0,387975,0.00%


The columns from the test data that contain missing values are the same as the ones from the train data, with the addition of the descriptive columns, which we had removed from the train data.

We will not impute missing values in the commented columns, as per the explainations in the comments

## 4. Simple Encoding

In [24]:
binary_columns = ['Alternative Dispute Resolution', 'Attorney/Representative', 'Gender', 'COVID-19 Indicator']

In [25]:
for col in binary_columns:
    display(df[col].value_counts())
    display(df_test[col].value_counts())

Alternative Dispute Resolution
N    571381
Y      2609
U         5
Name: count, dtype: int64

Alternative Dispute Resolution
N    386314
Y      1660
U         1
Name: count, dtype: int64

Attorney/Representative
N    392265
Y    181730
Name: count, dtype: int64

Attorney/Representative
N    306476
Y     81499
Name: count, dtype: int64

Gender
M    335204
F    234034
Name: count, dtype: int64

Gender
M    215343
F    167019
Name: count, dtype: int64

COVID-19 Indicator
N    546478
Y     27517
Name: count, dtype: int64

COVID-19 Indicator
N    385434
Y      2541
Name: count, dtype: int64

Before we proceed with the encoding of these features, let us replace the value 'U' in 'Alternative Dispute Resolution', by setting it as NaN

In [26]:
df.loc[df['Alternative Dispute Resolution'] == 'U', 'Alternative Dispute Resolution'] = np.nan
df_test.loc[df_test['Alternative Dispute Resolution'] == 'U', 'Alternative Dispute Resolution'] = np.nan

In [27]:
# Encoding binary features
for col in binary_columns:
    df[col] = df[col].map({'Y': 1, 'N': 0, 'M': 1, 'F': 0})
    df_test[col] = df_test[col].map({'Y': 1, 'N': 0, 'M': 1, 'F': 0})

In [28]:
def encode_dates(df, date_columns):
    for col in date_columns:
        df[f'{col}_year'] = df[col].apply(lambda x: x.year if pd.notnull(x) else np.nan)
        df[f'{col}_month'] = df[col].apply(lambda x: x.month if pd.notnull(x) else np.nan)
        df[f'{col}_day'] = df[col].apply(lambda x: x.day if pd.notnull(x) else np.nan)

    # Drop the original date columns
    df.drop(columns=date_columns, inplace=True)

In [29]:
# Apply the transformation
encode_dates(df, date_features)
encode_dates(df_test, date_features)

## 5. Train-Test Split

In [30]:
X = df.drop(columns='Claim Injury Type')
y = df['Claim Injury Type']

In [31]:
X_train, X_val, y_train, y_val = train_test_split(
    X
    , y
    , test_size=0.3
    , random_state=20
    , stratify=y
)

## 6. Outlier Removal

In [32]:
outliers_columns = [
    'Age at Injury'
    , 'Average Weekly Wage'
    , 'Birth Year'
    , 'IME-4 Count'
]

As you may recall, we had already outlined a strategy for outliers in the previous notebook 'Part2-VisualExploration'. We will proceed with that strategy for the columns mentioned above.

### 6.1. Age at Injury

We will remove any value on the right of the right wisker; since the average retirement age in the US is between 63 and 65, keeping only values up until the upper wisker is more than reasonable.

In [33]:
Q1 = X_train['Age at Injury'].quantile(0.25)
Q3 = X_train['Age at Injury'].quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + 1.5 * IQR

X_train = X_train[X_train['Age at Injury'] <= upper_bound]
y_train = y_train.loc[X_train.index]

X_val = X_val[X_val['Age at Injury'] <= upper_bound]
y_val = y_val.loc[X_val.index]

### 6.2. Average Weekly Wage

To remove the outliers of this variable, we will use the value of the 99th percentile and remove all values above it.

In [34]:
P99 = X_train['Average Weekly Wage'].quantile(0.99)

X_train = X_train[X_train['Average Weekly Wage'] <= P99]
y_train = y_train.loc[X_train.index]

X_val = X_val[X_val['Average Weekly Wage'] <= P99]
y_val = y_val.loc[X_val.index]

### 6.3. Birth Year

Values before 1916 are a bit far from the majority of the data points.

In [35]:
X_train = X_train[X_train['Birth Year'] >= 1916]
y_train = y_train.loc[X_train.index]

X_val = X_val[X_val['Birth Year'] >= 1916]
y_val = y_val.loc[X_val.index]

### 6.4. IME-4 Count

Values after 40 can be considered extreme

In [36]:
X_train = X_train[X_train['IME-4 Count'] <= 40]
y_train = y_train.loc[X_train.index]

X_val = X_val[X_val['IME-4 Count'] <= 40]
y_val = y_val.loc[X_val.index]

## 7. Categorical Encoding

### 7.1. Encoding High Cardinality Features

In [37]:
highcard_features = [
    'Carrier Name'
    , 'County of Injury'
    , 'Zip Code'
]

In [38]:
def frequency_encoder(train, val, test, column_name):
    freq_encoding = train[column_name].value_counts(normalize=True)

    train[f'{column_name}'] = train[column_name].map(freq_encoding)
    val[f'{column_name}'] = val[column_name].map(freq_encoding)
    test[f'{column_name}'] = test[column_name].map(freq_encoding)
    
    return train, val, test

In [39]:
for feat in highcard_features:
    X_train, X_val, df_test = frequency_encoder(X_train, X_val, df_test, feat)

### 7.2. Encoding Low Cardinality Features

In [40]:
lowcard_features = [
    'Carrier Type'
    , 'District Name'
    , 'Medical Fee Region'
]

In [41]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first').set_output(transform='pandas')

In [42]:
train_enc = ohe.fit_transform(X_train[lowcard_features])
val_enc = ohe.transform(X_val[lowcard_features])
test_enc = ohe.transform(df_test[lowcard_features])



In [43]:
train_enc.index = X_train.index
val_enc.index = X_val.index
test_enc.index = df_test.index

In [44]:
X_train = pd.concat([X_train, train_enc], axis=1)
X_val = pd.concat([X_val, val_enc], axis=1)
df_test = pd.concat([df_test, test_enc], axis=1)

In [45]:
X_train.drop(columns=lowcard_features, inplace=True)
X_val.drop(columns=lowcard_features, inplace=True)
df_test.drop(columns=lowcard_features, inplace=True)

### 7.3. Encoding the Target

In [46]:
# Encode the target variable 'Claim Injury Type'
target = OrdinalEncoder()

In [49]:
y_train = target.fit_transform(y_train.to_numpy().reshape(-1, 1))
y_val = target.transform(y_val.to_numpy().reshape(-1, 1))

Before we move on, we will order the columns in the dataframes so they all follow the same order

In [51]:
X_val = X_val[X_train.columns]
df_test = df_test[X_train.columns]

## 8. Feature Scaling

In [52]:
scaling_features = [
    'Age at Injury'
    , 'Average Weekly Wage'
    , 'Birth Year'
    , 'IME-4 Count'
    , 'Industry Code'
    , 'WCIO Cause of Injury Code'
    , 'WCIO Nature of Injury Code'
    , 'WCIO Part Of Body Code'
    , 'Number of Dependents'
    , 'Days Difference'
    , 'Accident Date_year'
    , 'Accident Date_month'
    , 'Accident Date_day'
    , 'Assembly Date_year'
    , 'Assembly Date_month'
    , 'Assembly Date_day'
    , 'C-2 Date_year'
    , 'C-2 Date_month'
    , 'C-2 Date_day'
    , 'C-3 Date_year'
    , 'C-3 Date_month'
    , 'C-3 Date_day'
    , 'First Hearing Date_year'
    , 'First Hearing Date_month'
    , 'First Hearing Date_day'
]

In [53]:
scaler = MinMaxScaler()

In [54]:
X_train[scaling_features] = scaler.fit_transform(X_train[scaling_features])
X_val[scaling_features] = scaler.transform(X_val[scaling_features])
df_test[scaling_features] = scaler.transform(df_test[scaling_features])


## 9. Data imputation

In [55]:
imp = IterativeImputer(
    estimator=RandomForestRegressor(n_jobs=-1, random_state=20),
    initial_strategy='median',
    max_iter=5,
    random_state=17,
    verbose=2
)

In [None]:
# Defining a sample to fit the imputer, since we have to much data for our computers to handle
imp_sample = X_train.sample(n=60000, random_state=20)

In [57]:
# Fitting the imputer on the sample data
imp.fit(imp_sample)

[IterativeImputer] Completing matrix with shape (600, 55)
[IterativeImputer] Ending imputation round 1/5, elapsed time 8.02
[IterativeImputer] Change: 1.7680802698183593, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 2/5, elapsed time 15.80
[IterativeImputer] Change: 0.5448333333333202, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 3/5, elapsed time 23.81
[IterativeImputer] Change: 0.3844090909090912, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 4/5, elapsed time 32.01
[IterativeImputer] Change: 0.38351515151515214, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 5/5, elapsed time 40.36
[IterativeImputer] Change: 0.3792878787878787, scaled tolerance: 0.001 




In [58]:
X_train.loc[:, :] = imp.transform(X_train)
X_val.loc[:, :] = imp.transform(X_val)
df_test.loc[:, :] = imp.transform(df_test)

[IterativeImputer] Completing matrix with shape (81936, 55)
[IterativeImputer] Ending imputation round 1/5, elapsed time 0.53
[IterativeImputer] Ending imputation round 2/5, elapsed time 1.03
[IterativeImputer] Ending imputation round 3/5, elapsed time 1.55
[IterativeImputer] Ending imputation round 4/5, elapsed time 2.05
[IterativeImputer] Ending imputation round 5/5, elapsed time 2.56
[IterativeImputer] Completing matrix with shape (35106, 55)
[IterativeImputer] Ending imputation round 1/5, elapsed time 0.37
[IterativeImputer] Ending imputation round 2/5, elapsed time 0.75
[IterativeImputer] Ending imputation round 3/5, elapsed time 1.12
[IterativeImputer] Ending imputation round 4/5, elapsed time 1.50
[IterativeImputer] Ending imputation round 5/5, elapsed time 1.87
[IterativeImputer] Completing matrix with shape (387975, 55)
[IterativeImputer] Ending imputation round 1/5, elapsed time 4.48
[IterativeImputer] Ending imputation round 2/5, elapsed time 9.03
[IterativeImputer] Ending i

In [59]:
print(f"NaN in X_train after imputation: {X_train.isnull().sum().sum()}")
print(f"NaN in X_val after imputation: {X_val.isnull().sum().sum()}")
print(f"NaN in df_test after imputation: {df_test.isnull().sum().sum()}")

NaN in X_train after imputation: 0
NaN in X_val after imputation: 0
NaN in df_test after imputation: 0


## 10. Export

In [70]:
y_train = pd.Series(y_train, index=X_train.index, name="Claim Injury Type")
df_train = pd.concat([X_train, y_train], axis=1)

y_val = pd.Series(y_val.flatten(), index=X_val.index, name="Claim Injury Type")
df_val = pd.concat([X_val, y_val], axis=1)

In [75]:
df_train.to_csv('../Data/train_data_preproc.csv', index=True)
df_val.to_csv('../Data/validation_data_preproc.csv', index=True)
df_test.to_csv('../Data/test_data_preproc.csv', index=True)