# **TO GRANT OR NOT TO GRANT: DECIDING ON COMPENSATION BENEFITS - PART 3: DATA CLEANING AND PRE-PROCESSING**

## 1. Imports and initial transformations

In [85]:
# importing the libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [86]:
# setting the options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
np.set_printoptions(threshold=np.inf)

In [87]:
# importing the data
df = pd.read_csv('../Data/train_data_visual_exploration_OpenEnded.csv', sep=',', index_col=0)
df_test = pd.read_csv('../Data/test_data_visual_exploration_OpenEnded.csv', sep=',', index_col=0)

  df = pd.read_csv('../Data/train_data_visual_exploration_OpenEnded.csv', sep=',', index_col=0)
  df_test = pd.read_csv('../Data/test_data_visual_exploration_OpenEnded.csv', sep=',', index_col=0)


In [88]:
missing_columns = set(df.columns) - set(df_test.columns)
print("Columns in df but not in df_test:", missing_columns)

Columns in df but not in df_test: {'Claim Injury Type', 'Agreement Reached'}


It is not necessary to remove rows since Agreement Reached has no null values

In [89]:
df['Agreement Reached'].isnull().sum()

0

In [90]:
# defining the numeric, date and categorical features
numeric_features = [
    'Age at Injury'
    , 'Average Weekly Wage'
    , 'Birth Year'
    , 'IME-4 Count'
    , 'Number of Dependents'
    , 'Days Difference'
]
date_features = [
    'Accident Date'
    , 'Assembly Date'
    , 'C-2 Date'
    , 'C-3 Date'
    , 'First Hearing Date'
]
categorical_features = [
    'Alternative Dispute Resolution'
    , 'Claim Injury Type'
    , 'Attorney/Representative'
    , 'Carrier Name'
    , 'Carrier Type'
    , 'Claim Injury Type'
    , 'County of Injury'
    , 'COVID-19 Indicator'
    , 'District Name'
    , 'Gender'
    , 'Industry Code'
    , 'Medical Fee Region'
    , 'WCIO Cause of Injury Code'
    , 'WCIO Nature of Injury Code'
    , 'WCIO Part Of Body Code'
    , 'Zip Code'
    , 'C-2 Missed Timing'
    , 'C-3 Missed Timing'
    , 'C-2 Missing'
    , 'C-3 Missing'
    , 'Has Hearing'
    , 'Has IME-4 Report'
]

all_features = numeric_features + date_features + categorical_features

In [91]:
# convert numeric feature to int:
for feature in list(filter(lambda feat: feat not in ['Average Weekly Wage'], numeric_features)):
    df[feature] = pd.to_numeric(df[feature], errors='coerce').astype('Int64')
    df_test[feature] = pd.to_numeric(df_test[feature], errors='coerce').astype('Int64')

In [92]:
# convert date features to datetime
for col in date_features:
    df[col] = pd.to_datetime(df[col], errors='coerce', format='%Y-%m-%d')
    df_test[col] = pd.to_datetime(df_test[col], errors='coerce', format='%Y-%m-%d')

In [93]:
df['Zip Code'] = df['Zip Code'].astype(str)
df_test['Zip Code'] = df_test['Zip Code'].astype(str)

## 2. Data Cleaning

As we previously mentioned, there are some columns with weird values:
- **Age at Injury**: multiple values below 14, which is the minimum legal age to work in the USA
- **WCIO Part Of Body Code**: has a negative value

Before we go any further, let's try to tackle these issues.

### 2.1. Age at Injury

In [94]:
len(df[df['Age at Injury'] < 14])

5388

In [95]:
len(df_test[df_test['Age at Injury'] < 14])

3123

In [96]:
len(df[(df['Age at Injury'] < 14) & ((df['Accident Date'].isna()) | (df['Birth Year'].isna()))])

5388

In [97]:
len(df_test[(df_test['Age at Injury'] < 14) & ((df_test['Accident Date'].isna()) | (df_test['Birth Year'].isna()))])

3107

As discussed in the notebook 'Part1-InitialInspection.ipynb', these values cannot yet be fixed, since we are trying to fix them using mathematical logic. Therefore, we will come back to fixing these values after we perform data imputation on the 'Accident Date' and 'Birth Year' columns. If there still are any inconsistent values in the test data, we will set them as NaN and impute them later.

### 2.2. WCIO Part Of Body Code

In [98]:
len(df[df['WCIO Part Of Body Code'] < 0])

42010

In [99]:
df[df['WCIO Part Of Body Code'] < 0]['WCIO Part Of Body Code'].unique()

array([-9.])

In [100]:
df_test[df_test['WCIO Part Of Body Code'] < 0]['WCIO Part Of Body Code'].unique()

array([-9.])

All negative values are the same value - let's check if there are any values '9' or if we can simply convert these values to its absolute value.

In [101]:
len(df[df['WCIO Part Of Body Code'] == 9])

0

In [102]:
len(df_test[df_test['WCIO Part Of Body Code'] == 9])

0

As there are no values that take the value '9', we will convert the negative values to the absolute value

In [103]:
df['WCIO Part Of Body Code'] = np.where(
    # we select only the rows that meet the necessary conditions
    (df['WCIO Part Of Body Code'] < 0) & (df['WCIO Part Of Body Code'].notna()),
    # if conditions are met, we calculate the new value
    df['WCIO Part Of Body Code'].abs(),
    # otherwise, we keep the original value
    df['WCIO Part Of Body Code']
)

In [104]:
len(df[df['WCIO Part Of Body Code'] < 0])

0

In [105]:
df_test['WCIO Part Of Body Code'] = np.where(
    # we select only the rows that meet the necessary conditions
    (df_test['WCIO Part Of Body Code'] < 0) & (df_test['WCIO Part Of Body Code'].notna()),
    # if conditions are met, we calculate the new value
    df_test['WCIO Part Of Body Code'].abs(),
    # otherwise, we keep the original value
    df_test['WCIO Part Of Body Code']
)

In [106]:
len(df_test[df_test['WCIO Part Of Body Code'] < 0])

0

## 3. Missing values

In [107]:
# calculate the number of NaNs for each column
nan_counts = df.isna().sum()

# get the total number of rows
total_rows = df.shape[0]

# calculate the percentage of NaN values for each column
percentage_nans = (nan_counts / total_rows) * 100

# format the percentages with '%' sign
percentage_nans = percentage_nans.apply(lambda x: f"{x:.2f}%")

# combine all the information into a DataFrame
nan_summary = pd.DataFrame({
    'NaN Count': nan_counts
    , 'Total Values': [total_rows] * len(nan_counts)
    , 'Percentage NaN': percentage_nans
})

display(nan_summary)

Unnamed: 0,NaN Count,Total Values,Percentage NaN
Accident Date,3688,573995,0.64%
Age at Injury,0,573995,0.00%
Alternative Dispute Resolution,0,573995,0.00%
Assembly Date,0,573995,0.00%
Attorney/Representative,0,573995,0.00%
Average Weekly Wage,28648,573995,4.99%
Birth Year,31018,573995,5.40%
C-2 Date,14558,573995,2.54%
C-3 Date,386756,573995,67.38%
Carrier Name,0,573995,0.00%


In [108]:
missing_values = [
    'Accident Date'
    , 'Average Weekly Wage'
    , 'Birth Year'
    #, 'C-2 Date'  # missing form could have relationship with the target
    #, 'C-3 Date'  # missing form could have relationship with the target
    #, 'First Hearing Date'  # missing values means no hearing has held
    , 'Gender'
    #, 'IME-4 Count'  # missing form could have relationship with the target
    , 'Industry Code'
    , 'WCIO Cause of Injury Code'
    , 'WCIO Nature of Injury Code'
    , 'WCIO Part Of Body Code'
    , 'Zip Code'
    , 'Days Difference'
]

In [109]:
# calculate the number of NaNs for each column
nan_counts = df_test.isna().sum()

# get the total number of rows
total_rows = df_test.shape[0]

# calculate the percentage of NaN values for each column
percentage_nans = (nan_counts / total_rows) * 100

# format the percentages with '%' sign
percentage_nans = percentage_nans.apply(lambda x: f"{x:.2f}%")

# combine all the information into a DataFrame
nan_summary = pd.DataFrame({
    'NaN Count': nan_counts
    , 'Total Values': [total_rows] * len(nan_counts)
    , 'Percentage NaN': percentage_nans
})

display(nan_summary)

Unnamed: 0,NaN Count,Total Values,Percentage NaN
Accident Date,2444,387975,0.63%
Age at Injury,0,387975,0.00%
Alternative Dispute Resolution,0,387975,0.00%
Assembly Date,0,387975,0.00%
Attorney/Representative,0,387975,0.00%
Average Weekly Wage,19204,387975,4.95%
Birth Year,20301,387975,5.23%
C-2 Date,9134,387975,2.35%
C-3 Date,302759,387975,78.04%
Carrier Name,0,387975,0.00%


The columns from the test data that contain missing values are the same as the ones from the train data, with the addition of the descriptive columns, which we had removed from the train data.

We will not impute missing values in the commented columns, as per the explainations in the comments

## 4. Simple Encoding

In [110]:
binary_columns = ['Alternative Dispute Resolution', 'Attorney/Representative', 'Gender', 'COVID-19 Indicator']

In [111]:
for col in binary_columns:
    display(df[col].value_counts())
    display(df_test[col].value_counts())

Alternative Dispute Resolution
N    571381
Y      2609
U         5
Name: count, dtype: int64

Alternative Dispute Resolution
N    386314
Y      1660
U         1
Name: count, dtype: int64

Attorney/Representative
N    392265
Y    181730
Name: count, dtype: int64

Attorney/Representative
N    306476
Y     81499
Name: count, dtype: int64

Gender
M    335204
F    234034
Name: count, dtype: int64

Gender
M    215343
F    167019
Name: count, dtype: int64

COVID-19 Indicator
N    546478
Y     27517
Name: count, dtype: int64

COVID-19 Indicator
N    385434
Y      2541
Name: count, dtype: int64

Before we proceed with the encoding of these features, let us replace the value 'U' in 'Alternative Dispute Resolution', by setting it as NaN

In [112]:
df.loc[df['Alternative Dispute Resolution'] == 'U', 'Alternative Dispute Resolution'] = np.nan
df_test.loc[df_test['Alternative Dispute Resolution'] == 'U', 'Alternative Dispute Resolution'] = np.nan

In [113]:
# Encoding binary features
for col in binary_columns:
    df[col] = df[col].map({'Y': 1, 'N': 0, 'M': 1, 'F': 0})
    df_test[col] = df_test[col].map({'Y': 1, 'N': 0, 'M': 1, 'F': 0})

In [114]:
def encode_dates(df, date_columns):
    for col in date_columns:
        df[f'{col}_year'] = df[col].apply(lambda x: x.year if pd.notnull(x) else np.nan)
        df[f'{col}_month'] = df[col].apply(lambda x: x.month if pd.notnull(x) else np.nan)
        df[f'{col}_day'] = df[col].apply(lambda x: x.day if pd.notnull(x) else np.nan)

    # Drop the original date columns
    df.drop(columns=date_columns, inplace=True)

In [115]:
# Apply the transformation
encode_dates(df, date_features)
encode_dates(df_test, date_features)

## 5. Train-Test Split

In [116]:
X = df.drop(columns=['Agreement Reached', 'Claim Injury Type'])
y = df['Agreement Reached']

In [117]:
X_train, X_val, y_train, y_val = train_test_split(
    X
    , y
    , test_size=0.3
    , random_state=20
    , stratify=y
)

## 6. Outlier Removal

In [118]:
outliers_columns = [
    'Age at Injury'
    , 'Average Weekly Wage'
    , 'Birth Year'
    , 'IME-4 Count'
]

As you may recall, we had already outlined a strategy for outliers in the previous notebook 'Part2-VisualExploration'. We will proceed with that strategy for the columns mentioned above.

### 6.1. Age at Injury

We will cap the more extreme values

In [119]:
lower_bound_age = X_train[['Age at Injury']].quantile(0.01)
upper_bound_age = X_train[['Age at Injury']].quantile(0.95)

X_train['Age at Injury'] = X_train['Age at Injury'].clip(lower_bound_age['Age at Injury'], 
                                                         upper_bound_age['Age at Injury'])

X_val['Age at Injury'] = X_val['Age at Injury'].clip(lower_bound_age['Age at Injury'], 
                                                     upper_bound_age['Age at Injury'])

df_test['Age at Injury'] = df_test['Age at Injury'].clip(lower_bound_age['Age at Injury'], 
                                                         upper_bound_age['Age at Injury'])

### 6.2. Average Weekly Wage

To remove the outliers of this variable, we will use the value of the 95th percentile and remove all values above it.

In [120]:
upper_bound = X_train['Average Weekly Wage'].quantile(0.95)

X_train['Average Weekly Wage'] = X_train['Average Weekly Wage'].clip(upper=upper_bound)
X_val['Average Weekly Wage'] = X_val['Average Weekly Wage'].clip(upper=upper_bound)
df_test['Average Weekly Wage'] = df_test['Average Weekly Wage'].clip(upper=upper_bound)

### 6.3. Birth Year

We will apply the same logic as we applied to Age at Injury

In [121]:
lower_bound_birth = X_train[['Birth Year']].quantile(0.01)
upper_bound_birth = X_train[['Birth Year']].quantile(0.95)

X_train['Birth Year'] = X_train['Birth Year'].clip(lower_bound_birth['Birth Year'], 
                                                   upper_bound_birth['Birth Year'])

X_val['Birth Year'] = X_val['Birth Year'].clip(lower_bound_birth['Birth Year'], 
                                               upper_bound_birth['Birth Year'])

df_test['Birth Year'] = df_test['Birth Year'].clip(lower_bound_birth['Birth Year'], 
                                                   upper_bound_birth['Birth Year'])

### 6.4. IME-4 Count

Values over 40 seem to be a bit extreme

In [122]:
upper_bound_ime4 = 40

X_train['IME-4 Count'] = X_train['IME-4 Count'].clip(upper=upper_bound_ime4)
X_val['IME-4 Count'] = X_val['IME-4 Count'].clip(upper=upper_bound_ime4)
df_test['IME-4 Count'] = df_test['IME-4 Count'].clip(upper=upper_bound_ime4)


## 7. Categorical Encoding

### 7.1. Encoding High Cardinality Features

In [123]:
highcard_features = [
    'Carrier Name'
    , 'County of Injury'
    , 'Zip Code'
]

In [124]:
def frequency_encoder(train, val, test, column_name):
    freq_encoding = train[column_name].value_counts(normalize=True)

    train[f'{column_name}'] = train[column_name].map(freq_encoding)
    val[f'{column_name}'] = val[column_name].map(freq_encoding)
    test[f'{column_name}'] = test[column_name].map(freq_encoding)
    
    return train, val, test

In [125]:
for feat in highcard_features:
    X_train, X_val, df_test = frequency_encoder(X_train, X_val, df_test, feat)

### 7.2. Encoding Low Cardinality Features

In [126]:
lowcard_features = [
    'Carrier Type'
    , 'District Name'
    , 'Medical Fee Region'
]

In [127]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first').set_output(transform='pandas')

In [128]:
train_enc = ohe.fit_transform(X_train[lowcard_features])
val_enc = ohe.transform(X_val[lowcard_features])
test_enc = ohe.transform(df_test[lowcard_features])

In [129]:
train_enc.index = X_train.index
val_enc.index = X_val.index
test_enc.index = df_test.index

In [130]:
X_train = pd.concat([X_train, train_enc], axis=1)
X_val = pd.concat([X_val, val_enc], axis=1)
df_test = pd.concat([df_test, test_enc], axis=1)

In [131]:
X_train.drop(columns=lowcard_features, inplace=True)
X_val.drop(columns=lowcard_features, inplace=True)
df_test.drop(columns=lowcard_features, inplace=True)

Before we move on, we will order the columns in the dataframes so they all follow the same order

In [132]:
X_val = X_val[X_train.columns]
df_test = df_test[X_train.columns]

## 8. Feature Scaling

In [133]:
scaling_features = [
    'Age at Injury'
    , 'Average Weekly Wage'
    , 'Birth Year'
    , 'IME-4 Count'
    , 'Industry Code'
    , 'WCIO Cause of Injury Code'
    , 'WCIO Nature of Injury Code'
    , 'WCIO Part Of Body Code'
    , 'Number of Dependents'
    , 'Days Difference'
    , 'Accident Date_year'
    , 'Accident Date_month'
    , 'Accident Date_day'
    , 'Assembly Date_year'
    , 'Assembly Date_month'
    , 'Assembly Date_day'
    , 'C-2 Date_year'
    , 'C-2 Date_month'
    , 'C-2 Date_day'
    , 'C-3 Date_year'
    , 'C-3 Date_month'
    , 'C-3 Date_day'
    , 'First Hearing Date_year'
    , 'First Hearing Date_month'
    , 'First Hearing Date_day'
]

In [134]:
scaler = MinMaxScaler()

In [135]:
X_train[scaling_features] = scaler.fit_transform(X_train[scaling_features])
X_val[scaling_features] = scaler.transform(X_val[scaling_features])
df_test[scaling_features] = scaler.transform(df_test[scaling_features])


## 9. Data imputation

In [136]:
imputation_features = [
    'Alternative Dispute Resolution'
    , 'Average Weekly Wage'
    , 'Birth Year'
    , 'Carrier Name'
    , 'Gender'
    # , 'IME-4 Count'  # we will fill these with 0
    , 'Industry Code'
    , 'WCIO Cause of Injury Code'
    , 'WCIO Nature of Injury Code'
    , 'WCIO Part Of Body Code'
    , 'Zip Code'
    # , 'Days Difference'  # we will fix these once we have Accident Date
    , 'Accident Date_year'
    , 'Accident Date_month'
    , 'Accident Date_day'
    # we will fill these with 0, since their absence is meaningful
    # , 'C-2 Date_year'
    # , 'C-2 Date_month'
    # , 'C-2 Date_day'
    # , 'C-3 Date_year'
    # , 'C-3 Date_month'
    # , 'C-3 Date_day'
    # , 'First Hearing Date_year'
    # , 'First Hearing Date_month'
    # , 'First Hearing Date_day'
]

### 9.1. Iterative Imputer

In [137]:
imp = IterativeImputer(
    estimator=RandomForestRegressor(n_jobs=-1, random_state=20),
    initial_strategy='median',
    max_iter=5,
    random_state=17,
    verbose=2
).set_output(transform='pandas')

In [138]:
# Defining a sample to fit the imputer, since we have to much data for our computers to handle
imp_sample = X_train.sample(
    n=10000
    , random_state=20
)

In [139]:
# Fitting the imputer on the sample data
imp.fit(imp_sample)

[IterativeImputer] Completing matrix with shape (10000, 56)
[IterativeImputer] Ending imputation round 1/5, elapsed time 37.33
[IterativeImputer] Change: 2.3601076688743006, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 2/5, elapsed time 128.26
[IterativeImputer] Change: 1.5369203019203077, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 3/5, elapsed time 183.11
[IterativeImputer] Change: 1.2733467302170522, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 4/5, elapsed time 377.77
[IterativeImputer] Change: 1.1921809856809815, scaled tolerance: 0.001 
[IterativeImputer] Ending imputation round 5/5, elapsed time 435.37
[IterativeImputer] Change: 1.3693061248177798, scaled tolerance: 0.001 




In [140]:
X_train.loc[:, imputation_features]  = imp.transform(X_train).loc[:, imputation_features]
X_val.loc[:, imputation_features] = imp.transform(X_val).loc[:, imputation_features]
df_test.loc[:, imputation_features] = imp.transform(df_test).loc[:, imputation_features]

[IterativeImputer] Completing matrix with shape (401796, 56)
[IterativeImputer] Ending imputation round 1/5, elapsed time 6.71
[IterativeImputer] Ending imputation round 2/5, elapsed time 14.04
[IterativeImputer] Ending imputation round 3/5, elapsed time 20.87
[IterativeImputer] Ending imputation round 4/5, elapsed time 27.92
[IterativeImputer] Ending imputation round 5/5, elapsed time 34.96
[IterativeImputer] Completing matrix with shape (172199, 56)
[IterativeImputer] Ending imputation round 1/5, elapsed time 4.57
[IterativeImputer] Ending imputation round 2/5, elapsed time 9.37
[IterativeImputer] Ending imputation round 3/5, elapsed time 14.06
[IterativeImputer] Ending imputation round 4/5, elapsed time 18.76
[IterativeImputer] Ending imputation round 5/5, elapsed time 23.10
[IterativeImputer] Completing matrix with shape (387975, 56)
[IterativeImputer] Ending imputation round 1/5, elapsed time 6.71
[IterativeImputer] Ending imputation round 2/5, elapsed time 13.70
[IterativeImputer

### 9.2. 0-fill

In [141]:
zero_fill = [
    'IME-4 Count'
    , 'C-2 Date_year'
    , 'C-2 Date_month'
    , 'C-2 Date_day'
    , 'C-3 Date_year'
    , 'C-3 Date_month'
    , 'C-3 Date_day'
    , 'First Hearing Date_year'
    , 'First Hearing Date_month'
    , 'First Hearing Date_day'
]

In [142]:
X_train.loc[:, zero_fill] = X_train.loc[:, zero_fill].fillna(0)
X_val.loc[:, zero_fill] = X_val.loc[:, zero_fill].fillna(0)
df_test.loc[:, zero_fill] = df_test.loc[:, zero_fill].fillna(0)

### 9.3. Fixing Some Features

As we mentioned earlier, we would fix Age at Injury once we had all values for Birth Year and Accident Date, which we now have. We will also fill the missing values of Days Difference using Accident and Assembly Date.

#### 9.3.1. Age at Injury

We first need to revert the scaling.

In [143]:
# Reverting the scaling
X_train.loc[:, scaling_features] = scaler.inverse_transform(X_train.loc[:, scaling_features])
X_val.loc[:, scaling_features] = scaler.inverse_transform(X_val.loc[:, scaling_features])
df_test.loc[:, scaling_features] = scaler.inverse_transform(df_test.loc[:, scaling_features])

In [144]:
# Checking for inconsistencies
print(len(X_train[X_train['Age at Injury'] < 14]))
print(len(X_val[X_val['Age at Injury'] < 14]))
print(len(df_test[df_test['Age at Injury'] < 14]))

0
0
0


In [145]:
# Computing Age at Injury
X_train.loc[X_train['Age at Injury'] < 14, 'Age at Injury'] = X_train['Accident Date_year'] - X_train['Birth Year']
X_val.loc[X_val['Age at Injury'] < 14, 'Age at Injury'] = X_val['Accident Date_year'] - X_val['Birth Year']
df_test.loc[df_test['Age at Injury'] < 14, 'Age at Injury'] = df_test['Accident Date_year'] - df_test['Birth Year']

In [146]:
# Checking if inconsistencies remained
print(len(X_train[X_train['Age at Injury'] < 14]))
print(len(X_val[X_val['Age at Injury'] < 14]))
print(len(df_test[df_test['Age at Injury'] < 14]))

0
0
0


#### 9.3.3. Dates

We will fix any date inconsistencies, i.e. if a day is not possible for that month, we will set it to the last possible one.

In [147]:
dates = [
    'Accident Date'
    , 'Assembly Date'
    , 'C-2 Date'
    , 'C-3 Date'
    , 'First Hearing Date'
]

In [148]:
def adjust_day(row, col):
    if row[f'{col}_month'].astype(int) == 2 and row[f'{col}_day'].astype(int) >= 29:
        if row[f'{col}_year'] in [2016, 2020]:
            return 29
        else:
            return 28  # Non-leap year, adjust day to 28
    elif row[f'{col}_month'].astype(int) in [4, 6, 9, 11] and row[f'{col}_day'].astype(int) > 30:
        return 30  # Month has only 30 days
    elif row[f'{col}_month'].astype(int) in [1, 3, 5, 7, 8, 10, 12] and row[f'{col}_day'].astype(int) > 31:
        return 31  # Month has only 31 days
    return row[f'{col}_day'].astype(int)  # No change if conditions are not met

In [149]:
for df in [X_train, X_val, df_test]:
    for col in dates:
        df[f'{col}_day'] = df.apply(lambda row: adjust_day(row, col), axis=1)

#### 9.3.2. Days Difference

In [150]:
X_train.loc[X_train['Days Difference'].isna(), 'Days Difference'] = X_train[X_train['Days Difference'].isna()].apply(
    lambda row: (
        pd.to_datetime(f"{int(row['Assembly Date_year'])}-{int(row['Assembly Date_month'])}-{int(row['Assembly Date_day'])}") - 
        pd.to_datetime(f"{int(row['Accident Date_year'])}-{int(row['Accident Date_month'])}-{int(row['Accident Date_day'])}")
    ).days
    , axis=1
)
X_val.loc[X_val['Days Difference'].isna(), 'Days Difference'] = X_val[X_val['Days Difference'].isna()].apply(
    lambda row: (
        pd.to_datetime(f"{int(row['Assembly Date_year'])}-{int(row['Assembly Date_month'])}-{int(row['Assembly Date_day'])}") - 
        pd.to_datetime(f"{int(row['Accident Date_year'])}-{int(row['Accident Date_month'])}-{int(row['Accident Date_day'])}")
    ).days
    , axis=1
)
df_test.loc[df_test['Days Difference'].isna(), 'Days Difference'] = df_test[df_test['Days Difference'].isna()].apply(
    lambda row: (
        pd.to_datetime(f"{int(row['Assembly Date_year'])}-{int(row['Assembly Date_month'])}-{int(row['Assembly Date_day'])}") - 
        pd.to_datetime(f"{int(row['Accident Date_year'])}-{int(row['Accident Date_month'])}-{int(row['Accident Date_day'])}")
    ).days
    , axis=1
)

In [151]:
# Scalling the features back

for dataset in [X_train, X_val, df_test]:
    dataset[scaling_features] = scaler.transform(dataset[scaling_features])


In [152]:
# Checking the missing values
print(f"NaN in X_train after imputation: {X_train.isnull().sum().sum()}")
print(f"NaN in X_val after imputation: {X_val.isnull().sum().sum()}")
print(f"NaN in df_test after imputation: {df_test.isnull().sum().sum()}")

NaN in X_train after imputation: 0
NaN in X_val after imputation: 0
NaN in df_test after imputation: 0


## 10. Export

In [159]:
y_train = pd.Series(y_train.ravel(), index=X_train.index, name="Agreement Reached")
df_train = pd.concat([X_train, y_train], axis=1)

y_val = pd.Series(y_val.ravel(), index=X_val.index, name="Agreement Reached")
df_val = pd.concat([X_val, y_val], axis=1)

  y_train = pd.Series(y_train.ravel(), index=X_train.index, name="Agreement Reached")
  y_val = pd.Series(y_val.ravel(), index=X_val.index, name="Agreement Reached")


In [160]:
df_train.to_csv('../Data/train_data_preproc_OpenEnded.csv', index=True)
df_val.to_csv('../Data/validation_data_preproc_OpenEnded.csv', index=True)
df_test.to_csv('../Data/test_data_preproc_OpenEnded.csv', index=True)

In [162]:
df_train['Agreement Reached'].value_counts()

Agreement Reached
0    383045
1     18751
Name: count, dtype: int64