# TO GRANT OR NOT TO GRANT: DECIDING ON COMPENSATION BENEFITS

## 1. Imports, options and ingestion

In [59]:
# importing the libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
import seaborn as sns
import re
import math

In [2]:
# setting the options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
np.set_printoptions(threshold=np.inf)

In [None]:
# importing the training and test data
df = pd.read_csv('/home/shadybea/OneDrive/General/Machine Learning/Project/Data/train_data.csv', sep=',')
df_test = pd.read_csv('/home/shadybea/OneDrive/General/Machine Learning/Project/Data/test_data.csv', sep=',')


Just by importing the dataset, we get a warning saying column 29 has mixed data types - we will check this in a bit.

## 2. Initial inspection

### 2.1. Macro-inspection

In [None]:
# we check the shape of the dataset
df.shape

In [None]:
# we check the first rows of the dataset
df.head()

In [None]:
# we check if there are any aggregation rows at the end of the dataset
df.tail()

In [None]:
# we check the datatypes and null counts
df.info()

**Data type analysis:**

Features that should be integers:
- 'Age at Injury'
- 'Birth Year'
- 'IME-4 Count'
- 'Industry Code'
- 'WCIO Cause of Injury Code'
- 'WCIO Nature of Injury Code'
- 'WCIO Part Of Body Code'
- 'Number of Dependents'

Features that should be booleans:
- 'Agreement Reached'

### 2.2. Claim Identifier

This feature is the unique identifier of each claim - we will analyse this column in more depth in an attempt to assign it as the index of our dataframe.

In [None]:
# we check if there are any duplicate values for this column
df[df['Claim Identifier'].duplicated(keep=False)]

Apparently, we have one duplicated 'Claim Identifier', where all values, except for 'Assembly Date', are NaNs. </br>
We will use the default python behavior and drop the second appearence of the repeated 'Claim Identifier'. </br>
As we saw previously, this column has no null values, so we can set it as the dataframe index.

In [9]:
df = df[~df['Claim Identifier'].duplicated()].set_index('Claim Identifier').rename_axis(None)

In [None]:
df.head()

### 2.3. Data consistency

#### 2.3.1. Missing Values

In [None]:
# Calculate the number of NaNs for each column
nan_counts = df.isna().sum()

# Get the total number of rows (entries) in the DataFrame
total_rows = df.shape[0]

# Calculate the percentage of NaN values for each column
percentage_nans = (nan_counts / total_rows) * 100

# Format the percentage with '%' sign
percentage_nans = percentage_nans.apply(lambda x: f"{x:.2f}%")

# Combine all information into a DataFrame for better readability
nan_summary = pd.DataFrame({
    'NaN Count': nan_counts,
    'Total Values': [total_rows] * len(nan_counts),  # Ensure this column matches the length of nan_counts
    'Percentage NaN': percentage_nans
})

# Print the result
print("Summary of NaN values per column:\n")
print(nan_summary)

By analysing the output above, we conclude:
- **'C-3 Date'**: more than 50% of the data for this feature is missing, it can be due to process status - the employee has not yet sent its report
- **'First Hearing Date'**: around 75% of the data for this feature is missing - this means hearings have not yet been scheduled
- **'IME-4 Count'**: more than 75% of the data for this feature is missing, it can be due to process status - the independent examiner has not yet sent its report
- **'OIICS Nature of Injury Description'**: only has null values

##### 2.3.1.1. Accident Date

We have a few missing values for 'Accident Date'; however, these values can be imputed if we have information about 'Age at Injury' and 'Birth Year'

In [None]:
df['Accident Date'].isna().sum()

In [None]:
df[(df['Accident Date'].isna()) & ~((df['Age at Injury'].isna()) | (df['Age at Injury'] == 0.0)) & ~(df['Birth Year'].isna())]

We have no way of deducing missing values for 'Accident Date'.

##### 2.3.1.2. Age at Injury

We have a few missing values for 'Age at Injury'; however, these values can be imputed if we have information about 'Accident Date' and 'Birth Year'

In [None]:
df['Age at Injury'].isna().sum()

In [None]:
df[(df['Age at Injury'].isna()) & ~((df['Accident Date'].isna())) & ~(df['Birth Year'].isna())]

We have no way of deducing missing values for 'Age at Injury'

##### 2.3.1.3. Birth Year

We have a few missing values for 'Birth Year'; however, these values can be imputed if we have information about 'Accident Date' and 'Age at Injury'

In [None]:
df['Birth Year'].isna().sum()

In [None]:
df[(df['Birth Year'].isna()) & ~((df['Accident Date'].isna())) & ~(df['Age at Injury'].isna())]

We will be able to deduce a good amount of values for 'Birth Year'. </br>
We do this in section TK.

#### 2.3.2. Data types

As we imported the data, we got a warning saying columns had mixed data types. We shall take a look at this issue now.

In [None]:
def check_mixed_types(column):
    return len(set(column[pd.notna(column)].apply(type))) > 1  # If there are more than one unique data types, excluding NaN values (as these are considered as floats)

# Apply the function to all columns and filter out the mixed-type columns
mixed_type_columns = [col for col in df.columns if check_mixed_types(df[col])]

print(mixed_type_columns)

In [None]:
for col in mixed_type_columns:
    print(df[col].apply(type).value_counts())

The values for 'Zip Code' are split across two data types - string and float. We shall look at the feature values in order to make a decision.

In [None]:
# we check if there are any values that contain any non numeric character
print(df[df['Zip Code'].apply(lambda x: bool(re.search(r'\D', str(x))))]['Zip Code'].unique())

Since we have determined that there are values for 'Zip Code' that contain other than numeric characters, we shall set this feature as a string.

In [21]:
# Changing the data type of the values to string
df['Zip Code'] = df['Zip Code'].astype(str)

In [None]:
# re-checking the data type counts
print(df['Zip Code'].apply(type).value_counts())

Let us now fixed the remaining data types, as identified in [Section 2.1](#21-macro-inspection)

In [65]:
# we create a dictionary that stores the column names
type_cast = {
    'int_features': ['Age at Injury', 'Birth Year', 'IME-4 Count', 'Industry Code', 'WCIO Cause of Injury Code', 'WCIO Nature of Injury Code', 'WCIO Part Of Body Code', 'Number of Dependents']
    , 'bool_features': ['Agreement Reached']
}

In [66]:
def convert_datatype(df, type_cast):
    for dtype, columns in type_cast.items():        
        if dtype == 'int_features':
            # Apply numeric conversion
            for col in columns:
                df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')
                
        elif dtype == 'bool_features':
            # Apply boolean conversion
            for col in columns:
                df[col] = df[col].apply(lambda x: x if pd.isna(x) else bool(x))
    
    return df

In [67]:
df = convert_datatype(df, type_cast)

In [None]:
# checking if the conversion was successful
for subset in type_cast:
    for col in type_cast[subset]:
        print(col, '\t', df[col].dtypes)

#### 2.3.4. Unique values

In [None]:
# we check the number of unique values for each column
df.nunique()

In [None]:
# we check the unique values for each column
pd.DataFrame(
    {
        'feature_type': [df[column].dtype for column in df.columns],
        'unique_values': df.apply(lambda col: sorted(pd.Series(col.dropna().unique().tolist())))  # we disregard NaN values, so we can sort the unique values
    }
)

Looking at this information, we can point out the following:
- **Age at Injury**: since this dataset concerns workplace injuries, values under 14 (legal working age in the US) are weird
- **Birth Year**: we have '0.0' values
- **Gender**: this is not a binary feature
- **OIICS Nature of Injury Description**: has no values
- **WCIO Part Of Body Code**: has a negative value
- **WCB Decision**: only has one value

#### 2.3.5. Date formats

In [None]:
date_columns = ['Accident Date', 'Assembly Date', 'C-2 Date', 'C-3 Date', 'First Hearing Date']

# regex pattern for the format YYYY-MM-DD
date_pattern = r'^\d{4}-\d{2}-\d{2}$'

# check if there are any values in these date columns that do not follow this format
for column in date_columns:
    display(df[df[column].apply(lambda x: pd.notna(x) and not bool(re.match(date_pattern, str(x))))])

#### 2.3.6. Inconsistencies

##### 2.3.6.1. Codes vs. Descriptions

In this section, we will check if the amount of codes are the same as the descriptions.

In [None]:
len(df['Industry Code'].dropna().unique()) == len(df['Industry Code Description'].dropna().unique())

In [None]:
df.groupby(['Industry Code', 'Industry Code Description']).size().reset_index(name='count').sort_values('Industry Code Description')

The "repeated" values of 'Industry Code Description' occur for consecutive 'Industry Code' values - one could consider replacing the different values for 'Industry Code' for a unique one.

In [None]:
len(df['WCIO Cause of Injury Code'].dropna().unique()) == len(df['WCIO Cause of Injury Description'].dropna().unique())

In [None]:
df.groupby(['WCIO Cause of Injury Code', 'WCIO Cause of Injury Description']).size().reset_index(name='count').sort_values(by='WCIO Cause of Injury Description')

Again, one could aggregate the same value for the description in the same code. There is also some values that are rather similiar that could be aggregated (e.g. 'MOVING PART OF MACHINE' and 'MOVING PARTS OF MACHINE')

In [None]:
len(df['WCIO Nature of Injury Code'].dropna().unique()) == len(df['WCIO Nature of Injury Description'].dropna().unique())

In [None]:
len(df['WCIO Part Of Body Code'].dropna().unique()) == len(df['WCIO Part Of Body Description'].dropna().unique())

In [None]:
df.groupby(['WCIO Part Of Body Code', 'WCIO Part Of Body Description']).size().reset_index(name='count').sort_values(by='WCIO Part Of Body Description')

Once again, we have descriptions associated to multiple codes - we can aggregate these descriptions into one single code.

##### 2.3.6.2. The wanna-be NaN

We will check if there are NaN values that are coded as string values, instead of the default np.nan

In [None]:
# we check if any column contains the string 'nan'
contains_nan_string = df.apply(lambda col: col.isin(['nan']).any())

# we get the columns that contain 'nan'
columns_with_nan_string = contains_nan_string[contains_nan_string].index.tolist()

columns_with_nan_string

Indeed we have string representations of missing values in 'Zip Code' - we shall convert these into actual NaN

In [38]:
df['Zip Code'] = df['Zip Code'].replace('nan', np.nan)

##### 2.3.6.3. Duplicated values

In [None]:
# we check how many duplicated rows we have
df.duplicated(keep=False).sum()

In [None]:
# we check how many of these are distinct
len(df[df.duplicated(keep=False)].drop_duplicates())

In fact, even though we have 19k+ duplicated rows, 1k of these rows are distinct, i.e., we have rows with more than one duplicate

In [None]:
df[df.duplicated()].drop_duplicates().head(10)

We shall check if all values of all rows are NaN with the exception of 'Assembly Date'

In [None]:
# we drop duplicates from those duplicated rows
unique_duplicated_rows = df[df.duplicated(keep=False)].drop_duplicates()

# we check if all values (excluding 'Assembly Date') are NaN
nan_check = unique_duplicated_rows.drop(columns='Assembly Date').isna().all(axis=1)

# we check the rows that have other than missing values
unique_duplicated_rows[~nan_check]

This means that all duplicated rows have missing values in every column but 'Assembly Date', except for one row

In [None]:
# percentage of duplicate values
df.duplicated().sum() / df.shape[0] * 100

The duplicated (and "empty") rows we have been discussing account for around 3.1% of our data - we will drop these values.

In [44]:
df.drop_duplicates(inplace=True)

##### 2.3.6.4. The target

In [None]:
# we check how many missing values we have in our target variable
df['Claim Injury Type'].isna().sum()

In [None]:
df[df['Claim Injury Type'].isna()].head(10)

At a first glance, it looks like these rows are all NaN values with the exception of 'Assembly Date' - just like before. Let us check if this is indeed the case.

In [None]:
no_target_df = df[df['Claim Injury Type'].isna()]

# we check if all values (excluding 'Assembly Date') are NaN
nan_check = no_target_df.drop(columns='Assembly Date').isna().all(axis=1)

# we check the rows that have other than missing values
no_target_df[~nan_check]

It seems like all these rows have missing values in every column but 'Assembly Date' - we will also remove these rows.

In [48]:
df = df[~df['Claim Injury Type'].isna()]

In [None]:
df.shape

Knowing the original dataset had 593471 rows, let us check what percentage of the data we have removed so far.

In [None]:
(1 - (df.shape[0] / 593471)) * 100

We are still below the 5% threshold rule of thumb, so we are good to go!

## 2.4. Statistical pitstop

Let us recheck our count of missing values once more and look at some statistics

### 2.4.1. Missing values (again)

In [None]:
# Calculate the number of NaNs for each column
nan_counts = df.isna().sum()

# Get the total number of rows (entries) in the DataFrame
total_rows = df.shape[0]

# Calculate the percentage of NaN values for each column
percentage_nans = (nan_counts / total_rows) * 100

# Format the percentage with '%' sign
percentage_nans = percentage_nans.apply(lambda x: f"{x:.2f}%")

# Combine all information into a DataFrame for better readability
nan_summary = pd.DataFrame({
    'NaN Count': nan_counts,
    'Total Values': [total_rows] * len(nan_counts),  # Ensure this column matches the length of nan_counts
    'Percentage NaN': percentage_nans
})

# Print the result
print("Summary of NaN values per column:\n")
print(nan_summary)

We have considerably reduced the number of missing values - nice!

### 2.4.2. Summary statistics

In [None]:
df.describe(include='all').T

# 3. Visual inspection

We now start the visual inspection of our data

In [None]:
df.columns

In [80]:
# we start by defining our numeric and categorical features
numeric_features = df[['Age at Injury', 'Average Weekly Wage', 'IME-4 Count']].columns.tolist()
categorical_features = df[['Accident Date', 'Alternative Dispute Resolution', 'Assembly Date', 'Attorney/Representative', 
       'Birth Year', 'C-2 Date', 'C-3 Date', 'Carrier Type', 'Claim Injury Type', 'County of Injury', 'COVID-19 Indicator',
       'District Name', 'First Hearing Date', 'Gender', 'Industry Code', 'Medical Fee Region', 'WCIO Cause of Injury Code',
       'WCIO Nature of Injury Code', 'WCIO Part Of Body Code', 'Zip Code', 'Agreement Reached', 'WCB Decision', 
       'Number of Dependents']].columns.tolist()

In [None]:
num_features = len(numeric_features)
num_columns = 2
num_rows = (num_features + num_columns - 1) // num_columns

fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(15, 5 * num_rows))

# Flatten the axes array for easier iteration if there's more than one row
axes = axes.flatten()

# Loop through each numeric feature to plot histograms
for ax, feature in zip(axes, numeric_features):
    ax.hist(df[feature].dropna(), bins=30, color='skyblue', alpha=0.7)  # alpha for transparency
    ax.set_title(f'Histogram of {feature}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')
    ax.grid(True)

# Hide any empty subplots if the number of features is not even
for i in range(len(numeric_features), len(axes)):
    fig.delaxes(axes[i])

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Set up the subplots with 4 columns
num_features = len(categorical_features)
num_columns = 4
num_rows = (num_features + num_columns - 1) // num_columns  # Calculate the required number of rows

fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(15, 5 * num_rows))

# Flatten the axes array for easier iteration if there's more than one row
axes = axes.flatten()

# Loop through each categorical feature to plot bar plots
for ax, feature in zip(axes, categorical_features):
    value_counts = df[feature].value_counts()
    value_counts.plot(kind='bar', ax=ax, color='skyblue', alpha=0.7)  # Bar plot
    ax.set_title(f'Bar Plot of {feature}')
    ax.set_xlabel(feature)
    ax.set_ylabel('Frequency')
    ax.grid(axis='y')  # Only show gridlines for y-axis

# Hide any empty subplots if the number of features is not even
for i in range(len(categorical_features), len(axes)):
    fig.delaxes(axes[i])

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()

# Checkpoint 2024.10.24 12:32

In [None]:
df['Age at Injury'].isnull().sum()

# Remove 19445 rows with NaN values

In [None]:
# Remove 19445 rows with NaN values
df_nulos = df[df['Age at Injury'].isnull()]
df = df.drop(df_nulos.index)
df = df.drop('OIICS Nature of Injury Description', axis=1)

# Calculate the number of NaNs for each column
nan_counts = df.isna().sum()

# Get the total number of rows (entries) in the DataFrame
total_rows = df.shape[0]

# Calculate the percentage of NaN values for each column
percentage_nans = (nan_counts / total_rows) * 100

# Format the percentage with '%' sign
percentage_nans = percentage_nans.apply(lambda x: f"{x:.2f}%")

# Combine all information into a DataFrame for better readability
nan_summary = pd.DataFrame({
    'NaN Count': nan_counts,
    'Total Values': [total_rows] * len(nan_counts),  # Ensure this column matches the length of nan_counts
    'Percentage NaN': percentage_nans
})

# Print the result
print("Summary of NaN values per column:")
print(nan_summary)

In [None]:
df.duplicated().sum()

In [None]:
df.describe()

# Single Variables

In [None]:
fig, axes = plt.subplots(6, 2, figsize=(12, 30), tight_layout=True)

axes = axes.flatten()

for i, column in enumerate(df_numeric):
    sns.histplot(x=df_numeric[column], bins=20, ax=axes[i])
    axes[i].set_title(f'Histogram de {column}')
    axes[i].set_ylabel('Frequency')

plt.tight_layout()  
plt.show()

### Age at Injury

In [None]:
df['Age at Injury'].value_counts

### Alternative Dispute Resolution

In [None]:
df['Alternative Dispute Resolution'].value_counts()

### New Feature: Days Difference

In [None]:
# Converter as colunas para o formato de data, se ainda não estiverem
df['Accident Date'] = pd.to_datetime(df['Accident Date'])
df['Assembly Date'] = pd.to_datetime(df['Assembly Date'])

# Calcular a diferença de dias entre as duas colunas
df['Days Difference'] = (df['Assembly Date'] - df['Accident Date']).dt.days

# DataFrame com a nova coluna
print(df[['Accident Date', 'Assembly Date', 'Days Difference']])

### Attorney/Representative

In [None]:
df['Attorney/Representative'].value_counts()

In [None]:
# Assuming df is your DataFrame and 'Attorney/Representative' is a non-numeric column
plt.figure(figsize=(12, 6))  # Setup figure size

# Create a count plot for 'Attorney/Representative'
sns.countplot(x=df['Attorney/Representative'])
plt.title('Count of Attorney/Representative')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

### Average Weekly Wage

In [None]:
(df['Average Weekly Wage']).value_counts()

In [None]:
# Assuming df is your DataFrame and 'Average Weekly Wage' is a numeric column
plt.figure(figsize=(12, 6))  # Setup figure size

# Create a boxplot for 'Average Weekly Wage'
sns.boxplot(x=df['Average Weekly Wage'])
plt.title('Boxplot of Average Weekly Wage')
plt.ylabel('Values')

plt.tight_layout()
plt.show()

In [None]:
# Assuming df is your DataFrame and 'Average Weekly Wage' is the column of interest
plt.figure(figsize=(12, 6))  # Adjust the figure size as needed

# Create a histogram for the 'Average Weekly Wage'
sns.histplot(df['Average Weekly Wage'].dropna(), bins=20, kde=False, color='blue')
plt.title('Histogram of Average Weekly Wage')
plt.xlabel('Average Weekly Wage')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Calculate value counts including NaN
value_counts = df['Average Weekly Wage'].value_counts(dropna=False)

# Total number of entries
total_entries = df['Average Weekly Wage'].size

# Calculate the percentage of 0.0 values
percent_zero = (value_counts.get(0.0, 0) / total_entries) * 100

# Calculate the percentage of NaN values using np.isnan and sum
percent_nan = (df['Average Weekly Wage'].isna().sum() / total_entries) * 100

# Calculate the percentage of all other values
percent_others = 100 - (percent_zero + percent_nan)

# Print the results
print('Average Weekly Wage Percentages:')
print(f"0.0 values: {percent_zero:.2f}%")
print(f"NaN values: {percent_nan:.2f}%")
print(f"All other values: {percent_others:.2f}%")

### Birth Year

In [None]:
(df['Average Weekly Wage']).value_counts()

In [None]:
(df['Birth Year']==0.0).sum()

In [None]:
plt.figure(figsize=(12, 8))
sns.stripplot(x=df['Birth Year'], jitter=0.1, size=5, color='purple', alpha=0.6)
plt.title('Distribution of Birth Years')
plt.xlabel('Birth Year')
plt.grid(True)
plt.show()

### Carrier Name

In [None]:
carrier_counts = df['Carrier Name'].value_counts()

plt.figure(figsize=(14, 8))  # Adjust the figure size as necessary
carrier_counts.head(20).plot(kind='bar', color='skyblue')  # Show top 20 carriers for readability
plt.title('Top 20 Carrier Names by Frequency')
plt.xlabel('Carrier Name')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)  # Add horizontal grid lines for better visual comparison
plt.show()

In [None]:
(df['Carrier Name'] == 'STATE INSURANCE FUND').value_counts(normalize=True) * 100

### Claim Injury Type

In [None]:
carrier_counts = df['Claim Injury Type'].value_counts()

plt.figure(figsize=(14, 8))  # Adjust the figure size as necessary
carrier_counts.head(20).plot(kind='bar', color='skyblue')  # Show top 20 carriers for readability
plt.title('Claim Injury Type Count')
plt.xlabel('Claim Injury Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)  # Add horizontal grid lines for better visual comparison
plt.show()

### County of Injury

In [None]:
carrier_counts = df['County of Injury'].value_counts()

plt.figure(figsize=(14, 8))  # Adjust the figure size as necessary
carrier_counts.head(30).plot(kind='bar', color='skyblue')  # Show top 20 carriers for readability
plt.title('Top 30 County of Injury')
plt.xlabel('County of Injury')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)  # Add horizontal grid lines for better visual comparison
plt.show()

### COVID-19 Indicator

In [None]:
carrier_counts = df['COVID-19 Indicator'].value_counts()

plt.figure(figsize=(14, 8))  # Adjust the figure size as necessary
carrier_counts.head(20).plot(kind='bar', color='skyblue')  # Show top 20 carriers for readability
plt.title('COVID-19 Indicator by Frequency')
plt.xlabel('COVID-19 Indicator')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)  # Add horizontal grid lines for better visual comparison
plt.show()

### District Name

In [None]:
carrier_counts = df['District Name'].value_counts()

plt.figure(figsize=(14, 8))  # Adjust the figure size as necessary
carrier_counts.head(20).plot(kind='bar', color='skyblue')  # Show top 20 carriers for readability
plt.title('District Name by Frequency')
plt.xlabel('District Name')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)  # Add horizontal grid lines for better visual comparison
plt.show()

In [None]:
values= df['District Name'].dropna().value_counts()

plt.pie(values, labels=values.index.astype(str), autopct='%1.1f%%')
plt.title('Distribution of District Name')
plt.show()

### Gender

In [None]:
carrier_counts = df['Gender'].value_counts()

plt.figure(figsize=(14, 8))  # Adjust the figure size as necessary
carrier_counts.head(20).plot(kind='bar', color='skyblue')  # Show top 20 carriers for readability
plt.title('Gender by Frequency')
plt.xlabel('Gender')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)  # Add horizontal grid lines for better visual comparison
plt.show()

In [None]:
df['Gender'].value_counts()

### IME-4 Count Distribution

In [None]:

# Assuming 'carrier_counts' contains the value counts of the 'IME-4 Count' column
top_10 = carrier_counts.head(10)
other = carrier_counts.iloc[10:].sum()  # Sum the counts beyond the top 10

# Create a new series that includes 'Other' using pd.concat
pie_data = pd.concat([top_10, pd.Series([other], index=['Other'])])

# Create a pie chart
plt.figure(figsize=(10, 8))
pie_data.plot(kind='pie', autopct='%1.1f%%', colors=['#ff9999','#66b3ff','#99ff99','#ffcc99','#c2c2f0','#ffb3e6', '#c4e17f', '#76d7c4', '#f7c6c7', '#f7b7a3', '#d4e157'])
plt.title('IME-4 Count Distribution including Other')
plt.ylabel('')  # Pie chart does not require a y-label
plt.show()

### Industry Code

In [None]:
carrier_counts = df['Industry Code'].value_counts()

plt.figure(figsize=(14, 8))  # Adjust the figure size as necessary
carrier_counts.head(24).plot(kind='bar', color='skyblue')  # Show top 20 carriers for readability
plt.title('Industry Code by Frequency')
plt.xlabel('Industry Code')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)  # Add horizontal grid lines for better visual comparison
plt.show()

In [None]:
df['Industry Code'].value_counts()

### Industry Code Description

In [None]:
carrier_counts = df['Industry Code Description'].value_counts()

plt.figure(figsize=(14, 10))  # Adjust the figure size as necessary
carrier_counts.head(24).plot(kind='bar', color='skyblue')  # Show top 20 carriers for readability
plt.title('Industry Code Description by Frequency')
plt.xlabel('Industry Code Description')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)  # Add horizontal grid lines for better visual comparison
plt.show()

### Number of Dependents

In [None]:
# Count the occurrences of each number of dependents
dependent_counts = df['Number of Dependents'].value_counts().sort_index()

plt.figure(figsize=(10, 6))
sns.pointplot(x=dependent_counts.index, y=dependent_counts.values)
plt.title('Dot Plot of Number of Dependents')
plt.xlabel('Number of Dependents')
plt.ylabel('Frequency')
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

In [None]:
# Drop NA values and count occurrences of each number of dependents
value_counts = df['Number of Dependents'].dropna().value_counts()

# Create a pie chart
plt.pie(value_counts, labels=value_counts.index.astype(str), autopct='%1.1f%%')
plt.title('Distribution of Number of Dependents')
plt.show()

### WCIO Nature of Injury Description

In [None]:
# Filtrar os dados para remover valores nulos na coluna de lesões
filtered_data = df.dropna(subset=['WCIO Nature of Injury Description'])

# Contar as 5 lesões mais comuns
top_injuries = filtered_data['WCIO Nature of Injury Description'].value_counts().head(5)

# Exibir as 5 lesões mais comuns
print(top_injuries)

In [None]:
# Substituir valores de 'SPRAIN OR TEAR' por 'STRAIN OR TEAR' (ou vice-versa, dependendo da sua escolha)
filtered_data['WCIO Nature of Injury Description'] = filtered_data['WCIO Nature of Injury Description'].replace('SPRAIN OR TEAR', 'STRAIN OR TEAR')

# Contar novamente as 5 lesões mais comuns após a unificação
top_injuries_unified = filtered_data['WCIO Nature of Injury Description'].value_counts().head(5)

# Exibir os resultados
print(top_injuries_unified)

### Relation between WCIO Nature of Injury Description and Industry Code Description

In [None]:
# Definir as 5 lesões mais comuns após a unificação
common_injuries = ['STRAIN OR TEAR']

# Filtrar o dataset para conter apenas essas lesões
filtered_data = filtered_data[filtered_data['WCIO Nature of Injury Description'].isin(common_injuries)]

# Gráfico de barras para visualizar a relação entre lesões e tipo de trabalho
plt.figure(figsize=(12, 8))
sns.countplot(x='Industry Code Description', hue='WCIO Nature of Injury Description', data=filtered_data)
plt.title('Relation between WCIO Nature of Injury Description and Industry Code Description')
plt.xticks(rotation=45)
plt.legend(title='Tipo de Lesão', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


# Gráfico de boxplot para analisar a idade dos trabalhadores por tipo de lesão
plt.figure(figsize=(10, 6))
sns.boxplot(x='WCIO Nature of Injury Description', y='Age at Injury', data=filtered_data)
plt.title('Relation between WCIO Nature of Injury Description and Industry Code Description')
plt.xticks(rotation=45)
plt.show()


In [None]:
(df['Alternative Dispute Resolution']).value_counts()

### Relation between Attorney/Representative and Claim Injury Type

In [None]:
# Filtrar os dados onde 'Alternative Dispute Resolution' é 'Y'
adr_yes = df[df['Alternative Dispute Resolution'] == 'Y']

# Contar os valores únicos de 'Claim Injury Type' para os casos com ADR 'Y'
claim_injury_type_counts_adr_yes = adr_yes['Claim Injury Type'].value_counts()

# Exibir os resultados
print("Claim Injury Type when 'Alternative Dispute Resolution' == 'Y'")
print(claim_injury_type_counts_adr_yes)

# Calcular a percentagem de cada 'Claim Injury Type' quando 'Alternative Dispute Resolution' é 'Y'
claim_injury_type_percentage_adr_yes = (claim_injury_type_counts_adr_yes / claim_injury_type_counts_adr_yes.sum()) * 100

# Exibir as percentagens
print(claim_injury_type_percentage_adr_yes)

In [None]:
# Criar gráfico para visualizar a relação entre 'Alternative Dispute Resolution' e 'Claim Injury Type'
plt.figure(figsize=(12, 6))
sns.countplot(x='Attorney/Representative', hue='Claim Injury Type', data=df)
plt.title('Relation between Attorney/Representative and Claim Injury Type')
plt.xlabel('Attorney/Representative')
plt.ylabel('Contagem')
plt.legend(title='Attorney/Representative', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Criar uma tabela cruzada (crosstab) para contar a frequência de 'Claim Injury Type' em função de 'Attorney/Representative'
claim_injury_type_by_attorney = pd.crosstab(df['Claim Injury Type'], df['Attorney/Representative'])

# Calcular a percentagem de cada valor no total para advogado e não advogado
claim_injury_type_percentage = claim_injury_type_by_attorney.apply(lambda x: x / x.sum() * 100, axis=1)

# Concatenar os valores absolutos com as percentagens
claim_injury_type_with_percentage = pd.concat([claim_injury_type_by_attorney, claim_injury_type_percentage], axis=1, keys=['Count', 'Percentage'])

# Exibir o resultado
print(claim_injury_type_with_percentage)

### Average Weekly Wage by Attorney/Representative

In [None]:
# Assuming df is your DataFrame
# Calculate the mean 'Average Weekly Wage' for each 'Attorney/Representative' category
mean_wage_by_lawyer = df.groupby('Attorney/Representative')['Average Weekly Wage'].mean().reset_index()

# Create a bar plot
plt.figure(figsize=(8, 6))
bar_plot = sns.barplot(x='Attorney/Representative', y='Average Weekly Wage', data=mean_wage_by_lawyer)

# Add title and labels
plt.title('Average Weekly Wage by Attorney/Representative')
plt.xlabel('Attorney/Representative (Y/N)')
plt.ylabel('Average Weekly Wage')

# Annotate the bar plot with the actual mean values
for index, row in mean_wage_by_lawyer.iterrows():
    bar_plot.text(index, row['Average Weekly Wage'], f"{row['Average Weekly Wage']:.2f}", 
                  color='black', ha="center", va="bottom")

# Show the plot
plt.show()

### Relation between Carrier Type and Claim Injury Type

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='Carrier Type', hue='Claim Injury Type', data=df)
plt.title('Relation between Carrier Type and Claim Injury Type')
plt.xlabel('Carrier Type')
plt.ylabel('Contagem')
plt.xticks(rotation=45)
plt.legend(title='Claim Injury Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='Carrier Type', hue='Attorney/Representative', data=df)
plt.title('Relation between Carrier Type and Claim Injury Type')
plt.xlabel('Carrier Type')
plt.ylabel('Contagem')
plt.xticks(rotation=45)
plt.legend(title='Attorney/Representative', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
# Criar uma tabela cruzada (crosstab) para contar a frequência de 'Carrier Type' em função de 'Attorney/Representative'
carrier_type_with_attorney_counts = pd.crosstab(df['Carrier Type'], df['Attorney/Representative'])

# Exibir os resultados
print(carrier_type_with_attorney_counts)

### Relation between COVID-19 Indicator and Claim Injury Type

In [None]:
covid_yes = df[df['COVID-19 Indicator'] == 'Y']

plt.figure(figsize=(12, 6))
sns.countplot(x='COVID-19 Indicator', hue='Claim Injury Type', data=covid_yes)
plt.title('Relation between COVID-19 Indicator and Claim Injury Type')
plt.xlabel('COVID-19 Indicator')
plt.ylabel('Contagem')
plt.legend(title='Claim Injury Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Filtrar os dados onde 'COVID-19 Indicator' é 'Y'
covid_yes = df[df['COVID-19 Indicator'] == 'Y']

# Contar o número de ocorrências de 'Claim Injury Type' para os casos com 'COVID-19 Indicator' = 'Y'
covid_yes_claim_injury_counts = covid_yes['Claim Injury Type'].value_counts()

# Calcular a percentagem de cada tipo de lesão
total_claims = covid_yes_claim_injury_counts.sum()
covid_yes_claim_injury_percentage = (covid_yes_claim_injury_counts / total_claims) * 100

# Calcular a percentagem acumulada
covid_yes_claim_injury_cumulative_percentage = covid_yes_claim_injury_percentage.cumsum()

# Criar um DataFrame com as contagens, percentagens e percentagem acumulada
covid_yes_claim_injury_df = pd.DataFrame({
    'Count': covid_yes_claim_injury_counts,
    'Percentage': covid_yes_claim_injury_percentage,
    'Cumulative Percentage': covid_yes_claim_injury_cumulative_percentage
})

# Exibir o resultado
print("Claim Injury Type para casos onde COVID-19 Indicator = 'Y'")
print(covid_yes_claim_injury_df)



### Relation betweenAge at Injury and Mean IME-4 Count

In [None]:
# Create a line plot
mean_ime_by_age = df.groupby('Age at Injury')['IME-4 Count'].mean().reset_index()

plt.figure(figsize=(14, 8))
sns.lineplot(x='Age at Injury', y='IME-4 Count', data=mean_ime_by_age, marker='o')

# Add title and labels
plt.title('Mean IME-4 Count by Age at Injury')
plt.xlabel('Age at Injury')
plt.ylabel('Mean IME-4 Count')

# Show the plot
plt.show()

### Relation between Average Weekly Wage and Attorney/Representative

In [None]:
# Remove rows where 'Average Weekly Wage' is 0.0 or NaN
filtered_df = df[df['Average Weekly Wage'] > 0]

# Ensure that the column names match exactly the ones in your dataset.
# Group by 'Attorney/Representative' and calculate the mean 'Average Weekly Wage'
mean_wage_by_lawyer = filtered_df.groupby('Attorney/Representative')['Average Weekly Wage'].mean().reset_index()

print(mean_wage_by_lawyer)

plt.figure(figsize=(8, 6))
sns.barplot(x='Attorney/Representative', y='Average Weekly Wage', data=mean_wage_by_lawyer, palette='viridis')

# Add title and labels
plt.title('Average Weekly Wage by Attorney/Representative')
plt.xlabel('Attorney/Representative (Y/N)')
plt.ylabel('Average Weekly Wage')

# Show the plot
plt.show()

### Relation between Mean Average Weekly Wage and Age at Injury

In [None]:
# Create a line plot
mean_wage_by_age = df.groupby('Age at Injury')['Average Weekly Wage'].mean().reset_index()

plt.figure(figsize=(14, 8))
sns.lineplot(x='Age at Injury', y='Average Weekly Wage', data=mean_wage_by_age, marker='o')

# Add title and labels
plt.title('Mean Average Weekly Wage by Age at Injury')
plt.xlabel('Age at Injury')
plt.ylabel('Average Weekly Wage')

# Show the plot
plt.show()

### Relation between Mean Days Difference and Age at Injury

In [None]:
# Create a line plot
mean_daysdif_by_age = df.groupby('Age at Injury')['Days Difference'].mean().reset_index()

plt.figure(figsize=(14, 8))
sns.lineplot(x='Age at Injury', y='Days Difference', data=mean_daysdif_by_age, marker='o')

# Add title and labels
plt.title('Mean Days Difference by Age at Injury')
plt.xlabel('Age at Injury')
plt.ylabel('Days Difference')

# Show the plot
plt.show()

### Relation between Mean Days Difference and Average Weekly Wage

In [None]:
# Create a line plot
mean_daysdif_by_wage = df.groupby('Average Weekly Wage')['Days Difference'].mean().reset_index()

plt.figure(figsize=(14, 8))
sns.lineplot(x='Average Weekly Wage', y='Days Difference', data=mean_daysdif_by_wage, marker='o')

# Add title and labels
plt.title('Mean Days Difference by Average Weekly Wage')
plt.xlabel('Average Weekly Wage')
plt.ylabel('Days Difference')

# Show the plot
plt.show()

### Relation between Mean Days Difference and District

In [None]:
mean_daysdif_by_district = df.groupby('District Name')['Days Difference'].mean().reset_index()

# Create a bar plot
plt.figure(figsize=(8, 6))
bar_plot = sns.barplot(x='District Name', y='Days Difference', data=mean_daysdif_by_district)

# Add title and labels
plt.title('Mean Days Difference by District')
plt.xlabel('District')
plt.ylabel('Days Difference')

# Annotate the bar plot with the actual mean values
for index, row in mean_daysdif_by_district.iterrows():
    bar_plot.text(index, row['Days Difference'], f"{row['Days Difference']:.2f}", 
                  color='black', ha="center", va="bottom")

# Show the plot
plt.show()

### Proportion between Atorney/Representative within District Name

In [None]:
cross_tab = pd.crosstab(df['District Name'], df['Attorney/Representative'], normalize='index')

# Plotar o gráfico de barras empilhadas com proporções
cross_tab.plot(kind='bar', stacked=True, figsize=(8, 6), color=['skyblue', 'salmon'])

# Adicionar rótulos e título
plt.title('Proportion of Attorney/Representative within District Name')
plt.xlabel('District Name')
plt.ylabel('Proportion')
plt.legend(title='Attorney/Representative', loc='upper right')
plt.show()

### Mean Days Difference by Attorney/Representative

In [None]:
mean_daysdif_by_lawyer = df.groupby('Attorney/Representative')['Days Difference'].mean().reset_index()

# Create a bar plot
plt.figure(figsize=(8, 6))
bar_plot = sns.barplot(x='Attorney/Representative', y='Days Difference', data=mean_daysdif_by_lawyer)

# Add title and labels
plt.title('Mean Days Difference by Attorney/Representative')
plt.xlabel('Attorney/Representative (Y/N)')
plt.ylabel('Days Difference')

# Annotate the bar plot with the actual mean values
for index, row in mean_daysdif_by_lawyer.iterrows():
    bar_plot.text(index, row['Days Difference'], f"{row['Days Difference']:.2f}", 
                  color='black', ha="center", va="bottom")

# Show the plot
plt.show()