# TO GRANT OR NOT TO GRANT: DECIDING ON COMPENSATION BENEFITS

## 1. Imports, options and ingestion

In [41]:
# importing the libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
import seaborn as sns
import re

In [48]:
# setting the options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
np.set_printoptions(threshold=np.inf)

In [3]:
# importing the training and test data
df = pd.read_csv('/home/shadybea/OneDrive/General/Machine Learning/Project/Data/train_data.csv', sep=',')
df_test = pd.read_csv('/home/shadybea/OneDrive/General/Machine Learning/Project/Data/test_data.csv', sep=',')


  df = pd.read_csv('/home/shadybea/OneDrive/General/Machine Learning/Project/Data/train_data.csv', sep=',')


Just by importing the dataset, we get a warning saying column 29 has mixed data types - we will check this in a bit.

## 2. Initial inspection

### 2.1. Macro-inspection

In [4]:
# we check the shape of the dataset
df.shape

(593471, 33)

In [5]:
# we check the first rows of the dataset
df.head()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Identifier,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Industry Code Description,Medical Fee Region,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Agreement Reached,WCB Decision,Number of Dependents
0,2019-12-30,31.0,N,2020-01-01,N,0.0,1988.0,2019-12-31,,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,5393875,2. NON-COMP,ST. LAWRENCE,N,SYRACUSE,,M,,44.0,RETAIL TRADE,I,,27.0,FROM LIQUID OR GREASE SPILLS,10.0,CONTUSION,62.0,BUTTOCKS,13662.0,0.0,Not Work Related,1.0
1,2019-08-30,46.0,N,2020-01-01,Y,1745.93,1973.0,2020-01-01,2020-01-14,ZURICH AMERICAN INSURANCE CO,1A. PRIVATE,5393091,4. TEMPORARY,WYOMING,N,ROCHESTER,2020-02-21,F,4.0,23.0,CONSTRUCTION,I,,97.0,REPETITIVE MOTION,49.0,SPRAIN OR TEAR,38.0,SHOULDER(S),14569.0,1.0,Not Work Related,4.0
2,2019-12-06,40.0,N,2020-01-01,N,1434.8,1979.0,2020-01-01,,INDEMNITY INSURANCE CO OF,1A. PRIVATE,5393889,4. TEMPORARY,ORANGE,N,ALBANY,,M,,56.0,ADMINISTRATIVE AND SUPPORT AND WASTE MANAGEMEN...,II,,79.0,OBJECT BEING LIFTED OR HANDLED,7.0,CONCUSSION,10.0,MULTIPLE HEAD INJURY,12589.0,0.0,Not Work Related,6.0
3,,,,2020-01-01,,,,,,,,957648180,,,,,,,,,,,,,,,,,,,,,
4,2019-12-30,61.0,N,2020-01-01,N,,1958.0,2019-12-31,,STATE INSURANCE FUND,2A. SIF,5393887,2. NON-COMP,DUTCHESS,N,ALBANY,,M,,62.0,HEALTH CARE AND SOCIAL ASSISTANCE,II,,16.0,"HAND TOOL, UTENSIL; NOT POWERED",43.0,PUNCTURE,36.0,FINGER(S),12603.0,0.0,Not Work Related,1.0


In [6]:
# we check if there are any aggregation rows at the end of the dataset
df.tail()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Identifier,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Industry Code Description,Medical Fee Region,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Agreement Reached,WCB Decision,Number of Dependents
593466,,,,2022-12-31,,,,,,,,327160035,,,,,,,,,,,,,,,,,,,,,
593467,2022-12-13,72.0,N,2022-12-31,N,0.0,1950.0,2022-12-31,,TECHNOLOGY INSURANCE CO. INC.,1A. PRIVATE,6165075,2. NON-COMP,SULLIVAN,N,BINGHAMTON,,F,,48.0,TRANSPORTATION AND WAREHOUSING,I,,25.0,FROM DIFFERENT LEVEL (ELEVATION),90.0,MULTIPLE PHYSICAL INJURIES ONLY,-9.0,MULTIPLE,12779.0,0.0,Not Work Related,3.0
593468,,,,2022-12-31,,,,,,,,249875936,,,,,,,,,,,,,,,,,,,,,
593469,,,,2022-12-31,,,,,,,,120584215,,,,,,,,,,,,,,,,,,,,,
593470,,,,2022-12-31,,,,,,,,818961390,,,,,,,,,,,,,,,,,,,,,


In [7]:
# we check the datatypes and null counts
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593471 entries, 0 to 593470
Data columns (total 33 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   Accident Date                       570337 non-null  object 
 1   Age at Injury                       574026 non-null  float64
 2   Alternative Dispute Resolution      574026 non-null  object 
 3   Assembly Date                       593471 non-null  object 
 4   Attorney/Representative             574026 non-null  object 
 5   Average Weekly Wage                 545375 non-null  float64
 6   Birth Year                          544948 non-null  float64
 7   C-2 Date                            559466 non-null  object 
 8   C-3 Date                            187245 non-null  object 
 9   Carrier Name                        574026 non-null  object 
 10  Carrier Type                        574026 non-null  object 
 11  Claim Identifier          

**Data type analysis:**

Features that should be dates:
- 'Accident Date'
- 'Assembly Date'
- 'C-2 Date'
- 'C-3 Date'
- 'First Hearing Date'

Features that should be integers:
- 'Age at Injury'
- 'Birth Year'
- 'Industry Code'
- 'WCIO Cause of Injury Code'
- 'WCIO Nature of Injury Code'
- 'WCIO Part Of Body Code'
- 'Number of Dependents'

Features that should be booleans:
- 'Agreement Reached'

### 2.2. Claim Identifier

This feature is the unique identifier of each claim - we will analyse this column in more depth in an attempt to assign it as the index of our dataframe.

In [8]:
# we check if there are any duplicate values for this column
df[df['Claim Identifier'].duplicated(keep=False)]

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Identifier,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Industry Code Description,Medical Fee Region,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Agreement Reached,WCB Decision,Number of Dependents
257901,,,,2021-05-21,,,,,,,,292668076,,,,,,,,,,,,,,,,,,,,,
526445,,,,2022-09-05,,,,,,,,292668076,,,,,,,,,,,,,,,,,,,,,


Apparently, we have one duplicated 'Claim Identifier', where all values, except for 'Assembly Date', are NaNs. </br>
We will use the default python behavior and drop the second appearence of the repeated 'Claim Identifier'. </br>
As we saw previously, this column has no null values, so we can set it as the dataframe index.

In [9]:
df = df[~df['Claim Identifier'].duplicated()].set_index('Claim Identifier').rename_axis(None)

In [10]:
df.head()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Industry Code Description,Medical Fee Region,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Agreement Reached,WCB Decision,Number of Dependents
5393875,2019-12-30,31.0,N,2020-01-01,N,0.0,1988.0,2019-12-31,,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,2. NON-COMP,ST. LAWRENCE,N,SYRACUSE,,M,,44.0,RETAIL TRADE,I,,27.0,FROM LIQUID OR GREASE SPILLS,10.0,CONTUSION,62.0,BUTTOCKS,13662.0,0.0,Not Work Related,1.0
5393091,2019-08-30,46.0,N,2020-01-01,Y,1745.93,1973.0,2020-01-01,2020-01-14,ZURICH AMERICAN INSURANCE CO,1A. PRIVATE,4. TEMPORARY,WYOMING,N,ROCHESTER,2020-02-21,F,4.0,23.0,CONSTRUCTION,I,,97.0,REPETITIVE MOTION,49.0,SPRAIN OR TEAR,38.0,SHOULDER(S),14569.0,1.0,Not Work Related,4.0
5393889,2019-12-06,40.0,N,2020-01-01,N,1434.8,1979.0,2020-01-01,,INDEMNITY INSURANCE CO OF,1A. PRIVATE,4. TEMPORARY,ORANGE,N,ALBANY,,M,,56.0,ADMINISTRATIVE AND SUPPORT AND WASTE MANAGEMEN...,II,,79.0,OBJECT BEING LIFTED OR HANDLED,7.0,CONCUSSION,10.0,MULTIPLE HEAD INJURY,12589.0,0.0,Not Work Related,6.0
957648180,,,,2020-01-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5393887,2019-12-30,61.0,N,2020-01-01,N,,1958.0,2019-12-31,,STATE INSURANCE FUND,2A. SIF,2. NON-COMP,DUTCHESS,N,ALBANY,,M,,62.0,HEALTH CARE AND SOCIAL ASSISTANCE,II,,16.0,"HAND TOOL, UTENSIL; NOT POWERED",43.0,PUNCTURE,36.0,FINGER(S),12603.0,0.0,Not Work Related,1.0


### 2.3. Data consistency

#### 2.3.1. Missing Values

In [11]:
# Calculate the number of NaNs for each column
nan_counts = df.isna().sum()

# Get the total number of rows (entries) in the DataFrame
total_rows = df.shape[0]

# Calculate the percentage of NaN values for each column
percentage_nans = (nan_counts / total_rows) * 100

# Format the percentage with '%' sign
percentage_nans = percentage_nans.apply(lambda x: f"{x:.2f}%")

# Combine all information into a DataFrame for better readability
nan_summary = pd.DataFrame({
    'NaN Count': nan_counts,
    'Total Values': [total_rows] * len(nan_counts),  # Ensure this column matches the length of nan_counts
    'Percentage NaN': percentage_nans
})

# Print the result
print("Summary of NaN values per column:\n")
print(nan_summary)

Summary of NaN values per column:

                                    NaN Count  Total Values Percentage NaN
Accident Date                           23133        593470          3.90%
Age at Injury                           19444        593470          3.28%
Alternative Dispute Resolution          19444        593470          3.28%
Assembly Date                               0        593470          0.00%
Attorney/Representative                 19444        593470          3.28%
Average Weekly Wage                     48095        593470          8.10%
Birth Year                              48522        593470          8.18%
C-2 Date                                34004        593470          5.73%
C-3 Date                               406225        593470         68.45%
Carrier Name                            19444        593470          3.28%
Carrier Type                            19444        593470          3.28%
Claim Injury Type                       19444        593470      

By analysing the output above, we conclude:
- **'C-3 Date'**: more than 50% of the data for this feature is missing, it can be due to process status - the employee has not yet sent its report
- **'First Hearing Date'**: around 75% of the data for this feature is missing - this means hearings have not yet been scheduled
- **'IME-4 Count'**: more than 75% of the data for this feature is missing, it can be due to process status - the independent examiner has not yet sent its report
- **'OIICS Nature of Injury Description'**: only has null values

#### 2.3.2. Mixed data types

As we imported the data, we got a warning saying columns had mixed data types. We shall take a look at this issue now.

In [25]:
def check_mixed_types(column):
    return len(set(column[pd.notna(column)].apply(type))) > 1  # If there are more than one unique data types, excluding NaN values (as these are considered as floats)

# Apply the function to all columns and filter out the mixed-type columns
mixed_type_columns = [col for col in df.columns if check_mixed_types(df[col])]

print(mixed_type_columns)

['Zip Code']


In [26]:
for col in mixed_type_columns:
    print(df[col].apply(type).value_counts())

Zip Code
<class 'str'>      530364
<class 'float'>     63106
Name: count, dtype: int64


The values for 'Zip Code' are split across two data types - string and float. We shall look at the feature values in order to make a decision.

In [43]:
# we check if there are any values that contain any non numeric character
print(df[df['Zip Code'].apply(lambda x: bool(re.search(r'\D', str(x))))]['Zip Code'].unique())

[nan 'L1N 5' 'T1B0P' 'L6Y 1' 'JMDMR' 'N2P 1' 'H7X3Z' 'L2N 3' 'L0R 1'
 'L1A 3' 'T4R1E' 'L7L 6' 'L2W 1' 'K9K 1' '.1605' 'M4E1S' 'L2A 5' 'UNKNO'
 'T2V' 'J0L1B' 'AB106' 'J1H2Y' 'L2A 1' 'H2T2W' 'T1J5G' '22-40' 'M3K2B'
 'M3K2C' 'T1S 0' 'N6C4E' 'H1V2L' 'V6Z3G' 'KOC 1' 'M6S5B' 'V3H1H' 'L2H0H'
 'L1C0K' 'JOL1B' 'SLV' 'BT48-' 'L5N7C' 'JMDCN' 'V6T1Z' 'T2X3P' 'L2A1R'
 'L4A0G' 'H1V3S' 'K6H2L' 'JMAKN' 'L6H 3' 'J0J1S' 'B3E1A' 'V6T5C' 'JMBTS'
 'V2B0E' 'T6R 0' 'JMDWD' 'M6K3C' 'L9W' 'S7R 0' 'F94A0' 'P7G' 'Z2E1N'
 'L0S1N' 'K2L 4' 'H1G0A' 'B3M 3' 10567.0 11213.0 11421.0 11378.0 10035.0
 11385.0 10302.0 10305.0 12550.0 10027.0 11710.0 12180.0 11365.0 11418.0
 10598.0 10465.0 18851.0 6820.0 10952.0 14557.0 72524.0 7882.0 10923.0
 12901.0 11207.0 12590.0 11580.0 11725.0 10033.0 10532.0 12804.0 14614.0
 11779.0 11224.0 11435.0 11735.0 11367.0 11772.0 11218.0 11434.0 11901.0
 11577.0 11004.0 13083.0 10037.0 11214.0 10458.0 11422.0 14590.0 7822.0
 10001.0 10306.0 11704.0 10453.0 11414.0 11206.0 11203.0 11212.0 1

Since we have determined that there are values for 'Zip Code' that contain other than numeric characters, we shall set this feature as a string.

In [44]:
# Changing the data type of the values to string
df['Zip Code'] = df['Zip Code'].astype(str)

In [45]:
# re-checking the data type counts
print(df['Zip Code'].apply(type).value_counts())

Zip Code
<class 'str'>    593470
Name: count, dtype: int64


#### 2.3.4. Unique values

In [51]:
# we check the number of unique values for each column
df.nunique()

Accident Date                           5539
Age at Injury                            108
Alternative Dispute Resolution             3
Assembly Date                           1096
Attorney/Representative                    2
Average Weekly Wage                   120024
Birth Year                               107
C-2 Date                                2475
C-3 Date                                1648
Carrier Name                            2046
Carrier Type                               8
Claim Injury Type                          8
County of Injury                          63
COVID-19 Indicator                         2
District Name                              8
First Hearing Date                      1094
Gender                                     4
IME-4 Count                               41
Industry Code                             24
Industry Code Description                 20
Medical Fee Region                         5
OIICS Nature of Injury Description         0
WCIO Cause

In [50]:
# we check the unique values for each column
pd.DataFrame(
    {
        'feature_type': [df[column].dtype for column in df.columns],
        'unique_values': df.apply(lambda col: sorted(pd.Series(col.dropna().unique().tolist())))  # we disregard NaN values, so we can sort the unique values
    }
)

Unnamed: 0,feature_type,unique_values
Accident Date,object,"[1961-09-06, 1963-10-01, 1966-06-01, 1966-09-13, 1967-01-01, 1967-04-05, 1967-10-26, 1969-04-01, 1969-05-03, 1970-12-10, 1971-03-17, 1971-04-27, 1971-05-08, 1972-05-26, 1972-09-26, 1972-11-15, 1972-12-26, 1973-01-15, 1973-02-27, 1973-04-24, 1973-05-01, 1973-05-31, 1973-12-12, 1973-12-24, 1974-02-22, 1974-06-06, 1974-06-22, 1974-07-09, 1974-10-29, 1975-05-01, 1975-05-06, 1975-06-09, 1975-06-17, 1975-07-01, 1975-08-06, 1975-10-16, 1975-12-22, 1976-07-21, 1976-08-17, 1976-10-14, 1977-03-04, 1977-04-04, 1977-06-15, 1977-06-17, 1977-06-21, 1977-07-22, 1977-09-16, 1977-12-29, 1978-01-23, 1978-02-02, 1978-04-04, 1978-05-19, 1978-07-28, 1978-08-14, 1979-03-01, 1979-08-28, 1979-10-08, 1979-11-02, 1980-01-01, 1980-01-16, 1980-03-26, 1980-03-28, 1980-04-24, 1980-04-28, 1980-05-01, 1980-05-09, 1980-05-17, 1980-10-06, 1980-12-16, 1980-12-17, 1981-01-28, 1981-02-09, 1981-02-16, 1981-04-21, 1981-06-22, 1981-09-23, 1981-11-12, 1981-12-09, 1981-12-30, 1982-01-01, 1982-02-01, 1982-03-12, 1982-03-25, 1982-05-21, 1982-06-10, 1982-06-14, 1982-08-01, 1982-10-15, 1982-12-07, 1983-01-18, 1983-02-14, 1983-02-19, 1983-05-01, 1983-06-06, 1983-07-12, 1983-07-18, 1983-07-28, 1983-08-22, 1984-01-16, 1984-01-30, ...]"
Age at Injury,float64,"[0.0, 1.0, 5.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 104.0, ...]"
Alternative Dispute Resolution,object,"[N, U, Y]"
Assembly Date,object,"[2020-01-01, 2020-01-02, 2020-01-03, 2020-01-04, 2020-01-05, 2020-01-06, 2020-01-07, 2020-01-08, 2020-01-09, 2020-01-10, 2020-01-11, 2020-01-12, 2020-01-13, 2020-01-14, 2020-01-15, 2020-01-16, 2020-01-17, 2020-01-18, 2020-01-19, 2020-01-20, 2020-01-21, 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01-26, 2020-01-27, 2020-01-28, 2020-01-29, 2020-01-30, 2020-01-31, 2020-02-01, 2020-02-02, 2020-02-03, 2020-02-04, 2020-02-05, 2020-02-06, 2020-02-07, 2020-02-08, 2020-02-09, 2020-02-10, 2020-02-11, 2020-02-12, 2020-02-13, 2020-02-14, 2020-02-15, 2020-02-16, 2020-02-17, 2020-02-18, 2020-02-19, 2020-02-20, 2020-02-21, 2020-02-22, 2020-02-23, 2020-02-24, 2020-02-25, 2020-02-26, 2020-02-27, 2020-02-28, 2020-02-29, 2020-03-01, 2020-03-02, 2020-03-03, 2020-03-04, 2020-03-05, 2020-03-06, 2020-03-07, 2020-03-08, 2020-03-09, 2020-03-10, 2020-03-11, 2020-03-12, 2020-03-13, 2020-03-14, 2020-03-15, 2020-03-16, 2020-03-17, 2020-03-18, 2020-03-19, 2020-03-20, 2020-03-21, 2020-03-22, 2020-03-23, 2020-03-24, 2020-03-25, 2020-03-26, 2020-03-27, 2020-03-28, 2020-03-29, 2020-03-30, 2020-03-31, 2020-04-01, 2020-04-02, 2020-04-03, 2020-04-04, 2020-04-05, 2020-04-06, 2020-04-07, 2020-04-08, 2020-04-09, ...]"
Attorney/Representative,object,"[N, Y]"
Average Weekly Wage,float64,"[0.0, 13.94, 17.41, 23.27, 27.49, 30.88, 31.54, 32.0, 34.23, 35.0, 36.86, 38.13, 38.46, 40.0, 41.73, 42.92, 43.5, 44.65, 46.15, 48.39, 48.92, 50.0, 50.33, 50.53, 50.83, 51.15, 51.76, 52.24, 52.63, 55.0, 57.59, 58.37, 58.72, 59.14, 61.08, 62.43, 62.62, 62.77, 63.0, 65.95, 66.33, 66.66, 66.97, 67.03, 67.25, 69.15, 70.0, 71.33, 72.26, 72.28, 75.0, 76.0, 76.42, 77.04, 78.27, 78.31, 78.47, 80.0, 80.22, 80.34, 80.35, 81.25, 81.67, 82.0, 82.92, 83.51, 84.62, 86.05, 86.25, 87.83, 88.0, 88.4, 89.17, 90.0, 90.39, 90.75, 91.0, 91.08, 91.27, 91.92, 92.4, 92.54, 93.02, 93.19, 94.0, 94.34, 95.0, 95.88, 96.0, 96.08, 96.3, 96.95, 97.83, 98.28, 98.61, 99.5, 99.71, 99.95, 100.0, 100.2, ...]"
Birth Year,float64,"[0.0, 1900.0, 1901.0, 1902.0, 1903.0, 1907.0, 1909.0, 1910.0, 1911.0, 1916.0, 1917.0, 1920.0, 1921.0, 1922.0, 1923.0, 1924.0, 1925.0, 1926.0, 1927.0, 1928.0, 1929.0, 1930.0, 1931.0, 1932.0, 1933.0, 1934.0, 1935.0, 1936.0, 1937.0, 1938.0, 1939.0, 1940.0, 1941.0, 1942.0, 1943.0, 1944.0, 1945.0, 1946.0, 1947.0, 1948.0, 1949.0, 1950.0, 1951.0, 1952.0, 1953.0, 1954.0, 1955.0, 1956.0, 1957.0, 1958.0, 1959.0, 1960.0, 1961.0, 1962.0, 1963.0, 1964.0, 1965.0, 1966.0, 1967.0, 1968.0, 1969.0, 1970.0, 1971.0, 1972.0, 1973.0, 1974.0, 1975.0, 1976.0, 1977.0, 1978.0, 1979.0, 1980.0, 1981.0, 1982.0, 1983.0, 1984.0, 1985.0, 1986.0, 1987.0, 1988.0, 1989.0, 1990.0, 1991.0, 1992.0, 1993.0, 1994.0, 1995.0, 1996.0, 1997.0, 1998.0, 1999.0, 2000.0, 2001.0, 2002.0, 2003.0, 2004.0, 2005.0, 2006.0, 2007.0, 2008.0, ...]"
C-2 Date,object,"[1996-01-12, 1996-01-31, 1996-02-14, 1996-03-06, 1996-03-29, 1996-04-25, 1996-05-09, 1996-05-10, 1996-05-16, 1996-05-22, 1996-06-04, 1996-06-06, 1996-06-18, 1996-06-19, 1996-06-20, 1996-06-27, 1996-06-28, 1996-07-01, 1996-07-08, 1996-07-23, 1996-07-29, 1996-08-08, 1996-08-12, 1996-08-13, 1996-08-15, 1996-08-20, 1996-09-17, 1996-10-09, 1996-10-16, 1996-10-29, 1996-11-06, 1996-11-08, 1996-11-14, 1996-11-21, 1996-12-05, 1996-12-19, 1996-12-23, 1997-01-15, 1997-01-21, 1997-01-24, 1997-02-07, 1997-02-10, 1997-02-12, 1997-02-13, 1997-02-19, 1997-02-20, 1997-03-06, 1997-03-18, 1997-03-24, 1997-04-01, 1997-04-02, 1997-04-10, 1997-04-17, 1997-04-24, 1997-05-07, 1997-06-12, 1997-06-16, 1997-06-20, 1997-06-25, 1997-07-01, 1997-07-03, 1997-07-17, 1997-07-18, 1997-09-05, 1997-09-10, 1997-09-16, 1997-09-17, 1997-09-24, 1997-10-15, 1997-10-16, 1997-10-22, 1997-10-23, 1997-10-24, 1997-11-24, 1997-12-05, 1997-12-08, 1997-12-15, 1998-01-26, 1998-02-05, 1998-02-27, 1998-03-17, 1998-03-18, 1998-04-06, 1998-04-17, 1998-06-04, 1998-06-08, 1998-06-10, 1998-06-23, 1998-06-26, 1998-06-29, 1998-07-14, 1998-07-23, 1998-07-27, 1998-08-10, 1998-10-01, 1998-10-13, 1998-10-26, 1998-10-27, 1998-10-30, 1998-11-17, ...]"
C-3 Date,object,"[1992-11-13, 2010-05-14, 2010-07-29, 2010-08-27, 2010-10-08, 2011-01-11, 2011-03-03, 2011-05-26, 2012-07-12, 2012-11-02, 2013-04-03, 2013-08-07, 2013-12-25, 2016-06-20, 2016-11-08, 2017-02-09, 2017-09-01, 2017-09-07, 2017-09-12, 2017-11-08, 2018-01-30, 2018-02-05, 2018-02-13, 2018-02-26, 2018-03-09, 2018-04-04, 2018-05-17, 2018-06-13, 2018-06-26, 2018-06-28, 2018-07-11, 2018-07-19, 2018-07-30, 2018-10-24, 2018-10-30, 2018-11-09, 2018-11-23, 2018-12-14, 2019-01-04, 2019-01-14, 2019-01-17, 2019-02-25, 2019-02-27, 2019-03-25, 2019-03-28, 2019-03-29, 2019-04-03, 2019-04-04, 2019-05-02, 2019-05-15, 2019-05-17, 2019-05-21, 2019-05-23, 2019-05-24, 2019-06-03, 2019-06-21, 2019-06-28, 2019-07-03, 2019-07-12, 2019-07-18, 2019-07-19, 2019-07-22, 2019-07-24, 2019-07-25, 2019-07-26, 2019-07-29, 2019-07-31, 2019-08-07, 2019-08-08, 2019-08-14, 2019-08-16, 2019-08-19, 2019-08-22, 2019-08-29, 2019-09-06, 2019-09-10, 2019-09-13, 2019-09-17, 2019-09-19, 2019-10-03, 2019-10-07, 2019-10-08, 2019-10-11, 2019-10-21, 2019-10-22, 2019-10-27, 2019-10-28, 2019-10-29, 2019-10-30, 2019-10-31, 2019-11-01, 2019-11-04, 2019-11-06, 2019-11-07, 2019-11-08, 2019-11-11, 2019-11-12, 2019-11-13, 2019-11-14, 2019-11-15, ...]"
Carrier Name,object,"[*** CARRIER UNDETERMINED ***, A I U INSURANCE COMPANY, ABF FREIGHT SYSTEM, INC., ABF FREIGHT SYSTEMS INC DEL, ACADIA INSURANCE COMPANY, ACCIDENT FUND GENERAL, ACCIDENT FUND GENERAL INS CO, ACCIDENT FUND INSURANCE, ACCIDENT FUND INSURANCE CO, ACCIDENT FUND NATIONAL INS CO, ACCIDENT FUND NATONAL INS CO, ACCREDITED SURETY AND CASUALTY, ACE AMERICAN INSURANCE CO, ACE AMERICAN INSURANCE CO., ACE FIRE UNDERWRITERS, ACE FIRE UNDERWRITERS INS, ACE PROPERTY & CASUALTY INS CO, ACE PROPERTY AND CASUALTY, ACIG INSURANCE COMPANY, ADDISON CENTRAL SCHOOL DISTRIC, ADDISON CSD, ADIRONDACK CENTRAL SCHOOL, ADIRONDACK CSD, ADMIRAL INDEMNITY COMPANY, ADMIRAL INDEMNITY CORP., AIG PROPERTY CASUALTY CO., AIG PROPERTY CASUALTY COMPANY, AIRBORNE EXPRESS, INC., AIU INSURANCE CO, AKRON CENTRAL SCHOOL DISTRICT, AKRON CSD, ALBANY CITY OF, ALBANY CITY SCHOOL DISTRICT, ALBANY, COUNTY OF, ALBION CENTRAL SCHOOL DIST, ALBION CENTRAL SCHOOL DISTRICT, ALDEN C.S.D., ALDEN CENTRAL SCHOOL DIST, ALEXANDRIA CENTRAL SCHOOL DIST, ALFRED-ALMOND CENTRAL SCHOOL, ALFRED-ALMOND CSD, ALL AMERICA INSURANCE CO, ALLEGANY COUNTY MUTUAL, ALLEGANY-LIMESTONE, ALLEGANY-LIMESTONE CSD, ALLIANCE NATIONAL INS CO, ALLIANCE NATIONAL INSURANCE CO, ALLIANZ GLOBAL RISKS US, ALLIED EASTERN INDEMNITY CO, ALLIED PROPERTY & CASUALTY, ALLIED PROPERTY AND CASUALTY, ALLMERICA FIN BENEFIT INS CO, ALLMERICA FINANCIAL ALLIANCE, ALLMERICA FINANCIAL BENEFIT, ALLSTATE INSURANCE COMPANY, ALTMAR-PARISH-WILLIAMSTOWN CSD, AMAGANSETT UFSD, AMCO INSURANCE COMPANY, AMERICAN AUTOMOBILE INS CO, AMERICAN AUTOMOBILE INS. CO., AMERICAN CASUALTY CO, AMERICAN CASUALTY CO OF, AMERICAN FIRE & CASUALTY CO, AMERICAN FIRE & CASUALTY CO., AMERICAN GUAR & LIAB INS CO, AMERICAN GUARANTEE & LIABILITY, AMERICAN HOME ASSURANCE CO, AMERICAN INSURANCE CO, AMERICAN MFG. MUT. INS. CO., AMERICAN MOTORISTS INS CO, AMERICAN PROTECTION INS CO., AMERICAN ZURICH INSURANCE CO, AMERISURE INS CO, AMERISURE INSURANCE COMPANY, AMERISURE MUTUAL INSURANCE CO, AMGUARD INSURANCE COMPANY, AMHERST C.S.D, AMHERST CENTRAL SCHOOL DIST, AMITYVILLE UFSD, AMITYVILLE UNION FREE, AMTRUST INS CO OF KANSAS INC, AMTRUST INSURANCE COMPANY, AMTRUST INSURANCE COMPANY OF, ANDES CENTRAL SCHOOL, ANDES CENTRAL SCHOOL DISTRICT, ANDOVER CENTRAL SCHOOL, ARCH INDEMNITY INSURANCE CO, ARCH INDEMNITY INSURANCE CO., ARCH INSURANCE CO, ARCH INSURANCE COMPANY, ARDSLEY UFSD, ARDSLEY UNION FREE SCHL DIST, ARGONAUT INSURANCE COMPANY, ARGONAUT-MIDWEST INS CO, ARGONAUT-MIDWEST INS, CO, ARGYLE CENTRAL SCHOOL DIST, ARI INSURANCE COMPANY, ARKPORT CENTRAL SCHOOL, ARLINGTON CENTRAL, ARLINGTON CSD, ...]"


# Checkpoint 2024.10.24 12:32

In [None]:
df[df['Age at Injury'].isnull()]

# Remove 19445 rows with NaN values

In [None]:
# Remove 19445 rows with NaN values
df_nulos = df[df['Age at Injury'].isnull()]
df = df.drop(df_nulos.index)
df = df.drop('OIICS Nature of Injury Description', axis=1)

# Calculate the number of NaNs for each column
nan_counts = df.isna().sum()

# Get the total number of rows (entries) in the DataFrame
total_rows = df.shape[0]

# Calculate the percentage of NaN values for each column
percentage_nans = (nan_counts / total_rows) * 100

# Format the percentage with '%' sign
percentage_nans = percentage_nans.apply(lambda x: f"{x:.2f}%")

# Combine all information into a DataFrame for better readability
nan_summary = pd.DataFrame({
    'NaN Count': nan_counts,
    'Total Values': [total_rows] * len(nan_counts),  # Ensure this column matches the length of nan_counts
    'Percentage NaN': percentage_nans
})

# Print the result
print("Summary of NaN values per column:")
print(nan_summary)

In [None]:
df.duplicated().sum()

In [None]:
df.describe()

# Single Variables

In [None]:
df_numeric = df.select_dtypes(include=['number']) 
df_non_numeric = df.drop(df_numeric.columns, axis=1)

df_numeric

In [None]:
fig, axes = plt.subplots(6, 2, figsize=(12, 30), tight_layout=True)

axes = axes.flatten()

for i, column in enumerate(df_numeric):
    sns.histplot(x=df_numeric[column], bins=20, ax=axes[i])
    axes[i].set_title(f'Histogram de {column}')
    axes[i].set_ylabel('Frequency')

plt.tight_layout()  
plt.show()

### Age at Injury

In [None]:
df['Age at Injury'].value_counts

### Alternative Dispute Resolution

In [None]:
df['Alternative Dispute Resolution'].value_counts()

### New Feature: Days Difference

In [None]:
# Converter as colunas para o formato de data, se ainda não estiverem
df['Accident Date'] = pd.to_datetime(df['Accident Date'])
df['Assembly Date'] = pd.to_datetime(df['Assembly Date'])

# Calcular a diferença de dias entre as duas colunas
df['Days Difference'] = (df['Assembly Date'] - df['Accident Date']).dt.days

# DataFrame com a nova coluna
print(df[['Accident Date', 'Assembly Date', 'Days Difference']])

### Attorney/Representative

In [None]:
df['Attorney/Representative'].value_counts()

In [None]:
# Assuming df is your DataFrame and 'Attorney/Representative' is a non-numeric column
plt.figure(figsize=(12, 6))  # Setup figure size

# Create a count plot for 'Attorney/Representative'
sns.countplot(x=df['Attorney/Representative'])
plt.title('Count of Attorney/Representative')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

### Average Weekly Wage

In [None]:
(df['Average Weekly Wage']).value_counts()

In [None]:
# Assuming df is your DataFrame and 'Average Weekly Wage' is a numeric column
plt.figure(figsize=(12, 6))  # Setup figure size

# Create a boxplot for 'Average Weekly Wage'
sns.boxplot(x=df['Average Weekly Wage'])
plt.title('Boxplot of Average Weekly Wage')
plt.ylabel('Values')

plt.tight_layout()
plt.show()

In [None]:
# Assuming df is your DataFrame and 'Average Weekly Wage' is the column of interest
plt.figure(figsize=(12, 6))  # Adjust the figure size as needed

# Create a histogram for the 'Average Weekly Wage'
sns.histplot(df['Average Weekly Wage'].dropna(), bins=20, kde=False, color='blue')
plt.title('Histogram of Average Weekly Wage')
plt.xlabel('Average Weekly Wage')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Calculate value counts including NaN
value_counts = df['Average Weekly Wage'].value_counts(dropna=False)

# Total number of entries
total_entries = df['Average Weekly Wage'].size

# Calculate the percentage of 0.0 values
percent_zero = (value_counts.get(0.0, 0) / total_entries) * 100

# Calculate the percentage of NaN values using np.isnan and sum
percent_nan = (df['Average Weekly Wage'].isna().sum() / total_entries) * 100

# Calculate the percentage of all other values
percent_others = 100 - (percent_zero + percent_nan)

# Print the results
print('Average Weekly Wage Percentages:')
print(f"0.0 values: {percent_zero:.2f}%")
print(f"NaN values: {percent_nan:.2f}%")
print(f"All other values: {percent_others:.2f}%")

### Birth Year

In [None]:
(df['Average Weekly Wage']).value_counts()

In [None]:
(df['Birth Year']==0.0).sum()

In [None]:
plt.figure(figsize=(12, 8))
sns.stripplot(x=df['Birth Year'], jitter=0.1, size=5, color='purple', alpha=0.6)
plt.title('Distribution of Birth Years')
plt.xlabel('Birth Year')
plt.grid(True)
plt.show()

### Carrier Name

In [None]:
carrier_counts = df['Carrier Name'].value_counts()

plt.figure(figsize=(14, 8))  # Adjust the figure size as necessary
carrier_counts.head(20).plot(kind='bar', color='skyblue')  # Show top 20 carriers for readability
plt.title('Top 20 Carrier Names by Frequency')
plt.xlabel('Carrier Name')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)  # Add horizontal grid lines for better visual comparison
plt.show()

In [None]:
(df['Carrier Name'] == 'STATE INSURANCE FUND').value_counts(normalize=True) * 100

### Claim Injury Type

In [None]:
carrier_counts = df['Claim Injury Type'].value_counts()

plt.figure(figsize=(14, 8))  # Adjust the figure size as necessary
carrier_counts.head(20).plot(kind='bar', color='skyblue')  # Show top 20 carriers for readability
plt.title('Claim Injury Type Count')
plt.xlabel('Claim Injury Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)  # Add horizontal grid lines for better visual comparison
plt.show()

### County of Injury

In [None]:
carrier_counts = df['County of Injury'].value_counts()

plt.figure(figsize=(14, 8))  # Adjust the figure size as necessary
carrier_counts.head(30).plot(kind='bar', color='skyblue')  # Show top 20 carriers for readability
plt.title('Top 30 County of Injury')
plt.xlabel('County of Injury')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)  # Add horizontal grid lines for better visual comparison
plt.show()

### COVID-19 Indicator

In [None]:
carrier_counts = df['COVID-19 Indicator'].value_counts()

plt.figure(figsize=(14, 8))  # Adjust the figure size as necessary
carrier_counts.head(20).plot(kind='bar', color='skyblue')  # Show top 20 carriers for readability
plt.title('COVID-19 Indicator by Frequency')
plt.xlabel('COVID-19 Indicator')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)  # Add horizontal grid lines for better visual comparison
plt.show()

### District Name

In [None]:
carrier_counts = df['District Name'].value_counts()

plt.figure(figsize=(14, 8))  # Adjust the figure size as necessary
carrier_counts.head(20).plot(kind='bar', color='skyblue')  # Show top 20 carriers for readability
plt.title('District Name by Frequency')
plt.xlabel('District Name')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)  # Add horizontal grid lines for better visual comparison
plt.show()

In [None]:
values= df['District Name'].dropna().value_counts()

plt.pie(values, labels=values.index.astype(str), autopct='%1.1f%%')
plt.title('Distribution of District Name')
plt.show()

### Gender

In [None]:
carrier_counts = df['Gender'].value_counts()

plt.figure(figsize=(14, 8))  # Adjust the figure size as necessary
carrier_counts.head(20).plot(kind='bar', color='skyblue')  # Show top 20 carriers for readability
plt.title('Gender by Frequency')
plt.xlabel('Gender')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)  # Add horizontal grid lines for better visual comparison
plt.show()

In [None]:
df['Gender'].value_counts()

### IME-4 Count Distribution

In [None]:

# Assuming 'carrier_counts' contains the value counts of the 'IME-4 Count' column
top_10 = carrier_counts.head(10)
other = carrier_counts.iloc[10:].sum()  # Sum the counts beyond the top 10

# Create a new series that includes 'Other' using pd.concat
pie_data = pd.concat([top_10, pd.Series([other], index=['Other'])])

# Create a pie chart
plt.figure(figsize=(10, 8))
pie_data.plot(kind='pie', autopct='%1.1f%%', colors=['#ff9999','#66b3ff','#99ff99','#ffcc99','#c2c2f0','#ffb3e6', '#c4e17f', '#76d7c4', '#f7c6c7', '#f7b7a3', '#d4e157'])
plt.title('IME-4 Count Distribution including Other')
plt.ylabel('')  # Pie chart does not require a y-label
plt.show()

### Industry Code

In [None]:
carrier_counts = df['Industry Code'].value_counts()

plt.figure(figsize=(14, 8))  # Adjust the figure size as necessary
carrier_counts.head(24).plot(kind='bar', color='skyblue')  # Show top 20 carriers for readability
plt.title('Industry Code by Frequency')
plt.xlabel('Industry Code')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)  # Add horizontal grid lines for better visual comparison
plt.show()

In [None]:
df['Industry Code'].value_counts()

### Industry Code Description

In [None]:
carrier_counts = df['Industry Code Description'].value_counts()

plt.figure(figsize=(14, 10))  # Adjust the figure size as necessary
carrier_counts.head(24).plot(kind='bar', color='skyblue')  # Show top 20 carriers for readability
plt.title('Industry Code Description by Frequency')
plt.xlabel('Industry Code Description')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better readability
plt.grid(True, axis='y', linestyle='--', alpha=0.7)  # Add horizontal grid lines for better visual comparison
plt.show()

### Number of Dependents

In [None]:
# Count the occurrences of each number of dependents
dependent_counts = df['Number of Dependents'].value_counts().sort_index()

plt.figure(figsize=(10, 6))
sns.pointplot(x=dependent_counts.index, y=dependent_counts.values)
plt.title('Dot Plot of Number of Dependents')
plt.xlabel('Number of Dependents')
plt.ylabel('Frequency')
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

In [None]:
# Drop NA values and count occurrences of each number of dependents
value_counts = df['Number of Dependents'].dropna().value_counts()

# Create a pie chart
plt.pie(value_counts, labels=value_counts.index.astype(str), autopct='%1.1f%%')
plt.title('Distribution of Number of Dependents')
plt.show()

### WCIO Nature of Injury Description

In [None]:
# Filtrar os dados para remover valores nulos na coluna de lesões
filtered_data = df.dropna(subset=['WCIO Nature of Injury Description'])

# Contar as 5 lesões mais comuns
top_injuries = filtered_data['WCIO Nature of Injury Description'].value_counts().head(5)

# Exibir as 5 lesões mais comuns
print(top_injuries)

In [None]:
# Substituir valores de 'SPRAIN OR TEAR' por 'STRAIN OR TEAR' (ou vice-versa, dependendo da sua escolha)
filtered_data['WCIO Nature of Injury Description'] = filtered_data['WCIO Nature of Injury Description'].replace('SPRAIN OR TEAR', 'STRAIN OR TEAR')

# Contar novamente as 5 lesões mais comuns após a unificação
top_injuries_unified = filtered_data['WCIO Nature of Injury Description'].value_counts().head(5)

# Exibir os resultados
print(top_injuries_unified)

### Relation between WCIO Nature of Injury Description and Industry Code Description

In [None]:
# Definir as 5 lesões mais comuns após a unificação
common_injuries = ['STRAIN OR TEAR']

# Filtrar o dataset para conter apenas essas lesões
filtered_data = filtered_data[filtered_data['WCIO Nature of Injury Description'].isin(common_injuries)]

# Gráfico de barras para visualizar a relação entre lesões e tipo de trabalho
plt.figure(figsize=(12, 8))
sns.countplot(x='Industry Code Description', hue='WCIO Nature of Injury Description', data=filtered_data)
plt.title('Relation between WCIO Nature of Injury Description and Industry Code Description')
plt.xticks(rotation=45)
plt.legend(title='Tipo de Lesão', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


# Gráfico de boxplot para analisar a idade dos trabalhadores por tipo de lesão
plt.figure(figsize=(10, 6))
sns.boxplot(x='WCIO Nature of Injury Description', y='Age at Injury', data=filtered_data)
plt.title('Relation between WCIO Nature of Injury Description and Industry Code Description')
plt.xticks(rotation=45)
plt.show()


In [None]:
(df['Alternative Dispute Resolution']).value_counts()

### Relation between Attorney/Representative and Claim Injury Type

In [None]:
# Filtrar os dados onde 'Alternative Dispute Resolution' é 'Y'
adr_yes = df[df['Alternative Dispute Resolution'] == 'Y']

# Contar os valores únicos de 'Claim Injury Type' para os casos com ADR 'Y'
claim_injury_type_counts_adr_yes = adr_yes['Claim Injury Type'].value_counts()

# Exibir os resultados
print("Claim Injury Type when 'Alternative Dispute Resolution' == 'Y'")
print(claim_injury_type_counts_adr_yes)

# Calcular a percentagem de cada 'Claim Injury Type' quando 'Alternative Dispute Resolution' é 'Y'
claim_injury_type_percentage_adr_yes = (claim_injury_type_counts_adr_yes / claim_injury_type_counts_adr_yes.sum()) * 100

# Exibir as percentagens
print(claim_injury_type_percentage_adr_yes)

In [None]:
# Criar gráfico para visualizar a relação entre 'Alternative Dispute Resolution' e 'Claim Injury Type'
plt.figure(figsize=(12, 6))
sns.countplot(x='Attorney/Representative', hue='Claim Injury Type', data=df)
plt.title('Relation between Attorney/Representative and Claim Injury Type')
plt.xlabel('Attorney/Representative')
plt.ylabel('Contagem')
plt.legend(title='Attorney/Representative', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Criar uma tabela cruzada (crosstab) para contar a frequência de 'Claim Injury Type' em função de 'Attorney/Representative'
claim_injury_type_by_attorney = pd.crosstab(df['Claim Injury Type'], df['Attorney/Representative'])

# Calcular a percentagem de cada valor no total para advogado e não advogado
claim_injury_type_percentage = claim_injury_type_by_attorney.apply(lambda x: x / x.sum() * 100, axis=1)

# Concatenar os valores absolutos com as percentagens
claim_injury_type_with_percentage = pd.concat([claim_injury_type_by_attorney, claim_injury_type_percentage], axis=1, keys=['Count', 'Percentage'])

# Exibir o resultado
print(claim_injury_type_with_percentage)

### Average Weekly Wage by Attorney/Representative

In [None]:
# Assuming df is your DataFrame
# Calculate the mean 'Average Weekly Wage' for each 'Attorney/Representative' category
mean_wage_by_lawyer = df.groupby('Attorney/Representative')['Average Weekly Wage'].mean().reset_index()

# Create a bar plot
plt.figure(figsize=(8, 6))
bar_plot = sns.barplot(x='Attorney/Representative', y='Average Weekly Wage', data=mean_wage_by_lawyer)

# Add title and labels
plt.title('Average Weekly Wage by Attorney/Representative')
plt.xlabel('Attorney/Representative (Y/N)')
plt.ylabel('Average Weekly Wage')

# Annotate the bar plot with the actual mean values
for index, row in mean_wage_by_lawyer.iterrows():
    bar_plot.text(index, row['Average Weekly Wage'], f"{row['Average Weekly Wage']:.2f}", 
                  color='black', ha="center", va="bottom")

# Show the plot
plt.show()

### Relation between Carrier Type and Claim Injury Type

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='Carrier Type', hue='Claim Injury Type', data=df)
plt.title('Relation between Carrier Type and Claim Injury Type')
plt.xlabel('Carrier Type')
plt.ylabel('Contagem')
plt.xticks(rotation=45)
plt.legend(title='Claim Injury Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='Carrier Type', hue='Attorney/Representative', data=df)
plt.title('Relation between Carrier Type and Claim Injury Type')
plt.xlabel('Carrier Type')
plt.ylabel('Contagem')
plt.xticks(rotation=45)
plt.legend(title='Attorney/Representative', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
# Criar uma tabela cruzada (crosstab) para contar a frequência de 'Carrier Type' em função de 'Attorney/Representative'
carrier_type_with_attorney_counts = pd.crosstab(df['Carrier Type'], df['Attorney/Representative'])

# Exibir os resultados
print(carrier_type_with_attorney_counts)

### Relation between COVID-19 Indicator and Claim Injury Type

In [None]:
covid_yes = df[df['COVID-19 Indicator'] == 'Y']

plt.figure(figsize=(12, 6))
sns.countplot(x='COVID-19 Indicator', hue='Claim Injury Type', data=covid_yes)
plt.title('Relation between COVID-19 Indicator and Claim Injury Type')
plt.xlabel('COVID-19 Indicator')
plt.ylabel('Contagem')
plt.legend(title='Claim Injury Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Filtrar os dados onde 'COVID-19 Indicator' é 'Y'
covid_yes = df[df['COVID-19 Indicator'] == 'Y']

# Contar o número de ocorrências de 'Claim Injury Type' para os casos com 'COVID-19 Indicator' = 'Y'
covid_yes_claim_injury_counts = covid_yes['Claim Injury Type'].value_counts()

# Calcular a percentagem de cada tipo de lesão
total_claims = covid_yes_claim_injury_counts.sum()
covid_yes_claim_injury_percentage = (covid_yes_claim_injury_counts / total_claims) * 100

# Calcular a percentagem acumulada
covid_yes_claim_injury_cumulative_percentage = covid_yes_claim_injury_percentage.cumsum()

# Criar um DataFrame com as contagens, percentagens e percentagem acumulada
covid_yes_claim_injury_df = pd.DataFrame({
    'Count': covid_yes_claim_injury_counts,
    'Percentage': covid_yes_claim_injury_percentage,
    'Cumulative Percentage': covid_yes_claim_injury_cumulative_percentage
})

# Exibir o resultado
print("Claim Injury Type para casos onde COVID-19 Indicator = 'Y'")
print(covid_yes_claim_injury_df)



### Relation betweenAge at Injury and Mean IME-4 Count

In [None]:
# Create a line plot
mean_ime_by_age = df.groupby('Age at Injury')['IME-4 Count'].mean().reset_index()

plt.figure(figsize=(14, 8))
sns.lineplot(x='Age at Injury', y='IME-4 Count', data=mean_ime_by_age, marker='o')

# Add title and labels
plt.title('Mean IME-4 Count by Age at Injury')
plt.xlabel('Age at Injury')
plt.ylabel('Mean IME-4 Count')

# Show the plot
plt.show()

### Relation between Average Weekly Wage and Attorney/Representative

In [None]:
# Remove rows where 'Average Weekly Wage' is 0.0 or NaN
filtered_df = df[df['Average Weekly Wage'] > 0]

# Ensure that the column names match exactly the ones in your dataset.
# Group by 'Attorney/Representative' and calculate the mean 'Average Weekly Wage'
mean_wage_by_lawyer = filtered_df.groupby('Attorney/Representative')['Average Weekly Wage'].mean().reset_index()

print(mean_wage_by_lawyer)

plt.figure(figsize=(8, 6))
sns.barplot(x='Attorney/Representative', y='Average Weekly Wage', data=mean_wage_by_lawyer, palette='viridis')

# Add title and labels
plt.title('Average Weekly Wage by Attorney/Representative')
plt.xlabel('Attorney/Representative (Y/N)')
plt.ylabel('Average Weekly Wage')

# Show the plot
plt.show()

### Relation between Mean Average Weekly Wage and Age at Injury

In [None]:
# Create a line plot
mean_wage_by_age = df.groupby('Age at Injury')['Average Weekly Wage'].mean().reset_index()

plt.figure(figsize=(14, 8))
sns.lineplot(x='Age at Injury', y='Average Weekly Wage', data=mean_wage_by_age, marker='o')

# Add title and labels
plt.title('Mean Average Weekly Wage by Age at Injury')
plt.xlabel('Age at Injury')
plt.ylabel('Average Weekly Wage')

# Show the plot
plt.show()

### Relation between Mean Days Difference and Age at Injury

In [None]:
# Create a line plot
mean_daysdif_by_age = df.groupby('Age at Injury')['Days Difference'].mean().reset_index()

plt.figure(figsize=(14, 8))
sns.lineplot(x='Age at Injury', y='Days Difference', data=mean_daysdif_by_age, marker='o')

# Add title and labels
plt.title('Mean Days Difference by Age at Injury')
plt.xlabel('Age at Injury')
plt.ylabel('Days Difference')

# Show the plot
plt.show()

### Relation between Mean Days Difference and Average Weekly Wage

In [None]:
# Create a line plot
mean_daysdif_by_wage = df.groupby('Average Weekly Wage')['Days Difference'].mean().reset_index()

plt.figure(figsize=(14, 8))
sns.lineplot(x='Average Weekly Wage', y='Days Difference', data=mean_daysdif_by_wage, marker='o')

# Add title and labels
plt.title('Mean Days Difference by Average Weekly Wage')
plt.xlabel('Average Weekly Wage')
plt.ylabel('Days Difference')

# Show the plot
plt.show()

### Relation between Mean Days Difference and District

In [None]:
mean_daysdif_by_district = df.groupby('District Name')['Days Difference'].mean().reset_index()

# Create a bar plot
plt.figure(figsize=(8, 6))
bar_plot = sns.barplot(x='District Name', y='Days Difference', data=mean_daysdif_by_district)

# Add title and labels
plt.title('Mean Days Difference by District')
plt.xlabel('District')
plt.ylabel('Days Difference')

# Annotate the bar plot with the actual mean values
for index, row in mean_daysdif_by_district.iterrows():
    bar_plot.text(index, row['Days Difference'], f"{row['Days Difference']:.2f}", 
                  color='black', ha="center", va="bottom")

# Show the plot
plt.show()

### Proportion between Atorney/Representative within District Name

In [None]:
cross_tab = pd.crosstab(df['District Name'], df['Attorney/Representative'], normalize='index')

# Plotar o gráfico de barras empilhadas com proporções
cross_tab.plot(kind='bar', stacked=True, figsize=(8, 6), color=['skyblue', 'salmon'])

# Adicionar rótulos e título
plt.title('Proportion of Attorney/Representative within District Name')
plt.xlabel('District Name')
plt.ylabel('Proportion')
plt.legend(title='Attorney/Representative', loc='upper right')
plt.show()

### Mean Days Difference by Attorney/Representative

In [None]:
mean_daysdif_by_lawyer = df.groupby('Attorney/Representative')['Days Difference'].mean().reset_index()

# Create a bar plot
plt.figure(figsize=(8, 6))
bar_plot = sns.barplot(x='Attorney/Representative', y='Days Difference', data=mean_daysdif_by_lawyer)

# Add title and labels
plt.title('Mean Days Difference by Attorney/Representative')
plt.xlabel('Attorney/Representative (Y/N)')
plt.ylabel('Days Difference')

# Annotate the bar plot with the actual mean values
for index, row in mean_daysdif_by_lawyer.iterrows():
    bar_plot.text(index, row['Days Difference'], f"{row['Days Difference']:.2f}", 
                  color='black', ha="center", va="bottom")

# Show the plot
plt.show()