You can read about dataset and download it [here](https://www.osha.gov/Establishment-Specific-Injury-and-Illness-Data).

Let's import standard packages:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import ipywidgets as widgets
from IPython.display import display

In [None]:
old_pwd = os.getcwd()
os.chdir('../..')
from constants import INSPECTION_DETAILS_FOLDER_NAME, MAPPING_FILES_FOLDER_NAME, ALL, STATE_NAMES, TWO_DIGIT_NAICS
from helpers import get_naics_sector_numbers_by_names
from scrapping_inspection_details import parse_inspection_file, Inspection, get_inspection_details_list
os.chdir(old_pwd)

Below is dictionary with file names and corresponding encoding, used during reading of files. File names for most years are commented because we're interested only in three latest years.

In [None]:
FILE_NAMES_ENCODING = {
#     'ITA Data CY 2016.zip': 'cp1252',
#     'ITA Data CY 2017.zip': 'cp1252',
#     'ITA Data CY 2018.zip': 'cp1252',
#     'ITA Data CY 2019.zip': 'utf-8',
    'ITA Data CY 2020.zip': 'utf-8',
    'ITA Data CY 2021 submitted thru 8-29-2022.zip': 'utf-8',
}
OWNERSHIP_MAP = {
    'Not a government entity': 1,
    'State Government entity': 2,
    'Local Government entity': 3,
}

In [None]:
df = pd.concat(list(map(
    lambda name: pd.read_csv(name, encoding=FILE_NAMES_ENCODING[name], low_memory=False),
    FILE_NAMES_ENCODING.keys()
)), ignore_index=True)

Let's delete columns we don't need, delete rows with empty column **year_filing_for**, and reindex dataframe:

In [None]:
df = df.drop(columns=[
    'id', # Unique number for each record
    'street_address',
    'zip_code',
    'no_injuries_illnesses', # Whether the establishment had any OSHA recordable work-related injuries or illnesses during the year
    'total_other_cases',
    'total_skin_disorders',
    'total_poisonings',
    'total_respiratory_conditions',
    'total_hearing_loss',
    'total_other_illnesses',
    'created_timestamp', # The date and time a record was submitted to the ITA
    'change_reason', # The reason why an establishment’s injury and illness summary was changed, if applicable
])

df = df[~df['year_filing_for'].isna()]
df = df[df['annual_average_employees'] < 1000000]
df = df[df['total_hours_worked'] >= 0]
df = df[df['total_dafw_days'] >= 0]
df = df[df['total_djtr_days'] >= 0]
df.index = list(range(len(df)))
df

In [None]:
def my_plot(selected_states, naics, year, ownership_type, employee_number_range, days_of_job_transfer_or_restriction, total_hours_worked_range, days_away_from_work):
    t = df if ALL in selected_states else df[df['state'].isin(selected_states)]
    t = t if ALL in naics else t[t['naics_code'].astype(str).str.startswith(tuple(get_naics_sector_numbers_by_names(naics)))]
    t = t if ALL in year else t[t['year_filing_for'].isin(year)]
    t = t if ALL in ownership_type else t[t['establishment_type'].isin(tuple(map(lambda x: OWNERSHIP_MAP[x], ownership_type)))]
    t = t[t['annual_average_employees'].between(employee_number_range[0], employee_number_range[1])]
    t = t[t['total_hours_worked'].between(total_hours_worked_range[0], total_hours_worked_range[1])]
    t = t[t['total_dafw_days'].between(days_away_from_work[0], days_away_from_work[1])]
    input_df = t[t['total_djtr_days'].between(days_of_job_transfer_or_restriction[0], days_of_job_transfer_or_restriction[1])]

    grouped_df = input_df.groupby(['company_name', 'year_filing_for'])\
    ['annual_average_employees', 'total_hours_worked', 'total_deaths', 'total_dafw_cases', 'total_djtr_cases', 'total_dafw_days', 'total_djtr_days', 'total_injuries']\
    .agg(sum)
    plt.scatter(grouped_df['annual_average_employees'], grouped_df['total_injuries'])
    plt.title("Linear regression between 'Number of Employees' vs 'Number of Injuries' for 2020 and 2021")
    plt.xlabel('Annual Average Number of Employees, millions')
    plt.ylabel('Total number of injuries')

    b, a = np.polyfit(grouped_df['annual_average_employees'], grouped_df['total_injuries'], deg=1)
    xseq = np.linspace(0, max(grouped_df['annual_average_employees']), num=1000)
    plt.plot(xseq, a + b * xseq, color="k", lw=2.5, label='y={:.2f}x+{:.2f}'.format(b,a));
    plt.legend(fontsize=9)

    plt.grid()
    plt.tight_layout()
    plt.show()

In [None]:
global t
t = df.copy(deep=True)
state = widgets.SelectMultiple(
    options=[ALL] + list(STATE_NAMES.keys()),
    value=[ALL],
    rows=10,
    description='State',
)
year = widgets.SelectMultiple(
    options=[ALL] + list(df['year_filing_for'].unique()),
    value=[ALL],
    rows=len(df['year_filing_for'].unique()) + 1,
    description='Year',
)
ownership_type = widgets.SelectMultiple(
    options=[ALL] + list(OWNERSHIP_MAP.keys()),
    value=[ALL],
    rows=len(OWNERSHIP_MAP.keys()) + 1,
    description='Ownership',
)
establishment_type = widgets.SelectMultiple(
    options=[ALL] + list(df['year_filing_for'].unique()),
    value=[ALL],
    rows=len(df['year_filing_for'].unique()) + 1,
    description='Year',
)
naics = widgets.SelectMultiple(
    options=[ALL] + list(set(TWO_DIGIT_NAICS.values())),
    value=[ALL],
    rows=10,
    description='NAICS sector',
    disabled=False
)
annual_average_employees = widgets.IntRangeSlider(
    value=[df['annual_average_employees'].min(), df['annual_average_employees'].max()],
    min=df['annual_average_employees'].min(),
    max=df['annual_average_employees'].max(),
    step=1,
    description='Annual Average Number of Employees:',
    disabled=False,
    continuous_update=False,
    readout=True,
)
total_hours_worked_range = widgets.IntRangeSlider(
    value=[df['total_hours_worked'].min(), df['total_hours_worked'].max()],
    min=df['total_hours_worked'].min(),
    max=df['total_hours_worked'].max(),
    step=1,
    description='Total hours worked by all employees:',
    disabled=False,
    continuous_update=False,
    readout=True,
)
days_away_from_work = widgets.IntRangeSlider(
    value=[df['total_dafw_days'].min(), df['total_dafw_days'].max()],
    min=df['total_dafw_days'].min(),
    max=df['total_dafw_days'].max(),
    step=1,
    description='Days away from work:',
    continuous_update=False,
    readout=True,
)
days_of_job_transfer_or_restriction = widgets.IntRangeSlider(
    value=[df['total_djtr_days'].min(), df['total_djtr_days'].max()],
    min=df['total_djtr_days'].min(),
    max=df['total_djtr_days'].max(),
    step=1,
    description='Days of job transfer or restriction:',
    continuous_update=False,
    readout=True,
)
# total_djtr_days Total number of days of job transfer or restriction
save_as_csv_button = widgets.Button(
    description='Save as CSV file',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    icon='check'
)

def my_filter(selected_states, naics, year, ownership_type, employee_number_range, days_of_job_transfer_or_restriction, total_hours_worked_range, days_away_from_work):
    t = df if ALL in selected_states else df[df['state'].isin(selected_states)]
    t = t if ALL in naics else t[t['naics_code'].astype(str).str.startswith(tuple(get_naics_sector_numbers_by_names(naics)))]
    t = t if ALL in year else t[t['year_filing_for'].isin(year)]
    t = t if ALL in ownership_type else t[t['establishment_type'].isin(tuple(map(lambda x: OWNERSHIP_MAP[x], ownership_type)))]
    t = t[t['annual_average_employees'].between(employee_number_range[0], employee_number_range[1])]
    t = t[t['total_hours_worked'].between(total_hours_worked_range[0], total_hours_worked_range[1])]
    t = t[t['total_dafw_days'].between(days_away_from_work[0], days_away_from_work[1])]
    t = t[t['total_djtr_days'].between(days_of_job_transfer_or_restriction[0], days_of_job_transfer_or_restriction[1])]
    display(t)
    
def save_as_csv(some):
    pd.from_csv(table.outputs[0]['data']['text/plain'])
    table.outputs[0]['data']['text/plain'].to_csv(f'{pd.to_datetime("today").strftime("%Y-%m-%d")}.csv', index=False)

table = widgets.interactive_output(my_filter, {
    'selected_states': state,
    'naics': naics,
    'year': year,
    'ownership_type': ownership_type,
    'employee_number_range': annual_average_employees,
    'total_hours_worked_range': total_hours_worked_range,
    'days_away_from_work': days_away_from_work,
    'days_of_job_transfer_or_restriction': days_of_job_transfer_or_restriction,
})
save_as_csv_button.on_click(save_as_csv, False)
scatter_plot = widgets.interactive_output(my_plot, {
    'selected_states': state,
    'naics': naics,
    'year': year,
    'ownership_type': ownership_type,
    'employee_number_range': annual_average_employees,
    'total_hours_worked_range': total_hours_worked_range,
    'days_away_from_work': days_away_from_work,
    'days_of_job_transfer_or_restriction': days_of_job_transfer_or_restriction,
})

display(state)
display(naics)
display(year)
display(ownership_type)
display(annual_average_employees)
display(total_hours_worked_range)
display(days_away_from_work)
display(days_of_job_transfer_or_restriction)
display(table)
display(save_as_csv_button)
display(scatter_plot)

In [None]:
grouped_df = df.groupby(['company_name', 'year_filing_for'])\
['annual_average_employees', 'total_hours_worked', 'total_deaths', 'total_dafw_cases', 'total_djtr_cases', 'total_dafw_days', 'total_djtr_days', 'total_injuries']\
.agg(sum)