# Clean USIS Dataset
**Author:** Jacob Kvasnicka <br>
**Date:** September 24, 2024

## Setup

In [1]:
import pandas as pd
import random
import json

from config_management import UnifiedConfiguration
from raw_processing import cehd_cleaning, cehd_loading

In [2]:
# Clean the USIS data for consistency with CEHD
# Determine which years to focus on
# Extract NAICS and see which sectors/subsectors covered
# Build ML models for CEHD, USIS, and the union

In [2]:
cehd_file = r"C:\Users\jmank\Repositories\ht-occupational-plus\raw_processing\tests\expected_cehd.feather"
cehd = pd.read_feather(cehd_file).set_index('index')

cehd.shape

(1346556, 29)

In [3]:
usis_file = r"C:\Users\jmank\Repositories\ht-occupational-plus\input\raw\osha\usis\USIS_data.feather"

usis = pd.read_feather(usis_file)

usis.shape

(765337, 49)

In [13]:
# Filter out non-personal
usis['sample_type_name'].value_counts() / len(usis)

Personal    0.954594
Area        0.045406
Name: sample_type_name, dtype: float64

In [38]:
# Filter out non-full-shift?
usis['exposure_type_name'].value_counts() / len(usis)

Full Shift Time Weighted Average (TWA)    0.585060
Ceiling                                   0.066895
Short Term Exposure Limit                 0.024594
Peak                                      0.016337
Name: exposure_type_name, dtype: float64

In [50]:
# How to handle short-term samples?
cehd['time_sampled'].describe()

count    1.346556e+06
mean     2.124963e+02
std      1.535882e+02
min      5.000000e-02
25%      8.600000e+01
50%      1.840000e+02
75%      3.300000e+02
max      8.214000e+03
Name: time_sampled, dtype: float64

In [46]:
# Convert all units to mg/m3 - check Jeff's code
cehd['unit_of_measurement'].value_counts()
usis['measure_unit_id'].value_counts()

M              1042056
P               255381
F                18616
%                15760
M.from.Perc      14743
Name: unit_of_measurement, dtype: int64

In [18]:
# Extract sampled year as separate column
usis['sample_date']

0         2006-03-30
1         2006-03-30
2         2006-03-30
3         2006-05-17
4         2007-04-04
             ...    
765332    2021-07-23
765333    2021-07-23
765334    2021-07-23
765335    2021-07-23
765336    2021-07-23
Name: sample_date, Length: 765337, dtype: object