In [None]:
Get consistent column names based on the field definitions page
Pass a dict to DataFrame.rename()
Use all capitals for consistency with CDR
Concatenate into a master dataframe
Move the functions into a module
Process the data into X and y. Use ChatGTP for guidance

In [1]:
import pandas as pd
import numpy as np
import os
import chardet

import osha

In [2]:
osha_dir = 'Input/Raw/OSHA/healthsamples'

rename_mapper = {
    'EIGHT_HR_TWA_CALC': 'EIGHT_HOUR_TWA_CALC',
    'BLANK_SAMPLE': 'BLANK_USED'
}

# Might not need all these columns
dtype_for_column = {
    'INSPECTION_NUMBER': 'Int64',
    'ESTABLISHMENT_NAME': 'string',
    'CITY': 'string',
    'STATE': 'string',
    'ZIP_CODE': 'Int64',
    'SIC_CODE': 'Int64',
    'NAICS_CODE': 'Int64',
    'SAMPLING_NUMBER': 'Int64',
    'OFFICE_ID': 'Int64',
    'DATE_SAMPLED': 'string',  # Will parse as date later
    'DATE_REPORTED': 'string',  # Will parse as date later
    'EIGHT_HR_TWA_CALC': 'string',
    'LAB_NUMBER': 'string',
    'FIELD_NUMBER': 'string',
    'SAMPLE_TYPE': 'string',
    'BLANK_SAMPLE': 'string',
    'TIME_SAMPLED': 'Int64',
    'AIR_VOLUME_SAMPLED': 'string',
    'SAMPLE_WEIGHT': 'float64',
    'IMIS_SUBSTANCE_CODE': 'string',
    'SUBSTANCE': 'string',
    'SAMPLE_RESULT': 'float64',
    'UNIT_OF_MEASUREMENT': 'string',
    'QUALIFIER': 'string'
}

In [5]:
osha_data = osha.load_osha_data(osha_dir, rename_mapper)

osha_data

Loading 1984 data...
Loading 1985 data...
Loading 1986 data...
Loading 1987 data...
Loading 1988 data...
Loading 1989 data...
Loading 1990 data...
Loading 1991 data...
Loading 1992 data...
Loading 1993 data...
Loading 1994 data...
Loading 1995 data...
Loading 1996 data...
Loading 1997 data...
Loading 1998 data...
Loading 1999 data...
Loading 2000 data...
Loading 2001 data...
Loading 2002 data...
Loading 2003 data...
Loading 2004 data...
Loading 2005 data...
Loading 2006 data...
Loading 2007 data...
Loading 2008 data...
Loading 2009 data...
Loading 2010 data...
Loading 2011 data...
Loading 2012 data...
Loading 2013 data...
Loading 2014 data...
Loading 2015 data...
Loading 2016 data...
Loading 2017 data...
Loading 2018 data...
Loading 2019 data...
Loading 2020 data...
Loading 2021 data...
Loading 2022 data...


Unnamed: 0,YEAR,INSPECTION_NUMBER,ESTABLISHMENT_NAME,CITY,STATE,ZIP_CODE,SIC_CODE,NAICS_CODE,SAMPLING_NUMBER,OFFICE_ID,...,BLANK_USED,TIME_SAMPLED,AIR_VOLUME_SAMPLED,IMIS_SUBSTANCE_CODE,SUBSTANCE,SAMPLE_RESULT,UNIT_OF_MEASUREMENT,QUALIFIER,SAMPLE_WEIGHT,EIGHT_HOUR_TWA_CALC
0,1984,111211,CONTROLLED CASTINGS CORP,Plainview,NY,11803.0,3365.0,0.0,5245543.0,214700.0,...,N,258.0,516.0000,T100,Thorium,0.0005,M,,,
1,1984,111211,CONTROLLED CASTINGS CORP,Plainview,NY,11803.0,3365.0,0.0,5245543.0,214700.0,...,Y,,,T100,Thorium,0.0000,,BLK,,
2,1984,111211,CONTROLLED CASTINGS CORP,Plainview,NY,11803.0,3365.0,0.0,5245501.0,214700.0,...,N,38.0,76.0000,T100,Thorium,0.0160,M,,,
3,1984,111211,CONTROLLED CASTINGS CORP,Plainview,NY,11803.0,3365.0,0.0,5245501.0,214700.0,...,Y,,,T100,Thorium,0.0000,,BLK,,
4,1984,111252,MID ISLAND NON FERROUS FOUNDRY,East Farmingdale,NY,11735.0,3365.0,0.0,5245345.0,214700.0,...,N,37.0,74.0000,1591,"Lead, Inorganic (as Pb)",0.0640,M,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33174,2022,MI20221101,"Ittner Bean and Grain, Inc",,,,,,MI2022110101,552651.0,...,N,,,E200,% Combustible Dust,75.0000,%,COMB,,N
33175,2022,MI20221101,"Ittner Bean and Grain, Inc",,,,,,MI2022110101,552651.0,...,N,,,M102,MAXIMUM NORMALIZED DP/DT,20.9700,BM/S,bm/s,,N
33176,2022,MI20221101,"Ittner Bean and Grain, Inc",,,,,,MI2022110101,552651.0,...,N,,,M104,Moisture Content,10.0000,%,MOIS,,N
33177,2022,TB20220524,American Surfacing Materials,,,,,,TB2022052401,830800.0,...,N,,,M124,QUALITATIVE MASS-SPEC ANALYSIS BY THERMAL DESO...,0.0000,N,,,N


In [7]:
list(osha_data)

['YEAR',
 'INSPECTION_NUMBER',
 'ESTABLISHMENT_NAME',
 'CITY',
 'STATE',
 'ZIP_CODE',
 'SIC_CODE',
 'NAICS_CODE',
 'SAMPLING_NUMBER',
 'OFFICE_ID',
 'DATE_SAMPLED',
 'DATE_REPORTED',
 'INSTRUMENT_TYPE',
 'LAB_NUMBER',
 'FIELD_NUMBER',
 'SAMPLE_TYPE',
 'BLANK_USED',
 'TIME_SAMPLED',
 'AIR_VOLUME_SAMPLED',
 'IMIS_SUBSTANCE_CODE',
 'SUBSTANCE',
 'SAMPLE_RESULT',
 'UNIT_OF_MEASUREMENT',
 'QUALIFIER',
 'SAMPLE_WEIGHT',
 'EIGHT_HOUR_TWA_CALC']

In [9]:
osha_data['INSPECTION_NUMBER'].nunique()

99558

In [11]:
osha_data['SAMPLING_NUMBER'].nunique()

300214

In [19]:
osha_data['SAMPLE_TYPE'].unique()

array(['P', 'A', 'BU', 'W', None, 'BL', 'B', 'S', 'WB', 'L', 'U', 'M',
       'Z', 'N', 'Y', nan], dtype=object)

In [13]:
osha_data['BLANK_USED'].unique()

array(['N', 'Y', nan], dtype=object)

In [20]:
osha_data['UNIT_OF_MEASUREMENT'].unique()

array(['M', None, 'P', '%', 'D', 'F', 'Y', 'X', 'B', 'L', 'C', 'N', 'W',
       "'", '0', ')', 'T', 'V', '\\', 'U', '4', 'H', '[', 'O', ']', '5',
       '.', '+', 'S', ',', 'm', '@', '&', 'K', 'p', 'G', 'E', '2', 'R',
       '<', '*', '$', 'x', 'J', 'f', '~', '>', 'y', 'u', '^', 'c', 'BM/S',
       'AAAAA', 'mcg/m3', nan], dtype=object)

In [21]:
osha_data['QUALIFIER'].unique()

array([None, 'BLK', 'ND', 'SER', '<', 'BLAN', 'DET', 'N', '.', 'BULK',
       'BLNK', nan, 'DL=3', 'BADG', 'WIPE', 'WBLN', '8', 'ND,D', 'Y',
       'BL', '4', 'N.D.', '<QL', 'BUIA', 'DL=6', '@', '=<', '<QUA',
       'DL15', '<=', 'ND0.', '14', '71', 'LT', '.019', 'L', 'BLA', 'AIR',
       'DL', 'W', 'WBLK', 'IMP', '1', '0000', 'NND', 'WIP0', 'F', 'MD',
       '@=<', '17.', 'NDD', 'BUKJ', '@<', '5.4', 'BKL', '{', 'KLB',
       '0.03', 'nd', 'blk', '.ND', 'WIP', '.50', '.10', '2', '[ND', '137',
       'DL=1', 'CRIS', '<DL', '+ TI', '+ CA', '+ FE', '+ SI', '+ K',
       '+ SR', '+ RB', '+ ZN', '+ PB', '+ AL', '+ P', '+ CR', '? CU',
       '+ ZR', '? NI', '- ZN', '? MO', '+ MO', '+ CU', '? SR', '- PB',
       '- CU', '? ZR', '0', 'BUL;', 'BLK,', 'BL;K', 'BLKK', 'MG', '1ND',
       '43', '81', 'BLOO', '.14', 'BLK\\', '3', 'DL10', 'ND1', 'UIA',
       '0.23', 'NDDL', ']BLK', '@<=', '%', '0.02', 'D', 'R', 'TCD',
       'NONE', 'ND D', 'NOTE', '11.3', '.19', 'LIA', 'NLK', 'B;LL', '<@',
       