In [1]:
import polars as pl
import glob
import os

In [2]:
# Combine accidents
                      
accident_csv_files = sorted(glob.glob('data/accident/*.csv'))                  

# Read and combine all chunks
accident_schema = {
      "SUMMARY_NR": pl.Utf8,
      "REPORT_ID": pl.Utf8,
      "EVENT_DATE": pl.Utf8,  
      "EVENT_TIME": pl.Utf8,
      'EVENT_DESC': pl.Utf8, 
      'EVENT_KEYWORD': pl.Utf8, 
      'CONST_END_USE': pl.Utf8, 
      'BUILD_STORIES': pl.Float64, 
      'NONBUILD_HT': pl.Float64, 
      'PROJECT_COST': pl.Utf8, 
      'PROJECT_TYPE': pl.Utf8, 
      'SIC_LIST': pl.Utf8, 
      'FATALITY': pl.Utf8, 
      'STATE_FLAG': pl.Utf8, 
      'ABSTRACT_TEXT': pl.Utf8, 
      'LOAD_DT': pl.Utf8}

accident_combined = pl.concat([
    pl.read_csv(file, schema_overrides=accident_schema, ignore_errors=True) for file in accident_csv_files
]).with_columns(
    pl.col('EVENT_DATE').str.strptime(pl.Datetime, format='%Y-%m-%d %H:%M:%S%z').dt.date()
)

In [3]:
# Combine accident injury
                      
injuries_csv_files = sorted(glob.glob('data/accident_injury/*.csv'))                  

# Read and combine all chunks
injury_schema = {
      "SUMMARY_NR": pl.Utf8,
      "REL_INSP_NR": pl.Utf8,
      "AGE": pl.Float64,  
      "SEX": pl.Utf8,
      'NATURE_OF_INJ': pl.Float64, 
      'PART_OF_BODY': pl.Float64, 
      'SRC_OF_INJURY': pl.Float64, 
      'EVENT_TYPE': pl.Float64, 
      'EVN_FACTOR': pl.Float64, 
      'HUM_FACTOR': pl.Float64, 
      'OCC_CODE': pl.Float64, 
      'DEGREE_OF_INJ': pl.Float64, 
      'TASK_ASSIGNED': pl.Float64, 
      'HAZSUB': pl.Utf8, 
      'CONST_OP': pl.Float64, 
      'CONST_OP_CAUSE': pl.Utf8,
      'FAT_CAUSE': pl.Utf8,
      'FALL_DISTANCE': pl.Float64,
      'FALL_HT': pl.Float64,
      'INJURY_LINE_NR': pl.Utf8,
      'LOAD_DT': pl.Utf8}

injury_combined = pl.concat([
    pl.read_csv(file, schema_overrides=injury_schema, ignore_errors=True) for file in injuries_csv_files
])

In [15]:
# Combine inspection
                      
inspection_csv_files = sorted(glob.glob('data/inspection/*.csv'))                  

# Read and combine all chunks
inspection_schema = {
      "ACTIVITY_NR": pl.Utf8,
      "REPORTING_ID": pl.Utf8,
      "STATE_FLAG": pl.Utf8,  
      "ESTAB_NAME": pl.Utf8,
      'SITE_ADDRESS': pl.Utf8, 
      'SITE_CITY': pl.Utf8, 
      'SITE_STATE': pl.Utf8, 
      'SITE_ZIP': pl.Utf8, 
      'OWNER_TYPE': pl.Utf8, 
      'OWNER_CODE': pl.Utf8, 
      'ADV_NOTICE': pl.Utf8, 
      'SAFETY_HLTH': pl.Utf8, 
      'SIC_CODE': pl.Utf8, 
      'NAICS_CODE': pl.Utf8, 
      'INSP_TYPE': pl.Utf8, 
      'INSP_SCOPE': pl.Utf8,
      'WHY_NO_INSP': pl.Utf8,
      'UNION_STATUS': pl.Utf8,
      'SAFETY_MANUF': pl.Utf8,
      'SAFETY_CONST': pl.Utf8,
      'SAFETY_MARIT': pl.Utf8,
      'HEALTH_MANUF': pl.Utf8,
      'HEALTH_CONST': pl.Utf8,
      'HEALTH_MARIT': pl.Utf8,
      'MIGRANT': pl.Utf8,
      'MAIL_STREET': pl.Utf8,
      'MAIL_CITY': pl.Utf8,
      'MAIL_STATE': pl.Utf8,
      'MAIL_ZIP': pl.Utf8,
      'HOST_EST_KEY': pl.Utf8,
      'NR_IN_ESTAB': pl.Float64,
      'OPEN_DATE': pl.Utf8,
      'CASE_MOD_DATE': pl.Utf8,
      'CLOSE_CONF_DATE': pl.Utf8,
      'CLOSE_CASE_DATE': pl.Utf8,
      'LOAD_DT': pl.Utf8,
      }

inspection_combined = pl.concat([
    pl.read_csv(file, schema_overrides=inspection_schema, ignore_errors=True) for file in inspection_csv_files
]).with_columns(
    pl.col('OPEN_DATE').str.strptime(pl.Datetime, format='%Y-%m-%d %H:%M:%S%z').dt.date(),
).with_columns(
    pl.col("OPEN_DATE").dt.year().alias("OPEN_YEAR")
).with_columns(
    pl.when(pl.col("NR_IN_ESTAB") <= 25)
        .then(pl.lit("Very Small"))
    .when(pl.col("NR_IN_ESTAB") <= 100)
        .then(pl.lit("Small-Mid"))
    .when(pl.col("NR_IN_ESTAB") <= 250)
        .then(pl.lit("Mid-Large"))
    .when(pl.col("NR_IN_ESTAB") > 250)
        .then(pl.lit("Large"))
    .otherwise(pl.lit(None))
    .alias("ESTAB_SIZE_FLAG")
)

In [21]:
# Combine violations
                      
violation_csv_files = sorted(glob.glob('data/violation/*.csv'))                  

# Read and combine all chunks
violation_schema = {
        'ACTIVITY_NR': pl.Utf8,
        'CITATION_ID': pl.Utf8,
        'DELETE_FLAG': pl.Utf8,
        'STANDARD': pl.Utf8,
        'VIOL_TYPE': pl.Utf8,
        'ISSUANCE_DATE': pl.Utf8,
        'ABATE_DATE': pl.Utf8,
        'ABATE_COMPLETE': pl.Utf8,
        'CURRENT_PENALTY': pl.Float64,
        'INITIAL_PENALTY': pl.Float64,
        'CONTEST_DATE': pl.Utf8,
        'FINAL_ORDER_DATE': pl.Utf8,
        'NR_INSTANCES': pl.Float64,
        'NR_EXPOSED': pl.Float64,
        'REC': pl.Utf8,
        'GRAVITY': pl.Float64,
        'EMPHASIS': pl.Utf8,
        'HAZCAT': pl.Utf8,
        'FTA_INSP_NR': pl.Utf8,
        'FTA_ISSUANCE_DATE': pl.Utf8,
        'FTA_PENALTY': pl.Float64,
        'FTA_CONTEST_DATE': pl.Utf8,
        'FTA_FINAL_ORDER_DATE': pl.Utf8,
        'HAZSUB1': pl.Utf8,
        'HAZSUB2': pl.Utf8,
        'HAZSUB3': pl.Utf8,
        'HAZSUB4': pl.Utf8,
        'HAZSUB5': pl.Utf8,
        'LOAD_DT': pl.Utf8
      }

violation_combined = pl.concat([
    pl.read_csv(file, schema_overrides=violation_schema, ignore_errors=True) for file in violation_csv_files
]).with_columns(
    pl.col('ISSUANCE_DATE').str.strptime(pl.Datetime, format='%Y-%m-%d %H:%M:%S%z').dt.date(),
    pl.col('ABATE_DATE').str.strptime(pl.Datetime, format='%Y-%m-%d %H:%M:%S%z').dt.date(),
    pl.col('CONTEST_DATE').str.strptime(pl.Datetime, format='%Y-%m-%d %H:%M:%S%z').dt.date(),
    pl.col('FINAL_ORDER_DATE').str.strptime(pl.Datetime, format='%Y-%m-%d %H:%M:%S%z').dt.date(),
    pl.col('FTA_ISSUANCE_DATE').str.strptime(pl.Datetime, format='%Y-%m-%d %H:%M:%S%z').dt.date(),
    pl.col('FTA_CONTEST_DATE').str.strptime(pl.Datetime, format='%Y-%m-%d %H:%M:%S%z').dt.date(),
    pl.col('FTA_FINAL_ORDER_DATE').str.strptime(pl.Datetime, format='%Y-%m-%d %H:%M:%S%z').dt.date(),
).with_columns(
    pl.when(pl.col("INITIAL_PENALTY") > 0)
    .then(
            ( (pl.col("CURRENT_PENALTY") - pl.col("INITIAL_PENALTY")) / pl.col("INITIAL_PENALTY") ).alias("PENALTY_PCT_REDUCTION")
    )
)

In [7]:
# Combine accident abstracts
                      
abstract_csv_files = sorted(glob.glob('data/accident_abstract/*.csv'))                  

# Read and combine all chunks
abstract_schema = {
        'SUMMARY_NR': pl.Utf8,
        'LINE_NR': pl.Utf8,
        'ABSTRACT_TEXT': pl.Utf8,
        'LOAD_DT' :pl.Utf8
      }

abstract_combined = pl.concat([
    pl.read_csv(file, schema_overrides=abstract_schema, ignore_errors=True) for file in abstract_csv_files
])

In [2]:
# Combine citation abstracts
                      
citation_csv_files = sorted(glob.glob('data/citation/*.csv'))                  

# Read and combine all chunks
citation_schema = {
        'ACTIVITY_NR': pl.Utf8,
        'CITATION_ID': pl.Utf8,
        'LINE_NR': pl.Utf8,
        'LINE_TEXT': pl.Utf8,
        'LOAD_DT' :pl.Utf8
      }

citation_combined = pl.concat([
    pl.read_csv(file, schema_overrides=citation_schema, ignore_errors=True) for file in citation_csv_files
])

In [None]:
accident_combined.write_parquet("data/clean/accidents.parquet")
injury_combined.write_parquet("data/clean/injuries.parquet")
inspection_combined.write_parquet("data/clean/inspections.parquet")
violation_combined.write_parquet("data/clean/violations.parquet")
abstract_combined.write_parquet("data/clean/abstracts.parquet")
citation_combined.write_parquet("data/clean/citations.parquet")