In [1]:
import pandas as pd
import datetime

In [2]:
paths = [
    "accident_gravity_reporting_regulated_jul2020_present.txt",
    "accident_hazardous_liquid_jan2010_present.txt",
    "incident_gas_distribution_jan2010_present.txt",
    "incident_gas_transmission_gathering_jan2010_present.txt",
    "incident_liquefied_natural_gas_jan2011_present.txt",
    "incident_type_r_reporting_regulated_gas_gathering_may2022_present.txt"
]

data_sources = [
    'gravity',
    'hazardous_liquid',
    'gas_distribution',
    'gas_transmission',
    'liquefied_natural_gas',
    'gas_gathering'
]

In [3]:
# concatenate tables to one df

df_temp1 = pd.read_csv(
    paths[0]
    , header=0
    , encoding='unicode_escape' 
    , sep='\t'
    #, index_col=['REPORT_NUMBER', 'SUPPLEMENTAL_NUMBER'] 
    , engine='python'
    )
df_temp1['data_source'] = data_sources[0]

df_temp2 = pd.read_csv(
    paths[1]
    , header=0
    , encoding='unicode_escape' 
    , sep='\t'
    #, index_col=['REPORT_NUMBER', 'SUPPLEMENTAL_NUMBER'] 
    , engine='python'
    )
df_temp2['data_source'] = data_sources[1]

df = pd.concat([df_temp1, df_temp2])


for path, data_source in zip(paths[2:], data_sources[2:]):
    df_temp = pd.read_csv(
        path
        , header=0
        , encoding='unicode_escape' 
        , sep='\t'
        #, index_col=['REPORT_NUMBER', 'SUPPLEMENTAL_NUMBER'] 
        , engine='python'
        )
    df_temp['data_source'] = data_sources[1]
    df = pd.concat([df, df_temp])

print(df.shape)

(8874, 871)


In [4]:
# create column 'LOCAL_DATE'
df2 = df.copy()
df2['LOCAL_DATETIME'] = pd.to_datetime(df2['LOCAL_DATETIME'])
df2['LOCAL_DATE'] = df2['LOCAL_DATETIME'].dt.date

df = df2.copy()

In [5]:
# drop unnamed columns
columns_to_drop = df.columns[df.columns.str.contains('Unnamed')]
df = df.loc[:, ~df.columns.isin(columns_to_drop)]

In [6]:
# filter out dates outside of our range
# and records with no locations
df = df[
    (df['LOCAL_DATE'] >= datetime.date(2010, 1, 1)) 
    & (df['LOCAL_DATE'] <= datetime.date(2022, 12, 31))
    & (df['LOCATION_LATITUDE'].notnull())
    & (df['LOCATION_LONGITUDE'].notnull())
    ]

In [7]:
# calculate nans per column
# filter out those with more than 0.8 nans
nan_share_per_column = df.isna().sum()/df.shape[0]
cols_nan_less_08 = nan_share_per_column[nan_share_per_column <= 0.8]
cols_to_keep = cols_nan_less_08.index.tolist()

df3 = df.loc[:, cols_to_keep]
df3.to_csv('df_3.csv', index=False)

In [8]:
# manual selection
selected_cols = [
    'LOCAL_DATE',
    'LOCATION_LATITUDE',
    'LOCATION_LONGITUDE',
    'COMMODITY_RELEASED_TYPE',
    'COMMODITY_SUBTYPE',
    'UNINTENTIONAL_RELEASE_BBLS',
    'INTENTIONAL_RELEASE_BBLS',
    'RECOVERED_BBLS',
    'FATALITY_IND',
    'FATAL',
    'INJURY_IND',
    'INJURE',
    'OPERATOR_TYPE',
    'ON_OFF_SHORE',
    'IGNITE_IND',
    'EXPLODE_IND',
    'NUM_PUB_EVACUATED',
    'FEDERAL',
    'LOCATION_TYPE',
    'CROSSING',
    'ITEM_INVOLVED',
    'PIPE_TYPE',
    'PIPE_DIAMETER',
    'MATERIAL_INVOLVED',
    'WILDLIFE_IMPACT_IND',
    'SOIL_CONTAMINATION',
    'LONG_TERM_ASSESSMENT',
    'REMEDIATION_IND',
    'WATER_CONTAM_IND',
    'EST_COST_OPER_PAID',
    'EST_COST_GAS_RELEASED',
    'EST_COST_PROP_DAMAGE',
    'EST_COST_EMERGENCY',
    'EST_COST_ENVIRONMENTAL',
    'EST_COST_OTHER',
    'CAUSE',
    'CAUSE_DETAILS',
    'NARRATIVE',
    'data_source',
    'SYSTEM_PART_INVOLVED',
    'SHUTDOWN_DUE_ACCIDENT_IND',
    'SHUTDOWN_EXPLAIN',
    'UPSTREAM_VALVE_TYPE_IND',
    'DOWNSTREAM_VALVE_TYPE_IND',
    'DEPTH_OF_COVER',
    'PIPE_FACILITY_TYPE',
    'PIPE_WALL_THICKNESS',
    'PIPE_SMYS',
    'PIPE_SPECIFICATION',
    'PIPE_SEAM_TYPE',
    'PIPE_MANUFACTURER',
    'PIPE_COATING_TYPE',
    'INSTALLATION_YEAR',
    'MANUFACTURED_YEAR',
    'RELEASE_TYPE',
    'LEAK_TYPE',
    'COULD_BE_HCA',
    'COMMODITY_REACHED_HCA',
    'ACCIDENT_PSIG',
    'MOP_PSIG',
    'MOP_CFR_SECTION',
    'ACCIDENT_PRESSURE',
    'PRESSURE_RESTRICTION_IND',
    'LENGTH_SEGMENT_ISOLATED',
    'INTERNAL_INSPECTION_IND',
    'OPERATION_COMPLICATIONS_IND',
    'PIPELINE_FUNCTION',
    'SCADA_IN_PLACE_IND',
    'SCADA_OPERATING_IND',
    'SCADA_FUNCTIONAL_IND',
    'SCADA_DETECTION_IND',
    'SCADA_CONF_IND',
    'CPM_IN_PLACE_IND',
    'CPM_OPERATING_IND',
    'CPM_FUNCTIONAL_IND',
    'CPM_DETECTION_IND',
    'CPM_CONF_IND',
    'INVESTIGATION_STATUS',
    'INVESTIGATION_STATUS_DETAILS',
    'EMPLOYEE_DRUG_TEST_IND',
    'CONTRACTOR_DRUG_TEST_IND',
    'EQ_FAILURE_TYPE',
    'UNINTENTIONAL_RELEASE',
    'INTENTIONAL_RELEASE',
    'CLASS_LOCATION_TYPE',
    'EST_COST_UNINTENTIONAL_RELEASE',
    'EST_COST_INTENTIONAL_RELEASE'
]

In [9]:
df = df.loc[:, selected_cols]

# reset index
df = df.reset_index(drop=True)

# export to csv
df.to_csv('df_awarie_2010-2022.csv', index=False)