# Clean dataset

In [1]:
import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None



In [2]:
df = pd.read_csv('../data/processed/pipeline_incidents_2010_present_all.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


## Keep only relevant columns

In [3]:
columns_keep = [
    'datafile_as_of',
    'ff',
    'significant',
    'serious',
    'report_number',
    'supplemental_number',
    'report_received_date',
    'report_type',
    'operator_id',
    'name',
    'operator_street_address',
    'operator_city_name',
    'operator_state_abbreviation',
    'operator_postal_code',
    'local_datetime',
    'time_zone',
    'daylight_savings_ind',
    'iyear',
    'location_street_address',
    'location_city_name',
    'location_county_name',
    'location_state_abbreviation',
    'location_postal_code',
    'location_latitude',
    'location_longitude',
    'total_cost',
    'total_cost_current',
    'injury_ind',
    'injure',
    'num_pub_evacuated',
    'fatal',
    'cause',
    'cause_details',
    'material_involved',
    'material_details',
    'narrative',
    'unintentional_release',
    'installation_year'
]

In [4]:
df_final = df[columns_keep]
df_final.columns

Index(['datafile_as_of', 'ff', 'significant', 'serious', 'report_number',
       'supplemental_number', 'report_received_date', 'report_type',
       'operator_id', 'name', 'operator_street_address', 'operator_city_name',
       'operator_state_abbreviation', 'operator_postal_code', 'local_datetime',
       'time_zone', 'daylight_savings_ind', 'iyear', 'location_street_address',
       'location_city_name', 'location_county_name',
       'location_state_abbreviation', 'location_postal_code',
       'location_latitude', 'location_longitude', 'total_cost',
       'total_cost_current', 'injury_ind', 'injure', 'num_pub_evacuated',
       'fatal', 'cause', 'cause_details', 'material_involved',
       'material_details', 'narrative', 'unintentional_release',
       'installation_year'],
      dtype='object')

## Create variable with decades

In [5]:
# create decades variable
conditions = [
    (df['installation_year'] >= '1900-01-01') & (df['installation_year'] < '1910-01-01'),
    (df['installation_year'] >= '1910-01-01') & (df['installation_year'] < '1920-01-01'),
    (df['installation_year'] >= '1920-01-01') & (df['installation_year'] < '1930-01-01'),
    (df['installation_year'] >= '1930-01-01') & (df['installation_year'] < '1940-01-01'),
    (df['installation_year'] >= '1940-01-01') & (df['installation_year'] < '1950-01-01'),
    (df['installation_year'] >= '1950-01-01') & (df['installation_year'] < '1960-01-01'),
    (df['installation_year'] >= '1960-01-01') & (df['installation_year'] < '1970-01-01'),
    (df['installation_year'] >= '1970-01-01') & (df['installation_year'] < '1980-01-01'),
    (df['installation_year'] >= '1980-01-01') & (df['installation_year'] < '1990-01-01'),
    (df['installation_year'] >= '1990-01-01') & (df['installation_year'] < '2000-01-01'),
    (df['installation_year'] >= '2000-01-01') & (df['installation_year'] < '2010-01-01'),
    df['installation_year'] >= '2010-01-01'
]

outputs = [
    '1900-1909', '1910-1919', '1920-1929', '1930-1939', '1940-1949', '1950-1959', '1960-1969', '1970-1979',
    '1980-1989', '1990-1999', '2000-2009', '>2010'
]

df_final['decade'] = np.select(conditions, outputs, np.nan)
df_final.decade.unique()

array(['nan', '1990-1999', '1970-1979', '1980-1989', '1960-1969',
       '2000-2009', '1950-1959', '1940-1949', '>2010', '1920-1929',
       '1930-1939', '1900-1909', '1910-1919'], dtype=object)

## Create variable of current pipeline age at time of incident

In [6]:
# convert to datetime
df_final['installation_year'] = pd.to_datetime(df_final['installation_year'], format='%Y-%m-%d')

In [7]:
df_final['year_dt'] = df_final.installation_year.dt.year

In [8]:
# convert to float to allow us to subtract iyear and installation_year
df_final['iyear'] = df_final.iyear.astype('float', errors='ignore')

In [9]:
# create pipeline_age variable
df_final['pipeline_age'] = df_final['iyear'] - df_final['year_dt']

In [10]:
df_final.to_csv('../data/processed/pipeline_incidents_2010_present_all_CLEAN.csv', index=False)