In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import fiona

pd.options.display.max_columns = None 

# set working directory
os.chdir('/Users/ichittumuri/Desktop/MINES/COGCC-Risk-Analysis/Data')

In [2]:
# Load Data
spills = gpd.read_file('flowlines_with_spills.geojson')
no_spills = gpd.read_file('flowlines_without_spills.geojson')

# Data Cleaning


## No Spills

In [3]:
print(no_spills.info())
print(no_spills.shape)

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 14522 entries, 0 to 14521
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   ACTIONDESCRIPTION  2778 non-null   object        
 1   BEDDINGMATERIAL    7375 non-null   object        
 2   COMPANY_NAME       14522 non-null  object        
 3   CONSTRUCTDATE      14522 non-null  datetime64[ms]
 4   Diam_in            14522 non-null  float64       
 5   ENDLAT             14522 non-null  float64       
 6   ENDLONG            14522 non-null  float64       
 7   ENTIRELINEREMOVED  1743 non-null   object        
 8   FLOWLINEACTION     10202 non-null  object        
 9   FLOWLINEID         14512 non-null  float64       
 10  Fluid              14522 non-null  object        
 11  LOCATIONTYPE       14522 non-null  object        
 12  LOCATION_ID        14510 non-null  float64       
 13  Length_ft          14522 non-null  float64       
 14

In [4]:
no_spills.describe()

Unnamed: 0,CONSTRUCTDATE,Diam_in,ENDLAT,ENDLONG,FLOWLINEID,LOCATION_ID,Length_ft,MAXOPPRESSURE,OPERATOR_NUM,RECEIVE_DATE,SHAPE_Length,STARTLAT,STARTLOCATIONID,STARTLONG
count,14522,14522.0,14522.0,14522.0,14512.0,14510.0,14522.0,5814.0,14522.0,14522,14522.0,14522.0,14233.0,14522.0
mean,2000-08-06 06:02:37.664000,3.071835,39.987917,-104.572175,469338.000482,405387.083322,4864.275725,477.989336,37354.398292,2020-04-15 16:20:49.640000,1483.209327,39.987859,348130.117965,-104.572478
min,1900-01-11 00:00:00,0.0,37.010428,-109.031176,455152.0,159601.0,2.2,0.0,710.0,2018-04-25 14:38:45.767000,0.671472,36.993609,159652.0,-109.049983
25%,1991-12-19 00:00:00,2.0,40.009828,-104.918814,464583.5,328242.0,648.9825,40.0,10633.0,2019-07-09 13:57:38.594000,197.979321,40.00934,317513.0,-104.925054
50%,2005-12-12 00:00:00,2.0,40.169366,-104.685,469184.0,434152.0,1275.5,150.0,10699.0,2019-10-29 13:04:29.083000,389.715802,40.168958,328891.0,-104.687893
75%,2011-02-05 00:00:00,3.0,40.360843,-104.271865,474916.25,463250.0,2193.47,500.0,47120.0,2020-11-20 09:29:56.673000,668.733885,40.360968,338366.0,-104.274455
max,2020-06-01 00:00:00,278.0,40.99263,-102.045863,484080.0,481140.0,152023.9,4700.0,200077.0,2023-11-17 11:11:43.017000,46342.37979,40.996,482188.0,-102.046467
std,,3.914074,0.814513,1.269729,6456.010456,66646.265877,19250.560586,833.014496,35725.181548,,5866.711824,0.814311,50355.883113,1.269641


### Get line age from construction date

In [5]:
# Verify CONSTRUCTDATE is datetime type; convert if necessary
no_spills['CONSTRUCTDATE'] = pd.to_datetime(no_spills['CONSTRUCTDATE'])

# Calculate line_age
# Today's date
today = pd.Timestamp.now()

# Calculate the difference in years
no_spills['line_age_yr'] = (today - no_spills['CONSTRUCTDATE']).dt.days / 365.25

# Display the updated GeoDataFrame to verify 'line_age' column
print(no_spills[['CONSTRUCTDATE', 'line_age_yr']])

      CONSTRUCTDATE  line_age_yr
0        1983-11-09    41.774127
1        1983-12-07    41.697467
2        2006-06-05    19.203285
3        2007-04-08    18.362765
4        2005-05-10    20.273785
...             ...          ...
14517    2001-10-02    23.876797
14518    2008-02-22    17.486653
14519    2007-08-05    18.036961
14520    1998-09-27    26.891170
14521    1983-10-21    41.826146

[14522 rows x 2 columns]


In [6]:
# Print the list of column names
column_names = no_spills.columns.tolist()

print(column_names)

['ACTIONDESCRIPTION', 'BEDDINGMATERIAL', 'COMPANY_NAME', 'CONSTRUCTDATE', 'Diam_in', 'ENDLAT', 'ENDLONG', 'ENTIRELINEREMOVED', 'FLOWLINEACTION', 'FLOWLINEID', 'Fluid', 'LOCATIONTYPE', 'LOCATION_ID', 'Length_ft', 'MAXOPPRESSURE', 'Material', 'OPERATOR_NUM', 'Operator', 'PIPEMATERIAL', 'RECEIVE_DATE', 'SHAPE_Length', 'STARTLAT', 'STARTLOCATIONID', 'STARTLONG', 'Status', 'TYPEOFFLUIDTRANS', 'geometry', 'line_age_yr']


In [7]:
mapping = {
    'KINDER MORGAN CO2 CO LP': 'KINDER MORGAN CO2 CO LLC',
    'BEEMAN OIL & GAS INC': 'BEEMAN OIL & GAS LLC',
}
no_spills['Operator'] = no_spills['Operator'].replace(mapping)

no_spills.rename(columns={'OPERATOR_NUM': 'operator_number'}, inplace=True)
no_spills.rename(columns={'Operator': 'operator_name'}, inplace=True)

# Extract unique OPERATOR_NUM values
unique_operator_nums = no_spills['operator_number'].unique()

# Create a DataFrame to map OPERATOR_NUM to Operator (assuming the column name for operator names is 'Operator')
no_spills_operator_mapping = no_spills[['operator_number', 'operator_name']].drop_duplicates().reset_index(drop=True)

# Display the mapping
print(no_spills_operator_mapping)

     operator_number                          operator_name
0              10633  CRESTONE PEAK RESOURCES OPERATING LLC
1              68710          PETERSON ENERGY OPERATING INC
2              10459               EXTRACTION OIL & GAS INC
3              10646                   AXIS EXPLORATION LLC
4              10575                            8 NORTH LLC
..               ...                                    ...
109            47120        KERR MCGEE OIL & GAS ONSHORE LP
110            61250              MULL DRILLING COMPANY INC
111           100264                         XTO ENERGY INC
112            10000          BP AMERICA PRODUCTION COMPANY
113            10447             URSA OPERATING COMPANY LLC

[114 rows x 2 columns]


In [8]:
# List of columns to be removed
columns_to_remove = [
    "BEDDINGMATERIAL", "COMPANY_NAME", "ENDLAT", "ENDLONG", "ENTIRELINEREMOVED",
    "RECEIVE_DATE", "STARTLAT", "STARTLOCATIONID", "STARTLONG","ACTIONDESCRIPTION","operator_name",
    "TYPEOFFLUIDTRANS", "PIPEMATERIAL", "CONSTRUCTDATE"
]

# Drop the columns from the DataFrame
no_spills = no_spills.drop(columns=columns_to_remove)

In [9]:
# drop any NAs
no_spills = no_spills.dropna()

In [10]:
# Reorder df
new_order = ['operator_number', 'FLOWLINEID', 'LOCATION_ID', 'Status', 'FLOWLINEACTION', 'LOCATIONTYPE', 
             'Fluid', 'Material', 'Diam_in', 'Length_ft', 'MAXOPPRESSURE', 'SHAPE_Length', 'line_age_yr', 'geometry'] 
no_spills = no_spills[new_order]
no_spills

Unnamed: 0,operator_number,FLOWLINEID,LOCATION_ID,Status,FLOWLINEACTION,LOCATIONTYPE,Fluid,Material,Diam_in,Length_ft,MAXOPPRESSURE,SHAPE_Length,line_age_yr,geometry
1,10633,470445.0,470443.0,Active,Out of Service,Production Facilities,Multiphase,Carbon Steel,2.00,1025.98,250.0,312.594254,41.697467,"MULTILINESTRING ((507681.553 4440214.644, 5076..."
6,10459,462601.0,452637.0,Out of Service,Out of Service,Production Facilities,Multiphase,Carbon Steel,2.25,510.01,1140.0,155.389195,18.201232,"MULTILINESTRING ((496682.46 4457399.772, 49668..."
7,10459,462602.0,452637.0,Out of Service,Out of Service,Production Facilities,Multiphase,Carbon Steel,2.25,1597.85,1140.0,486.830336,15.285421,"MULTILINESTRING ((496340.456 4457384.126, 4963..."
8,10633,473671.0,336437.0,Abandoned,Out of Service,Production Facilities,Multiphase,Steel,2.00,3457.32,265.0,1053.369404,22.110883,"MULTILINESTRING ((507139.811 4448160.801, 5071..."
13,10633,473673.0,336437.0,Abandoned,Out of Service,Production Facilities,Multiphase,Steel,2.00,3457.32,320.0,1053.369404,15.279945,"MULTILINESTRING ((507139.811 4448160.801, 5071..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14393,96155,456386.0,433999.0,Active,Registration,Production Facilities,Multi-Phase,Carbon Steel,3.50,1404.32,150.0,427.915127,7.181383,"MULTILINESTRING ((597095.529 4518130.982, 5970..."
14394,96155,456381.0,433999.0,Active,Registration,Production Facilities,Multi-Phase,Carbon Steel,3.50,1404.32,150.0,427.915127,7.173169,"MULTILINESTRING ((597095.529 4518130.982, 5970..."
14395,96155,456382.0,433999.0,Active,Registration,Production Facilities,Multi-Phase,Carbon Steel,3.50,1404.32,150.0,427.915127,7.181383,"MULTILINESTRING ((597095.529 4518130.982, 5970..."
14501,35080,455592.0,443145.0,Active,Registration,Production Facilities,Oil,HDPE,3.50,1175.83,40.0,358.332547,7.756331,"MULTILINESTRING ((637207.682 4380630.737, 6371..."


### Consolidate variables uniformly

In [11]:
# List of specific columns for which to print unique values
columns_to_check = ['Status', 'FLOWLINEACTION', 'LOCATIONTYPE', 'Fluid', 'Material']

# Create a dictionary to store unique values for each column
unique_values_dict = {}

# Loop through the specified columns and store unique values in the dictionary
for column in columns_to_check:
    if column in no_spills.columns:  # Check if the column exists in the DataFrame
        unique_values = no_spills[column].unique().tolist()  # Convert numpy array to list
        unique_values_dict[column] = unique_values
    else:
        unique_values_dict[column] = "Column not found in DataFrame."

# Print each column's unique values list
for column, values in unique_values_dict.items():
    print(f"Unique values in {column}: {values}")

Unique values in Status: ['Active', 'Out of Service', 'Abandoned', 'ACTIVE', 'Inactive', 'abandoned', 'InActive', 'Abandoned in Place', 'ABANDONED', 'Actove', 'OutofService', 'Avtive', 'PA', 'INACTIVE', 'OUT OF SERVICE', 'Out-of-Service', 'Shut in', 'Status', 'Future', 'Actve']
Unique values in FLOWLINEACTION: ['Out of Service', 'Abandonment Verification', 'Realignment', 'Pre-Abandonment Notice', 'Registration', 'Removed From Service', 'Abandonment']
Unique values in LOCATIONTYPE: ['Production Facilities', 'Well Site', 'Manifold', 'Compressor Station', 'Crude Oil Transfer Line', 'Produced Water Transfer System']
Unique values in Fluid: ['Multiphase', 'Emulsion', 'NATUAL GAS', 'CO2/Produced Water', 'Oil', 'Co2/Prod Water', 'Natural Gas', 'Gas', 'MULTIPHASE', 'Crude Oil Emulsion', 'Produced Water', 'CRUDE OIL EMULSION', 'Water', 'Condensate', 'Crude Oil', 'Unprocessed Production Fluids', 'NATURAL GAS', 'Multi-Phase', 'CO2/Prod Water', 'CO2Produced Water', 'CO2/Produced Wtaer', 'C02/Prod 

In [12]:
status_mapping = {
    'Active': 'Active', 'ACTIVE': 'Active', 'Actove': 'Active', 'Avtive': 'Active', 'Actve': 'Active',
    'Out of Service': 'Out of Service', 'OOS': 'Out of Service', 'OutofService': 'Out of Service', 'Out-of-Service': 'Out of Service',
    'Abandoned': 'Abandoned', 'abandoned': 'Abandoned', 'Abandoned in Place': 'Abandoned', 'ABANDONED': 'Abandoned',
    'Inactive': 'Inactive', 'InActive': 'Inactive', 'INACTIVE': 'Inactive',
    'PA': 'Pending Analysis', 'Shut in': 'Shut In',
    'Status': 'Unknown', 'Future': 'Future'
}
no_spills['Status'] = no_spills['Status'].replace(status_mapping)
print(no_spills['Status'].unique())


flowlineaction_mapping = {
    'Out of Service': 'Out of Service', 'Removed From Service': 'Out of Service',
    'Pre-Abandonment Notice': 'Pre-Abandonment Notice',
    'Abandonment Verification': 'Abandonment Verification',
    'Realignment': 'Realignment',
    'Registration': 'Registration',
    'Abandonment': 'Abandonment'
}
no_spills['FLOWLINEACTION'] = no_spills['FLOWLINEACTION'].replace(flowlineaction_mapping)
print(no_spills['FLOWLINEACTION'].unique())


locationtype_mapping = {
    'Production Facilities': 'Production Facilities', 'Well Site': 'Well Site', 'Manifold': 'Manifold',
    'Compressor Station': 'Compressor Station', 'Gathering Line': 'Gathering Line',
    'Crude Oil Transfer Line': 'Crude Oil Transfer Line', 'Produced Water Transfer System': 'Produced Water'
}
no_spills['LOCATIONTYPE'] = no_spills['LOCATIONTYPE'].replace(locationtype_mapping)
print(no_spills['LOCATIONTYPE'].unique())

['Active' 'Out of Service' 'Abandoned' 'Inactive' 'Pending Analysis'
 'OUT OF SERVICE' 'Shut In' 'Unknown' 'Future']
['Out of Service' 'Abandonment Verification' 'Realignment'
 'Pre-Abandonment Notice' 'Registration' 'Abandonment']
['Production Facilities' 'Well Site' 'Manifold' 'Compressor Station'
 'Crude Oil Transfer Line' 'Produced Water']


In [13]:
# Update the 'Fluid' column normalization and mapping in one comprehensive block
no_spills['Fluid'] = no_spills['Fluid'].str.strip().str.title().replace({
    'Natual Gas': 'Natural Gas',  # Correct common misspelling
    'Natural Gas Production': 'Natural Gas',
    'Co2': 'Co2/Produced Water',  # Assuming Co2 implies the mixed type
    'C02/Prod Water': 'Co2/Produced Water',
    'Co2/Prod Water': 'Co2/Produced Water',
    'Co2Produced Water': 'Co2/Produced Water',
    'Co2/Produced Wtaer': 'Co2/Produced Water',
    'Gas': 'Natural Gas',  # Assuming general Gas to be Natural Gas
    'Gas, Oil And Water': 'Full Well Stream',
    'Oil': 'Crude Oil',  # Assuming Oil to be categorized as Crude Oil
    'Crude Oil': 'Crude Oil',
    'Crude Oil Emulsion': 'Crude Oil Emulsion',
    'Emulsion': 'Crude Oil Emulsion',  # To unify with Crude Oil Emulsion
    'Crude Oil Emmulsion, Water And Oil': 'Crude Oil Emulsion',
    'Crude Oil And Water Emulsion': 'Crude Oil Emulsion',
    'Oil Water Emulsion': 'Crude Oil Emulsion',
    'Oil/Water': 'Crude Oil Emulsion',
    'Oil /Water/Gas': 'Full Well Stream',
    '3 Phase': 'Multiphase',  # Assumed to mean the same
    'Multiphase': 'Multiphase',
    'Multi-Phase': 'Multiphase',
    'Mulitphase': 'Multiphase',
    'Multi-Phase\xa0': 'Multiphase',  # Non-breaking space issue
    'Injection Produced Water': 'Produced Water',
    'Produced Water': 'Produced Water',
    'Water': 'Produced Water',
    'Saltwater': 'Produced Water',
    'Condensate': 'Condensate',  # Depending on your classification needs
    'Liquid': 'Other',  # General term, assuming to categorize broadly
    'Unprocessed Production Fluids': 'Other',
    'Production Fluids': 'Other',
    'Full Well Stream': 'Full Well Stream',  # Assuming similar handling
    'Other': 'Other',
    'Gas,  Oil And Water' : 'Full Well Stream',
    'Natural Gas Lift': 'Natural Gas',
    'Natuarl Gas': 'Natural Gas',
    'Natural Gas High Pressure': 'Natural Gas',
    'Natural Gas Supply':'Natural Gas',
    'Crude Oill Emulsion': 'Crude Oil Emulsion',
})

# Review the changes to ensure the mappings are applied correctly
print(no_spills['Fluid'].unique())

['Multiphase' 'Crude Oil Emulsion' 'Natural Gas' 'Co2/Produced Water'
 'Crude Oil' 'Produced Water' 'Condensate' 'Other' 'Fluid'
 'Full Well Stream' 'Oil Water']


In [14]:
no_spills['Material'] = no_spills['Material'].str.strip().str.title().replace({
    'Fiberglass': 'Fiberglass', 
    'Fibergalss': 'Fiberglass', 
    'Fiberspar': 'Fiberglass', 
    'Fiber Glass': 'Fiberglass',
    'Carbon Steel': 'Carbon Steel', 
    'Carbon Steel - Hdpe': 'Carbon Steel/HDPE', 
    'Carbon Steel, Hdpe,Stainless Steel': 'Carbon Steel/HDPE/Stainless Steel', 
    'Carbon Steel, Hdpe, Stainless Steel': 'Carbon Steel/HDPE/Stainless Steel',
    'Carbon Steel/Stainless Steel/Hdpe': 'Carbon Steel/HDPE/Stainless Steel', 
    'Carbon Steel/Hdpe/Stainless': 'Carbon Steel/HDPE/Stainless Steel', 
    'Carbon Steel/Hdpe': 'Carbon Steel/HDPE', 
    'Satinless/Carbon Steel/Hdpe': 'Carbon Steel/HDPE/Stainless Steel',
    'Steel': 'Steel', 
    'Lined Steel': 'Steel', 
    'Coated Steel': 'Steel', 
    'Flexsteel': 'Steel', 
    'Fiber Glass And Carbon Steel': 'Fiberglass/Carbon Steel', 
    'Fiberglass And Hdpe': 'Fiberglass/HDPE',
    'Hdpe': 'HDPE', 
    'Hdpe Poly': 'HDPE', 
    'Hdpe/Steel': 'HDPE/Steel', 
    'Hdpe Lined Steel': 'HDPE/Steel',
    'Poly': 'Poly', 
    'Polyline': 'Poly', 
    'Poly & Steel': 'Poly/Steel', 
    'Polycarbonate': 'Polycarbonate', 
    'Polycarbonate/Steel': 'Polycarbonate/Steel',
    'Pvc': 'PVC', 
    'Flexspar': 'Fiberglass', 
    'Stainless': 'Steel', 
    'Stainless/Carbon Steel/Hdpe': 'Carbon Steel/HDPE/Stainless Steel',
    'Unknown': 'Unknown', 
    'Other': 'Other', 
    'Other (Poly)': 'Other', 
    'Sdr7 Polyethelyne': 'Polyethylene', 
    'Sdr 11 Poly Pipe': 'Polyethylene', 
    'Sdr 11 Poly': 'Polyethylene', 
    'Poly Pipe': 'Polyethylene', 
    'Sdr_Poly': 'Polyethylene',
    'Duplex': 'Duplex', 
    'Fplp': 'Other', 
    'Flowline': 'Other',
    'Flex Steel': 'Steel',
    'Fiberglass And Carbon Steel': 'Fiberglass/Carbon Steel', 
    'Stainless Steel': 'Steel',
    'HDPE Lined Steel': 'HDPE/Steel',
    'Fiberglass/Hdpe': 'Fiberglass/HDPE',
})

# Verify the changes by printing the unique values in the 'Material' column
print(no_spills['Material'].unique())

['Carbon Steel' 'Steel' 'Poly' 'Fiberglass' 'HDPE'
 'Carbon Steel/HDPE/Stainless Steel' 'Carbon Steel/HDPE' 'Unknown' 'Other'
 'Carbon Steel/Hdpe/Stainless Steel' 'Duplex' 'Co2/Produced Water' 'PVC'
 'Polycarbonate' 'Polycarbonate/Steel' 'Carbon Steel/Stainless/Hdpe'
 'Polyethylene' 'Polypropylene' 'Fiberglass/Carbon Steel'
 'Carbon Steel And Hdpe' 'Hdpe/Steel, Flexsteel' 'Fiberglass/HDPE']


In [15]:
no_spills.columns = [
    'operator_number',  # Changed from OPERATOR_NUM to operator_number
    'flowline_id',       # Changed from FLOWLINEID to flowline_id
    'location_id',       # Changed from LOCATION_ID to location_id
    'status',            # Already appropriately named but changed to lowercase
    'flowline_action',   # Changed from FLOWLINEACTION to flowline_action
    'location_type',     # Changed from LOCATIONTYPE to location_type
    'fluid',             # Already appropriately named but changed to lowercase
    'material',          # Already appropriately named but changed to lowercase
    'diameter_in',       # Changed from Diam_in to diameter_in
    'length_ft',         # Changed from Length_ft to length_ft
    'max_operating_pressure', # Changed from MAXOPPRESSURE to max_operating_pressure
    'shape_length',      # Changed from SHAPE_Length to shape_length
    'line_age_yr',
    'geometry',           # Already appropriately named but changed to lowercase
]

# Print new column names to verify
print(no_spills.columns.tolist())

['operator_number', 'flowline_id', 'location_id', 'status', 'flowline_action', 'location_type', 'fluid', 'material', 'diameter_in', 'length_ft', 'max_operating_pressure', 'shape_length', 'line_age_yr', 'geometry']


In [16]:
no_spills.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 3866 entries, 1 to 14513
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   operator_number         3866 non-null   int32   
 1   flowline_id             3866 non-null   float64 
 2   location_id             3866 non-null   float64 
 3   status                  3866 non-null   object  
 4   flowline_action         3866 non-null   object  
 5   location_type           3866 non-null   object  
 6   fluid                   3866 non-null   object  
 7   material                3866 non-null   object  
 8   diameter_in             3866 non-null   float64 
 9   length_ft               3866 non-null   float64 
 10  max_operating_pressure  3866 non-null   float64 
 11  shape_length            3866 non-null   float64 
 12  line_age_yr             3866 non-null   float64 
 13  geometry                3866 non-null   geometry
dtypes: float64(7), geome

In [17]:
# make variables integers
no_spills['diameter_in'] = no_spills['diameter_in'].astype(int)
no_spills['length_ft'] = no_spills['length_ft'].astype(int)
no_spills['max_operating_pressure'] = no_spills['max_operating_pressure'].astype(int)
no_spills['shape_length'] = no_spills['shape_length'].astype(int)
no_spills['line_age_yr'] = no_spills['line_age_yr'].astype(int)

In [18]:
# Create a new column 'Risk' in the no_spills DataFrame and set all its values to 0
no_spills['risk'] = 0

## Spills

In [19]:
print(spills.info())
print(spills.shape)

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 42 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   ACTIONDESCRIPTION         5 non-null      object        
 1   BEDDINGMATERIAL           94 non-null     object        
 2   COMPANY_NAME              109 non-null    object        
 3   CONSTRUCTDATE             109 non-null    datetime64[ms]
 4   Detailed Root Cause Type  94 non-null     object        
 5   Diam_in                   109 non-null    float64       
 6   ENDLAT                    109 non-null    float64       
 7   ENDLONG                   109 non-null    float64       
 8   ENTIRELINEREMOVED         1 non-null      object        
 9   FLOWLINEACTION            77 non-null     object        
 10  FLOWLINEID                109 non-null    float64       
 11  Fluid                     109 non-null    object        
 12  Gathering?    

In [20]:
spills.describe()

Unnamed: 0,CONSTRUCTDATE,Diam_in,ENDLAT,ENDLONG,FLOWLINEID,LOCATION_ID,Lat,Length_ft,Long,MAXOPPRESSURE,OPERATOR_NUM,RECEIVE_DATE,SHAPE_Length,STARTLAT,STARTLOCATIONID,STARTLONG,trkg_num
count,109,109.0,109.0,109.0,109.0,109.0,109.0,109.0,109.0,57.0,109.0,109,109.0,109.0,108.0,109.0,109.0
mean,1994-10-10 17:23:40.183000,4.018349,40.162537,-104.420141,472237.284404,390118.229358,40.160553,17837.505046,-104.420592,405.631579,38607.330275,2020-04-03 04:48:28.268000,5435.425622,40.162413,359445.537037,-104.421418,402771400.0
min,1955-04-29 00:00:00,0.0,37.106749,-108.062531,457741.0,159607.0,37.093239,107.77,-108.06363,0.0,8960.0,2018-12-03 19:09:59.127000,32.834947,37.101221,307298.0,-108.063693,401702100.0
25%,1976-08-17 00:00:00,2.0,40.04761,-104.90665,465855.0,317874.0,40.047251,925.07,-104.905802,24.0,10633.0,2019-07-03 13:07:23.247000,281.856665,40.0495,318131.25,-104.90658,402167400.0
50%,1994-01-13 00:00:00,3.0,40.11081,-104.644379,475134.0,430411.0,40.110337,2058.12,-104.644528,100.0,46290.0,2020-01-30 15:57:12.920000,627.071301,40.11142,329114.5,-104.644886,402922800.0
75%,2014-12-04 00:00:00,4.0,40.379446,-103.790655,476178.0,446980.0,40.378707,4375.74,-103.812439,700.0,46290.0,2020-11-18 15:58:53.707000,1333.192839,40.376553,432765.25,-103.797578,403353400.0
max,2018-05-09 00:00:00,12.75,40.973847,-102.08325,483960.0,480689.0,40.97725,141290.45,-102.083057,2700.0,100322.0,2023-10-13 16:08:29.477000,43053.037992,40.97303,482187.0,-102.080785,403977900.0
std,,2.833733,0.477498,0.857263,5790.90792,74723.623231,0.476802,41540.03928,0.85648,599.942248,29563.880014,,12658.087091,0.478185,55138.474229,0.857137,697429.0


### Get line age from construction date

In [21]:
# Verify CONSTRUCTDATE is datetime type; convert if necessary
spills['CONSTRUCTDATE'] = pd.to_datetime(spills['CONSTRUCTDATE'])

# Calculate line_age
# Today's date
today = pd.Timestamp.now()

# Calculate the difference in years
spills['line_age_yr'] = (today - spills['CONSTRUCTDATE']).dt.days / 365.25

# Display the updated GeoDataFrame to verify 'line_age' column
print(spills[['CONSTRUCTDATE', 'line_age_yr']])

    CONSTRUCTDATE  line_age_yr
0      2017-08-19     7.997262
1      2017-08-19     7.997262
2      2002-10-01    22.880219
3      2017-08-19     7.997262
4      2017-08-19     7.997262
..            ...          ...
104    1973-10-03    51.874059
105    1973-12-13    51.679671
106    1979-02-02    46.540726
107    2015-04-24    10.318960
108    1976-08-24    48.982888

[109 rows x 2 columns]


In [22]:
# Print the list of column names
column_names = spills.columns.tolist()

print(column_names)  

['ACTIONDESCRIPTION', 'BEDDINGMATERIAL', 'COMPANY_NAME', 'CONSTRUCTDATE', 'Detailed Root Cause Type', 'Diam_in', 'ENDLAT', 'ENDLONG', 'ENTIRELINEREMOVED', 'FLOWLINEACTION', 'FLOWLINEID', 'Fluid', 'Gathering?', 'LOCATIONTYPE', 'LOCATION_ID', 'Lat', 'Length_ft', 'Long', 'MAXOPPRESSURE', 'Material', 'Metallic?', 'OPERATOR_NUM', 'Operator', 'Operator Name', 'PIPEMATERIAL', 'Preventative Measure', 'RECEIVE_DATE', 'Root Cause', 'Root Cause Type', 'SHAPE_Length', 'STARTLAT', 'STARTLOCATIONID', 'STARTLONG', 'Spill Type', 'Spill_Desc', 'Status', 'TYPEOFFLUIDTRANS', 'facility_status', 'facility_type', 'incident_date', 'trkg_num', 'geometry', 'line_age_yr']


In [23]:
spills.rename(columns={'OPERATOR_NUM': 'operator_number'}, inplace=True)
spills.rename(columns={'COMPANY_NAME': 'operator_name'}, inplace=True)

# Extract unique OPERATOR_NUM values
unique_operator_nums = spills['operator_number'].unique()

# Create a DataFrame to map OPERATOR_NUM to Operator (assuming the column name for operator names is 'Operator')
spills_operator_mapping = spills[['operator_number', 'operator_name']].drop_duplicates().reset_index(drop=True)

# Apply strip() to remove leading and trailing spaces, and rstrip('.') to remove trailing dots from the 'operator_name' column
spills_operator_mapping['operator_name'] = spills_operator_mapping['operator_name'].str.strip().str.rstrip('.')

# Display the mapping
print(spills_operator_mapping)

    operator_number                               operator_name
0             96155               WHITING OIL & GAS CORPORATION
1             10112            FOUNDATION ENERGY MANAGEMENT LLC
2             10690                       IMPETRO RESOURCES LLC
3             10633       CRESTONE PEAK RESOURCES OPERATING LLC
4             10699                 OWN RESOURCES OPERATING LLC
5             46290                     KP KAUFFMAN COMPANY INC
6             10110         GREAT WESTERN OPERATING COMPANY LLC
7            100322                            NOBLE ENERGY INC
8              8960  BONANZA CREEK ENERGY OPERATING COMPANY LLC
9             95520                         WESCO OPERATING INC
10            10672                  TIMBER CREEK OPERATING LLC
11            10706                              D90 ENERGY LLC
12            95620                   WESTERN OPERATING COMPANY
13            98220               YOUNG GAS STORAGE COMPANY LTD
14            10261      BAYSWATER EXPLO

In [24]:
# List of columns to be removed
columns_to_remove = [
    "ACTIONDESCRIPTION", "BEDDINGMATERIAL", "operator_name", "CONSTRUCTDATE", "ENDLAT",
    "ENDLONG", "ENTIRELINEREMOVED", "Lat", "Long","Operator","Operator Name", "Spill Type",
    "STARTLAT", "STARTLOCATIONID", "STARTLONG", "trkg_num", "Root Cause", "Detailed Root Cause Type", "Root Cause Type", "Metallic?", 
    "Preventative Measure","Spill_Desc", "TYPEOFFLUIDTRANS", "facility_status", "facility_type", "PIPEMATERIAL", "RECEIVE_DATE"
]

# Drop the columns from the DataFrame
spills = spills.drop(columns=columns_to_remove)

In [25]:
# Print the list of column names
column_names = spills.columns.tolist()

print(column_names)  

['Diam_in', 'FLOWLINEACTION', 'FLOWLINEID', 'Fluid', 'Gathering?', 'LOCATIONTYPE', 'LOCATION_ID', 'Length_ft', 'MAXOPPRESSURE', 'Material', 'operator_number', 'SHAPE_Length', 'Status', 'incident_date', 'geometry', 'line_age_yr']


In [26]:
# Reorder df
new_order = ['operator_number', 'FLOWLINEID', 'LOCATION_ID', 'Status', 'FLOWLINEACTION', 'LOCATIONTYPE', 
             'Fluid', 'Material', 'Diam_in', 'Length_ft', 'MAXOPPRESSURE', 'SHAPE_Length', 'line_age_yr', 'geometry'] 
spills = spills[new_order]
spills

Unnamed: 0,operator_number,FLOWLINEID,LOCATION_ID,Status,FLOWLINEACTION,LOCATIONTYPE,Fluid,Material,Diam_in,Length_ft,MAXOPPRESSURE,SHAPE_Length,line_age_yr,geometry
0,96155,465855.0,442408.0,Active,Registration,Well Site,Crude Oil,Carbon Steel,12.375,141290.45,1100.0,43053.037992,7.997262,"MULTILINESTRING ((595672.003 4517988.031, 5956..."
1,96155,465855.0,442408.0,Active,Registration,Well Site,Crude Oil,Carbon Steel,12.375,141290.45,1100.0,43053.037992,7.997262,"MULTILINESTRING ((595672.003 4517988.031, 5956..."
2,10112,468013.0,467691.0,Active,Registration,Production Facilities,Multiphase,Carbon Steel,3.000,1967.95,100.0,599.638869,22.880219,"MULTILINESTRING ((579396.131 4497273.237, 5793..."
3,96155,465843.0,446980.0,Active,Registration,Production Facilities,Natural Gas,Carbon Steel,6.625,129999.02,150.0,39612.461236,7.997262,"MULTILINESTRING ((593633.376 4516002.016, 5936..."
4,96155,465843.0,446980.0,Active,Registration,Production Facilities,Natural Gas,Carbon Steel,6.625,129999.02,150.0,39612.461236,7.997262,"MULTILINESTRING ((593633.376 4516002.016, 5936..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104,46290,479357.0,317590.0,Active,Registration,Production Facilities,Multiphase,Fiberglass,3.000,2996.83,43.0,913.069782,51.874059,"MULTILINESTRING ((511091 4442899, 510291.027 4..."
105,46290,475987.0,450409.0,Active,,Production Facilities,Multiphase,Fiberglass,3.000,1772.77,,540.123533,51.679671,"MULTILINESTRING ((502308 4438910, 502341.821 4..."
106,46290,478333.0,476278.0,Active,Registration,Manifold,Multiphase,Carbon Steel,6.625,5017.70,24.0,1528.782168,46.540726,"MULTILINESTRING ((498137.46 4440307.632, 49788..."
107,10373,463309.0,159607.0,Active,Registration,Well Site,Produced Water,Fiberglass,6.000,1570.35,1800.0,478.457939,10.318960,"MULTILINESTRING ((536178.024 4439769.413, 5361..."


In [27]:
spills.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   operator_number  109 non-null    int32   
 1   FLOWLINEID       109 non-null    float64 
 2   LOCATION_ID      109 non-null    float64 
 3   Status           109 non-null    object  
 4   FLOWLINEACTION   77 non-null     object  
 5   LOCATIONTYPE     109 non-null    object  
 6   Fluid            109 non-null    object  
 7   Material         109 non-null    object  
 8   Diam_in          109 non-null    float64 
 9   Length_ft        109 non-null    float64 
 10  MAXOPPRESSURE    57 non-null     float64 
 11  SHAPE_Length     109 non-null    float64 
 12  line_age_yr      109 non-null    float64 
 13  geometry         109 non-null    geometry
dtypes: float64(7), geometry(1), int32(1), object(5)
memory usage: 11.6+ KB


### Consolidate variables uniformly

In [28]:
# List of specific columns for which to print unique values
columns_to_check = ['Status', 'FLOWLINEACTION', 'LOCATIONTYPE', 'Fluid', 'Material']

# Create a dictionary to store unique values for each column
unique_values_dict = {}

# Loop through the specified columns and store unique values in the dictionary
for column in columns_to_check:
    if column in spills.columns:  # Check if the column exists in the DataFrame
        unique_values = spills[column].unique().tolist()  # Convert numpy array to list
        unique_values_dict[column] = unique_values
    else:
        unique_values_dict[column] = "Column not found in DataFrame."

# Print each column's unique values list
for column, values in unique_values_dict.items():
    print(f"Unique values in {column}: {values}")

Unique values in Status: ['Active', 'Out of Service', 'abandoned', 'ACTIVE']
Unique values in FLOWLINEACTION: ['Registration', None, 'Pre-Abandonment Notice', 'Realignment', 'Abandonment Verification', 'Abandonment']
Unique values in LOCATIONTYPE: ['Well Site', 'Production Facilities', 'Manifold', 'Compressor Station']
Unique values in Fluid: ['Crude Oil', 'Multiphase', 'Natural Gas', 'Crude Oil Emulsion', 'Produced Water', 'Oil', 'Natural Gas Production', 'Multi-Phase\xa0', 'Oil and water', '3 Phase', 'PRODUCED WATER', 'MULTI PHASE', 'Natural Gas Lift']
Unique values in Material: ['Carbon Steel', 'HDPE', 'Fiberglass', 'Steel', 'Other (Poly)', 'Other', 'steel', 'Composite HDPE', 'PVC', 'Fiberglass Sleaved w/ HDPE', 'Poly', 'STEEL']


In [29]:
# Define the mapping dictionary for the 'Status' column
status_mapping = {
    'ACTIVE': 'Active',           # Normalize to 'Active'
    'Active': 'Active',           # No change needed, already in desired format
    'Out Of Service': 'Out of Service', # Normalize case and spacing
    'Out of Service': 'Out of Service', # No change needed
    'abandoned': 'Abandoned'      # Capitalize to 'Abandoned'
}

# Apply the mapping to the 'Status' column
spills['Status'] = spills['Status'].replace(status_mapping)
print(spills['Status'].unique())

# Define the mapping dictionary for the 'FLOWLINEACTION' column
flowlineaction_mapping = {
    'Registration': 'Registration',  # No change needed
    None: 'Unknown',                # Mapping None to 'Unknown' (or you could leave it as None if preferred)
    'Abandonment': 'Abandonment',   # No change needed
    'Out of Service': 'Out of Service', # Ensure consistent formatting if there's case variation elsewhere
    'Realignment': 'Realignment',   # No change needed
    'Pre-Abandonment Notice': 'Pre-Abandonment Notice' # No change needed
}

# Apply the mapping to the 'FLOWLINEACTION' column
spills['FLOWLINEACTION'] = spills['FLOWLINEACTION'].replace(flowlineaction_mapping)
print(spills['FLOWLINEACTION'].unique())

['Active' 'Out of Service' 'Abandoned']
['Registration' 'Unknown' 'Pre-Abandonment Notice' 'Realignment'
 'Abandonment Verification' 'Abandonment']


In [30]:
# Define the mapping dictionary for the 'Fluid' column, aligning with the provided categories
fluid_mapping = {
    'Crude Oil': 'Crude Oil',                         
    'Multiphase': 'Multiphase',                       
    'Natural Gas': 'Natural Gas',                     
    'Crude Oil Emulsion': 'Crude Oil Emulsion',       
    'Produced Water': 'Produced Water',               
    'PRODUCED WATER': 'Produced Water',               
    'Oil': 'Crude Oil',                               # Mapping general oil as Crude Oil
    'Liquid': 'Other',                                # Mapping generic liquids to 'Other'
    'Natural Gas Production': 'Natural Gas',          # Simplifying to 'Natural Gas'
    'Multi-Phase\xa0': 'Multiphase',                  # Standardizing 'Multiphase'
    'Mulitphase': 'Multiphase',                       # Standardizing 'Multiphase'
    'Oil and water': 'Oil Water',                     # Standardizing as 'Oil Water'
    '3 Phase': 'Multiphase',                          # Mapping to 'Multiphase'
    'Oil, Gas, Water': 'Oil Water',                   # Assuming mix of oil and water primarily
    'Mulitphase': 'Multiphase',                       # Correcting typo and standardizing
    'Co2/Produced Water': 'Co2/Produced Water',       # No change needed, already a standard category
    'Condensate': 'Condensate',                       # Adding as its own category
    'Full Well Stream': 'Full Well Stream'            # Adding as its own category
}

# Apply the mapping to the 'Fluid' column
spills['Fluid'] = spills['Fluid'].replace(fluid_mapping)
print(spills['Fluid'].unique())

['Crude Oil' 'Multiphase' 'Natural Gas' 'Crude Oil Emulsion'
 'Produced Water' 'Oil Water' 'MULTI PHASE' 'Natural Gas Lift']


In [31]:
# Define the mapping dictionary for the 'Material' column
material_mapping = {
    'Carbon Steel': 'Carbon Steel', 
    'HDPE': 'HDPE', 
    'LINED STEEL': 'Steel',          # Assuming Lined Steel is generally categorized as Steel
    'Fiberglass': 'Fiberglass', 
    'Steel': 'Steel', 
    'HDPE Poly': 'HDPE',             # Standardizing as HDPE
    'Other (Poly)': 'Other',         # Simplifying to 'Other'
    'Other': 'Other', 
    'FPLP': 'Other',                 # FPLP is often a specific type of lining or pipe, classify as Other if not specifically relevant
    'steel': 'Steel',                # Ensuring case consistency
    'Composite HDPE': 'HDPE',        # Composite materials containing HDPE, categorized as HDPE
    'PVC': 'PVC', 
    'Poly': 'Poly', 
    'Poly/Steel': 'Poly/Steel',      # This is already in the reference list
    'Carbon Steel/HDPE/Stainless Steel': 'Carbon Steel/HDPE/Stainless Steel', 
    'Carbon Steel/HDPE': 'Carbon Steel/HDPE', 
    'Unknown': 'Unknown',            # Keep as is if applicable
    'Duplex': 'Duplex',              # Duplex typically refers to stainless steel but keeping separate as may be specific
    'Co2/Produced Water': 'Co2/Produced Water',   # Specific type of material handling, categorized separately
    'Polycarbonate': 'Polycarbonate',
    'Polycarbonate/Steel': 'Polycarbonate/Steel',
    'Polyethylene': 'Polyethylene',  # Mapping to Polyethylene where applicable
    'Polypropylene': 'Polypropylene',
    'Fiberglass/Carbon Steel': 'Fiberglass/Carbon Steel',
    'Hdpe/Steel, Flexsteel': 'HDPE/Steel'         # Assuming Hdpe/Steel includes any combination like Flexsteel
}

# Apply the mapping to the 'Material' column
spills['Material'] = spills['Material'].replace(material_mapping)
print(spills['Material'].unique())

['Carbon Steel' 'HDPE' 'Fiberglass' 'Steel' 'Other' 'PVC'
 'Fiberglass Sleaved w/ HDPE' 'Poly' 'STEEL']


In [32]:
# Define the mapping dictionary for the 'Root Cause Type' column
# root_cause_mapping = {
#     'Corrosion': 'Corrosion', 
#     'Unknown': 'Unknown', 
#     'Incorrect Operation': 'Incorrect Operation', 
#     'Equipment Failure': 'Equipment Failure', 
#     'Other Outside Force Damage': 'Other Outside Force Damage', 
#     'Natural Force Damage': 'Natural Force Damage', 
#     'Pipe, Weld, or Joint Failure': 'Pipe, Weld, or Joint Failure', 
#     'Excavation Damage': 'Excavation Damage', 
#     'Other Outside Force': 'Other Outside Force Damage',  # Mapping to a more standardized category
#     'Pipe, Weld, Joint Failure': 'Pipe, Weld, or Joint Failure'  # Ensuring consistency in naming
# }

# Apply the mapping to the 'Root Cause Type' column
# spills['Root Cause Type'] = spills['Root Cause Type'].replace(root_cause_mapping)
# print(spills['Root Cause Type'].unique())

In [33]:
# Assuming 'spills' is your DataFrame
spills.columns = [
    'operator_number',         # Changed from OPERATOR_NUM to operator_number
    'flowline_id',             # Changed from FLOWLINEID to flowline_id
    'location_id',             # Changed from LOCATION_ID to location_id
    'status',                  # Already appropriately named but changed to lowercase
    'flowline_action',         # Changed from FLOWLINEACTION to flowline_action
    'location_type',           # Changed from LOCATIONTYPE to location_type
    'fluid',                   # Already appropriately named but changed to lowercase
    'material',                # Already appropriately named but changed to lowercase
    'diameter_in',             # Changed from Diam_in to diameter_in
    'length_ft',               # Changed from Length_ft to length_ft
    'max_operating_pressure',  # Changed from MAXOPPRESSURE to max_operating_pressure
    'shape_length',            # Changed from SHAPE_Length to shape_length
    'line_age_yr',             # Changed from line_age_yr to maintain consistency in the style
    # 'root_cause_type',         # Added to align with the column names provided earlier
    'geometry'                 # Already appropriately named but changed to lowercase
]

# Print new column names to verify
print(spills.columns.tolist())

['operator_number', 'flowline_id', 'location_id', 'status', 'flowline_action', 'location_type', 'fluid', 'material', 'diameter_in', 'length_ft', 'max_operating_pressure', 'shape_length', 'line_age_yr', 'geometry']


In [34]:
spills.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   operator_number         109 non-null    int32   
 1   flowline_id             109 non-null    float64 
 2   location_id             109 non-null    float64 
 3   status                  109 non-null    object  
 4   flowline_action         109 non-null    object  
 5   location_type           109 non-null    object  
 6   fluid                   109 non-null    object  
 7   material                109 non-null    object  
 8   diameter_in             109 non-null    float64 
 9   length_ft               109 non-null    float64 
 10  max_operating_pressure  57 non-null     float64 
 11  shape_length            109 non-null    float64 
 12  line_age_yr             109 non-null    float64 
 13  geometry                109 non-null    geometry
dtypes: float64(7), geo

In [35]:
# drop any NAs
spills = spills.dropna()
spills.shape

(57, 14)

In [36]:
# make variables integers
spills['diameter_in'] = spills['diameter_in'].astype(int)
spills['length_ft'] = spills['length_ft'].astype(int)
spills['max_operating_pressure'] = spills['max_operating_pressure'].astype(int)
spills['shape_length'] = spills['shape_length'].astype(int)
spills['line_age_yr'] = spills['line_age_yr'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [37]:
spills['risk'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [38]:
spills.head()

Unnamed: 0,operator_number,flowline_id,location_id,status,flowline_action,location_type,fluid,material,diameter_in,length_ft,max_operating_pressure,shape_length,line_age_yr,geometry,risk
0,96155,465855.0,442408.0,Active,Registration,Well Site,Crude Oil,Carbon Steel,12,141290,1100,43053,7,"MULTILINESTRING ((595672.003 4517988.031, 5956...",1
1,96155,465855.0,442408.0,Active,Registration,Well Site,Crude Oil,Carbon Steel,12,141290,1100,43053,7,"MULTILINESTRING ((595672.003 4517988.031, 5956...",1
2,10112,468013.0,467691.0,Active,Registration,Production Facilities,Multiphase,Carbon Steel,3,1967,100,599,22,"MULTILINESTRING ((579396.131 4497273.237, 5793...",1
3,96155,465843.0,446980.0,Active,Registration,Production Facilities,Natural Gas,Carbon Steel,6,129999,150,39612,7,"MULTILINESTRING ((593633.376 4516002.016, 5936...",1
4,96155,465843.0,446980.0,Active,Registration,Production Facilities,Natural Gas,Carbon Steel,6,129999,150,39612,7,"MULTILINESTRING ((593633.376 4516002.016, 5936...",1


# Merge Operator Numer and Name Mapping

In [39]:
# Merge the DataFrames
operator_mapping = pd.merge(no_spills_operator_mapping, spills_operator_mapping, how='outer')

# Drop duplicate rows
operator_mapping.drop_duplicates(inplace=True)
operator_mapping.shape

(115, 2)

In [40]:
print(f"No Spills: {len(no_spills)}")
print(f"Spills: {len(spills)}")

No Spills: 3866
Spills: 57


# Download Cleaned Data

In [41]:
no_spills.to_file("no_spills_cleaned.geojson", driver='GeoJSON')

In [42]:
spills.to_file("spills_cleaned.geojson", driver='GeoJSON')

In [43]:
operator_mapping.to_csv('operator_mapping.csv', index=False)