In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import fiona
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

pd.options.display.max_columns = None 

# set working directory
os.chdir('/Users/ichittumuri/Desktop/MINES/COGCC-Risk-Analysis/Data')

In [4]:
# Load Data
combined_gdf = gpd.read_file('combined_gdf.geojson')

# Data Cleaning


In [5]:
print(combined_gdf.info())
print(combined_gdf.shape)

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 464153 entries, 0 to 464152
Data columns (total 42 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   trkg_num                  562 non-null     float64       
 1   Operator Name             562 non-null     object        
 2   facility_type             518 non-null     object        
 3   Spill_Desc                562 non-null     object        
 4   Spill Type                315 non-null     object        
 5   Root Cause                392 non-null     object        
 6   Preventative Measure      383 non-null     object        
 7   Root Cause Type           562 non-null     object        
 8   Detailed Root Cause Type  487 non-null     object        
 9   Long                      562 non-null     float64       
 10  Lat                       562 non-null     float64       
 11  facility_status           10 non-null      object        

In [6]:
combined_gdf.describe()

Unnamed: 0,trkg_num,Long,Lat,nearest_flowline_index,CONSTRUCTDATE,Diam_in,ENDLAT,ENDLONG,FLOWLINEID,LOCATION_ID,Length_ft,MAXOPPRESSURE,OPERATOR_NUM,RECEIVE_DATE,SHAPE_Length,STARTLAT,STARTLOCATIONID,STARTLONG,risk
count,562.0,562.0,562.0,562.0,464153,464153.0,464153.0,464153.0,463691.0,464042.0,464153.0,275058.0,464153.0,464153,464153.0,464153.0,457911.0,464153.0,464153.0
mean,402406600.0,-106.119528,39.875744,258977.298932,2006-11-27 19:14:23.443000,6.489194,40.125575,-104.206951,469099.310334,426842.368187,78508.049994,416.670819,50679.598497,2020-02-26 22:21:06.594000,23926.594335,40.125363,375042.929283,-104.20796,0.001211
min,401524300.0,-108.94917,37.014053,8192.0,1900-01-11 00:00:00,0.0,37.010428,-109.045542,455152.0,159601.0,2.2,0.0,710.0,2018-04-25 14:38:45.767000,0.671472,36.993609,159652.0,-109.049983,0.0
25%,401977800.0,-108.111547,39.529501,164291.0,2003-03-05 00:00:00,2.0,40.068558,-104.73404,465368.0,430314.0,4668.43,150.0,10699.0,2019-07-05 10:12:32.930000,1422.368635,40.071112,316405.0,-104.73256,0.0
50%,402311300.0,-104.987023,40.07419,183331.0,2009-11-01 00:00:00,4.0,40.307481,-103.80744,467786.0,439324.0,129999.02,150.0,46290.0,2019-09-17 10:43:58.840000,39612.461236,40.309013,337862.0,-103.810884,0.0
75%,402962500.0,-104.507921,40.359008,439655.75,2017-06-08 00:00:00,12.375,40.811009,-102.770461,473298.0,468602.0,141290.45,550.0,96155.0,2020-11-16 13:32:50.357000,43053.037992,40.811044,435694.0,-102.776453,0.0
max,403278600.0,-102.083057,40.99735,463904.0,2020-06-01 00:00:00,278.0,40.99263,-102.045863,484080.0,483552.0,152023.9,4700.0,200077.0,2023-11-17 11:11:43.017000,46342.37979,40.996,484366.0,-102.046467,1.0
std,495331.6,1.920377,0.776363,131809.969421,,4.916893,0.997899,1.703578,5204.946624,57409.758949,67059.815222,554.73702,40481.922523,,20436.136095,0.998024,62329.008703,1.702843,0.034776


### Get line age from construction date

In [7]:
# Verify CONSTRUCTDATE is datetime type; convert if necessary
combined_gdf['CONSTRUCTDATE'] = pd.to_datetime(combined_gdf['CONSTRUCTDATE'])

# Calculate line_age
# Today's date
today = pd.Timestamp.now()

# Calculate the difference in years
combined_gdf['line_age_yr'] = (today - combined_gdf['CONSTRUCTDATE']).dt.days / 365.25

# Display the updated GeoDataFrame to verify 'line_age' column
print(combined_gdf[['CONSTRUCTDATE', 'line_age_yr']])

       CONSTRUCTDATE  line_age_yr
0         1972-08-07    52.440794
1         2017-12-07     7.107461
2         2011-08-10    13.434634
3         1987-12-18    37.078713
4         1993-11-07    31.189596
...              ...          ...
464148    1998-09-27    26.302533
464149    1983-10-21    41.237509
464150    1983-10-21    41.237509
464151    1983-10-21    41.237509
464152    1983-10-21    41.237509

[464153 rows x 2 columns]


In [8]:
# Print the list of column names
column_names = combined_gdf.columns.tolist()

print(column_names)

['trkg_num', 'Operator Name', 'facility_type', 'Spill_Desc', 'Spill Type', 'Root Cause', 'Preventative Measure', 'Root Cause Type', 'Detailed Root Cause Type', 'Long', 'Lat', 'facility_status', 'Metallic?', 'nearest_flowline_index', 'ACTIONDESCRIPTION', 'BEDDINGMATERIAL', 'COMPANY_NAME', 'CONSTRUCTDATE', 'Diam_in', 'ENDLAT', 'ENDLONG', 'ENTIRELINEREMOVED', 'FLOWLINEACTION', 'FLOWLINEID', 'Fluid', 'LOCATIONTYPE', 'LOCATION_ID', 'Length_ft', 'MAXOPPRESSURE', 'Material', 'OPERATOR_NUM', 'Operator', 'PIPEMATERIAL', 'RECEIVE_DATE', 'SHAPE_Length', 'STARTLAT', 'STARTLOCATIONID', 'STARTLONG', 'Status', 'TYPEOFFLUIDTRANS', 'risk', 'geometry', 'line_age_yr']


In [9]:
mapping = {
    'KINDER MORGAN CO2 CO LP': 'KINDER MORGAN CO2 CO LLC',
    'BEEMAN OIL & GAS INC': 'BEEMAN OIL & GAS LLC',
}
combined_gdf['Operator'] = combined_gdf['Operator'].replace(mapping)

combined_gdf.rename(columns={'OPERATOR_NUM': 'operator_number'}, inplace=True)
combined_gdf.rename(columns={'Operator': 'operator_name'}, inplace=True)

# Extract unique OPERATOR_NUM values
unique_operator_nums = combined_gdf['operator_number'].unique()

# Create a DataFrame to map OPERATOR_NUM to Operator (assuming the column name for operator names is 'Operator')
combined_gdf_operator_mapping = combined_gdf[['operator_number', 'operator_name']].drop_duplicates().reset_index(drop=True)

# Display the mapping
print(combined_gdf_operator_mapping)

     operator_number                        operator_name
0              10110  GREAT WESTERN OPERATING COMPANY LLC
1              69175                       PDC ENERGY INC
2              47120      KERR MCGEE OIL & GAS ONSHORE LP
3             100322                     NOBLE ENERGY INC
4              10459             EXTRACTION OIL & GAS INC
..               ...                                  ...
115            65110        O'BRIEN ENERGY RESOURCES CORP
116            41550        TYLER ROCKIES EXPLORATION LTD
117            10506               SEELEY OIL COMPANY LLC
118            11001                  BROWN OIL & GAS LLC
119            10639            CPX PICEANCE HOLDINGS LLC

[120 rows x 2 columns]


In [10]:
# List of columns to be removed
columns_to_remove = [
    'trkg_num', 'Operator Name', 'facility_type', 'Spill_Desc', 'Spill Type', 'Root Cause', 'Preventative Measure', 'Detailed Root Cause Type', 
    'Long', 'Lat', 'facility_status', 'Metallic?', 'nearest_flowline_index', 'ACTIONDESCRIPTION', 'BEDDINGMATERIAL', 'COMPANY_NAME', 'CONSTRUCTDATE', 
    'ENDLAT', 'ENDLONG', 'ENTIRELINEREMOVED', 'PIPEMATERIAL', 'RECEIVE_DATE', 'STARTLAT', 'STARTLOCATIONID', 'STARTLONG', 'TYPEOFFLUIDTRANS', 'operator_name']

# Drop the columns from the DataFrame
combined_gdf = combined_gdf.drop(columns=columns_to_remove)

In [11]:
# drop any NAs
# no_spills = combined_gdf.dropna()

In [12]:
'Root Cause Type', 'Diam_in', 'FLOWLINEACTION', 'FLOWLINEID', 'Fluid', 'LOCATIONTYPE', 'LOCATION_ID', 
'Length_ft', 'MAXOPPRESSURE', 'Material', 'OPERATOR_NUM', 'SHAPE_Length','Status', 'geometry', 'line_age_yr'

('Length_ft',
 'MAXOPPRESSURE',
 'Material',
 'OPERATOR_NUM',
 'SHAPE_Length',
 'Status',
 'geometry',
 'line_age_yr')

In [13]:
combined_gdf.head()

Unnamed: 0,Root Cause Type,Diam_in,FLOWLINEACTION,FLOWLINEID,Fluid,LOCATIONTYPE,LOCATION_ID,Length_ft,MAXOPPRESSURE,Material,operator_number,SHAPE_Length,Status,risk,geometry,line_age_yr
0,Unknown,2.0,Pre-Abandonment Notice,470450.0,Oil,Production Facilities,470449.0,542.71,,Steel,10110,165.354619,Active,1,"LINESTRING (545287.5 4410654.506, 545239.243 4...",52.440794
1,Unknown,3.5,Registration,477982.0,Produced Fluids,Production Facilities,447490.0,404.27,,Carbon Steel,69175,123.17501,New Construction,1,"LINESTRING (529087.406 4468617.814, 529106.928...",7.107461
2,Unknown,2.0,Registration,457300.0,PRODUCED WATER,Production Facilities,318070.0,18.23,,Steel,47120,5.553076,ACTIVE,1,"LINESTRING (526647.996 4445503.586, 526647.873...",13.434634
3,Unknown,2.0,Abandonment,465120.0,Multiphase,Production Facilities,464594.0,1884.61,,Carbon Steel,100322,574.207157,Active,1,"LINESTRING (534361.603 4464302.795, 534361.11 ...",37.078713
4,Unknown,2.0,Registration,466186.0,Multiphase,Production Facilities,455178.0,768.11,,Carbon Steel,100322,234.027984,Abandoned,1,"LINESTRING (521614.711 4482858.937, 521636.926...",31.189596


In [14]:
# Reorder df
new_order = ['operator_number', 'FLOWLINEID', 'LOCATION_ID', 'Status', 'FLOWLINEACTION', 'LOCATIONTYPE', 'Fluid',
             'Material', 'Diam_in', 'Length_ft', 'MAXOPPRESSURE', 'SHAPE_Length', 'line_age_yr', 'geometry', 'Root Cause Type']
combined_gdf = combined_gdf[new_order]
combined_gdf

Unnamed: 0,operator_number,FLOWLINEID,LOCATION_ID,Status,FLOWLINEACTION,LOCATIONTYPE,Fluid,Material,Diam_in,Length_ft,MAXOPPRESSURE,SHAPE_Length,line_age_yr,geometry,Root Cause Type
0,10110,470450.0,470449.0,Active,Pre-Abandonment Notice,Production Facilities,Oil,Steel,2.0,542.71,,165.354619,52.440794,"LINESTRING (545287.5 4410654.506, 545239.243 4...",Unknown
1,69175,477982.0,447490.0,New Construction,Registration,Production Facilities,Produced Fluids,Carbon Steel,3.5,404.27,,123.175010,7.107461,"LINESTRING (529087.406 4468617.814, 529106.928...",Unknown
2,47120,457300.0,318070.0,ACTIVE,Registration,Production Facilities,PRODUCED WATER,Steel,2.0,18.23,,5.553076,13.434634,"LINESTRING (526647.996 4445503.586, 526647.873...",Unknown
3,100322,465120.0,464594.0,Active,Abandonment,Production Facilities,Multiphase,Carbon Steel,2.0,1884.61,,574.207157,37.078713,"LINESTRING (534361.603 4464302.795, 534361.11 ...",Unknown
4,100322,466186.0,455178.0,Abandoned,Registration,Production Facilities,Multiphase,Carbon Steel,2.0,768.11,,234.027984,31.189596,"LINESTRING (521614.711 4482858.937, 521636.926...",Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
464148,100322,455156.0,455096.0,Active,Registration,Production Facilities,Multiphase,Carbon Steel,2.0,800.31,,243.837753,26.302533,"LINESTRING (519344.121 4482696.977, 519317.553...",
464149,100322,455152.0,319507.0,Abandoned,Registration,Production Facilities,Multiphase,Carbon Steel,3.0,642.93,,195.886304,41.237509,"LINESTRING (515773.134 4480495.258, 515730.241...",
464150,100322,455152.0,319507.0,Abandoned,Registration,Production Facilities,Multiphase,Carbon Steel,3.0,642.93,,195.886304,41.237509,"LINESTRING (515730.241 4480520.951, 515687.347...",
464151,100322,455152.0,319507.0,Abandoned,Registration,Production Facilities,Multiphase,Carbon Steel,3.0,642.93,,195.886304,41.237509,"LINESTRING (515687.347 4480546.644, 515644.453...",


### Consolidate variables uniformly

In [15]:
# List of specific columns for which to print unique values
columns_to_check = ['Status', 'FLOWLINEACTION', 'LOCATIONTYPE', 'Fluid', 'Material']

# Create a dictionary to store unique values for each column
unique_values_dict = {}

# Loop through the specified columns and store unique values in the dictionary
for column in columns_to_check:
    if column in combined_gdf.columns:  # Check if the column exists in the DataFrame
        unique_values = combined_gdf[column].unique().tolist()  # Convert numpy array to list
        unique_values_dict[column] = unique_values
    else:
        unique_values_dict[column] = "Column not found in DataFrame."

# Print each column's unique values list
for column, values in unique_values_dict.items():
    print(f"Unique values in {column}: {values}")

Unique values in Status: ['Active', 'New Construction', 'ACTIVE', 'Abandoned', 'REMOVED', 'ABANDONED', 'Out of Service', 'Out Of Service', 'Future', 'abandoned', 'Inactive', 'Pre-Abandonment', 'InActive', 'OOS', 'Abandoned in Place', 'Shut in', 'Actove', 'Out of service', 'Removed', 'OutofService', 'Pre Abandonment', 'Avtive', 'shut in', 'PA', 'INACTIVE', 'Status', 'Out-of-Service', 'Actve', 'ABiP', 'active', 'Abandon', 'PreAbandonment', 'TA', 'Abadnon', 'SI']
Unique values in FLOWLINEACTION: ['Pre-Abandonment Notice', 'Registration', 'Abandonment', None, 'Out of Service', 'Abandonment Verification', 'Realignment', 'Removed From Service']
Unique values in LOCATIONTYPE: ['Production Facilities', 'Manifold', 'Well Site', 'Pit', 'Compressor Station', 'Gathering Line', 'Produced Water Transfer System', 'Crude Oil Transfer Line']
Unique values in Fluid: ['Oil', 'Produced Fluids', 'PRODUCED WATER', 'Multiphase', 'Natural Gas', 'Natural Gas Lift', 'Natural Gas Production', 'Crude Oil', '3 Pha

In [16]:
status_mapping = {
    'Active': 'Active', 'ACTIVE': 'Active', 'Actove': 'Active', 'Avtive': 'Active', 'Actve': 'Active', 'active': 'Active',
    'Out of Service': 'Out of Service', 'OOS': 'Out of Service', 'OutofService': 'Out of Service', 'Out-of-Service': 'Out of Service', 'Out Of Service': 'Out of Service', 'Out of service': 'Out of Service',
    'Abandoned': 'Abandoned', 'abandoned': 'Abandoned', 'Abandoned in Place': 'Abandoned', 'ABANDONED': 'Abandoned', 'Abandon': 'Abandoned','Abadnon': 'Abandoned','TA': 'Abandoned',
    'Inactive': 'Inactive', 'InActive': 'Inactive', 'INACTIVE': 'Inactive',
    'PA': 'Pending Analysis', 'ABiP':'Pending Analysis', 'Shut in': 'Shut In', 'shut in': 'Shut In','SI':'Shut In',
    'Status': 'Unknown', 'Future': 'Future',
    'REMOVED': 'Removed',
    'Pre Abandonment':'Pre-Abandonment', 'PreAbandonment': 'Pre-Abandonment'
}
combined_gdf['Status'] = combined_gdf['Status'].replace(status_mapping)
print(combined_gdf['Status'].unique())


flowlineaction_mapping = {
    'Out of Service': 'Out of Service', 'Removed From Service': 'Out of Service',
    'Pre-Abandonment Notice': 'Pre-Abandonment Notice',
    'Abandonment Verification': 'Abandonment',
    'Realignment': 'Realignment',
    'Registration': 'Registration',
    'Abandonment': 'Abandonment'
}
combined_gdf['FLOWLINEACTION'] = combined_gdf['FLOWLINEACTION'].replace(flowlineaction_mapping)
print(combined_gdf['FLOWLINEACTION'].unique())


locationtype_mapping = {
    'Production Facilities': 'Production Facilities', 'Well Site': 'Well Site', 'Manifold': 'Manifold',
    'Compressor Station': 'Compressor Station', 'Gathering Line': 'Gathering Line',
    'Crude Oil Transfer Line': 'Crude Oil Transfer Line', 'Produced Water Transfer System': 'Produced Water Transfer System'
}
combined_gdf['LOCATIONTYPE'] = combined_gdf['LOCATIONTYPE'].replace(locationtype_mapping)
print(combined_gdf['LOCATIONTYPE'].unique())

['Active' 'New Construction' 'Abandoned' 'Removed' 'Out of Service'
 'Future' 'Inactive' 'Pre-Abandonment' 'Shut In' 'Pending Analysis'
 'Unknown']
['Pre-Abandonment Notice' 'Registration' 'Abandonment' None
 'Out of Service' 'Realignment']
['Production Facilities' 'Manifold' 'Well Site' 'Pit' 'Compressor Station'
 'Gathering Line' 'Produced Water Transfer System'
 'Crude Oil Transfer Line']


In [17]:
# Update the 'Fluid' column normalization and mapping in one comprehensive block
combined_gdf['Fluid'] = combined_gdf['Fluid'].str.strip().str.title().replace({
    'Natual Gas': 'Natural Gas',  # Correct common misspelling
    'Natural Gas Production': 'Natural Gas',
    'Co2': 'Co2/Produced Water',  # Assuming Co2 implies the mixed type
    'C02/Prod Water': 'Co2/Produced Water',
    'Co2/Prod Water': 'Co2/Produced Water',
    'Co2Produced Water': 'Co2/Produced Water',
    'Co2/Produced Wtaer': 'Co2/Produced Water',
    'Gas': 'Natural Gas',  # Assuming general Gas to be Natural Gas
    'Gas, Oil And Water': 'Full Well Stream',
    'Oil': 'Crude Oil',  # Assuming Oil to be categorized as Crude Oil
    'Crude Oil': 'Crude Oil',
    'Crude Oil Emulsion': 'Crude Oil Emulsion',
    'Emulsion': 'Crude Oil Emulsion',  # To unify with Crude Oil Emulsion
    'Crude Oil Emmulsion, Water And Oil': 'Crude Oil Emulsion',
    'Crude Oil And Water Emulsion': 'Crude Oil Emulsion',
    'Oil Water Emulsion': 'Crude Oil Emulsion',
    'Oil/Water': 'Crude Oil Emulsion',
    'Oil Water': 'Crude Oil Emulsion',
    'Oil And Water': 'Crude Oil Emulsion',
    'Oil /Water/Gas': 'Full Well Stream',
    'Oil/Gas/Water': 'Full Well Stream',
    'Oil, Gas, Water': 'Full Well Stream',
    '3 Phase': 'Multiphase',  # Assumed to mean the same
    'Multiphase': 'Multiphase',
    'Multi-Phase': 'Multiphase',
    'Mulitphase': 'Multiphase',
    'Multi Phase': 'Multiphase',
    'Mulit Phase': 'Multiphase',
    'Multi-Phase\xa0': 'Multiphase',  # Non-breaking space issue
    'Injection Produced Water': 'Produced Water',
    'Produced Water': 'Produced Water',
    'Water': 'Produced Water',
    'Saltwater': 'Produced Water',
    'Condensate': 'Condensate',  # Depending on your classification needs
    'Liquid': 'Other',
    'Liquids (Wtr/Cond)': 'Other', # General term, assuming to categorize broadly
    'Unprocessed Production Fluids': 'Other',
    'Production Fluids': 'Other',
    'Produced Fluids': 'Other',
    'Full Well Stream': 'Full Well Stream',  # Assuming similar handling
    'Other': 'Other',
    'Gas,  Oil And Water' : 'Full Well Stream',
    'Natural Gas Lift': 'Natural Gas',
    'Natuarl Gas': 'Natural Gas',
    'Natural Gas High Pressure': 'Natural Gas',
    'Natural Gas Supply':'Natural Gas',
    'Crude Oill Emulsion': 'Crude Oil Emulsion',
    'Unk': 'Unknown', 
    'Poly':'Polymer fluids'
})

# Review the changes to ensure the mappings are applied correctly
print(combined_gdf['Fluid'].unique())

['Crude Oil' 'Other' 'Produced Water' 'Multiphase' 'Natural Gas'
 'Co2/Produced Water' 'Crude Oil Emulsion' 'Condensate' 'Full Well Stream'
 'Polymer fluids' 'Unknown' 'Steel']


In [18]:
combined_gdf['Material'] = combined_gdf['Material'].str.strip().str.title().replace({
    'Fiberglass': 'Fiberglass', 
    'Fibergalss': 'Fiberglass', 
    'Fiberspar': 'Fiberglass', 
    'Fiber Glass': 'Fiberglass',
    'Carbon Steel': 'Carbon Steel', 
    'Carbonsteel': 'Carbon Steel',
    'Carbon Steel Sch 80': 'Carbon Steel',
    'Carbon Steel Sch 80': 'Carbon Steel',
    'Carbon Steel - Hdpe': 'Carbon Steel/HDPE', 
    'Carbon Steel, Hdpe,Stainless Steel': 'Carbon Steel/HDPE/Stainless Steel', 
    'Carbon Steel, Hdpe, Stainless Steel': 'Carbon Steel/HDPE/Stainless Steel',
    'Carbon Steel/Stainless Steel/Hdpe': 'Carbon Steel/HDPE/Stainless Steel', 
    'Carbon Steel/Hdpe/Stainless': 'Carbon Steel/HDPE/Stainless Steel', 
    'Carbon Steel/Hdpe': 'Carbon Steel/HDPE', 
    'Satinless/Carbon Steel/Hdpe': 'Carbon Steel/HDPE/Stainless Steel',
    'Carbon Steel/Stainless/Hdpe': 'Carbon Steel/HDPE/Stainless Steel',
    'Steel': 'Steel', 
    'Lined Steel': 'Steel', 
    'Coated Steel': 'Steel', 
    'Flexsteel': 'Steel', 
    'Flexpipe' : 'Steel',
    'Fiber Glass And Carbon Steel': 'Fiberglass/Carbon Steel', 
    'Fiberglass And Hdpe': 'Fiberglass/HDPE',
    'Hdpe': 'HDPE', 
    'Hdpe Poly': 'HDPE', 
    'Composite Hdpe': 'HDPE',
    'Hdpe/Steel': 'HDPE/Steel', 
    'Hdpe Lined Steel': 'HDPE/Steel',
     'Hdpe/Steel, Flexsteel': 'HDPE/Steel',
    'Poly': 'Polycarbonate', 
    'Polyline': 'Polycarbonate', 
    'Poly & Steel': 'Polycarbonate/Steel', 
    'Steel/Poly': 'Polycarbonate/Steel',
    'Poly/Steel': 'Polycarbonate/Steel',
    'Polycarbonate': 'Polycarbonate', 
    'Polycarbonate/Steel': 'Polycarbonate/Steel',
    'Pvc': 'PVC', 
    'Flexspar': 'Fiberglass', 
    'Stainless': 'Steel', 
    'Stainless/Carbon Steel/Hdpe': 'Carbon Steel/HDPE/Stainless Steel',
    'Carbon Steel/Hdpe/Stainless Steel': 'Carbon Steel/HDPE/Stainless Steel',
    'Unknown': 'Unknown', 
    'Other': 'Other', 
    'Other (Poly)': 'Polycarbonate', 
    'Sdr7 Polyethelyne': 'Polyethylene', 
    'Sdr 11 Poly Pipe': 'Polyethylene', 
    'Sdr 11 Poly': 'Polyethylene', 
    'Poly Pipe': 'Polyethylene', 
    'Sdr_Poly': 'Polyethylene',
    'Poly': 'Polyethylene',
    'Poly Sdr 7': 'Polypropylene',
    'Poly Sdr-7': 'Polypropylene',
    'Duplex': 'Duplex', 
    'Fplp': 'Other', 
    'Flowline': 'Other',
    'Flex Steel': 'Steel',
    'Other (Flex Steel)': 'Steel',
    'Fiberglass And Carbon Steel': 'Carbon Steel/Fiberglass', 
    'Stainless Steel': 'Steel',
    'HDPE Lined Steel': 'HDPE/Steel',
    'Fiberglass/Hdpe': 'Fiberglass/HDPE',
    'Unk': 'Unknown', 'Other (Unknown)': 'Unknown', 'Other': 'Unknown',
})

# Verify the changes by printing the unique values in the 'Material' column
print(combined_gdf['Material'].unique())

['Steel' 'Carbon Steel' 'Polyethylene' 'Fiberglass' 'HDPE' 'Other'
 'Polycarbonate' 'PVC' 'Unknown' 'Carbon Steel/HDPE/Stainless Steel'
 'Carbon Steel/HDPE' 'Duplex' 'Fiberglass/HDPE' 'Polycarbonate/Steel'
 'Polypropylene' 'Co2/Produced Water' 'Carbon Steel/Fiberglass' 'Oil'
 'HDPE/Steel']


In [19]:
# Define the mapping dictionary for the 'Root Cause Type' column
root_cause_mapping = {
    'Corrosion': 'Corrosion', 
    'Unknown': 'Unknown', 
    'Incorrect Operation': 'Incorrect Operation', 
    'Equipment Failure': 'Equipment Failure', 
    'Equipment failure': 'Equipment Failure',  
    'Other Outside Force Damage': 'Other Outside Force Damage', 
    'Natural Force Damage': 'Natural Force Damage', 
    'Pipe, Weld, or Joint Failure': 'Pipe, Weld, or Joint Failure', 
    'Pipe, Weld Joint Failure': 'Pipe, Weld, or Joint Failure',
    'Excavation Damage': 'Excavation Damage', 
    'Other Outside Force': 'Other Outside Force Damage',  # Mapping to a more standardized category
    'Pipe, Weld, Joint Failure': 'Pipe, Weld, or Joint Failure'  # Ensuring consistency in naming
}

# Apply the mapping to the 'Root Cause Type' column
combined_gdf['Root Cause Type'] = combined_gdf['Root Cause Type'].replace(root_cause_mapping)
print(combined_gdf['Root Cause Type'].unique())

['Unknown' 'Corrosion' 'Equipment Failure' 'Incorrect Operation'
 'Pipe, Weld, or Joint Failure' 'Other Outside Force Damage'
 'Natural Force Damage' 'Excavation Damage' None]


In [20]:
combined_gdf.columns = [
    'operator_number',  # Changed from OPERATOR_NUM to operator_number
    'flowline_id',       # Changed from FLOWLINEID to flowline_id
    'location_id',       # Changed from LOCATION_ID to location_id
    'status',            # Already appropriately named but changed to lowercase
    'flowline_action',   # Changed from FLOWLINEACTION to flowline_action
    'location_type',     # Changed from LOCATIONTYPE to location_type
    'fluid',             # Already appropriately named but changed to lowercase
    'material',          # Already appropriately named but changed to lowercase
    'diameter_in',       # Changed from Diam_in to diameter_in
    'length_ft',         # Changed from Length_ft to length_ft
    'max_operating_pressure', # Changed from MAXOPPRESSURE to max_operating_pressure
    'shape_length',      # Changed from SHAPE_Length to shape_length
    'line_age_yr',
    'geometry',
    'root_cause'           # Already appropriately named but changed to lowercase
]

# Print new column names to verify
print(combined_gdf.columns.tolist())

['operator_number', 'flowline_id', 'location_id', 'status', 'flowline_action', 'location_type', 'fluid', 'material', 'diameter_in', 'length_ft', 'max_operating_pressure', 'shape_length', 'line_age_yr', 'geometry', 'root_cause']


In [21]:
combined_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 464153 entries, 0 to 464152
Data columns (total 15 columns):
 #   Column                  Non-Null Count   Dtype   
---  ------                  --------------   -----   
 0   operator_number         464153 non-null  int32   
 1   flowline_id             463691 non-null  float64 
 2   location_id             464042 non-null  float64 
 3   status                  464153 non-null  object  
 4   flowline_action         322930 non-null  object  
 5   location_type           464153 non-null  object  
 6   fluid                   464153 non-null  object  
 7   material                464153 non-null  object  
 8   diameter_in             464153 non-null  float64 
 9   length_ft               464153 non-null  float64 
 10  max_operating_pressure  275058 non-null  float64 
 11  shape_length            464153 non-null  float64 
 12  line_age_yr             464153 non-null  float64 
 13  geometry                464153 non-null  geometry
 

In [22]:
# Adding the 'risk' column based on whether 'root_cause' is present or not
combined_gdf['risk'] = combined_gdf['root_cause'].apply(lambda x: 1 if pd.notnull(x) else 0)

In [23]:
# Remove rows where 'max_operating_pressure' is NaN and 'risk' is 0
combined_gdf = combined_gdf[~((combined_gdf['max_operating_pressure'].isna()) & (combined_gdf['risk'] == 0))]

In [24]:
# Select the relevant numeric columns for KNN Imputation
columns_for_imputation = ['max_operating_pressure', 'diameter_in', 'length_ft', 'line_age_yr', 'shape_length', 'material', 'fluid']  # Add other relevant columns as needed

# Creating a copy of the data to prevent modifying the original directly
df_for_imputation = combined_gdf[columns_for_imputation].copy()

# Convert the 'material' column using Label Encoding
le = LabelEncoder()
df_for_imputation['material_encoded'] = le.fit_transform(df_for_imputation['material'])
df_for_imputation['fluid_encoded'] = le.fit_transform(df_for_imputation['fluid'])

# Drop the original 'material' and 'fluid' columns
df_for_imputation = df_for_imputation.drop(columns=['material', 'fluid'])

# Initialize the KNN Imputer
imputer = KNNImputer(n_neighbors=5)

# Fit and transform the data to impute missing values
df_imputed = imputer.fit_transform(df_for_imputation)

# Replace the original max_operating_pressure with the imputed values
combined_gdf['max_operating_pressure'] = df_imputed[:, 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [25]:
# make variables integers
combined_gdf['diameter_in'] = combined_gdf['diameter_in'].astype(int)
combined_gdf['length_ft'] = combined_gdf['length_ft'].astype(int)
combined_gdf['max_operating_pressure'] = combined_gdf['max_operating_pressure'].astype(int)
combined_gdf['shape_length'] = combined_gdf['shape_length'].astype(int)
combined_gdf['line_age_yr'] = combined_gdf['line_age_yr'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [26]:
combined_gdf

Unnamed: 0,operator_number,flowline_id,location_id,status,flowline_action,location_type,fluid,material,diameter_in,length_ft,max_operating_pressure,shape_length,line_age_yr,geometry,root_cause,risk
0,10110,470450.0,470449.0,Active,Pre-Abandonment Notice,Production Facilities,Crude Oil,Steel,2,542,30,165,52,"LINESTRING (545287.5 4410654.506, 545239.243 4...",Unknown,1
1,69175,477982.0,447490.0,New Construction,Registration,Production Facilities,Other,Carbon Steel,3,404,2430,123,7,"LINESTRING (529087.406 4468617.814, 529106.928...",Unknown,1
2,47120,457300.0,318070.0,Active,Registration,Production Facilities,Produced Water,Steel,2,18,2030,5,13,"LINESTRING (526647.996 4445503.586, 526647.873...",Unknown,1
3,100322,465120.0,464594.0,Active,Abandonment,Production Facilities,Multiphase,Carbon Steel,2,1884,410,574,37,"LINESTRING (534361.603 4464302.795, 534361.11 ...",Unknown,1
4,100322,466186.0,455178.0,Abandoned,Registration,Production Facilities,Multiphase,Carbon Steel,2,768,1600,234,31,"LINESTRING (521614.711 4482858.937, 521636.926...",Unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
464018,35080,455592.0,443145.0,Active,Registration,Production Facilities,Crude Oil,HDPE,3,1175,40,358,7,"LINESTRING (637352.926 4380659.316, 637401.725...",,0
464019,35080,455592.0,443145.0,Active,Registration,Production Facilities,Crude Oil,HDPE,3,1175,40,358,7,"LINESTRING (637401.725 4380648.425, 637450.525...",,0
464020,35080,455592.0,443145.0,Active,Registration,Production Facilities,Crude Oil,HDPE,3,1175,40,358,7,"LINESTRING (637450.525 4380637.533, 637458.657...",,0
464097,39560,455244.0,318928.0,Abandoned,Registration,Production Facilities,Condensate,Steel,3,199,1000,60,41,"LINESTRING (496551.343 4443672.866, 496503.869...",,0


In [27]:
# Function to count only NaN values, excluding None
def count_only_nan(series):
    return series.apply(lambda x: 1 if isinstance(x, float) and np.isnan(x) else 0).sum()

# Apply the function to each column to count NaNs, not None
na_columns = combined_gdf.apply(count_only_nan)

# Filter columns with NaN values
columns_with_only_nan = na_columns[na_columns > 0]

# Print the results
print("Columns with NaN values (excluding None) and their counts:")
print(columns_with_only_nan)


Columns with NaN values (excluding None) and their counts:
flowline_id    34
location_id    90
dtype: int64


In [28]:
# Identify columns with NaN values
columns_with_na = combined_gdf.columns[combined_gdf.isna().any()]

# Create a dictionary to store the counts of NaNs with risk of 1 for each column
na_with_risk_1 = {}

# Iterate over each column with NaN values and count rows with risk = 1
for column in columns_with_na:
    count = combined_gdf[combined_gdf[column].isna() & (combined_gdf['risk'] == 1)].shape[0]
    na_with_risk_1[column] = count

# Print the counts of NaNs with risk of 1 for each column
print("Number of NaNs with risk of 1 in each column:")
print(na_with_risk_1)


Number of NaNs with risk of 1 in each column:
{'flowline_id': 0, 'location_id': 0, 'flowline_action': 101, 'root_cause': 0}


In [29]:
combined_gdf = combined_gdf.drop(columns=['flowline_action'])

In [30]:
combined_gdf = combined_gdf.dropna(subset=['flowline_id', 'location_id'])

In [31]:
combined_gdf


Unnamed: 0,operator_number,flowline_id,location_id,status,location_type,fluid,material,diameter_in,length_ft,max_operating_pressure,shape_length,line_age_yr,geometry,root_cause,risk
0,10110,470450.0,470449.0,Active,Production Facilities,Crude Oil,Steel,2,542,30,165,52,"LINESTRING (545287.5 4410654.506, 545239.243 4...",Unknown,1
1,69175,477982.0,447490.0,New Construction,Production Facilities,Other,Carbon Steel,3,404,2430,123,7,"LINESTRING (529087.406 4468617.814, 529106.928...",Unknown,1
2,47120,457300.0,318070.0,Active,Production Facilities,Produced Water,Steel,2,18,2030,5,13,"LINESTRING (526647.996 4445503.586, 526647.873...",Unknown,1
3,100322,465120.0,464594.0,Active,Production Facilities,Multiphase,Carbon Steel,2,1884,410,574,37,"LINESTRING (534361.603 4464302.795, 534361.11 ...",Unknown,1
4,100322,466186.0,455178.0,Abandoned,Production Facilities,Multiphase,Carbon Steel,2,768,1600,234,31,"LINESTRING (521614.711 4482858.937, 521636.926...",Unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
464018,35080,455592.0,443145.0,Active,Production Facilities,Crude Oil,HDPE,3,1175,40,358,7,"LINESTRING (637352.926 4380659.316, 637401.725...",,0
464019,35080,455592.0,443145.0,Active,Production Facilities,Crude Oil,HDPE,3,1175,40,358,7,"LINESTRING (637401.725 4380648.425, 637450.525...",,0
464020,35080,455592.0,443145.0,Active,Production Facilities,Crude Oil,HDPE,3,1175,40,358,7,"LINESTRING (637450.525 4380637.533, 637458.657...",,0
464097,39560,455244.0,318928.0,Abandoned,Production Facilities,Condensate,Steel,3,199,1000,60,41,"LINESTRING (496551.343 4443672.866, 496503.869...",,0


In [32]:
# Calculate the total number of rows
total_rows = combined_gdf.shape[0]

# Count rows where risk is 1
risk_1_count = combined_gdf[combined_gdf['risk'] == 1].shape[0]

# Count rows where risk is 0
risk_0_count = combined_gdf[combined_gdf['risk'] == 0].shape[0]

# Print the results
print(f"Total number of rows: {total_rows}")
print(f"Total number of rows with risk = 1: {risk_1_count}")
print(f"Total number of rows with risk = 0: {risk_0_count}")

Total number of rows: 275214
Total number of rows with risk = 1: 562
Total number of rows with risk = 0: 274652


# Download Cleaned Data

In [33]:
combined_gdf.to_file("final_cleaned_gdf.geojson", driver='GeoJSON')

In [34]:
combined_gdf_operator_mapping.to_csv('operator_mapping.csv', index=False)