In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import fiona
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

pd.options.display.max_columns = None 

# set working directory
os.chdir('/Users/ichittumuri/Desktop/MINES/COGCC-Risk-Analysis/Data')

In [2]:
# Load Data
combined_gdf = gpd.read_file('combined_gdf.geojson')

# Data Cleaning


In [3]:
print(combined_gdf.info())
print(combined_gdf.shape)

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 26629 entries, 0 to 26628
Data columns (total 42 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   trkg_num                  562 non-null    float64       
 1   Operator Name             562 non-null    object        
 2   facility_type             518 non-null    object        
 3   Spill_Desc                562 non-null    object        
 4   Spill Type                315 non-null    object        
 5   Root Cause                392 non-null    object        
 6   Preventative Measure      383 non-null    object        
 7   Root Cause Type           562 non-null    object        
 8   Detailed Root Cause Type  487 non-null    object        
 9   Long                      562 non-null    float64       
 10  Lat                       562 non-null    float64       
 11  facility_status           10 non-null     object        
 12  Metallic? 

In [4]:
combined_gdf.describe()

Unnamed: 0,trkg_num,Long,Lat,nearest_flowline_index,CONSTRUCTDATE,Diam_in,ENDLAT,ENDLONG,FLOWLINEID,LOCATION_ID,Length_ft,MAXOPPRESSURE,OPERATOR_NUM,RECEIVE_DATE,SHAPE_Length,STARTLAT,STARTLOCATIONID,STARTLONG,risk
count,562.0,562.0,562.0,562.0,26629,26629.0,26629.0,26629.0,26618.0,26617.0,26629.0,15644.0,26629.0,26629,26629.0,26629.0,26296.0,26629.0,26629.0
mean,402406600.0,-106.119528,39.875744,14717.523132,2006-06-19 05:39:35.981000,6.027643,40.285825,-104.221555,468209.81644,420126.336251,60256.162555,431.206149,55029.632356,2020-03-12 21:34:10.374000,18362.595098,40.285785,376832.405043,-104.222172,0.021105
min,401524300.0,-108.94917,37.014053,605.0,1900-01-11 00:00:00,0.0,37.010428,-109.045542,455152.0,159601.0,2.2,0.0,710.0,2018-04-25 14:38:45.767000,0.671472,36.993609,159652.0,-109.049983,0.0
25%,401977800.0,-108.111547,39.529501,10120.0,2001-10-03 00:00:00,2.0,40.105102,-104.791736,464982.0,422835.0,1137.4,150.0,10633.0,2019-06-28 13:32:02.080000,347.034471,40.10441,320153.0,-104.791043,0.0
50%,402311300.0,-104.987023,40.07419,11159.0,2010-11-22 00:00:00,3.0,40.348638,-103.866455,465855.0,436612.0,3832.12,150.0,46685.0,2019-10-10 08:31:29.233000,1167.563347,40.348011,338136.5,-103.865346,0.0
75%,402962500.0,-104.507921,40.359008,23974.75,2017-06-08 00:00:00,12.375,40.811009,-103.78985,473098.0,461073.0,141290.45,550.0,96155.0,2020-11-18 13:44:52.847000,43053.037992,40.811044,435694.0,-103.791688,0.0
max,403278600.0,-102.083057,40.99735,26355.0,2020-06-01 00:00:00,278.0,40.99263,-102.045863,484080.0,483552.0,152023.9,4700.0,200077.0,2023-11-17 11:11:43.017000,46342.37979,40.996,484366.0,-102.046467,1.0
std,495331.6,1.920377,0.776363,6778.470601,,5.200918,0.742468,1.240436,5387.799531,57564.442523,68394.558122,606.742535,40678.696678,,20841.808974,0.742419,60817.837792,1.24021,0.143736


### Get line age from construction date

In [5]:
# Verify CONSTRUCTDATE is datetime type; convert if necessary
combined_gdf['CONSTRUCTDATE'] = pd.to_datetime(combined_gdf['CONSTRUCTDATE'])

# Calculate line_age
# Today's date
today = pd.Timestamp.now()

# Calculate the difference in years
combined_gdf['line_age_yr'] = (today - combined_gdf['CONSTRUCTDATE']).dt.days / 365.25

# Display the updated GeoDataFrame to verify 'line_age' column
print(combined_gdf[['CONSTRUCTDATE', 'line_age_yr']])

      CONSTRUCTDATE  line_age_yr
0        1972-08-07    52.569473
1        2018-01-04     7.159480
2        2011-08-10    13.563313
3        2004-05-10    20.813142
4        1993-11-07    31.318275
...             ...          ...
26624    2001-10-02    23.416838
26625    2008-02-22    17.026694
26626    2007-08-05    17.577002
26627    1998-09-27    26.431211
26628    1983-10-21    41.366188

[26629 rows x 2 columns]


In [6]:
# Print the list of column names
column_names = combined_gdf.columns.tolist()

print(column_names)

['trkg_num', 'Operator Name', 'facility_type', 'Spill_Desc', 'Spill Type', 'Root Cause', 'Preventative Measure', 'Root Cause Type', 'Detailed Root Cause Type', 'Long', 'Lat', 'facility_status', 'Metallic?', 'nearest_flowline_index', 'ACTIONDESCRIPTION', 'BEDDINGMATERIAL', 'COMPANY_NAME', 'CONSTRUCTDATE', 'Diam_in', 'ENDLAT', 'ENDLONG', 'ENTIRELINEREMOVED', 'FLOWLINEACTION', 'FLOWLINEID', 'Fluid', 'LOCATIONTYPE', 'LOCATION_ID', 'Length_ft', 'MAXOPPRESSURE', 'Material', 'OPERATOR_NUM', 'Operator', 'PIPEMATERIAL', 'RECEIVE_DATE', 'SHAPE_Length', 'STARTLAT', 'STARTLOCATIONID', 'STARTLONG', 'Status', 'TYPEOFFLUIDTRANS', 'risk', 'geometry', 'line_age_yr']


In [7]:
mapping = {
    'KINDER MORGAN CO2 CO LP': 'KINDER MORGAN CO2 CO LLC',
    'BEEMAN OIL & GAS INC': 'BEEMAN OIL & GAS LLC',
}
combined_gdf['Operator'] = combined_gdf['Operator'].replace(mapping)

combined_gdf.rename(columns={'OPERATOR_NUM': 'operator_number'}, inplace=True)
combined_gdf.rename(columns={'Operator': 'operator_name'}, inplace=True)

# Extract unique OPERATOR_NUM values
unique_operator_nums = combined_gdf['operator_number'].unique()

# Create a DataFrame to map OPERATOR_NUM to Operator (assuming the column name for operator names is 'Operator')
combined_gdf_operator_mapping = combined_gdf[['operator_number', 'operator_name']].drop_duplicates().reset_index(drop=True)

# Display the mapping
print(combined_gdf_operator_mapping)

     operator_number                        operator_name
0              10110  GREAT WESTERN OPERATING COMPANY LLC
1              69175                       PDC ENERGY INC
2              47120      KERR MCGEE OIL & GAS ONSHORE LP
3             100322                     NOBLE ENERGY INC
4              10459             EXTRACTION OIL & GAS INC
..               ...                                  ...
115            65110        O'BRIEN ENERGY RESOURCES CORP
116            41550        TYLER ROCKIES EXPLORATION LTD
117            10506               SEELEY OIL COMPANY LLC
118            11001                  BROWN OIL & GAS LLC
119            10639            CPX PICEANCE HOLDINGS LLC

[120 rows x 2 columns]


In [8]:
# List of columns to be removed
columns_to_remove = [
    'trkg_num', 'Operator Name', 'facility_type', 'Spill_Desc', 'Spill Type', 'Root Cause', 'Preventative Measure', 'Detailed Root Cause Type', 
    'Long', 'Lat', 'facility_status', 'Metallic?', 'nearest_flowline_index', 'ACTIONDESCRIPTION', 'BEDDINGMATERIAL', 'COMPANY_NAME', 'CONSTRUCTDATE', 
    'ENDLAT', 'ENDLONG', 'ENTIRELINEREMOVED', 'PIPEMATERIAL', 'RECEIVE_DATE', 'STARTLAT', 'STARTLOCATIONID', 'STARTLONG', 'TYPEOFFLUIDTRANS', 'operator_name']

# Drop the columns from the DataFrame
combined_gdf = combined_gdf.drop(columns=columns_to_remove)

In [9]:
# drop any NAs
# no_spills = combined_gdf.dropna()

In [10]:
'Root Cause Type', 'Diam_in', 'FLOWLINEACTION', 'FLOWLINEID', 'Fluid', 'LOCATIONTYPE', 'LOCATION_ID', 
'Length_ft', 'MAXOPPRESSURE', 'Material', 'OPERATOR_NUM', 'SHAPE_Length','Status', 'geometry', 'line_age_yr'

('Length_ft',
 'MAXOPPRESSURE',
 'Material',
 'OPERATOR_NUM',
 'SHAPE_Length',
 'Status',
 'geometry',
 'line_age_yr')

In [11]:
combined_gdf.head()

Unnamed: 0,Root Cause Type,Diam_in,FLOWLINEACTION,FLOWLINEID,Fluid,LOCATIONTYPE,LOCATION_ID,Length_ft,MAXOPPRESSURE,Material,operator_number,SHAPE_Length,Status,risk,geometry,line_age_yr
0,Unknown,2.0,,470450.0,Oil,Production Facilities,470449.0,542.71,,Steel,10110,165.354619,Active,1,"LINESTRING (545287.5 4410654.506, 545132.965 4...",52.569473
1,Unknown,3.5,Registration,477981.0,Produced Fluids,Production Facilities,447490.0,404.27,,Carbon Steel,69175,123.17501,New Construction,1,"LINESTRING (529087.406 4468617.814, 529165.786...",7.15948
2,Unknown,2.0,Abandonment,457300.0,PRODUCED WATER,Production Facilities,318070.0,18.23,,Steel,47120,5.553076,ACTIVE,1,"LINESTRING (526647.996 4445503.586, 526647.873...",13.563313
3,Unknown,2.0,Registration,457931.0,Multiphase,Production Facilities,422528.0,1135.36,,Carbon Steel,100322,345.926043,Active,1,"LINESTRING (534932.646 4463662.371, 535082.761...",20.813142
4,Unknown,2.0,Registration,466186.0,Multiphase,Production Facilities,455178.0,768.11,,Carbon Steel,100322,234.027984,Abandoned,1,"LINESTRING (521484.139 4483010.433, 521636.926...",31.318275


In [12]:
# Reorder df
new_order = ['operator_number', 'FLOWLINEID', 'LOCATION_ID', 'Status', 'FLOWLINEACTION', 'LOCATIONTYPE', 'Fluid',
             'Material', 'Diam_in', 'Length_ft', 'MAXOPPRESSURE', 'SHAPE_Length', 'line_age_yr', 'geometry', 'Root Cause Type']
combined_gdf = combined_gdf[new_order]
combined_gdf

Unnamed: 0,operator_number,FLOWLINEID,LOCATION_ID,Status,FLOWLINEACTION,LOCATIONTYPE,Fluid,Material,Diam_in,Length_ft,MAXOPPRESSURE,SHAPE_Length,line_age_yr,geometry,Root Cause Type
0,10110,470450.0,470449.0,Active,,Production Facilities,Oil,Steel,2.0,542.71,,165.354619,52.569473,"LINESTRING (545287.5 4410654.506, 545132.965 4...",Unknown
1,69175,477981.0,447490.0,New Construction,Registration,Production Facilities,Produced Fluids,Carbon Steel,3.5,404.27,,123.175010,7.159480,"LINESTRING (529087.406 4468617.814, 529165.786...",Unknown
2,47120,457300.0,318070.0,ACTIVE,Abandonment,Production Facilities,PRODUCED WATER,Steel,2.0,18.23,,5.553076,13.563313,"LINESTRING (526647.996 4445503.586, 526647.873...",Unknown
3,100322,457931.0,422528.0,Active,Registration,Production Facilities,Multiphase,Carbon Steel,2.0,1135.36,,345.926043,20.813142,"LINESTRING (534932.646 4463662.371, 535082.761...",Unknown
4,100322,466186.0,455178.0,Abandoned,Registration,Production Facilities,Multiphase,Carbon Steel,2.0,768.11,,234.027984,31.318275,"LINESTRING (521484.139 4483010.433, 521636.926...",Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26624,100322,455232.0,446052.0,Abandoned,Registration,Production Facilities,Multiphase,Carbon Steel,3.0,1413.74,,430.739655,23.416838,"LINESTRING (530328.842 4472004.142, 529900.46 ...",
26625,100322,455190.0,455177.0,Active,Registration,Production Facilities,Multiphase,Carbon Steel,2.0,527.60,,160.751553,17.026694,"LINESTRING (543138.306 4481306.544, 542978.929...",
26626,100322,455155.0,455096.0,Abandoned,Registration,Production Facilities,Multiphase,Carbon Steel,3.0,2070.49,,630.836779,17.577002,"LINESTRING (519355.342 4482133.153, 519296.77 ...",
26627,100322,455156.0,455096.0,Active,Registration,Production Facilities,Multiphase,Carbon Steel,2.0,800.31,,243.837753,26.431211,"LINESTRING (519465.329 4482537.89, 519317.553 ...",


### Consolidate variables uniformly

In [13]:
# List of specific columns for which to print unique values
columns_to_check = ['Status', 'FLOWLINEACTION', 'LOCATIONTYPE', 'Fluid', 'Material']

# Create a dictionary to store unique values for each column
unique_values_dict = {}

# Loop through the specified columns and store unique values in the dictionary
for column in columns_to_check:
    if column in combined_gdf.columns:  # Check if the column exists in the DataFrame
        unique_values = combined_gdf[column].unique().tolist()  # Convert numpy array to list
        unique_values_dict[column] = unique_values
    else:
        unique_values_dict[column] = "Column not found in DataFrame."

# Print each column's unique values list
for column, values in unique_values_dict.items():
    print(f"Unique values in {column}: {values}")

Unique values in Status: ['Active', 'New Construction', 'ACTIVE', 'Abandoned', 'REMOVED', 'ABANDONED', 'Out of Service', 'Out Of Service', 'Future', 'abandoned', 'ABiP', 'Inactive', 'Pre-Abandonment', 'InActive', 'OOS', 'Abandoned in Place', 'Shut in', 'Actove', 'Out of service', 'Removed', 'OutofService', 'Pre Abandonment', 'Avtive', 'shut in', 'PA', 'INACTIVE', 'Status', 'Out-of-Service', 'Actve', 'active', 'Abandon', 'PreAbandonment', 'TA', 'Abadnon', 'SI']
Unique values in FLOWLINEACTION: [None, 'Registration', 'Abandonment', 'Abandonment Verification', 'Out of Service', 'Pre-Abandonment Notice', 'Realignment', 'Removed From Service']
Unique values in LOCATIONTYPE: ['Production Facilities', 'Manifold', 'Well Site', 'Pit', 'Compressor Station', 'Gathering Line', 'Produced Water Transfer System', 'Crude Oil Transfer Line']
Unique values in Fluid: ['Oil', 'Produced Fluids', 'PRODUCED WATER', 'Multiphase', 'Natural Gas', 'Natural Gas Lift', '3 Phase', 'Produced Water', 'CO2', 'Natural 

In [14]:
status_mapping = {
    'Active': 'Active', 'ACTIVE': 'Active', 'Actove': 'Active', 'Avtive': 'Active', 'Actve': 'Active', 'active': 'Active',
    'Out of Service': 'Out of Service', 'OOS': 'Out of Service', 'OutofService': 'Out of Service', 'Out-of-Service': 'Out of Service', 'Out Of Service': 'Out of Service', 'Out of service': 'Out of Service',
    'Abandoned': 'Abandoned', 'abandoned': 'Abandoned', 'Abandoned in Place': 'Abandoned', 'ABANDONED': 'Abandoned', 'Abandon': 'Abandoned','Abadnon': 'Abandoned','TA': 'Abandoned',
    'Inactive': 'Inactive', 'InActive': 'Inactive', 'INACTIVE': 'Inactive',
    'PA': 'Pending Analysis', 'ABiP':'Pending Analysis', 'Shut in': 'Shut In', 'shut in': 'Shut In','SI':'Shut In',
    'Status': 'Unknown', 'Future': 'Future',
    'REMOVED': 'Removed',
    'Pre Abandonment':'Pre-Abandonment', 'PreAbandonment': 'Pre-Abandonment'
}
combined_gdf['Status'] = combined_gdf['Status'].replace(status_mapping)
print(combined_gdf['Status'].unique())


flowlineaction_mapping = {
    'Out of Service': 'Out of Service', 'Removed From Service': 'Out of Service',
    'Pre-Abandonment Notice': 'Pre-Abandonment Notice',
    'Abandonment Verification': 'Abandonment',
    'Realignment': 'Realignment',
    'Registration': 'Registration',
    'Abandonment': 'Abandonment'
}
combined_gdf['FLOWLINEACTION'] = combined_gdf['FLOWLINEACTION'].replace(flowlineaction_mapping)
print(combined_gdf['FLOWLINEACTION'].unique())


locationtype_mapping = {
    'Production Facilities': 'Production Facilities', 'Well Site': 'Well Site', 'Manifold': 'Manifold',
    'Compressor Station': 'Compressor Station', 'Gathering Line': 'Gathering Line',
    'Crude Oil Transfer Line': 'Crude Oil Transfer Line', 'Produced Water Transfer System': 'Produced Water Transfer System'
}
combined_gdf['LOCATIONTYPE'] = combined_gdf['LOCATIONTYPE'].replace(locationtype_mapping)
print(combined_gdf['LOCATIONTYPE'].unique())

['Active' 'New Construction' 'Abandoned' 'Removed' 'Out of Service'
 'Future' 'Pending Analysis' 'Inactive' 'Pre-Abandonment' 'Shut In'
 'Unknown']
[None 'Registration' 'Abandonment' 'Out of Service'
 'Pre-Abandonment Notice' 'Realignment']
['Production Facilities' 'Manifold' 'Well Site' 'Pit' 'Compressor Station'
 'Gathering Line' 'Produced Water Transfer System'
 'Crude Oil Transfer Line']


In [15]:
# Update the 'Fluid' column normalization and mapping in one comprehensive block
combined_gdf['Fluid'] = combined_gdf['Fluid'].str.strip().str.title().replace({
    'Natual Gas': 'Natural Gas',  # Correct common misspelling
    'Natural Gas Production': 'Natural Gas',
    'Co2': 'Co2/Produced Water',  # Assuming Co2 implies the mixed type
    'C02/Prod Water': 'Co2/Produced Water',
    'Co2/Prod Water': 'Co2/Produced Water',
    'Co2Produced Water': 'Co2/Produced Water',
    'Co2/Produced Wtaer': 'Co2/Produced Water',
    'Gas': 'Natural Gas',  # Assuming general Gas to be Natural Gas
    'Gas, Oil And Water': 'Full Well Stream',
    'Oil': 'Crude Oil',  # Assuming Oil to be categorized as Crude Oil
    'Crude Oil': 'Crude Oil',
    'Crude Oil Emulsion': 'Crude Oil Emulsion',
    'Emulsion': 'Crude Oil Emulsion',  # To unify with Crude Oil Emulsion
    'Crude Oil Emmulsion, Water And Oil': 'Crude Oil Emulsion',
    'Crude Oil And Water Emulsion': 'Crude Oil Emulsion',
    'Oil Water Emulsion': 'Crude Oil Emulsion',
    'Oil/Water': 'Crude Oil Emulsion',
    'Oil Water': 'Crude Oil Emulsion',
    'Oil And Water': 'Crude Oil Emulsion',
    'Oil /Water/Gas': 'Full Well Stream',
    'Oil/Gas/Water': 'Full Well Stream',
    'Oil, Gas, Water': 'Full Well Stream',
    '3 Phase': 'Multiphase',  # Assumed to mean the same
    'Multiphase': 'Multiphase',
    'Multi-Phase': 'Multiphase',
    'Mulitphase': 'Multiphase',
    'Multi Phase': 'Multiphase',
    'Mulit Phase': 'Multiphase',
    'Multi-Phase\xa0': 'Multiphase',  # Non-breaking space issue
    'Injection Produced Water': 'Produced Water',
    'Produced Water': 'Produced Water',
    'Water': 'Produced Water',
    'Saltwater': 'Produced Water',
    'Condensate': 'Condensate',  # Depending on your classification needs
    'Liquid': 'Other',
    'Liquids (Wtr/Cond)': 'Other', # General term, assuming to categorize broadly
    'Unprocessed Production Fluids': 'Other',
    'Production Fluids': 'Other',
    'Produced Fluids': 'Other',
    'Full Well Stream': 'Full Well Stream',  # Assuming similar handling
    'Other': 'Other',
    'Gas,  Oil And Water' : 'Full Well Stream',
    'Natural Gas Lift': 'Natural Gas',
    'Natuarl Gas': 'Natural Gas',
    'Natural Gas High Pressure': 'Natural Gas',
    'Natural Gas Supply':'Natural Gas',
    'Crude Oill Emulsion': 'Crude Oil Emulsion',
    'Unk': 'Unknown', 
    'Poly':'Polymer fluids'
})

# Review the changes to ensure the mappings are applied correctly
print(combined_gdf['Fluid'].unique())

['Crude Oil' 'Other' 'Produced Water' 'Multiphase' 'Natural Gas'
 'Co2/Produced Water' 'Crude Oil Emulsion' 'Condensate' 'Full Well Stream'
 'Polymer fluids' 'Unknown' 'Steel']


In [16]:
combined_gdf['Material'] = combined_gdf['Material'].str.strip().str.title().replace({
    'Fiberglass': 'Fiberglass', 
    'Fibergalss': 'Fiberglass', 
    'Fiberspar': 'Fiberglass', 
    'Fiber Glass': 'Fiberglass',
    'Carbon Steel': 'Carbon Steel', 
    'Carbonsteel': 'Carbon Steel',
    'Carbon Steel Sch 80': 'Carbon Steel',
    'Carbon Steel Sch 80': 'Carbon Steel',
    'Carbon Steel - Hdpe': 'Carbon Steel/HDPE', 
    'Carbon Steel, Hdpe,Stainless Steel': 'Carbon Steel/HDPE/Stainless Steel', 
    'Carbon Steel, Hdpe, Stainless Steel': 'Carbon Steel/HDPE/Stainless Steel',
    'Carbon Steel/Stainless Steel/Hdpe': 'Carbon Steel/HDPE/Stainless Steel', 
    'Carbon Steel/Hdpe/Stainless': 'Carbon Steel/HDPE/Stainless Steel', 
    'Carbon Steel/Hdpe': 'Carbon Steel/HDPE', 
    'Satinless/Carbon Steel/Hdpe': 'Carbon Steel/HDPE/Stainless Steel',
    'Carbon Steel/Stainless/Hdpe': 'Carbon Steel/HDPE/Stainless Steel',
    'Steel': 'Steel', 
    'Lined Steel': 'Steel', 
    'Coated Steel': 'Steel', 
    'Flexsteel': 'Steel', 
    'Flexpipe' : 'Steel',
    'Fiber Glass And Carbon Steel': 'Fiberglass/Carbon Steel', 
    'Fiberglass And Hdpe': 'Fiberglass/HDPE',
    'Hdpe': 'HDPE', 
    'Hdpe Poly': 'HDPE', 
    'Composite Hdpe': 'HDPE',
    'Hdpe/Steel': 'HDPE/Steel', 
    'Hdpe Lined Steel': 'HDPE/Steel',
     'Hdpe/Steel, Flexsteel': 'HDPE/Steel',
    'Poly': 'Polycarbonate', 
    'Polyline': 'Polycarbonate', 
    'Poly & Steel': 'Polycarbonate/Steel', 
    'Steel/Poly': 'Polycarbonate/Steel',
    'Poly/Steel': 'Polycarbonate/Steel',
    'Polycarbonate': 'Polycarbonate', 
    'Polycarbonate/Steel': 'Polycarbonate/Steel',
    'Pvc': 'PVC', 
    'Flexspar': 'Fiberglass', 
    'Stainless': 'Steel', 
    'Stainless/Carbon Steel/Hdpe': 'Carbon Steel/HDPE/Stainless Steel',
    'Carbon Steel/Hdpe/Stainless Steel': 'Carbon Steel/HDPE/Stainless Steel',
    'Unknown': 'Unknown', 
    'Other': 'Other', 
    'Other (Poly)': 'Polycarbonate', 
    'Sdr7 Polyethelyne': 'Polyethylene', 
    'Sdr 11 Poly Pipe': 'Polyethylene', 
    'Sdr 11 Poly': 'Polyethylene', 
    'Poly Pipe': 'Polyethylene', 
    'Sdr_Poly': 'Polyethylene',
    'Poly': 'Polyethylene',
    'Poly Sdr 7': 'Polypropylene',
    'Poly Sdr-7': 'Polypropylene',
    'Duplex': 'Duplex', 
    'Fplp': 'Other', 
    'Flowline': 'Other',
    'Flex Steel': 'Steel',
    'Other (Flex Steel)': 'Steel',
    'Fiberglass And Carbon Steel': 'Carbon Steel/Fiberglass', 
    'Stainless Steel': 'Steel',
    'HDPE Lined Steel': 'HDPE/Steel',
    'Fiberglass/Hdpe': 'Fiberglass/HDPE',
    'Unk': 'Unknown', 'Other (Unknown)': 'Unknown', 'Other': 'Unknown',
})

# Verify the changes by printing the unique values in the 'Material' column
print(combined_gdf['Material'].unique())

['Steel' 'Carbon Steel' 'Polyethylene' 'Fiberglass' 'HDPE' 'Other'
 'Polycarbonate' 'PVC' 'Carbon Steel/HDPE/Stainless Steel'
 'Carbon Steel/HDPE' 'Unknown' 'Duplex' 'Fiberglass/HDPE'
 'Polycarbonate/Steel' 'Polypropylene' 'Co2/Produced Water'
 'Carbon Steel/Fiberglass' 'Oil' 'HDPE/Steel']


In [17]:
# Define the mapping dictionary for the 'Root Cause Type' column
root_cause_mapping = {
    'Corrosion': 'Corrosion', 
    'Unknown': 'Unknown', 
    'Incorrect Operation': 'Incorrect Operation', 
    'Equipment Failure': 'Equipment Failure', 
    'Equipment failure': 'Equipment Failure',  
    'Other Outside Force Damage': 'Other Outside Force Damage', 
    'Natural Force Damage': 'Natural Force Damage', 
    'Pipe, Weld, or Joint Failure': 'Pipe, Weld, or Joint Failure', 
    'Pipe, Weld Joint Failure': 'Pipe, Weld, or Joint Failure',
    'Excavation Damage': 'Excavation Damage', 
    'Other Outside Force': 'Other Outside Force Damage',  # Mapping to a more standardized category
    'Pipe, Weld, Joint Failure': 'Pipe, Weld, or Joint Failure'  # Ensuring consistency in naming
}

# Apply the mapping to the 'Root Cause Type' column
combined_gdf['Root Cause Type'] = combined_gdf['Root Cause Type'].replace(root_cause_mapping)
print(combined_gdf['Root Cause Type'].unique())

['Unknown' 'Corrosion' 'Equipment Failure' 'Incorrect Operation'
 'Pipe, Weld, or Joint Failure' 'Other Outside Force Damage'
 'Natural Force Damage' 'Excavation Damage' None]


In [18]:
combined_gdf.columns = [
    'operator_number',  # Changed from OPERATOR_NUM to operator_number
    'flowline_id',       # Changed from FLOWLINEID to flowline_id
    'location_id',       # Changed from LOCATION_ID to location_id
    'status',            # Already appropriately named but changed to lowercase
    'flowline_action',   # Changed from FLOWLINEACTION to flowline_action
    'location_type',     # Changed from LOCATIONTYPE to location_type
    'fluid',             # Already appropriately named but changed to lowercase
    'material',          # Already appropriately named but changed to lowercase
    'diameter_in',       # Changed from Diam_in to diameter_in
    'length_ft',         # Changed from Length_ft to length_ft
    'max_operating_pressure', # Changed from MAXOPPRESSURE to max_operating_pressure
    'shape_length',      # Changed from SHAPE_Length to shape_length
    'line_age_yr',
    'geometry',
    'root_cause'           # Already appropriately named but changed to lowercase
]

# Print new column names to verify
print(combined_gdf.columns.tolist())

['operator_number', 'flowline_id', 'location_id', 'status', 'flowline_action', 'location_type', 'fluid', 'material', 'diameter_in', 'length_ft', 'max_operating_pressure', 'shape_length', 'line_age_yr', 'geometry', 'root_cause']


In [19]:
combined_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 26629 entries, 0 to 26628
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   operator_number         26629 non-null  int32   
 1   flowline_id             26618 non-null  float64 
 2   location_id             26617 non-null  float64 
 3   status                  26629 non-null  object  
 4   flowline_action         17665 non-null  object  
 5   location_type           26629 non-null  object  
 6   fluid                   26629 non-null  object  
 7   material                26629 non-null  object  
 8   diameter_in             26629 non-null  float64 
 9   length_ft               26629 non-null  float64 
 10  max_operating_pressure  15644 non-null  float64 
 11  shape_length            26629 non-null  float64 
 12  line_age_yr             26629 non-null  float64 
 13  geometry                26629 non-null  geometry
 14  root_cause    

In [20]:
# Adding the 'risk' column based on whether 'root_cause' is present or not
combined_gdf['risk'] = combined_gdf['root_cause'].apply(lambda x: 1 if pd.notnull(x) else 0)

In [21]:
# Remove rows where 'max_operating_pressure' is NaN and 'risk' is 0
combined_gdf = combined_gdf[~((combined_gdf['max_operating_pressure'].isna()) & (combined_gdf['risk'] == 0))]

In [22]:
# Select the relevant numeric columns for KNN Imputation
columns_for_imputation = ['max_operating_pressure', 'diameter_in', 'length_ft', 'line_age_yr', 'shape_length', 'material', 'fluid']  # Add other relevant columns as needed

# Creating a copy of the data to prevent modifying the original directly
df_for_imputation = combined_gdf[columns_for_imputation].copy()

# Convert the 'material' column using Label Encoding
le = LabelEncoder()
df_for_imputation['material_encoded'] = le.fit_transform(df_for_imputation['material'])
df_for_imputation['fluid_encoded'] = le.fit_transform(df_for_imputation['fluid'])

# Drop the original 'material' and 'fluid' columns
df_for_imputation = df_for_imputation.drop(columns=['material', 'fluid'])

# Initialize the KNN Imputer
imputer = KNNImputer(n_neighbors=5)

# Fit and transform the data to impute missing values
df_imputed = imputer.fit_transform(df_for_imputation)

# Replace the original max_operating_pressure with the imputed values
combined_gdf['max_operating_pressure'] = df_imputed[:, 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [23]:
# make variables integers
combined_gdf['diameter_in'] = combined_gdf['diameter_in'].astype(int)
combined_gdf['length_ft'] = combined_gdf['length_ft'].astype(int)
combined_gdf['max_operating_pressure'] = combined_gdf['max_operating_pressure'].astype(int)
combined_gdf['shape_length'] = combined_gdf['shape_length'].astype(int)
combined_gdf['line_age_yr'] = combined_gdf['line_age_yr'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [24]:
combined_gdf

Unnamed: 0,operator_number,flowline_id,location_id,status,flowline_action,location_type,fluid,material,diameter_in,length_ft,max_operating_pressure,shape_length,line_age_yr,geometry,root_cause,risk
0,10110,470450.0,470449.0,Active,,Production Facilities,Crude Oil,Steel,2,542,34,165,52,"LINESTRING (545287.5 4410654.506, 545132.965 4...",Unknown,1
1,69175,477981.0,447490.0,New Construction,Registration,Production Facilities,Other,Carbon Steel,3,404,2430,123,7,"LINESTRING (529087.406 4468617.814, 529165.786...",Unknown,1
2,47120,457300.0,318070.0,Active,Abandonment,Production Facilities,Produced Water,Steel,2,18,2030,5,13,"LINESTRING (526647.996 4445503.586, 526647.873...",Unknown,1
3,100322,457931.0,422528.0,Active,Registration,Production Facilities,Multiphase,Carbon Steel,2,1135,435,345,20,"LINESTRING (534932.646 4463662.371, 535082.761...",Unknown,1
4,100322,466186.0,455178.0,Abandoned,Registration,Production Facilities,Multiphase,Carbon Steel,2,768,1006,234,31,"LINESTRING (521484.139 4483010.433, 521636.926...",Unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26501,96155,456386.0,433999.0,Active,Registration,Production Facilities,Multiphase,Carbon Steel,3,1404,150,427,6,"LINESTRING (597095.529 4518130.982, 597416.31 ...",,0
26502,96155,456381.0,433999.0,Active,Registration,Production Facilities,Multiphase,Carbon Steel,3,1404,150,427,6,"LINESTRING (597095.529 4518130.982, 597416.31 ...",,0
26503,96155,456382.0,433999.0,Active,Registration,Production Facilities,Multiphase,Carbon Steel,3,1404,150,427,6,"LINESTRING (597095.529 4518130.982, 597416.31 ...",,0
26609,35080,455592.0,443145.0,Active,Registration,Production Facilities,Crude Oil,HDPE,3,1175,40,358,7,"LINESTRING (637207.682 4380630.737, 637458.657...",,0


In [25]:
# Function to count only NaN values, excluding None
def count_only_nan(series):
    return series.apply(lambda x: 1 if isinstance(x, float) and np.isnan(x) else 0).sum()

# Apply the function to each column to count NaNs, not None
na_columns = combined_gdf.apply(count_only_nan)

# Filter columns with NaN values
columns_with_only_nan = na_columns[na_columns > 0]

# Print the results
print("Columns with NaN values (excluding None) and their counts:")
print(columns_with_only_nan)


Columns with NaN values (excluding None) and their counts:
flowline_id    2
location_id    9
dtype: int64


In [26]:
# Identify columns with NaN values
columns_with_na = combined_gdf.columns[combined_gdf.isna().any()]

# Create a dictionary to store the counts of NaNs with risk of 1 for each column
na_with_risk_1 = {}

# Iterate over each column with NaN values and count rows with risk = 1
for column in columns_with_na:
    count = combined_gdf[combined_gdf[column].isna() & (combined_gdf['risk'] == 1)].shape[0]
    na_with_risk_1[column] = count

# Print the counts of NaNs with risk of 1 for each column
print("Number of NaNs with risk of 1 in each column:")
print(na_with_risk_1)


Number of NaNs with risk of 1 in each column:
{'flowline_id': 0, 'location_id': 0, 'flowline_action': 100, 'root_cause': 0}


In [27]:
combined_gdf = combined_gdf.drop(columns=['flowline_action'])

In [28]:
combined_gdf = combined_gdf.dropna(subset=['flowline_id', 'location_id'])

In [29]:
combined_gdf


Unnamed: 0,operator_number,flowline_id,location_id,status,location_type,fluid,material,diameter_in,length_ft,max_operating_pressure,shape_length,line_age_yr,geometry,root_cause,risk
0,10110,470450.0,470449.0,Active,Production Facilities,Crude Oil,Steel,2,542,34,165,52,"LINESTRING (545287.5 4410654.506, 545132.965 4...",Unknown,1
1,69175,477981.0,447490.0,New Construction,Production Facilities,Other,Carbon Steel,3,404,2430,123,7,"LINESTRING (529087.406 4468617.814, 529165.786...",Unknown,1
2,47120,457300.0,318070.0,Active,Production Facilities,Produced Water,Steel,2,18,2030,5,13,"LINESTRING (526647.996 4445503.586, 526647.873...",Unknown,1
3,100322,457931.0,422528.0,Active,Production Facilities,Multiphase,Carbon Steel,2,1135,435,345,20,"LINESTRING (534932.646 4463662.371, 535082.761...",Unknown,1
4,100322,466186.0,455178.0,Abandoned,Production Facilities,Multiphase,Carbon Steel,2,768,1006,234,31,"LINESTRING (521484.139 4483010.433, 521636.926...",Unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26501,96155,456386.0,433999.0,Active,Production Facilities,Multiphase,Carbon Steel,3,1404,150,427,6,"LINESTRING (597095.529 4518130.982, 597416.31 ...",,0
26502,96155,456381.0,433999.0,Active,Production Facilities,Multiphase,Carbon Steel,3,1404,150,427,6,"LINESTRING (597095.529 4518130.982, 597416.31 ...",,0
26503,96155,456382.0,433999.0,Active,Production Facilities,Multiphase,Carbon Steel,3,1404,150,427,6,"LINESTRING (597095.529 4518130.982, 597416.31 ...",,0
26609,35080,455592.0,443145.0,Active,Production Facilities,Crude Oil,HDPE,3,1175,40,358,7,"LINESTRING (637207.682 4380630.737, 637458.657...",,0


In [30]:
# Calculate the total number of rows
total_rows = combined_gdf.shape[0]

# Count rows where risk is 1
risk_1_count = combined_gdf[combined_gdf['risk'] == 1].shape[0]

# Count rows where risk is 0
risk_0_count = combined_gdf[combined_gdf['risk'] == 0].shape[0]

# Print the results
print(f"Total number of rows: {total_rows}")
print(f"Total number of rows with risk = 1: {risk_1_count}")
print(f"Total number of rows with risk = 0: {risk_0_count}")

Total number of rows: 15911
Total number of rows with risk = 1: 562
Total number of rows with risk = 0: 15349


# Download Cleaned Data

In [31]:
combined_gdf.to_file("final_cleaned_gdf.geojson", driver='GeoJSON')

In [32]:
combined_gdf_operator_mapping.to_csv('operator_mapping.csv', index=False)