In [None]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

pd.options.display.max_columns = None

# set working directory
os.chdir('/Users/ichittumuri/Desktop/MINES/COGCC-Risk-Analysis/Data')

# 1. Load both GeoJSONs and reproject to WGS84
line_data = gpd.read_file("full_length_flowlines.geojson").to_crs(epsg=4326)
line_data["risk"] = 0  # start all flowlines with risk = 0

# 11. Convert CONSTRUCTDATE to datetime and compute line_age_yr
line_data['CONSTRUCTDATE'] = pd.to_datetime(line_data['CONSTRUCTDATE'], errors='coerce')
today = pd.Timestamp.now()
line_data['line_age_yr'] = (today - line_data['CONSTRUCTDATE']).dt.days / 365.25
print(line_data[['CONSTRUCTDATE', 'line_age_yr']].head())

# 12. Normalize operator names and build mapping table
mapping = {
    'KINDER MORGAN CO2 CO LP': 'KINDER MORGAN CO2 CO LLC',
    'BEEMAN OIL & GAS INC': 'BEEMAN OIL & GAS LLC',
}
line_data['Operator'] = line_data['Operator'].replace(mapping)
line_data.rename(columns={'OPERATOR_NUM': 'operator_number', 'Operator': 'operator_name'}, inplace=True)

# 14. Drop unwanted columns (if they exist)
columns_to_remove = [
    'trkg_num', 'Operator Name', 'facility_type', 'Spill_Desc', 'Spill Type',
    'Root Cause', 'Preventative Measure', 'Detailed Root Cause Type',
    'Long', 'Lat', 'facility_status', 'Metallic?', 'ACTIONDESCRIPTION',
    'BEDDINGMATERIAL', 'COMPANY_NAME', 'ENDLAT', 'ENDLONG',
    'ENTIRELINEREMOVED', 'PIPEMATERIAL', 'RECEIVE_DATE', 'STARTLAT',
    'STARTLOCATIONID', 'STARTLONG', 'TYPEOFFLUIDTRANS', 'operator_name',
    'SHAPE_Length', 'matched_crudeoil_idx'
]
cols_to_drop = [c for c in columns_to_remove if c in line_data.columns]
print(f"Dropping these existing columns: {cols_to_drop}")
line_data = line_data.drop(columns=cols_to_drop)

# 15. Reorder DataFrame columns (verify existence first)
new_order = [
    'unique_id', 'operator_number', 'FLOWLINEID', 'LOCATION_ID',
    'Status', 'FLOWLINEACTION', 'LOCATIONTYPE', 'Fluid',
    'Material', 'Diam_in', 'Length_ft', 'MAXOPPRESSURE', 'line_age_yr',
    'CONSTRUCTDATE', 'geometry', 'risk'
]
missing_cols = [c for c in new_order if c not in line_data.columns]
if missing_cols:
    raise KeyError(f"Missing columns before reordering: {missing_cols}")
line_data = line_data[new_order]
print("After reordering, columns are:", line_data.columns.tolist())

# 16. Print unique values for certain columns
columns_to_check = ['Status', 'FLOWLINEACTION', 'LOCATIONTYPE', 'Fluid', 'Material']
for column in columns_to_check:
    if column in line_data.columns:
        print(f"Unique values in {column}: {line_data[column].unique().tolist()}")
    else:
        print(f"{column}: Column not found in DataFrame.")

material_mapping = {
    'Carbon Steel': 'Carbon Steel', 'Carbonsteel': 'Carbon Steel', 'Carbon  Steel': 'Carbon Steel',
    'Carbon Steel Sch 80': 'Carbon Steel', 'Carbon Steel - Hdpe': 'Carbon Steel/HDPE',
    'Carbon Steel-Hdpe': 'Carbon Steel/HDPE', 'Carbon Steel And Hdpe': 'Carbon Steel/HDPE',
    'Carbon Steel/Hdpe': 'Carbon Steel/HDPE', 'Carbon Steel/HDPE': 'Carbon Steel/HDPE',
    'Hdpe/Steel': 'Carbon Steel/HDPE', 'HDPE/Steel': 'Carbon Steel/HDPE',
    'Hdpe Lined Steel': 'Carbon Steel/HDPE', 'Hdpe/Steel, Flexsteel': 'Carbon Steel/HDPE/Flexsteel',
    'Carbon Steel, Hdpe,Stainless Steel': 'Carbon Steel/HDPE/Stainless Steel',
    'Carbon Steel, Hdpe, Stainless Steel': 'Carbon Steel/HDPE/Stainless Steel',
    'Carbon Steel/Stainless Steel/Hdpe': 'Carbon Steel/HDPE/Stainless Steel',
    'Carbon Steel/Hdpe/Stainless': 'Carbon Steel/HDPE/Stainless Steel',
    'Carbon Steel/Hdpe/Stainless Steel': 'Carbon Steel/HDPE/Stainless Steel',
    'Carbon Steel/Stainless/Hdpe': 'Carbon Steel/HDPE/Stainless Steel',
    'Stainless/Carbon Steel/Hdpe': 'Carbon Steel/HDPE/Stainless Steel',
    'Stainless/ Carbon Steel/Hdpe': 'Carbon Steel/HDPE/Stainless Steel',
    'Stainless Steel/Carbon Steel/Hdpe': 'Carbon Steel/HDPE/Stainless Steel',
    'Stainless/Carbonsteel/Hdpe': 'Carbon Steel/HDPE/Stainless Steel',
    'Satinless/Carbon Steel/Hdpe': 'Carbon Steel/HDPE/Stainless Steel',
    'Stainless/Carbon Steel/Hspe': 'Carbon Steel/HDPE/Stainless Steel',
    'Stainless/Carbon Steel/ Hdpe': 'Carbon Steel/HDPE/Stainless Steel',
    'Carbon Steel/Fiberglass': 'Fiberglass/Carbon Steel',
    'Carbon Steel Mixed With Fiberglass': 'Fiberglass/Carbon Steel',
    'Fiberglass': 'Fiberglass', 'Fibergalss': 'Fiberglass', 'Fiberspar': 'Fiberglass',
    'Fiber Glass': 'Fiberglass', 'Flexspar': 'Fiberglass', 'Fiberoptic': 'Fiberglass',
    'Fiberglass & Fiberspar': 'Fiberglass', 'Fiberglass And Carbon Steel': 'Fiberglass/Carbon Steel',
    'Fiberglass Sleaved W/ Hdpe': 'Fiberglass/HDPE', 'Fiberglass And Hdpe': 'Fiberglass/HDPE',
    'Hdpe/Fiberglass': 'Fiberglass/HDPE',
    'Steel': 'Steel', 'Stainless': 'Steel', 'Stainless Steel': 'Steel',
    'Coated Steel': 'Steel', 'Lined Steel': 'Steel', 'Flexpipe': 'Steel',
    'Flex Steel': 'Steel', 'Flexsteel': 'Steel',
    'Other (Flexsteel)': 'Steel', 'Other (Flex Steel)': 'Steel', 'Other (Flex Pipe)': 'Steel',
    'Hdpe': 'HDPE', 'Hdpe Poly': 'HDPE', 'Hdpi Poly': 'HDPE',
    'Hdpe Line Sdr 7': 'HDPE', 'Hdpe Sdr7': 'HDPE', 'Hdpe Sdr11': 'HDPE',
    'Composite Hdpe': 'HDPE', 'High-Density Polyethylene (Hdpe)': 'HDPE',
    'Poly': 'Polycarbonate', 'Polyline': 'Polycarbonate', 'Polycarbonate': 'Polycarbonate',
    'Other (Poly)': 'Polycarbonate', 'Sdr 7 Poly': 'Polycarbonate',
    'Poly & Steel': 'Polycarbonate/Steel', 'Steel/Poly': 'Polycarbonate/Steel',
    'Poly/Steel': 'Polycarbonate/Steel', 'Polycarbonate/Steel': 'Polycarbonate/Steel',
    'Poly Sleeved Steel': 'Polycarbonate/Steel', 'Hdpe Poly Sdr 11': 'Polycarbonate/HDPE',
    'Sdr7 Polyethelyne': 'Polyethylene', 'Sdr 11 Poly Pipe': 'Polyethylene',
    'Sdr 11 Poly': 'Polyethylene', 'Poly Pipe': 'Polyethylene',
    'Sdr_Poly': 'Polyethylene', 'Sdr-11': 'Polyethylene', 'Sdr-11 Poly': 'Polyethylene',
    'Poly Sdr 7': 'Polypropylene', 'Poly Sdr-7': 'Polypropylene',
    'Pvc': 'PVC',
    'Duplex': 'Duplex',
    'Zaplock': 'Other',
    'Plastic': 'Other', 'Polypipe': 'Other', 'Core Linepipe': 'Other', 'Flex Pipe': 'Other',
    'Shawcor Fp150': 'Other', 'Zapock': 'Other', 'Other (Hdpe And Tubing)': 'Other',
    'Other (Please Specify)': 'Other', 'Other (Stainless Steel)': 'Other',
    'Fplp': 'Other', 'Oil': 'Other', 'Gas': 'Other', 'Co2/Produced Water': 'Other',
    'Other': 'Other',
    'Unknown': None, 'Other (Unknown)': None, 'Unk': None, '0': None,
    'Material': None, 'Sdr': None, 'Flowline': None
}


  CONSTRUCTDATE  line_age_yr
0    1988-05-25    37.119781
1    2004-03-01    21.352498
2    2007-12-18    17.555099
3    1992-04-14    33.232033
4    2009-05-11    16.158795
Dropping these existing columns: ['ACTIONDESCRIPTION', 'BEDDINGMATERIAL', 'COMPANY_NAME', 'ENDLAT', 'ENDLONG', 'ENTIRELINEREMOVED', 'PIPEMATERIAL', 'RECEIVE_DATE', 'STARTLAT', 'STARTLOCATIONID', 'STARTLONG', 'TYPEOFFLUIDTRANS', 'operator_name', 'SHAPE_Length']
After reordering, columns are: ['unique_id', 'operator_number', 'FLOWLINEID', 'LOCATION_ID', 'Status', 'FLOWLINEACTION', 'LOCATIONTYPE', 'Fluid', 'Material', 'Diam_in', 'Length_ft', 'MAXOPPRESSURE', 'line_age_yr', 'CONSTRUCTDATE', 'geometry', 'risk']
Unique values in Status: ['Abandoned', 'Partial Removed see comment', 'Active', None, 'Out of Service', 'ACTIVE', 'New Construction', 'Inactive', 'PreCommission', 'OutofService', 'InActive', 'PreAbandonment', 'Removed', 'Pre-Commissioned', 'ABANDONED', 'Out-of-Service', 'Abandon', 'ABiP', 'SI', 'Out Of Service', 

In [50]:
line_data['Material'] = line_data['Material'].str.strip().str.title().replace(material_mapping)
print(line_data['Material'].unique())

['Carbon Steel' None 'Fiberglass' 'Steel' 'HDPE' 'Polycarbonate'
 'Carbon Steel/HDPE/Stainless Steel' 'PVC' 'Other' 'Duplex' 'Polyethylene'
 'Polycarbonate/Steel' 'Carbon Steel/HDPE' 'Aluminum' 'Polycarbonate/HDPE'
 'Fiberglass/Carbon Steel' 'Polypropylene' 'Fiberglass/HDPE'
 'Carbon Steel/HDPE/Flexsteel']
