In [18]:
import os
import geopandas as gpd

os.chdir('/Users/ichittumuri/Desktop/MINES/COGCC-Risk-Analysis/Data')

# --- Load datasets ---
flowlines_pts = gpd.read_file("flowline_points_50m_dedup.geojson")
spills_pts = gpd.read_file("spills_w_flowline_attributes.geojson")

# Ensure both are in the same CRS
if flowlines_pts.crs != spills_pts.crs:
    spills_pts = spills_pts.to_crs(flowlines_pts.crs)

# --- Coordinate keys for matching (rounded to avoid float noise) ---
flowlines_pts["coord_key"] = flowlines_pts.geometry.apply(lambda p: (round(p.x, 6), round(p.y, 6)))
spills_pts["coord_key"] = spills_pts.geometry.apply(lambda p: (round(p.x, 6), round(p.y, 6)))

# --- Find intersection of coordinate keys ---
common_coords = set(flowlines_pts["coord_key"]) & set(spills_pts["coord_key"])
print(f"Number of duplicate coordinates between datasets: {len(common_coords)}")

# --- Remove duplicates from flowlines, keep spills intact ---
flowlines_no_dups = flowlines_pts[~flowlines_pts["coord_key"].isin(common_coords)].copy()

print(f"Flowline points before removal: {len(flowlines_pts)}")
print(f"Flowline points after removal:  {len(flowlines_no_dups)}")

# --- Optional: save ---
# flowlines_no_dups.to_file("flowline_points_50m_dedup_nospilldups.geojson", driver="GeoJSON")

Number of duplicate coordinates between datasets: 73
Flowline points before removal: 369782
Flowline points after removal:  369709


In [40]:
def check_na_zero(df):
    results = []
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            na_count = df[col].isna().sum()
            zero_count = (df[col] == 0).sum()
            results.append({
                'column': col,
                'na_count': na_count,
                'zero_count': zero_count
            })
    return pd.DataFrame(results)

print("\n--- NA and Zero Counts: combined ---")
print(check_na_zero(flowlines_pts))

print("\n--- NA and Zero Counts: final_subset ---")
print(check_na_zero(spills_pts))


--- NA and Zero Counts: combined ---
                   column  na_count  zero_count
0               unique_id         0           0
1         operator_number         0           0
2             flowline_id         0           0
3             location_id         0           0
4             diameter_in         0          56
5               length_ft         0           0
6  max_operating_pressure         0          84
7             line_age_yr         0           0
8                    risk         0      369782
9           geod_length_m         0           0

--- NA and Zero Counts: final_subset ---
                   column  na_count  zero_count
0               unique_id         0           0
1         operator_number         0           0
2             flowline_id         0           0
3             location_id         0           0
4             diameter_in         0           0
5               length_ft         0           0
6  max_operating_pressure         0           0
7       

In [23]:
import pandas as pd
import geopandas as gpd

# Ensure both have the same CRS before combining
if flowlines_no_dups.crs != spills_pts.crs:
    spills_pts = spills_pts.to_crs(flowlines_no_dups.crs)

# Row-bind (outer join keeps all columns from both datasets)
combined = gpd.GeoDataFrame(
    pd.concat([flowlines_no_dups, spills_pts], ignore_index=True),
    geometry="geometry",
    crs=flowlines_no_dups.crs
)

print(combined.shape)
print(combined.columns.tolist())

(370189, 22)
['unique_id', 'operator_name', 'operator_number', 'flowline_id', 'location_id', 'status', 'flowline_action', 'location_type', 'fluid', 'material', 'diameter_in', 'length_ft', 'max_operating_pressure', 'line_age_yr', 'construct_date', 'risk', 'geod_length_m', 'geometry', 'coord_key', 'match_distance_m', 'incident_date', 'root_cause']


In [27]:
print(f"Final combined dataset size: {len(combined)}")
print(f"Unique coordinate locations: {combined.geometry.apply(lambda p: (round(p.x, 6), round(p.y, 6))).nunique()}")

# Optional: save to file
# combined.to_file("flowlines_and_spills_combined.geojson", driver="GeoJSON")

Final combined dataset size: 370189
Unique coordinate locations: 370189
['unique_id', 'operator_name', 'operator_number', 'flowline_id', 'location_id', 'status', 'flowline_action', 'location_type', 'fluid', 'material', 'diameter_in', 'length_ft', 'max_operating_pressure', 'line_age_yr', 'construct_date', 'risk', 'geod_length_m', 'geometry', 'coord_key', 'match_distance_m', 'incident_date', 'root_cause']


In [39]:
print(combined.columns.tolist())


['unique_id', 'operator_name', 'operator_number', 'flowline_id', 'location_id', 'status', 'flowline_action', 'location_type', 'fluid', 'material', 'diameter_in', 'length_ft', 'max_operating_pressure', 'line_age_yr', 'avg_elevation', 'construct_date', 'incident_date', 'geod_length_m', 'match_distance_m', 'root_cause', 'risk', 'coord_key', 'geometry', 'avg_population']


Number of rows: 370189
Number of columns: 23
Rows with risk = 1: 480
Rows with risk = 0: 369709


In [31]:
import rasterio
from rasterio import features
from pyproj import CRS

dem = rasterio.open('output_USGS30m.tif')
dem_crs = CRS(dem.crs)
if combined.crs != dem_crs:
    combined = combined.to_crs(dem_crs)

def get_elevation(point, dem):
    try:
        val = list(dem.sample([(point.x, point.y)]))[0][0]
        if dem.nodata is not None and val == dem.nodata:
            return None
        return val
    except:
        return None

combined['avg_elevation'] = combined.geometry.apply(lambda pt: get_elevation(pt, dem))
combined = combined.drop(columns=['index_right'], errors='ignore')


In [None]:

# 27. Load additional datasets and join population density and elevation
pop_density = gpd.read_file('Population_Density_(Census_Tracts)').to_crs(combined.crs)

print('Summary of Census Tract Data:')
print(pop_density.info())
print('\nFirst few rows of the data:')
print(pop_density.head())

joined = gpd.sjoin(combined, pop_density, how='left', predicate='within')
combined['avg_population'] = joined['Populati_1']

In [None]:
desired = [
    'unique_id', 'operator_name', 'operator_number', 'flowline_id', 'location_id',
    'status', 'flowline_action', 'location_type', 'fluid', 'material',
    'diameter_in', 'length_ft', 'max_operating_pressure', 'line_age_yr', 'avg_elevation', 'avg_population',
    'construct_date', 'incident_date', 'geod_length_m', 'match_distance_m',
    'root_cause', 'risk', 'coord_key', 'geometry'
]

# keep desired (that actually exist), then append any remaining columns
ordered_cols = [c for c in desired if c in combined.columns] + \
               [c for c in combined.columns if c not in desired]

combined = combined[ordered_cols]

# re-affirm geometry column (good practice after reindexing)
combined = combined.set_geometry('geometry')

In [None]:
# combined.to_file("full_final_dataset.geojson", driver="GeoJSON")

In [None]:
print(f"Number of rows: {len(combined)}")
print(f"Number of columns: {combined.shape[1]}")
print(f"Rows with risk = 1: {(combined['risk'] == 1).sum()}")
print(f"Rows with risk = 0: {(combined['risk'] == 0).sum()}")