This code combines the two different flowline datasets into one. One only had end points of an flowline and the other had the entire flowline. We combined them using a spatial join and asserted that the operator name is the same, with a max 25 meter buffer. 

## Setup


In [1]:
import os
import pandas as pd
import geopandas as gpd
import shapely
from shapely.geometry import Point, Polygon, MultiLineString, LineString,MultiPolygon, MultiPoint
from shapely.ops import nearest_points
os.chdir('/Users/ichittumuri/Desktop/MINES/COGCC-Risk-Analysis/Data')
pd.options.display.max_columns = None

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Load Data
flowlines_gdf = gpd.read_file('flowlines.geojson')
crudeoil_gdf = gpd.read_file('crudeoil_offlocation.geojson')

In [3]:
# Check size
print(flowlines_gdf.shape)
print(crudeoil_gdf.shape)

(21942, 20)
(259979, 8)


In [4]:
flowlines_gdf.head(2)

Unnamed: 0,LOCATION_ID,FLOWLINEID,STARTLOCATIONID,FLOWLINEACTION,ENTIRELINEREMOVED,ACTIONDESCRIPTION,RECEIVE_DATE,OPERATOR_NUM,COMPANY_NAME,LOCATIONTYPE,ENDLAT,ENDLONG,STARTLAT,STARTLONG,PIPEMATERIAL,BEDDINGMATERIAL,TYPEOFFLUIDTRANS,MAXOPPRESSURE,CONSTRUCTDATE,geometry
0,331872.0,485633.0,,Registration,,,2023-10-30T08:10:52.119999,10673,GADECO LLC ...,Well Site,,,,,HDPE,Native Materials,Produced Water,,2000-07-15,
1,470443.0,470446.0,,,,The flowline serving the Emerson 3-29J (05-123...,2023-10-24T08:33:49.223000,10633,CRESTONE PEAK RESOURCES OPERATING LLC ...,Production Facilities,40.109444,-104.909686,40.105743,-104.90986,Carbon Steel,Native Materials,Multiphase,325.0,1983-11-09,"LINESTRING (507682.442 4439497.658, 507696.855..."


In [5]:
print(flowlines_gdf.geometry.iloc[1])

LINESTRING (507682.4421174659 4439497.657752302, 507696.85463215865 4439908.45533576)


In [6]:
crudeoil_gdf.head(5)

Unnamed: 0,Operator,Fluid,Material,Diam_in,Status,Length_ft,SHAPE_Length,geometry
0,EVERGREEN NATURAL RESOURCES LLC,Gas,polly,4.0,Active,2277.71,693.972162,"MULTILINESTRING ((524642.670 4117088.796, 5246..."
1,NOBLE ENERGY INC,Multiphase,Carbon Steel,3.0,Abandoned,651.58,198.525215,"MULTILINESTRING ((527997.281 4463899.920, 5281..."
2,PDC ENERGY INC,,,,Partial Removed see comment,1902.59,579.687012,"MULTILINESTRING ((537519.211 4475984.687, 5375..."
3,NOBLE ENERGY INC,Gas,Carbon Steel,3.0,Active,205.62,62.64984,"MULTILINESTRING ((544793.476 4489156.672, 5447..."
4,NOBLE ENERGY INC,Multiphase,Carbon Steel,3.0,Active,2069.9,630.658768,"MULTILINESTRING ((526448.455 4461830.702, 5258..."


In [7]:
shapely.get_num_coordinates(crudeoil_gdf.geometry)

0         46
1          2
2         26
3          2
4          2
          ..
259974     5
259975    23
259976     2
259977    27
259978    15
Name: geometry, Length: 259979, dtype: int32

In [8]:
print(crudeoil_gdf.geometry.iloc[1])

MULTILINESTRING ((527997.2812999999 4463899.920399999, 528172.7903000005 4463992.704299999))


In [9]:
# Check if CRS is the same for both files
if flowlines_gdf.crs != crudeoil_gdf.crs:
    flowlines_gdf = flowlines_gdf.to_crs(crudeoil_gdf.crs)

In [10]:
clean_crudeoil_gdf = crudeoil_gdf.dropna()
clean_crudeoil_gdf.shape

(138943, 8)

## Match with Buffer 
find spatial match with/ buffer, update geometry

#### Test

In [11]:
from shapely.geometry import LineString

line1 = LineString([(0, 0), (1, 1)])
line2 = LineString([(0.5, 0.5), (2, 2)])
line3 = LineString([(1, 1), (1, 2)])

print(line1.intersects(line2))   
print(line1.intersects(line3))   
print(line2.intersects(line3))  

True
True
True


Find matches

If you have more time, try these different methods to get more data
- do a scipy.spatial.KDTree.query to lessen the time it takes to search for a match

In [12]:
# Initialize an empty list to store matched data
matched_flowlines = []

max_buffer_distance = 25  # Maximum buffer distance in meters
buffer_increment = 1    # Buffer increment in meters

for index, flowline in flowlines_gdf.iterrows():
    # Check for missing geometry and skip if necessary
    if flowline.geometry is None:
        print(f"Missing geometry for flowline at index {index}. Skipping...")
        continue

    # Extract the two end points of the flowline and create a new LineString
    coords = list(flowline.geometry.coords)
    endpoint_linestring = LineString([coords[0], coords[-1]])

    match_found = False
    buffer_distance = 0  # Start with no buffer

    # First, attempt to find a match without buffering
    for _, crudeoil_line in clean_crudeoil_gdf.iterrows():
        if endpoint_linestring.intersects(crudeoil_line.geometry):
            if flowline['COMPANY_NAME'].strip().lower() == crudeoil_line['Operator'].strip().lower():
                print(f"Immediate company match found for flowline at index {index} without buffering.")
                match_found = True
                break

    # If no immediate match, begin buffering
    if not match_found:
        while buffer_distance <= max_buffer_distance:
            # Buffer the endpoint linestring
            buffered_endpoint_linestring = endpoint_linestring.buffer(buffer_distance)

            # Check for intersections with the buffer
            for _, crudeoil_line in clean_crudeoil_gdf.iterrows():
                if buffered_endpoint_linestring.intersects(crudeoil_line.geometry):
                    if flowline['COMPANY_NAME'].strip().lower() == crudeoil_line['Operator'].strip().lower():
                        print(f"Company match found for flowline at index {index} with buffer distance of {buffer_distance} meters.")
                        match_found = True
                        break

            if match_found:
                break
            buffer_distance += buffer_increment  # Increase the buffer distance

        # If no match was found and max buffer was reached
        if not match_found and buffer_distance > max_buffer_distance:
            print(f"No company match found for flowline at index {index} even after expanding buffer to {max_buffer_distance} meters.")

    # If a match is found, update the flowline with the crude oil line's attributes and geometry
    if match_found:
        updated_flowline = flowline.copy()
        updated_flowline.geometry = crudeoil_line.geometry  # Update to the crude oil line's geometry
        for col in crudeoil_line.index:
            updated_flowline[col] = crudeoil_line[col]
        matched_flowlines.append(updated_flowline)

# Convert the list of matched flowlines to a GeoDataFrame
matched_flowlines_gdf = gpd.GeoDataFrame(matched_flowlines, columns=flowlines_gdf.columns.union(clean_crudeoil_gdf.columns), crs=flowlines_gdf.crs)

Missing geometry for flowline at index 0. Skipping...
Immediate company match found for flowline at index 1 without buffering.
Immediate company match found for flowline at index 2 without buffering.
Company match found for flowline at index 3 with buffer distance of 7 meters.
Company match found for flowline at index 4 with buffer distance of 4 meters.
Company match found for flowline at index 5 with buffer distance of 7 meters.
Immediate company match found for flowline at index 6 without buffering.
Company match found for flowline at index 7 with buffer distance of 7 meters.
Immediate company match found for flowline at index 8 without buffering.
Company match found for flowline at index 9 with buffer distance of 1 meters.
No company match found for flowline at index 10 even after expanding buffer to 25 meters.
No company match found for flowline at index 11 even after expanding buffer to 25 meters.
No company match found for flowline at index 12 even after expanding buffer to 25 me

In [None]:
matched_flowlines_gdf.head()

Unnamed: 0,ACTIONDESCRIPTION,BEDDINGMATERIAL,COMPANY_NAME,CONSTRUCTDATE,Diam_in,ENDLAT,ENDLONG,ENTIRELINEREMOVED,FLOWLINEACTION,FLOWLINEID,Fluid,LOCATIONTYPE,LOCATION_ID,Length_ft,MAXOPPRESSURE,Material,OPERATOR_NUM,Operator,PIPEMATERIAL,RECEIVE_DATE,SHAPE_Length,STARTLAT,STARTLOCATIONID,STARTLONG,Status,TYPEOFFLUIDTRANS,geometry
1,The flowline serving the Emerson 3-29J (05-123...,Native Materials,CRESTONE PEAK RESOURCES OPERATING LLC ...,1983-11-09,2.0,40.109444,-104.909686,,,470446.0,Multiphase,Production Facilities,470443.0,1361.28,325.0,Carbon Steel,10633,CRESTONE PEAK RESOURCES OPERATING LLC,Carbon Steel,2023-10-24T08:33:49.223000,414.75153,40.105743,,-104.90986,Active,Multiphase,"MULTILINESTRING ((507682.447 4439497.657, 5076..."
2,The flowline (12311399_FL) servicing the Emers...,Native Materials,CRESTONE PEAK RESOURCES OPERATING LLC ...,1983-12-07,2.0,40.109441,-104.90967,1.0,Out of Service,470445.0,Multiphase,Production Facilities,470443.0,1025.98,250.0,Steel,10633,CRESTONE PEAK RESOURCES OPERATING LLC,Carbon Steel,2023-10-24T08:33:49.223000,312.594204,40.112203,319521.0,-104.909862,Out of Service,Multiphase,"MULTILINESTRING ((507680.697 4440215.365, 5076..."
3,,,PETERSON ENERGY OPERATING INC ...,2006-06-05,2.0,40.063138,-105.034086,,,466061.0,Crude Oil Emulsion,Production Facilities,318620.0,276.03,,Fiberglass,68710,PETERSON ENERGY OPERATING INC,Carbon Steel,2023-10-24T16:00:01.117000,84.099157,40.063823,305594.0,-105.030068,Active,Multiphase,"MULTILINESTRING ((497028.859 4434818.938, 4970..."


In [None]:
matched_flowlines_gdf.shape

(4, 20)

In [None]:
matched_flowlines.to_file("matched_flowlines.geojson", driver='GeoJSON')