This code combines the two different flowline datasets into one. One only had end points of an flowline and the other had the entire flowline. We combined them using a spatial join and asserted that the operator name is the same, with a max 50 meter buffer. 

## Setup


In [1]:
import os
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon, MultiLineString, LineString,MultiPolygon, MultiPoint
from shapely.ops import nearest_points
os.chdir('/Users/ichittumuri/Desktop/MINES/COGCC-Risk-Analysis/Data')
pd.options.display.max_columns = None

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [40]:
# Load Data
flowlines_gdf = gpd.read_file('flowlines.geojson')
crudeoil_gdf = gpd.read_file('crudeoil_offlocation.geojson')

In [3]:
# Check size
print(flowlines_gdf.shape)
print(crudeoil_offlocation_gdf.shape)

(21942, 19)
(259979, 8)


In [4]:
flowlines_gdf.head(2)

Unnamed: 0,LOCATION_ID,FLOWLINEID,STARTLOCATIONID,FLOWLINEACTION,ENTIRELINEREMOVED,ACTIONDESCRIPTION,RECEIVE_DATE,OPERATOR_NUM,COMPANY_NAME,LOCATIONTYPE,ENDLAT,ENDLONG,STARTLAT,STARTLONG,PIPEMATERIAL,BEDDINGMATERIAL,TYPEOFFLUIDTRANS,MAXOPPRESSURE,geometry
0,331872.0,485633.0,,Registration,,,2023-10-30T08:10:52.119999,10673,GADECO LLC ...,Well Site,,,,,HDPE,Native Materials,Produced Water,,
1,470443.0,470446.0,,,,The flowline serving the Emerson 3-29J (05-123...,2023-10-24T08:33:49.223000,10633,CRESTONE PEAK RESOURCES OPERATING LLC ...,Production Facilities,40.109444,-104.909686,40.105743,-104.90986,Carbon Steel,Native Materials,Multiphase,325.0,"LINESTRING (507682.442 4439497.658, 507696.855..."


In [17]:
print(flowlines_gdf.geometry.iloc[1])

LINESTRING (507682.4421174659 4439497.657752302, 507696.85463215865 4439908.45533576)


In [24]:
crudeoil_offlocation_gdf.head(5)

Unnamed: 0,Operator,Fluid,Material,Diam_in,Status,Length_ft,SHAPE_Length,geometry
0,EVERGREEN NATURAL RESOURCES LLC,Gas,polly,4.0,Active,2277.71,693.972162,"MULTILINESTRING ((524642.670 4117088.796, 5246..."
1,NOBLE ENERGY INC,Multiphase,Carbon Steel,3.0,Abandoned,651.58,198.525215,"MULTILINESTRING ((527997.281 4463899.920, 5281..."
2,PDC ENERGY INC,,,,Partial Removed see comment,1902.59,579.687012,"MULTILINESTRING ((537519.211 4475984.687, 5375..."
3,NOBLE ENERGY INC,Gas,Carbon Steel,3.0,Active,205.62,62.64984,"MULTILINESTRING ((544793.476 4489156.672, 5447..."
4,NOBLE ENERGY INC,Multiphase,Carbon Steel,3.0,Active,2069.9,630.658768,"MULTILINESTRING ((526448.455 4461830.702, 5258..."


In [21]:
# from shapely import GeometryCollection, LineString, Point
import shapely

In [30]:
shapely.get_num_coordinates(crudeoil_offlocation_gdf.geometry)

0         46
1          2
2         26
3          2
4          2
          ..
259974     5
259975    23
259976     2
259977    27
259978    15
Name: geometry, Length: 259979, dtype: int32

In [18]:
print(crudeoil_offlocation_gdf.geometry.iloc[1])

MULTILINESTRING ((527997.2812999999 4463899.920399999, 528172.7903000005 4463992.704299999))


In [6]:
# Check if CRS is the same for both files
if flowlines_gdf.crs != crudeoil_offlocation_gdf.crs:
    flowlines_gdf = flowlines_gdf.to_crs(crudeoil_offlocation_gdf.crs)

In [7]:
drop_crudeoil_offlocation_gdf = crudeoil_offlocation_gdf.dropna()
drop_crudeoil_offlocation_gdf.shape

(138943, 8)

## Match with Buffer 
find spatial match with/ buffer, update geometry

In [34]:
from shapely.geometry import LineString

line1 = LineString([(0, 0), (1, 1)])
line2 = LineString([(0.5, 0.5), (2, 2)])
line3 = LineString([(1, 1), (1, 2)])

print(line1.intersects(line2))  # True, because both lines share a common point (1,1)
print(line1.intersects(line3))  # True, because line1 and line3 intersect at point (1,1)
print(line2.intersects(line3))  # True, because line2 and line3 intersect at point (1,1)

True
True
True


In [45]:
import geopandas as gpd

# Load your datasets if not already loaded
# flowlines_gdf = gpd.read_file("path_to_flowlines_dataset.shp")
# crudeoil_gdf = gpd.read_file("path_to_crudeoil_dataset.shp")

# Ensure both datasets use the same coordinate reference system
if flowlines_gdf.crs != crudeoil_gdf.crs:
    crudeoil_gdf = crudeoil_gdf.to_crs(flowlines_gdf.crs)

def replace_geometry_with_closest_match(line, index):
    if line.geometry is None:
        print(f"Index {index}: No geometry to process.")
        return None  # Return None if there's no geometry to process

    initial_buffer = 5
    step = 1
    max_buffer = 25
    buffer_size = initial_buffer

    while True:
        # Create buffer around the linestring
        buffered_line = line.geometry.buffer(buffer_size)

        # Find intersections with the same operator
        potential_matches = crudeoil_gdf[crudeoil_gdf.intersects(buffered_line) & (crudeoil_gdf['Operator'] == line['COMPANY_NAME'])]

        # Debugging logs
        print(f"Index {index}: Buffer size {buffer_size}, Matches found {len(potential_matches)}")

        if len(potential_matches) == 1:
            print(f"Index {index}: Exactly one match found, operator name matches.")
            # If exactly one match, replace the geometry
            return potential_matches.iloc[0].geometry
        elif len(potential_matches) > 1:
            print(f"Index {index}: More than one match found, decreasing buffer size.")
            # If more than one match, decrease the buffer
            buffer_size -= step
        else:
            print(f"Index {index}: No matches found, increasing buffer size.")
            # If no matches, increase the buffer
            buffer_size += step * 5
            if buffer_size > max_buffer:
                print(f"Index {index}: Maximum buffer size exceeded, no suitable match found.")
                # Exit loop if maximum buffer is exceeded
                return line.geometry  # Return original geometry if no suitable match found

        if buffer_size < 0:
            print(f"Index {index}: Buffer size below zero, returning original geometry.")
            # Ensure buffer size doesn't go below zero
            return line.geometry  # Return original geometry if buffer size underflows

# Apply the function to update the geometry of each feature in flowlines dataset
for idx, line in flowlines_gdf.iterrows():
    flowlines_gdf.at[idx, 'geometry'] = replace_geometry_with_closest_match(line, idx)

# Save the updated GeoDataFrame
flowlines_gdf.to_file("path_to_updated_flowlines_dataset.shp")


Index 0: No geometry to process.
Index 1: Buffer size 5, Matches found 0
Index 1: No matches found, increasing buffer size.
Index 1: Buffer size 10, Matches found 0
Index 1: No matches found, increasing buffer size.
Index 1: Buffer size 15, Matches found 0
Index 1: No matches found, increasing buffer size.
Index 1: Buffer size 20, Matches found 0
Index 1: No matches found, increasing buffer size.
Index 1: Buffer size 25, Matches found 0
Index 1: No matches found, increasing buffer size.
Index 1: Maximum buffer size exceeded, no suitable match found.
Index 2: Buffer size 5, Matches found 0
Index 2: No matches found, increasing buffer size.
Index 2: Buffer size 10, Matches found 0
Index 2: No matches found, increasing buffer size.
Index 2: Buffer size 15, Matches found 0
Index 2: No matches found, increasing buffer size.
Index 2: Buffer size 20, Matches found 0
Index 2: No matches found, increasing buffer size.
Index 2: Buffer size 25, Matches found 0
Index 2: No matches found, increasin

KeyboardInterrupt: 

In [33]:
import geopandas as gpd
from shapely.geometry import MultiLineString

# Ensure both datasets use the same coordinate reference system
if flowlines_gdf.crs != drop_crudeoil_offlocation_gdf.crs:
    drop_crudeoil_offlocation_gdf = drop_crudeoil_offlocation_gdf.to_crs(flowlines_gdf.crs)

def find_closest_match(line, initial_buffer=5, step=1, max_buffer=25):
    buffer_size = initial_buffer
    matched = False
    closest_match = None

    while not matched:
        # Create buffer around the linestring
        buffered_line = line.geometry.buffer(buffer_size)

        # Find intersections with the same operator
        potential_matches = drop_crudeoil_offlocation_gdf[drop_crudeoil_offlocation_gdf.intersects(buffered_line) & (drop_crudeoil_offlocation_gdf['operator'] == line['operator'])]

        if len(potential_matches) == 1:
            matched = True
            closest_match = potential_matches
        elif len(potential_matches) > 1:
            # If more than one match, decrease the buffer
            buffer_size -= step
        else:
            # If no matches, increase the buffer
            buffer_size += step * 5
            if buffer_size > max_buffer:
                # Stop if maximum buffer is exceeded
                break

    return closest_match

# Apply the function to each feature in flowlines dataset
flowlines_gdf['closest_match'] = flowlines_gdf.apply(find_closest_match, axis=1)

# You might want to inspect or export the resulting data
print(flowlines_gdf[['closest_match']])


AttributeError: 'NoneType' object has no attribute 'buffer'

In [8]:
# Initialize an empty list to store matched data
matched_flowlines = []

max_buffer_distance = 49.5  # Maximum buffer distance in meters
initial_buffer_distance = 0  # Initial buffer distance
matches_found = 0  # Counter for matches found

for index, flowline in flowlines_gdf.iterrows():
    # Check if two matches are already found
    # if matches_found == 2:
        # break
    
    # Check for missing geometry and skip if necessary
    if flowline.geometry is None:
        print(f"Missing geometry for flowline at index {index}. Skipping...")
        continue

    # Extract the two end points of the flowline
    coords = list(flowline.geometry.coords)
    flp_start, flp_end = Point(coords[0]), Point(coords[-1])

    buffer_distance = initial_buffer_distance
    match_found = False

    while not match_found and buffer_distance <= max_buffer_distance:
        # Buffer the flowline geometry
        buffered_flowline = flowline.geometry.buffer(buffer_distance)
        
        # Iterate over the segments of the drop_crudeoil_offlocation_gdf multistring
        for _, crudeoil_line in drop_crudeoil_offlocation_gdf.iterrows():
            # Check if the buffer intersects with the crude oil geometry
            if buffered_flowline.intersects(crudeoil_line.geometry):
                # Find the intersection points
                intersection = buffered_flowline.intersection(crudeoil_line.geometry)
                
                # Ensure the intersection is a LineString
                if isinstance(intersection, LineString):
                    intersection_start, intersection_end = intersection.coords[0], intersection.coords[-1]
                    
                    # Check if the company names match
                    if flowline['COMPANY_NAME'].strip().lower() == crudeoil_line['Operator'].strip().lower():
                        print(f"Company match found at buffer distance {buffer_distance} meters for flowline at index {index}.")
                        match_found = True
                        matches_found += 1  # Increment the counter for matches found

                        # Create a new GeoDataFrame with the updated flowline
                        updated_flowline = flowline.copy()
                        updated_line = LineString([intersection_start, intersection_end])
                        updated_flowline.geometry = updated_line

                        # Append the updated flowline to the list
                        matched_flowlines.append(updated_flowline)
                        break

        if match_found:
            break
        else:
            buffer_distance += 0.5  # Increase buffer by 0.5 meters if no match found

    if not match_found:
        print(f"No company match found for flowline at index {index} even after expanding buffer to {buffer_distance} meters.")

# Convert the list of matched flowlines to a GeoDataFrame
matched_flowlines_gdf = gpd.GeoDataFrame(matched_flowlines, columns=flowlines_gdf.columns, crs=flowlines_gdf.crs)

Missing geometry for flowline at index 0. Skipping...
Company match found at buffer distance 0.5 meters for flowline at index 1.
Company match found at buffer distance 1.5 meters for flowline at index 2.
Company match found at buffer distance 7.0 meters for flowline at index 3.
Company match found at buffer distance 3.5 meters for flowline at index 4.
Company match found at buffer distance 6.5 meters for flowline at index 5.
Company match found at buffer distance 0.5 meters for flowline at index 6.
Company match found at buffer distance 6.5 meters for flowline at index 7.
Company match found at buffer distance 0.5 meters for flowline at index 8.
Company match found at buffer distance 0.5 meters for flowline at index 9.
No company match found for flowline at index 10 even after expanding buffer to 50.0 meters.
No company match found for flowline at index 11 even after expanding buffer to 50.0 meters.
No company match found for flowline at index 12 even after expanding buffer to 50.0 met

KeyboardInterrupt: 

In [9]:
matched_flowlines_gdf = gpd.GeoDataFrame(matched_flowlines, columns=flowlines_gdf.columns, crs=flowlines_gdf.crs)

In [10]:
matched_flowlines_gdf.shape


(84, 19)

## Spatial Join

In [11]:
all_flowlines = gpd.sjoin(matched_flowlines_gdf, crudeoil_offlocation_gdf, how='right', op='intersects')
all_flowlines.head()

  if await self.run_code(code, result, async_=asy):


Unnamed: 0,index_left,LOCATION_ID,FLOWLINEID,STARTLOCATIONID,FLOWLINEACTION,ENTIRELINEREMOVED,ACTIONDESCRIPTION,RECEIVE_DATE,OPERATOR_NUM,COMPANY_NAME,LOCATIONTYPE,ENDLAT,ENDLONG,STARTLAT,STARTLONG,PIPEMATERIAL,BEDDINGMATERIAL,TYPEOFFLUIDTRANS,MAXOPPRESSURE,Operator,Fluid,Material,Diam_in,Status,Length_ft,SHAPE_Length,geometry
0,,,,,,,,,,,,,,,,,,,,EVERGREEN NATURAL RESOURCES LLC,Gas,polly,4.0,Active,2277.71,693.972162,"MULTILINESTRING ((524642.670 4117088.796, 5246..."
1,,,,,,,,,,,,,,,,,,,,NOBLE ENERGY INC,Multiphase,Carbon Steel,3.0,Abandoned,651.58,198.525215,"MULTILINESTRING ((527997.281 4463899.920, 5281..."
2,,,,,,,,,,,,,,,,,,,,PDC ENERGY INC,,,,Partial Removed see comment,1902.59,579.687012,"MULTILINESTRING ((537519.211 4475984.687, 5375..."
3,,,,,,,,,,,,,,,,,,,,NOBLE ENERGY INC,Gas,Carbon Steel,3.0,Active,205.62,62.64984,"MULTILINESTRING ((544793.476 4489156.672, 5447..."
4,,,,,,,,,,,,,,,,,,,,NOBLE ENERGY INC,Multiphase,Carbon Steel,3.0,Active,2069.9,630.658768,"MULTILINESTRING ((526448.455 4461830.702, 5258..."


In [12]:
all_flowlines.shape

(260026, 27)

In [13]:
matches_joined = all_flowlines.dropna()
matches_joined.head()

Unnamed: 0,index_left,LOCATION_ID,FLOWLINEID,STARTLOCATIONID,FLOWLINEACTION,ENTIRELINEREMOVED,ACTIONDESCRIPTION,RECEIVE_DATE,OPERATOR_NUM,COMPANY_NAME,LOCATIONTYPE,ENDLAT,ENDLONG,STARTLAT,STARTLONG,PIPEMATERIAL,BEDDINGMATERIAL,TYPEOFFLUIDTRANS,MAXOPPRESSURE,Operator,Fluid,Material,Diam_in,Status,Length_ft,SHAPE_Length,geometry
1462,8.0,452637.0,462601.0,306712.0,Out of Service,1,The flowline (12325148FL) servicing the Elliot...,2023-10-24T10:08:19.323000,10459.0,EXTRACTION OIL & GAS INC ...,Production Facilities,40.267332,-105.037459,40.266917,-105.043033,Carbon Steel,Native Materials,Multiphase,1140.0,EXTRACTION OIL & GAS INC,Multiphase,Carbon Steel,2.375,Out of Service,510.01,155.389256,"MULTILINESTRING ((496681.603 4457400.496, 4966..."
9382,16.0,336437.0,473671.0,336315.0,Out of Service,1,The flowline (12321397_FL) servicing the Geist...,2023-10-19T11:30:16.719999,10633.0,CRESTONE PEAK RESOURCES OPERATING LLC ...,Production Facilities,40.18597,-104.918804,40.187553,-104.916936,Carbon Steel,Native Materials,Multiphase,265.0,CRESTONE PEAK RESOURCES OPERATING LLC,Multiphase,Steel,2.0,Abandoned,3457.32,1053.369404,"MULTILINESTRING ((507139.811 4448160.801, 5071..."
9382,21.0,336437.0,473673.0,331851.0,Out of Service,1,The flowline (12330823_FL) servicing the Geist...,2023-10-19T11:30:16.719999,10633.0,CRESTONE PEAK RESOURCES OPERATING LLC ...,Production Facilities,40.185945,-104.91876,40.186544,-104.919688,Carbon Steel,Native Materials,Multiphase,320.0,CRESTONE PEAK RESOURCES OPERATING LLC,Multiphase,Steel,2.0,Abandoned,3457.32,1053.369404,"MULTILINESTRING ((507139.811 4448160.801, 5071..."
16194,69.0,461164.0,461525.0,302916.0,Out of Service,1,The flowline (12330024FL) servicing the DF RAN...,2023-10-11T16:38:16.743000,10575.0,8 NORTH LLC ...,Production Facilities,40.906867,-104.20019,40.908605,-104.194062,Carbon Steel,Native Materials,Multiphase,57.0,8 NORTH LLC,Multiphase,Carbon Steel,2.375,Out of Service,2381.13,725.519247,"MULTILINESTRING ((567873.747 4528925.507, 5678..."
19477,57.0,321398.0,462649.0,321341.0,Pre-Abandonment Notice,1,The flowline serving the Becky 04-36 (05-013-0...,2023-11-17T11:11:43.016998,10459.0,EXTRACTION OIL & GAS INC ...,Production Facilities,40.089921,-105.066961,40.090185,-105.069508,Carbon Steel,Native Materials,Multiphase,410.0,EXTRACTION OIL & GAS INC,Multiphase,Carbon Steel,2.25,Out of Service,969.49,295.382313,"MULTILINESTRING ((494261.152 4438023.066, 4942..."


In [14]:
matches_joined.shape

(25, 27)

In [None]:
all_flowlines.to_file("all_flowlines.geojson", driver='GeoJSON')

KeyboardInterrupt: 

In [None]:
matches_joined.to_file("matched_flowlines.geojson", driver='GeoJSON')