This code combines the two different flowline datasets into one. One only had end points of an flowline and the other had the entire flowline. We combined them using a spatial join and asserted that the operator name is the same, with a max 50 meter buffer. 

## Setup


In [1]:
import os
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon, MultiLineString, LineString,MultiPolygon, MultiPoint
from shapely.ops import nearest_points
os.chdir('/Users/ichittumuri/Desktop/MINES/COGCC-Risk-Analysis/Data')
pd.options.display.max_columns = None

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Load Data
flowlines_gdf = gpd.read_file('flowlines.geojson')
crudeoil_offlocation_gdf = gpd.read_file('crudeoil_offlocation.geojson')

In [3]:
# Check size
print(flowlines_gdf.shape)
print(crudeoil_offlocation_gdf.shape)

(21942, 19)
(259979, 8)


In [4]:
flowlines_gdf.head(2)

Unnamed: 0,LOCATION_ID,FLOWLINEID,STARTLOCATIONID,FLOWLINEACTION,ENTIRELINEREMOVED,ACTIONDESCRIPTION,RECEIVE_DATE,OPERATOR_NUM,COMPANY_NAME,LOCATIONTYPE,ENDLAT,ENDLONG,STARTLAT,STARTLONG,PIPEMATERIAL,BEDDINGMATERIAL,TYPEOFFLUIDTRANS,MAXOPPRESSURE,geometry
0,331872.0,485633.0,,Registration,,,2023-10-30T08:10:52.119999,10673,GADECO LLC ...,Well Site,,,,,HDPE,Native Materials,Produced Water,,
1,470443.0,470446.0,,,,The flowline serving the Emerson 3-29J (05-123...,2023-10-24T08:33:49.223000,10633,CRESTONE PEAK RESOURCES OPERATING LLC ...,Production Facilities,40.109444,-104.909686,40.105743,-104.90986,Carbon Steel,Native Materials,Multiphase,325.0,"LINESTRING (507682.442 4439497.658, 507696.855..."


In [5]:
crudeoil_offlocation_gdf.head(2)

Unnamed: 0,Operator,Fluid,Material,Diam_in,Status,Length_ft,SHAPE_Length,geometry
0,EVERGREEN NATURAL RESOURCES LLC,Gas,polly,4.0,Active,2277.71,693.972162,"MULTILINESTRING ((524642.670 4117088.796, 5246..."
1,NOBLE ENERGY INC,Multiphase,Carbon Steel,3.0,Abandoned,651.58,198.525215,"MULTILINESTRING ((527997.281 4463899.920, 5281..."


In [6]:
# Check if CRS is the same for both files
if flowlines_gdf.crs != crudeoil_offlocation_gdf.crs:
    flowlines_gdf = flowlines_gdf.to_crs(crudeoil_offlocation_gdf.crs)

In [7]:
import geopandas as gpd
from shapely.ops import nearest_points

# Initialize an empty GeoDataFrame to store matched data
matched_flowlines_gdf = gpd.GeoDataFrame(columns=flowlines_gdf.columns, crs=flowlines_gdf.crs)

max_buffer_distance = 49.5  # Maximum buffer distance in meters
initial_buffer_distance = 0  # Initial buffer distance

for index, flowline in flowlines_gdf.iterrows():
    # Check for missing geometry and skip if necessary
    if flowline.geometry is None:
        print(f"Missing geometry for flowline at index {index}. Skipping...")
        continue

    buffer_distance = initial_buffer_distance
    match_found = False
    
    while not match_found and buffer_distance <= max_buffer_distance:
        # Buffer the flowline geometry
        buffered_flowline = flowline.geometry.buffer(buffer_distance)
        temp_flowline_gdf = gpd.GeoDataFrame([flowline], geometry=[buffered_flowline], crs=flowlines_gdf.crs)
        
        # Perform the spatial join
        joined_gdf = gpd.sjoin(temp_flowline_gdf, crudeoil_offlocation_gdf, how='inner', predicate='intersects')

        if not joined_gdf.empty:
            for _, match in joined_gdf.iterrows():
                # Check if the company names match
                if match['COMPANY_NAME'].strip().lower() == match['Operator'].strip().lower():
                    print(f"Company match found at buffer distance {buffer_distance} meters for flowline at index {index}.")
                    match_found = True
                    
                    # Find the nearest point on the crude oil path to the original flowline
                    nearest_geom = nearest_points(flowline.geometry, match.geometry)[1]
                    
                    # Update the flowline's geometry to this nearest point
                    updated_flowline = flowline.copy()
                    updated_flowline.geometry = nearest_geom
                    
                    # Append the updated flowline to the matched_flowlines_gdf
                    matched_flowlines_gdf = pd.concat([matched_flowlines_gdf, gpd.GeoDataFrame([updated_flowline], crs=flowlines_gdf.crs)], ignore_index=True)
                    
                    break
            
            if match_found:
                break
        
        if not match_found:
            buffer_distance += 0.5  # Increase buffer by 1 meter if no match found

    if not match_found:
        print(f"No company match found for flowline at index {index} even after expanding buffer to {buffer_distance} meters.")


Missing geometry for flowline at index 0. Skipping...
Company match found at buffer distance 0.5 meters for flowline at index 1.


  matched_flowlines_gdf = pd.concat([matched_flowlines_gdf, gpd.GeoDataFrame([updated_flowline], crs=flowlines_gdf.crs)], ignore_index=True)


Company match found at buffer distance 0.5 meters for flowline at index 2.
Company match found at buffer distance 7.0 meters for flowline at index 3.
Company match found at buffer distance 3.5 meters for flowline at index 4.
Company match found at buffer distance 6.5 meters for flowline at index 5.
Company match found at buffer distance 0.5 meters for flowline at index 6.
Company match found at buffer distance 6.5 meters for flowline at index 7.
Company match found at buffer distance 0.5 meters for flowline at index 8.
Company match found at buffer distance 0.5 meters for flowline at index 9.
No company match found for flowline at index 10 even after expanding buffer to 50.0 meters.
No company match found for flowline at index 11 even after expanding buffer to 50.0 meters.
No company match found for flowline at index 12 even after expanding buffer to 50.0 meters.
No company match found for flowline at index 13 even after expanding buffer to 50.0 meters.
No company match found for flowl

In [None]:
matched_flowlines_gdf.to_file("matched_flowlines.geojson", driver='GeoJSON')

In [None]:
matched_flowlines_gdf.head()

Unnamed: 0,LOCATION_ID,FLOWLINEID,STARTLOCATIONID,FLOWLINEACTION,ENTIRELINEREMOVED,ACTIONDESCRIPTION,RECEIVE_DATE,OPERATOR_NUM,COMPANY_NAME,LOCATIONTYPE,ENDLAT,ENDLONG,STARTLAT,STARTLONG,PIPEMATERIAL,BEDDINGMATERIAL,TYPEOFFLUIDTRANS,MAXOPPRESSURE,geometry
0,470443.0,470446.0,,,,The flowline serving the Emerson 3-29J (05-123...,2023-10-24T08:33:49.223000,10633,CRESTONE PEAK RESOURCES OPERATING LLC ...,Production Facilities,40.109444,-104.909686,40.105743,-104.90986,Carbon Steel,Native Materials,Multiphase,325.0,POINT (507682.442 4439497.658)
1,470443.0,470445.0,319521.0,Out of Service,1.0,The flowline (12311399_FL) servicing the Emers...,2023-10-24T08:33:49.223000,10633,CRESTONE PEAK RESOURCES OPERATING LLC ...,Production Facilities,40.109441,-104.90967,40.112203,-104.909862,Carbon Steel,Native Materials,Multiphase,250.0,POINT (507681.545 4440214.668)
2,318620.0,466061.0,305594.0,,,,2023-10-24T16:00:01.117000,68710,PETERSON ENERGY OPERATING INC ...,Production Facilities,40.063138,-105.034086,40.063823,-105.030068,Carbon Steel,,Multiphase,,POINT (497435.796 4434841.420)
3,318620.0,466062.0,305595.0,,,,2023-10-24T16:00:01.117000,68710,PETERSON ENERGY OPERATING INC ...,Production Facilities,40.063142,-105.034085,40.060063,-105.034795,Carbon Steel,,Multiphase,,POINT (497032.513 4434424.239)
4,318620.0,466063.0,332338.0,,,,2023-10-24T16:00:01.117000,68710,PETERSON ENERGY OPERATING INC ...,Production Facilities,40.063132,-105.034086,40.059581,-105.029331,Carbon Steel,,Multiphase,,POINT (497498.492 4434370.573)


In [None]:
import geopandas as gpd

# Assuming matched_data_gdf and crudeoil_offlocation_gdf are already defined and properly formatted

# Perform a spatial join where each entry in matched_data_gdf is joined with entries in crudeoil_offlocation_gdf based on intersection
# 'how='inner'' will only keep rows that intersect between the two GeoDataFrames
combined_flowines = gpd.sjoin(matched_flowlines_gdf, crudeoil_offlocation_gdf, how='inner', predicate='intersects')

# The result, combined_gdf, will contain all columns from both matched_data_gdf and crudeoil_offlocation_gdf
# Columns from crudeoil_offlocation_gdf will be suffixed with '_right' if there are overlapping column names

# Print the resulting DataFrame to see some of its data
print(combined_flowines.head())

Empty GeoDataFrame
Columns: [LOCATION_ID, FLOWLINEID, STARTLOCATIONID, FLOWLINEACTION, ENTIRELINEREMOVED, ACTIONDESCRIPTION, RECEIVE_DATE, OPERATOR_NUM, COMPANY_NAME, LOCATIONTYPE, ENDLAT, ENDLONG, STARTLAT, STARTLONG, PIPEMATERIAL, BEDDINGMATERIAL, TYPEOFFLUIDTRANS, MAXOPPRESSURE, geometry, index_right, Operator, Fluid, Material, Diam_in, Status, Length_ft, SHAPE_Length]
Index: []


In [None]:
combined_flowines.to_file("combined_flowines.geojson", driver='GeoJSON')

In [None]:
combined_flowines.head()

## Spatial Join

In [None]:
# Spatial join to find intersecting geometries
# Add attributes from flowlines to matching entries in crude_oil_gdf
matches_gdf = gpd.sjoin(crudeoil_offlocation_gdf, flowlines_gdf, how="inner", op='intersects')
matches_gdf.shape

  if await self.run_code(code, result, async_=asy):


(108291, 27)

In [None]:
# check if operator name is the same
pd.set_option('display.max_colwidth',None)
matches_gdf[['COMPANY_NAME','Operator']].sample(10)

Unnamed: 0,COMPANY_NAME,Operator
86681,KP KAUFFMAN COMPANY INC,KP KAUFFMAN COMPANY INC
28022,NOBLE ENERGY INC,NOBLE ENERGY INC
234844,URSA OPERATING COMPANY LLC,TEP ROCKY MOUNTAIN LLC
120925,KP KAUFFMAN COMPANY INC,KP KAUFFMAN COMPANY INC
151143,NOBLE ENERGY INC,NOBLE ENERGY INC
41342,NOBLE ENERGY INC,NOBLE ENERGY INC
181647,CRESTONE PEAK RESOURCES OPERATING LLC,NOBLE MIDSTREAM SERVICES LLC
7358,NOBLE ENERGY INC,NOBLE ENERGY INC
87970,VERDAD RESOURCES LLC,NOBLE ENERGY INC
116794,CITATION OIL & GAS CORP,CITATION OIL & GAS CORP


In [None]:
# Normalize the text in 'COMPANY_NAME' and 'Operator' columns before comparison
# This involves converting the text to lower case and stripping any leading/trailing spaces
validated_matches = matches_gdf[
    matches_gdf['COMPANY_NAME'].str.lower().str.strip() == matches_gdf['Operator'].str.lower().str.strip()
]

# Now, you can check the shape to see if there are any matches
print(validated_matches.shape)

(98663, 27)


In [None]:
print(validated_matches[['COMPANY_NAME', 'Operator']].sample(10))

                                              COMPANY_NAME  \
34201   NOBLE ENERGY INC                                     
58470   CRESTONE PEAK RESOURCES OPERATING LLC                
6401    NOBLE ENERGY INC                                     
7081    BONANZA CREEK ENERGY OPERATING COMPANY LLC           
44870   CRESTONE PEAK RESOURCES OPERATING LLC                
78450   NOBLE ENERGY INC                                     
52948   CRESTONE PEAK RESOURCES OPERATING LLC                
73525   TIMBER CREEK OPERATING LLC                           
119281  OWN RESOURCES OPERATING LLC                          
14305   TIMBER CREEK OPERATING LLC                           

                                          Operator  
34201                             NOBLE ENERGY INC  
58470        CRESTONE PEAK RESOURCES OPERATING LLC  
6401                              NOBLE ENERGY INC  
7081    BONANZA CREEK ENERGY OPERATING COMPANY LLC  
44870        CRESTONE PEAK RESOURCES OPERATING LLC  

In [None]:
# Merge the matches back to the original 'crudeoil_offlocation_gdf' to add 'flowlines_gdf' attributes where intersections occurred

# Dropp the 'index_right' column which is added by sjoin and duplicate geometry column
validated_matches.drop(columns=['index_right','geometry','Operator','Fluid','Material','Diam_in','Status','Length_ft','SHAPE_Length'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validated_matches.drop(columns=['index_right','geometry','Operator','Fluid','Material','Diam_in','Status','Length_ft','SHAPE_Length'], inplace=True)


In [None]:
# Merge based on index, ensuring all original entries in 'crudeoil_offlocation_gdf' are retained and adding 'flowlines_gdf' attributes where matches were found
all_flowlines = crudeoil_offlocation_gdf.merge(validated_matches, left_index=True, right_index=True, how="left")

In [None]:
print(crudeoil_offlocation_gdf.shape)
print(flowlines_gdf.shape)
print(all_flowlines.shape)

(259979, 8)
(21942, 19)
(335177, 26)


In [None]:
all_flowlines.head(2)

Unnamed: 0,Operator,Fluid,Material,Diam_in,Status,Length_ft,SHAPE_Length,geometry,LOCATION_ID,FLOWLINEID,STARTLOCATIONID,FLOWLINEACTION,ENTIRELINEREMOVED,ACTIONDESCRIPTION,RECEIVE_DATE,OPERATOR_NUM,COMPANY_NAME,LOCATIONTYPE,ENDLAT,ENDLONG,STARTLAT,STARTLONG,PIPEMATERIAL,BEDDINGMATERIAL,TYPEOFFLUIDTRANS,MAXOPPRESSURE
0,EVERGREEN NATURAL RESOURCES LLC,Gas,polly,4.0,Active,2277.71,693.972162,"MULTILINESTRING ((524642.670 4117088.796, 524635.855 4117065.943, 524642.839 4117043.875, 524644.806 4117038.110, 524644.948 4117037.721, 524647.344 4117029.801, 524649.373 4117015.664, 524647.124 4116999.342, 524639.679 4116984.231, 524630.942 4116970.047, 524621.881 4116956.759, 524614.674 4116944.822, 524608.274 4116932.252, 524601.572 4116917.128, 524593.134 4116899.610, 524582.780 4116881.528, 524570.950 4116863.161, 524560.121 4116845.209, 524550.495 4116828.320, 524541.644 4116811.514, 524532.448 4116793.800, 524522.199 4116774.155, 524511.641 4116754.211, 524501.932 4116736.341, 524493.554 4116719.288, 524485.915 4116703.239, 524478.660 4116688.610, 524472.318 4116676.228, 524466.204 4116663.878, 524461.745 4116654.085, 524460.993 4116652.434, 524460.799 4116652.268, 524459.589 4116649.783, 524455.570 4116641.354, 524449.475 4116628.485, 524442.644 4116614.823, 524434.908 4116600.482, 524425.811 4116585.828, 524414.601 4116570.923, 524399.704 4116555.207, 524380.607 4116537.113, 524372.634 4116530.176, 524364.897 4116524.118, 524357.109 4116516.189, 524345.951 4116505.112, 524333.116 4116492.371))",,,,,,,,,,,,,,,,,,
1,NOBLE ENERGY INC,Multiphase,Carbon Steel,3.0,Abandoned,651.58,198.525215,"MULTILINESTRING ((527997.281 4463899.920, 528172.790 4463992.704))",,,,,,,,,,,,,,,,,,


In [None]:
all_flowlines.to_file('all_flowlines.geojson', driver='GeoJSON')