In [1]:
import os
import pandas as pd
import geopandas as gpd

os.chdir('/Users/ichittumuri/Desktop/MINES/COGCC-Risk-Analysis/Data')
pd.options.display.max_columns = None

# Load Data
flowlines_gdf = gpd.read_file('flowlines.geojson')
crudeoil_offlocation_gdf = gpd.read_file('crudeoil_offlocation.geojson')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Check size
print(flowlines_gdf.shape)
print(crudeoil_offlocation_gdf.shape)

(21942, 19)
(259979, 8)


In [3]:
flowlines_gdf.head(2)

Unnamed: 0,LOCATION_ID,FLOWLINEID,STARTLOCATIONID,FLOWLINEACTION,ENTIRELINEREMOVED,ACTIONDESCRIPTION,RECEIVE_DATE,OPERATOR_NUM,COMPANY_NAME,LOCATIONTYPE,ENDLAT,ENDLONG,STARTLAT,STARTLONG,PIPEMATERIAL,BEDDINGMATERIAL,TYPEOFFLUIDTRANS,MAXOPPRESSURE,geometry
0,331872.0,485633.0,,Registration,,,2023-10-30T08:10:52.119999,10673,GADECO LLC ...,Well Site,,,,,HDPE,Native Materials,Produced Water,,
1,470443.0,470446.0,,,,The flowline serving the Emerson 3-29J (05-123...,2023-10-24T08:33:49.223000,10633,CRESTONE PEAK RESOURCES OPERATING LLC ...,Production Facilities,40.109444,-104.909686,40.105743,-104.90986,Carbon Steel,Native Materials,Multiphase,325.0,"LINESTRING (507682.442 4439497.658, 507696.855..."


In [4]:
crudeoil_offlocation_gdf.head(2)

Unnamed: 0,Operator,Fluid,Material,Diam_in,Status,Length_ft,SHAPE_Length,geometry
0,EVERGREEN NATURAL RESOURCES LLC,Gas,polly,4.0,Active,2277.71,693.972162,"MULTILINESTRING ((524642.670 4117088.796, 5246..."
1,NOBLE ENERGY INC,Multiphase,Carbon Steel,3.0,Abandoned,651.58,198.525215,"MULTILINESTRING ((527997.281 4463899.920, 5281..."


In [5]:
# Check if CRS is the same for both files
if flowlines_gdf.crs != crudeoil_offlocation_gdf.crs:
    flowlines_gdf = flowlines_gdf.to_crs(crudeoil_offlocation_gdf.crs)

In [6]:
# Spatial join to find intersecting geometries
# Add attributes from flowlines to matching entries in crude_oil_gdf
matches_gdf = gpd.sjoin(crudeoil_offlocation_gdf, flowlines_gdf, how="inner", op='intersects')
matches_gdf.shape

  if await self.run_code(code, result, async_=asy):


(108291, 27)

In [7]:
print(matches_gdf[['COMPANY_NAME', 'Operator']].sample(10))

                                             COMPANY_NAME  \
53028   TIMBER CREEK OPERATING LLC                    ...   
90664   CRESTONE PEAK RESOURCES OPERATING LLC         ...   
151744  SCHNEIDER ENERGY SERVICES INC                 ...   
128541  FOUNDATION ENERGY MANAGEMENT LLC              ...   
64816   KP KAUFFMAN COMPANY INC                       ...   
207470  KERR MCGEE OIL & GAS ONSHORE LP               ...   
49968   CRESTONE PEAK RESOURCES OPERATING LLC         ...   
29631   NOBLE ENERGY INC                              ...   
47190   MCCARTNEY ENGINEERING LLC                     ...   
84689   NOBLE ENERGY INC                              ...   

                                     Operator  
53028              TIMBER CREEK OPERATING LLC  
90664   CRESTONE PEAK RESOURCES OPERATING LLC  
151744          SCHNEIDER ENERGY SERVICES INC  
128541       FOUNDATION ENERGY MANAGEMENT LLC  
64816                 KP KAUFFMAN COMPANY INC  
207470        KERR MCGEE OIL & GAS ONSHO

In [8]:
# remove the trailing spaces and ellipses from COMPANY_NAME

# Strip leading and trailing whitespace
matches_gdf['COMPANY_NAME'] = matches_gdf['COMPANY_NAME'].str.strip()

# Replace ellipses and any excess internal spaces with a single space
matches_gdf['COMPANY_NAME'] = matches_gdf['COMPANY_NAME'].str.replace(r"\.\.\.+", "", regex=True)  # Removes ellipses
matches_gdf['COMPANY_NAME'] = matches_gdf['COMPANY_NAME'].str.replace(r"\s+", " ", regex=True)  # Replaces multiple spaces with a single space

In [9]:
# Validate spatial merge by COMPANY_NAME/Operator
validated_matches = matches_gdf[matches_gdf['COMPANY_NAME'] == matches_gdf['Operator']]
validated_matches.shape

(98657, 27)

In [10]:
print(validated_matches[['COMPANY_NAME', 'Operator']].sample(10))

                                     COMPANY_NAME  \
28671                      TEP ROCKY MOUNTAIN LLC   
47553                              PDC ENERGY INC   
55242                              PDC ENERGY INC   
60362  BONANZA CREEK ENERGY OPERATING COMPANY LLC   
92876                 OWN RESOURCES OPERATING LLC   
44416                     KP KAUFFMAN COMPANY INC   
96035                     KP KAUFFMAN COMPANY INC   
94201                            NOBLE ENERGY INC   
21146       CRESTONE PEAK RESOURCES OPERATING LLC   
83807                            NOBLE ENERGY INC   

                                         Operator  
28671                      TEP ROCKY MOUNTAIN LLC  
47553                              PDC ENERGY INC  
55242                              PDC ENERGY INC  
60362  BONANZA CREEK ENERGY OPERATING COMPANY LLC  
92876                 OWN RESOURCES OPERATING LLC  
44416                     KP KAUFFMAN COMPANY INC  
96035                     KP KAUFFMAN COMPANY INC  


In [11]:
validated_matches.head(10)

Unnamed: 0,Operator,Fluid,Material,Diam_in,Status,Length_ft,SHAPE_Length,geometry,index_right,LOCATION_ID,FLOWLINEID,STARTLOCATIONID,FLOWLINEACTION,ENTIRELINEREMOVED,ACTIONDESCRIPTION,RECEIVE_DATE,OPERATOR_NUM,COMPANY_NAME,LOCATIONTYPE,ENDLAT,ENDLONG,STARTLAT,STARTLONG,PIPEMATERIAL,BEDDINGMATERIAL,TYPEOFFLUIDTRANS,MAXOPPRESSURE
2,PDC ENERGY INC,,,,Partial Removed see comment,1902.59,579.687012,"MULTILINESTRING ((537519.211 4475984.687, 5375...",11709,472043.0,472044.0,332067.0,Registration,,,2019-10-03T10:17:51.337002,69175,PDC ENERGY INC,Production Facilities,40.433678,-104.564457,40.429625,-104.562114,Carbon Steel,,Crude Oil,
2,PDC ENERGY INC,,,,Partial Removed see comment,1902.59,579.687012,"MULTILINESTRING ((537519.211 4475984.687, 5375...",7856,472043.0,472044.0,332067.0,,,,2020-06-02T08:02:30.473000,69175,PDC ENERGY INC,Well Site,40.433678,-104.564457,40.429625,-104.562114,Carbon Steel,,Crude Oil,
2,PDC ENERGY INC,,,,Partial Removed see comment,1902.59,579.687012,"MULTILINESTRING ((537519.211 4475984.687, 5375...",11710,472043.0,472045.0,305898.0,Registration,,,2019-10-03T10:17:51.337002,69175,PDC ENERGY INC,Production Facilities,40.433678,-104.564457,40.432222,-104.559722,Carbon Steel,,Crude Oil,
2,PDC ENERGY INC,,,,Partial Removed see comment,1902.59,579.687012,"MULTILINESTRING ((537519.211 4475984.687, 5375...",7857,472043.0,472045.0,305898.0,,,,2020-06-02T08:02:30.473000,69175,PDC ENERGY INC,Well Site,40.433678,-104.564457,40.432222,-104.559722,Carbon Steel,,Crude Oil,
5,NOBLE ENERGY INC,Multiphase,Carbon Steel,3.0,Abandoned,1118.3,340.724313,"MULTILINESTRING ((530315.537 4449292.793, 5304...",21778,330724.0,456334.0,336483.0,Registration,,,2018-06-25T11:33:36.369999,100322,NOBLE ENERGY INC,Production Facilities,40.194494,-104.647039,40.194907,-104.64204,,,,
5,NOBLE ENERGY INC,Multiphase,Carbon Steel,3.0,Abandoned,1118.3,340.724313,"MULTILINESTRING ((530315.537 4449292.793, 5304...",16585,330724.0,456334.0,336483.0,Abandonment,1.0,"Purged fluids, abandoned in place, cut risers ...",2019-06-10T12:58:49.646999,100322,NOBLE ENERGY INC,Production Facilities,40.194494,-104.647039,40.194907,-104.64204,,,,
14,BONANZA CREEK ENERGY OPERATING COMPANY LLC,Crude Oil,Carbon Steel,6.0,Active,6160.57,1877.062024,"MULTILINESTRING ((556593.657 4470563.616, 5565...",14499,423766.0,467329.0,423761.0,Registration,,,2019-09-05T09:54:31.670000,8960,BONANZA CREEK ENERGY OPERATING COMPANY LLC,Well Site,40.379677,-104.35514,40.37967,-104.35001,Carbon Steel,Native Materials,Crude Oil Emulsion,
14,BONANZA CREEK ENERGY OPERATING COMPANY LLC,Crude Oil,Carbon Steel,6.0,Active,6160.57,1877.062024,"MULTILINESTRING ((556593.657 4470563.616, 5565...",14500,423766.0,467328.0,423761.0,Registration,,,2019-09-05T09:54:31.670000,8960,BONANZA CREEK ENERGY OPERATING COMPANY LLC,Well Site,40.379677,-104.355142,40.37967,-104.34995,Carbon Steel,Native Materials,Crude Oil Emulsion,
14,BONANZA CREEK ENERGY OPERATING COMPANY LLC,Crude Oil,Carbon Steel,6.0,Active,6160.57,1877.062024,"MULTILINESTRING ((556593.657 4470563.616, 5565...",14498,423761.0,467330.0,423766.0,Registration,,,2019-09-05T10:04:52.240002,8960,BONANZA CREEK ENERGY OPERATING COMPANY LLC,Well Site,40.37967,-104.35001,40.379699,-104.355181,HDPE,Native Materials,Natural Gas,
14,BONANZA CREEK ENERGY OPERATING COMPANY LLC,Crude Oil,Carbon Steel,6.0,Active,6160.57,1877.062024,"MULTILINESTRING ((556593.657 4470563.616, 5565...",14497,423761.0,467331.0,423766.0,Registration,,,2019-09-05T10:04:52.240002,8960,BONANZA CREEK ENERGY OPERATING COMPANY LLC,Well Site,40.37967,-104.34995,40.379699,-104.355181,HDPE,Native Materials,Natural Gas,


In [12]:
# Merge the matches back to the original 'crudeoil_offlocation_gdf' to add 'flowlines_gdf' attributes where intersections occurred

# Dropp the 'index_right' column which is added by sjoin and duplicate geometry column
validated_matches.drop(columns=['index_right','geometry','Operator','Fluid','Material','Diam_in','Status','Length_ft','SHAPE_Length'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validated_matches.drop(columns=['index_right','geometry','Operator','Fluid','Material','Diam_in','Status','Length_ft','SHAPE_Length'], inplace=True)


In [13]:
# Merge based on index, ensuring all original entries in 'crudeoil_offlocation_gdf' are retained and adding 'flowlines_gdf' attributes where matches were found
all_flowlines = crudeoil_offlocation_gdf.merge(validated_matches, left_index=True, right_index=True, how="left")

In [14]:
print(crudeoil_offlocation_gdf.shape)
print(all_flowlines.shape)

(259979, 8)
(335174, 26)


In [15]:
all_flowlines.head(2)

Unnamed: 0,Operator,Fluid,Material,Diam_in,Status,Length_ft,SHAPE_Length,geometry,LOCATION_ID,FLOWLINEID,STARTLOCATIONID,FLOWLINEACTION,ENTIRELINEREMOVED,ACTIONDESCRIPTION,RECEIVE_DATE,OPERATOR_NUM,COMPANY_NAME,LOCATIONTYPE,ENDLAT,ENDLONG,STARTLAT,STARTLONG,PIPEMATERIAL,BEDDINGMATERIAL,TYPEOFFLUIDTRANS,MAXOPPRESSURE
0,EVERGREEN NATURAL RESOURCES LLC,Gas,polly,4.0,Active,2277.71,693.972162,"MULTILINESTRING ((524642.670 4117088.796, 5246...",,,,,,,,,,,,,,,,,,
1,NOBLE ENERGY INC,Multiphase,Carbon Steel,3.0,Abandoned,651.58,198.525215,"MULTILINESTRING ((527997.281 4463899.920, 5281...",,,,,,,,,,,,,,,,,,


In [16]:
all_flowlines.to_file('all_flowlines.geojson', driver='GeoJSON')