# Libraries

In [1]:
import pandas as pd
import geopandas as gpd
import plotly.express as px
import shapely
import time

# Import Road Data within NYC

In [2]:
roads_within = gpd.read_parquet("road-data/nyc_combined_1.parquet")
roads_within.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 22034 entries, 0 to 22033
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   road_name    18542 non-null  object  
 1   county_name  22034 non-null  object  
 2   road_geom    22034 non-null  geometry
dtypes: geometry(1), object(2)
memory usage: 516.6+ KB


# Data Validation

In [6]:
for col in roads_within.columns:
    print(roads_within.value_counts([col]))

road_name      
Riverside Dr       45
Park Dr            28
F D R Dr           26
41st Ave           21
Pedestrian Opas    21
                   ..
Forest Ln           1
Forest Hill Rd      1
Forest Ct           1
Fordham St          1
la Salle St         1
Name: count, Length: 9100, dtype: int64
county_name
Queens         8298
Richmond       4689
Kings          3460
Bronx          3349
New York       2238
Name: count, dtype: int64
road_geom                                        
LINESTRING (-73.87999 40.80987, -73.87908 40.8...    5
LINESTRING (-73.92586 40.67478, -73.92539 40.6...    5
LINESTRING (-74.19230 40.58753, -74.19200 40.5...    4
LINESTRING (-73.96310 40.76053, -73.96173 40.7...    4
LINESTRING (-74.18414 40.60505, -74.18355 40.6...    4
                                                    ..
LINESTRING (-73.92426 40.73498, -73.92441 40.7...    1
LINESTRING (-73.92534 40.73557, -73.92541 40.7...    1
LINESTRING (-73.92310 40.73552, -73.92397 40.7...    1
LINESTRING (-73.923

In [11]:
roads_within.road_geom.geom_type.value_counts()

LineString    22034
Name: count, dtype: int64

In [27]:
roads_within[roads_within.road_geom.duplicated()]

Unnamed: 0,road_name,county_name,road_geom
464,Latimer Pl,Queens,"LINESTRING (-73.83144 40.76657, -73.83038 40.7..."
578,US Rte 9,New York,"LINESTRING (-73.91464 40.87079, -73.91459 40.8..."
895,Queensboro Brg,Queens,"LINESTRING (-73.95084 40.75522, -73.94914 40.7..."
951,Calhoun Rd,Queens,"LINESTRING (-73.83053 40.65133, -73.82747 40.6..."
1164,59th Street Brg,New York,"LINESTRING (-73.96310 40.76053, -73.96173 40.7..."
...,...,...,...
21763,Fort Hamilton Manor,Kings,"LINESTRING (-74.02724 40.60969, -74.02726 40.6..."
21858,McKee Ave,Queens,"LINESTRING (-73.83099 40.65177, -73.82759 40.6..."
21862,Little Neck Rd,Queens,"LINESTRING (-73.75358 40.77003, -73.75188 40.7..."
21920,Queens Midtown Tunl,Queens,"LINESTRING (-73.96138 40.74280, -73.96133 40.7..."


# Data Cleaning

Extract rows of Riverside Drive

In [19]:
riverside_dr = roads_within.query('road_name == "Riverside Dr"').copy().reset_index(names = "id").set_index("id")
riverside_dr

Unnamed: 0_level_0,road_name,county_name,road_geom
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
30,Riverside Dr,New York,"LINESTRING (-73.95368 40.82531, -73.95352 40.8..."
1130,Riverside Dr,New York,"LINESTRING (-73.95752 40.82114, -73.95745 40.8..."
1174,Riverside Dr,New York,"LINESTRING (-73.96125 40.81672, -73.96069 40.8..."
1705,Riverside Dr,New York,"LINESTRING (-73.96388 40.81230, -73.96390 40.8..."
1706,Riverside Dr,New York,"LINESTRING (-73.94829 40.83386, -73.94810 40.8..."
2791,Riverside Dr,New York,"LINESTRING (-73.96220 40.81375, -73.96211 40.8..."
2811,Riverside Dr,New York,"LINESTRING (-73.94910 40.83439, -73.94898 40.8..."
3371,Riverside Dr,New York,"LINESTRING (-73.94303 40.84911, -73.94296 40.8..."
3898,Riverside Dr,New York,"LINESTRING (-73.92810 40.86561, -73.92727 40.8..."
4989,Riverside Dr,New York,"LINESTRING (-73.96402 40.81133, -73.96332 40.8..."


Extract `road_geom` with duplicated records

In [65]:
duplicated_geom = roads_within[roads_within.road_geom.duplicated(keep=False)].groupby("road_geom").value_counts(["road_name"]).reset_index()
duplicated_geom = gpd.GeoDataFrame(duplicated_geom).set_geometry("road_geom")
duplicated_geom

Unnamed: 0,road_geom,road_name,count
0,"LINESTRING (-74.24619 40.51507, -74.24579 40.5...",Wood Ave,1
1,"LINESTRING (-74.24619 40.51507, -74.24579 40.5...",Wood Ln,1
2,"LINESTRING (-74.22362 40.52629, -74.22366 40.5...",State Rte 440,1
3,"LINESTRING (-74.22362 40.52629, -74.22366 40.5...",W Shore Expy,1
4,"LINESTRING (-74.22380 40.50528, -74.22456 40.5...",Clairmont Ave,1
...,...,...,...
907,"LINESTRING (-73.74349 40.59797, -73.74455 40.5...",Jarvis Ct,1
908,"LINESTRING (-73.74711 40.60315, -73.74778 40.6...",Gateway Blvd,1
909,"LINESTRING (-73.74711 40.60315, -73.74778 40.6...",Greenport Rd,1
910,"LINESTRING (-73.73857 40.59968, -73.74025 40.5...",Jarvis Ave,1


In [67]:
blake_ave_geom = duplicated_geom.query('road_name == "Blake Ave"').road_geom.values[0]
print(blake_ave_geom)
duplicated_geom[duplicated_geom.road_geom.geom_equals(blake_ave_geom)]

LINESTRING (-73.860775 40.672806, -73.859867 40.672945, -73.858924 40.673071, -73.858206 40.673145)


Unnamed: 0,road_geom,road_name,count
780,"LINESTRING (-73.86078 40.67281, -73.85987 40.6...",133rd Ave,1
781,"LINESTRING (-73.86078 40.67281, -73.85987 40.6...",Blake Ave,1


# Visualisation

visualise Riverside Drive, the road containing the most Linestring records

In [21]:
riverside_dr.explore(tiles="CartoDB positron", column=riverside_dr.index,)

show the roads with duplicated geometry but with different names

In [70]:
duplicated_geom.iloc[0:2, :].explore(tiles="CartoDB positron", column = "road_name", style_kwds=dict(opacity=0.3))