# Street processing


The goal of this file is to
* Process the 1836 dataset (finding / working with duplicate streets)
* Define which streets belong to only one or both datasets
* Have one final dataset containing all georeferenced streets

In [1]:
# Imports
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
import geopandas as gpd
import contextily as cx
import matplotlib.pyplot as plt
from shapely.ops import linemerge, Point
import warnings
try:
    from shapely.errors import ShapelyDeprecationWarning
    warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning) 
except:
    print("Couldn't install ShapelyDeprecationWarning")

from preprocessing import preprocess
from collections import Counter
from paris_methods import duplicate_processing, duplicate_final, assign_gridnumber, translate_geopoints, create_grid, check_overlap
import pyproj

In [2]:
# Import shapefiles
Openparis = gpd.read_file("data/voie.zip", encoding = 'utf-8')
Vasserot = gpd.read_file("data/vasserot.zip")

# Set right EPSG for Geodata
Openparis = Openparis.to_crs(epsg=3857)
Vasserot = Vasserot.to_crs(epsg=3857)

#change mistakes in streetnames
mask = Vasserot.loc[:,"NOM_ENTIER"] == "Rue Lafayette"
Vasserot.loc[mask,["NOM","NOM_ENTIER"]] = ["la Fayette", "Rue la Fayette"]


## Preprocess streets

In [3]:
Openparis = preprocess(Openparis, "l_longmin")
Vasserot = preprocess(Vasserot, "NOM_ENTIER")

# TODO Rename those streets
Vasserot["voie"] = Vasserot["NOM_ENTIER_prep"]
Openparis["voie"] = Openparis["l_longmin_prep"]

# Remove empty lines
Vasserot = Vasserot.dropna(subset=["voie"])

# Add year
Vasserot = Vasserot.assign(year= [[1836]]*len(Vasserot))
Openparis = Openparis.assign(year= [[2022]]*len(Openparis))

# create buffer around streets, important for merging duplicate streets
buffer = 100
Vasserot["buffer"] = Vasserot["geometry"].apply(lambda x: x.buffer(buffer))
Openparis["buffer"] = Openparis["geometry"].apply(lambda x: x.buffer(buffer))

# Find all duplicates
Duplicates = Vasserot[Vasserot.duplicated(subset=['voie'], keep=False)].sort_values("voie")
Unique = Vasserot[~Vasserot.duplicated(subset=['voie'], keep=False)].sort_values("voie")


In [6]:
Duplicates.drop(columns=["buffer", "year"]).explore()


## Process 1836 streets

### Merge duplicates if streets are close

In [None]:
DuplicatesProcessed = duplicate_processing(Duplicates, "voie")

# Sanity check if all streetnames are in the newly created dataframe
#len(Duplicates["voie"].unique()) == DuplicatesProcessed["voie"].unique()))

### Visualizing Nr. of unique streets before and after processing

In [None]:
freqs = Counter(Duplicates["voie"].value_counts())
plt.bar(freqs.keys(), freqs.values(), width = 0.9)
freqs = Counter(DuplicatesProcessed["voie"].value_counts())
plt.bar(freqs.keys(), freqs.values(), width = 0.9)
plt.show()

print("Duplicates before: ", sum(Duplicates["voie"].value_counts()>1))
print("Duplicates after: ", sum(DuplicatesProcessed["voie"].value_counts()>1))

In [None]:
StillDuplicates = DuplicatesProcessed[DuplicatesProcessed.duplicated(subset=['voie'], keep=False)].sort_values("voie")
NewlyUnique = DuplicatesProcessed[~DuplicatesProcessed.duplicated(subset=['voie'], keep=False)].sort_values("voie")
# Adding newly unique streets
Unique = pd.concat([Unique, NewlyUnique])

#### Visualizing Streets that are still duplicates

In [None]:
# Needed in order to visualize results
StillDuplicates = StillDuplicates.drop(columns=["buffer"])

In [None]:
# Select Street to visualize
a = StillDuplicates["voie"].value_counts()[StillDuplicates["voie"].value_counts()>=2]
Trueduplicates = a.index
mask = StillDuplicates["voie"] == Trueduplicates[8]
# Visualize
#StillDuplicates[mask].explore()
StillDuplicates.explore()

## Create dataset containing old and new streets


#### Preparing Vasserot and Openparis dataframe for final comparison

In [None]:
# Unique from Vasserot DF
Unique = Unique.iloc[:,[0,2,3,6,8,14,16,17,18]]
Unique = Unique.rename(columns={"ROWID":"rowid", "NOM_ENTIER":"streetname","TYPE":"type","ARTICLE":"article","NOM":"name", "voie":"streetname_prep"})
Unique = Unique.assign(matching = [[]] * len(Unique))

In [None]:
# Uniques from Openparis DF
Openparis = Openparis.iloc[:,[2,3,4,5,6,15,17,18,19]]
Openparis = Openparis.rename(columns={"l_longmin": "streetname","c_desi":"type","c_liaison":"article","l_voie":"name","l_courtmin":"streetname_short","voie":"streetname_prep"})
# assign random rowid to Openparis data because they dont have them
Openparis = Openparis.assign(rowid = np.random.randint(7000, 200000, size=len(Openparis)))
Openparis = Openparis.assign(matching = [[]] * len(Openparis))

Merged = pd.concat([Unique, Openparis])


In [None]:
# Check if streets with same name are at same location 
MergedProcessed = duplicate_final(Merged, "streetname_prep")
MergedProcessed = preprocess(MergedProcessed, "name")

# Create Datasets containing duplicates and unique streets
FinalDuplicates = MergedProcessed[MergedProcessed.duplicated(subset=['streetname_prep'], keep=False)].sort_values("streetname_prep")
FinalUnique= MergedProcessed[~MergedProcessed.duplicated(subset=['streetname_prep'], keep=False)].sort_values("streetname_prep")

# Assign right classes to data
FinalUnique = FinalUnique.convert_dtypes()
FinalDuplicates = FinalDuplicates.convert_dtypes()

### Visualize results in the final dataset

In [None]:
FinalDuplicates= FinalDuplicates.drop(columns=["buffer"])
FinalDuplicates.explore()

In [None]:
FinalUnique["geometry"].explore()

## Saving Dataframes

In [None]:

FinalUnique.to_pickle("data/FinalUnique.pkl")
FinalDuplicates.to_pickle("data/FinalDuplicate.pkl")


# Grouping on Short Streetnames

In [None]:
# group streets based on their short name
grouped_streets = FinalUnique.groupby("name_prep", as_index=False).agg({"streetname": ", ".join, 
        "geometry": list, "year": list, "rowid": list, "name": "first", "buffer":list})

# split streets in those that are unique and those that aren't
unique_short_streets = grouped_streets[grouped_streets['buffer'].str.len() == 1]
unique_short_streets[["year", "geometry", "rowid", "buffer"]] = unique_short_streets[["year", "geometry", "rowid", "buffer"]].apply(lambda x: x.str[0])
multiple_short_streets = grouped_streets[grouped_streets['buffer'].str.len() > 1]

print(f"#streets with unique short streetname: {len(unique_short_streets)}, not unique: {len(multiple_short_streets)}")
unique_short_streets.tail()

In [None]:
# save in a new column if all streets with the same short streetname overlap
multiple_short_streets["all_overlap"] = multiple_short_streets["buffer"].apply(check_overlap)
# divide data in overlapping and not overlapping
overlap = multiple_short_streets[multiple_short_streets["all_overlap"]==True]
no_overlap = multiple_short_streets[multiple_short_streets["all_overlap"]==False]

In [None]:
#check if type of streets differ in the overlapping/non overlapping datasets
overlap_prefixes = [tuple([street.split()[0] for street in street_list]) for street_list in overlap.streetname.str.split(", ")]
print("overlapping top 10:\n", Counter(overlap_prefixes).most_common(10))

no_overlap_prefixes = [tuple([street.split()[0] for street in street_list]) for street_list in no_overlap.streetname.str.split(", ")]
print("\nNon-overlapping top 10:\n",Counter(no_overlap_prefixes).most_common(10))

# -> they do not differ a lot

In [37]:
# statistics
print("#overlapping streets:", len(overlap), "\n#not overlapping streets:", len(no_overlap),
        "\nratio of overlapping/all =", len(overlap)/len(multiple_short_streets))

#overlapping streets: 491 
#not overlapping streets: 289 
ratio of overlapping/all = 0.6294871794871795


In [None]:
# merge geometry of overlapping streets and drop "all_overlap" column
overlap["geometry"] = overlap["geometry"].apply(lambda row: gpd.GeoSeries(row).unary_union)
overlap.drop(columns="all_overlap", inplace=True)

# append overlapping streets to unique_short_streets
unique_short_streets = pd.concat([unique_short_streets, overlap])
# update non_unique_short_streets
non_unique_short_streets = no_overlap.drop(columns="all_overlap")

## Save Short Streetdata

In [None]:
unique_short_streets.to_pickle("data/unique_short_streets.pkl")
# multiple_short_streets will be neglected further on
multiple_short_streets.to_pickle("data/not_unique_short_streets.pkl")