In [2]:
import pandas as pd
import geopandas as gp
import os

from shapely import wkt
from shapely.geometry import Point
from tqdm import tqdm

tqdm.pandas()

busstops = pd.read_csv('data/busstops_norway.csv')
busstops = gp.GeoDataFrame(busstops, geometry=busstops.geometry.apply(wkt.loads))

train = pd.read_csv('data/stores_train.csv')[['store_id', 'lon','lat']]
train = gp.GeoDataFrame(train, geometry=gp.points_from_xy(train.lon, train.lat)).drop(columns=['lon', 'lat'])
train = train.set_crs('epsg:4326', allow_override=True).to_crs('epsg:3857')

train_extra = pd.read_csv('data/stores_extra.csv')[['store_id', 'lon','lat']]
train_extra = gp.GeoDataFrame(train_extra, geometry=gp.points_from_xy(train_extra.lon, train_extra.lat)).drop(columns=['lon', 'lat'])
train_extra = train_extra.set_crs('epsg:4326', allow_override=True).to_crs('epsg:3857')

test = pd.read_csv('data/stores_test.csv')[['store_id', 'lon', 'lat']]
test = gp.GeoDataFrame(test, geometry=gp.points_from_xy(test.lon, test.lat)).drop(columns=['lon', 'lat'])
test = test.set_crs('epsg:4326', allow_override=True).to_crs('epsg:3857')


In [3]:
busstops = busstops.set_crs('epsg:4326', allow_override=True)
busstops = busstops.to_crs('epsg:3857')
print(busstops.crs)
# busstops.plot()
busstops


epsg:3857


Unnamed: 0,busstop_id,stopplace_type,importance_level,side_placement,geometry
0,853cb081-cc32-4880-aa3e-26e96870d874,Plattform og lomme,Mangler viktighetsnivå,LEFT_AND_RIGHT,POINT (728101.520 8640047.651)
1,156b052b-2771-497a-b4f4-97fed59e1aca,"Lomme og skilt, ikke plattform",Mangler viktighetsnivå,LEFT_AND_RIGHT,POINT (656762.828 8431462.584)
2,7312a280-e14f-4b09-a421-02e8fe1bc63e,,Mangler viktighetsnivå,MIDDLE_LEFT,POINT (1199816.247 8384169.375)
3,d9cda2c7-355a-49c1-b56c-a33180d2a82e,,Mangler viktighetsnivå,MIDDLE_LEFT,POINT (1199818.130 8384192.832)
4,f803bcd3-182d-450b-bbb4-113c6ca885c2,,Mangler viktighetsnivå,MIDDLE_RIGHT,POINT (1199820.134 8384216.279)
...,...,...,...,...,...
68390,12edd887-a122-44e2-896b-a5f663917f88,"Lomme og skilt, ikke plattform",Mangler viktighetsnivå,MIDDLE,POINT (651699.196 8445068.826)
68391,38dd0aee-b497-43c2-943b-51ab5c6d0eb7,"Lomme og skilt, ikke plattform",Mangler viktighetsnivå,MIDDLE,POINT (651650.730 8445056.162)
68392,3803f0fe-d6dc-4211-a44d-bbc3453af289,"Lomme og skilt, ikke plattform",Mangler viktighetsnivå,MIDDLE,POINT (1185500.476 9086180.311)
68393,5b177625-d0dc-40ae-b554-e97f7d400e24,Plattform og lomme,Lokalt knutepunkt,MIDDLE,POINT (1020155.419 8640402.161)


In [4]:
# train = train.drop(columns=['bus_stops_count', *busstops.importance_level.unique()])

In [5]:
def cond_merge(row: pd.Series):
    output_dict = {}
    gp.options.use_pygeos = True
    
    stops_with_dist = busstops[busstops.distance(row.geometry) < 1000]
    output_dict['bus_stops_count'] = len(stops_with_dist)
    
    output_dict.update(stops_with_dist.importance_level.value_counts().reindex(
        busstops.importance_level.unique(), fill_value=0
    ).to_dict())
    
    return output_dict

data = pd.concat([train, train_extra], ignore_index=True)

data = data.join(data.progress_apply(lambda row: cond_merge(row), axis=1, result_type='expand'))
data.to_parquet('derived_data/stores_bus_stops_lt_1km_train_with_extras')

# test = test.join(test.progress_apply(lambda row: cond_merge(row), axis=1, result_type='expand'))
# test.to_parquet('derived_data/stores_bus_stops_lt_1km_test')

100%|██████████| 41543/41543 [12:10<00:00, 56.83it/s]


In [16]:
data

Unnamed: 0,store_id,geometry,bus_stops_count,Mangler viktighetsnivå,Standard holdeplass,Lokalt knutepunkt,Nasjonalt knutepunkt,Regionalt knutepunkt,Annen viktig holdeplass
0,983540538-974187930-44774,POINT (1136007.376 8342763.593),2,2,0,0,0,0,0
1,987074191-973117734-44755,POINT (1194906.839 8380562.352),25,15,0,9,0,1,0
2,984890265-981157303-64491,POINT (1141543.072 8436509.485),9,9,0,0,0,0,0
3,914057442-992924179-126912,POINT (1600751.595 10237351.427),7,1,0,0,0,6,0
4,913018583-913063538-668469,POINT (1154921.068 9188593.406),19,4,11,2,0,0,2
...,...,...,...,...,...,...,...,...,...
41538,931186744-982303729-41793,POINT (1139514.362 8207041.705),7,5,1,0,0,0,1
41539,931186744-983814964-6308,POINT (1162461.103 9205008.601),8,5,3,0,0,0,0
41540,931186744-990018995-100187,POINT (592250.550 8492500.004),5,4,1,0,0,0,0
41541,931186744-971707283-20794,POINT (829508.323 7972078.436),3,3,0,0,0,0,0


In [31]:
bus_data_train = gp.read_parquet('derived_data/stores_bus_stops_lt_1km_train')
print(bus_data_train.bus_stops_count.sum())
len(bus_data_train)

117446


12859

In [33]:
bus_data_train = gp.read_parquet('derived_data/stores_bus_stops_lt_1km_train_with_extras')[:12859]
print(bus_data_train.bus_stops_count.sum())
len(bus_data_train)

117446


12859