In [1]:
import os

os.chdir("../../")

import logging

logging.getLogger().setLevel(logging.INFO)

In [2]:
import warnings

warnings.filterwarnings("ignore")


In [3]:
from src.etl import run_etl_pipeline
from src.pipeline.enrich_sites import EnrichSites

In [7]:
site_df_path = "./data/raw/DE_HomeworkAssignment.csv"
eps_km = 0.25
min_sample_size = 7
min_samples = 3
output_dir = "./data/staging"

In [5]:
(
    site_data_file_name,
    fast_food_poi_df_file_name,
    fuel_station_poi_df_file_name,
    supermarket_poi_df_file_name,
    proximity_data_file_name,
) = run_etl_pipeline(
    site_df_path=site_df_path,
    eps_km=eps_km,
    min_sample_size=min_sample_size,
    min_samples=min_samples,
    output_dir=output_dir
)


INFO:root:Cleaning and removing outliers from site_data.
INFO:root:The number of sites that lie outside the approximated border of Germany is: 9
INFO:root:Successfuly removed outliers if any.
INFO:root:(ClusterSites): Successfuly initialized HDBSCAN model.
INFO:root:(ClusterSites): Successfully clustered sites.
Querying cluster POIs: 100%|██████████| 29/29 [00:22<00:00,  1.27it/s]
Querying individual sites: 100%|██████████| 174/174 [02:28<00:00,  1.17it/s]
INFO:root:(Transforms): Succesfully read 203 raw POI data file and converted to DataFrame.
INFO:root:(Transforms): Found 69 duplicates.
INFO:root:(Transforms): Successfuly deduplicated the dataset.
INFO:root:(Transforms): Found 21 duplicates.
INFO:root:(Transforms): Successfuly deduplicated the dataset.
INFO:root:(Transforms): Found 55 duplicates.
INFO:root:(Transforms): Successfuly deduplicated the dataset.
INFO:root:(Transforms): Successfully extracted latitudes and longitudes of ways(center).
INFO:root:(Transforms): Successfully e

Succesfully saved file: ./data/intermediate/fast_food_pois.parquet
Succesfully saved file: ./data/intermediate/fuel_pois.parquet
Succesfully saved file: ./data/intermediate/supermarket_pois.parquet
Succesfully saved file: ./data/intermediate/site_data.parquet
Succesfully saved file: ./data/intermediate/site_pois_proximities.parquet


In [6]:
enrich_sites = EnrichSites(
        site_data_path=f"{output_dir}/{site_data_file_name}",
        fast_food_data_path=f"{output_dir}/{fast_food_poi_df_file_name}",
        fuel_data_path=f"{output_dir}/{fuel_station_poi_df_file_name}",
        supermarket_data_path=f"{output_dir}/{supermarket_poi_df_file_name}",
        proximity_data_path=f"{output_dir}/{proximity_data_file_name}",
    )

enirched_site_data = enrich_sites.enirch_site_data_with_features()

enirched_site_data

Unnamed: 0,site_id,locality,postalCode,state,operatorId,operatorName,lon,lat,total_num_pois,num_fast_food,num_fuel_stations,num_supermarkets,closest_category,has_toilet,has_wifi,has_seating,open_24hr,tags.brand
0,-4ZBQG-7Q6NqIB0E-IKSTEPG9lYhs0HdwxuBQCJ5rGM,Hamburg,22041,Hamburg,DEHHM,Hamburger Energiewerke,10.076880,53.579440,0.0,0.0,0.0,0.0,,,,,,
1,-7GAlj5KMjg81KmaOcdIlmUrDZLpYp_kp_pXqeePYdc,Hamburg,20095,Hamburg,DEHHM,Hamburger Energiewerke,9.993134,53.548933,2.0,2.0,0.0,0.0,fast_food,False,False,True,False,Asiahung
2,-7GAlj5KMjg81KmaOcdIlmUrDZLpYp_kp_pXqeePYdc,Hamburg,20095,Hamburg,DEHHM,Hamburger Energiewerke,9.993134,53.548933,2.0,2.0,0.0,0.0,fast_food,False,False,True,False,
3,-AolBOqEu0y-HEbZ74WdzDFCQc4N6zoS9z1bDtUBSPo,Hamburg,22767,Hamburg,DEHHM,Hamburger Energiewerke,9.999160,53.549600,0.0,0.0,0.0,0.0,,,,,,
4,-DTxSOKBxOlXgxzPImpaJZfBUD-YeW9Il9rIYuMiYKw,Hamburg,22337,Hamburg,DEHHM,Hamburger Energiewerke,10.052869,53.633256,0.0,0.0,0.0,0.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1807,zi2Iktv4CsTvWe1P6Hf_fvcpxjNpOttnSO1mvwZ3RtE,Hamburg,21109,Hamburg,DEBPE,Aral Pulse,10.018841,53.489877,1.0,0.0,1.0,0.0,fuel,False,False,False,False,Aral
1808,zq1BZuLFKrTx50k3UUaAqCJ9oYwMhls69kkkZMqvLhA,Hamburg,20251,Hamburg,DEHHM,Hamburger Energiewerke,9.976170,53.582280,0.0,0.0,0.0,0.0,,,,,,
1809,zqmLTfJfNbn-4U26pGDkeJW_k2dWUXFkwJevLVtFAJo,Hamburg,22337,Hamburg,DEHHM,Hamburger Energiewerke,10.035689,53.629118,0.0,0.0,0.0,0.0,,,,,,
1810,ztBbCmf3x8T6t3kEiu20MTybJjYirL2Lp_D2S6bbeaU,Hamburg,22337,Hamburg,DEEZE,eze.network GmbH,10.023960,53.615410,0.0,0.0,0.0,0.0,,,,,,
