In [1]:
import warnings
from datetime import datetime, timedelta
import numpy as np

import polars as pl
# https://stackoverflow.com/q/70234041/15937542
# Need to set this
pl.enable_string_cache(True)

warnings.filterwarnings('ignore')

#### The following districts will be removed as they lie outside of Istanbul.

In [2]:
drop_districts=[
    'Marmaraereğlisi',
    'Çorlu',
    'Çerkezköy',
    'Kapaklı',
    'Ergene',
    'Darıca',
    'Gebze',
    'Çayırova'
]

In [3]:
(
    # Checking for number of rows that will be removed
    pl.scan_parquet("datasets/00_tr_df_merged/tr_ist_td_merged_full.zstd")
    .filter(pl.col('DISTRICT').is_in(drop_districts))
    .groupby('GEOHASH')
    .count()
    .sum()
    .collect(streaming=True)
)

GEOHASH,count
cat,u32
,2480640


    By removing thesw districts, we will reduce the number of rows by 2,480,640.

#### Three more additional datasets will be added. They include:
    * Number of nearby Fuel Stations in Istanbul within 1km radius of a GEOHASH
    * Number of nearby Public Bicycle stands (ISBIKE) within 1km radius of a GEOHASH
    * Number of nearby bicycle and micromobility parking spaces within 1km radius of a GEOHASH
    * Location type of GEOHASH (highway, road, junction etc)

In [4]:
# Data for nearby Fuel stns
nearby_fuel_stn_df = (
    pl.read_csv("datasets/13_gh_proximities/gh_nearby_fuelstns.csv")
    .filter(pl.col('type').eq('geohash'))
    .groupby('name').agg(pl.col('nearest').n_unique())
    .rename({'name' : 'GEOHASH', 'nearest' : 'NEARBY_FUEL_STNS_CNT'})
    .with_columns(pl.col('GEOHASH').cast(pl.Categorical),
                  pl.col('NEARBY_FUEL_STNS_CNT').cast(pl.UInt8))
)

nearby_fuel_stn_df.head()

GEOHASH,NEARBY_FUEL_STNS_CNT
cat,u8
"""sxkc0q""",2
"""sxk9ns""",2
"""sxk91r""",5
"""sxkb9b""",5
"""sxk9e1""",3


In [5]:
# ISBIKE stands
nearby_isbike_stnds_df = (
    pl.read_csv("datasets/13_gh_proximities/gh_nearby_isbike_stnds.csv")
    .filter(pl.col('type').eq('geohash'))
    .groupby('name').agg(pl.col('nearest').n_unique())
    .rename({'name' : 'GEOHASH', 'nearest' : 'NEARBY_ISBIKE_STANDS_CNT'})
    .with_columns(pl.col('GEOHASH').cast(pl.Categorical),
                  pl.col('NEARBY_ISBIKE_STANDS_CNT').cast(pl.UInt8))
)

nearby_isbike_stnds_df.head()

GEOHASH,NEARBY_ISBIKE_STANDS_CNT
cat,u8
"""sxk3q8""",2
"""sxk8yn""",3
"""sxk9qt""",4
"""sxk97s""",1
"""sxk99s""",1


In [6]:
nearby_bike_mm_parking = (
    pl.read_csv("datasets/13_gh_proximities/gh_nearby_bike_mm_parking.csv")
    .filter(pl.col('type').eq('geohash'))
    .groupby('name').agg(pl.col('nearest').n_unique())
    .rename({'name' : 'GEOHASH', 'nearest' : 'NEARBY_BIKE_MM_PARKING_CNT'})
    .with_columns(pl.col('GEOHASH').cast(pl.Categorical),
                  pl.col('NEARBY_BIKE_MM_PARKING_CNT').cast(pl.UInt8))
)

nearby_bike_mm_parking.head()

GEOHASH,NEARBY_BIKE_MM_PARKING_CNT
cat,u8
"""sxk9ke""",2
"""sxk9hr""",1
"""sxk96b""",2
"""sxk9dy""",1
"""sxk9rg""",1


In [7]:
# location information data
gh_with_loc_info = (
    pl.read_csv("datasets/15_gh_info/ist_geohash_district_loc_info.csv")
    .select(['GEOHASH', 'loc_type'])
    .with_columns(pl.all().cast(pl.Categorical))
    .rename({'loc_type' : 'LOCATION_TYPE'})
)

gh_with_loc_info.head()

GEOHASH,LOCATION_TYPE
cat,cat
"""sxk3xw""","""avenue"""
"""sxk9nm""","""other"""
"""sxk9q0""","""street"""
"""sxk3hx""","""other"""
"""sx7cmx""","""street"""


In [8]:
stadium_df = (
    pl.read_csv("datasets/13_gh_proximities/gh_nearby_stadiums.csv")
    .filter(pl.col('type').eq('geohash'))
    .select(pl.col('name').cast(pl.Categorical).alias('GEOHASH'))
    .with_columns(pl.lit(1).alias('IS_NEARBY_STADIUM'))
)

stadium_df.head()

GEOHASH,IS_NEARBY_STADIUM
cat,i32
"""sxk3zw""",1
"""sxk3zt""",1
"""sxk3zx""",1
"""sxk97m""",1
"""sxk97q""",1


# Final process

In [13]:
final_df = (
    pl.scan_parquet("datasets/00_tr_df_merged/tr_ist_td_merged_full.zstd")

    # drop the following columns
    .drop(['AVERAGE_SPEED', 'RAIN', 'SNOW', 'CLUSTER', 'LATITUDE', 'LONGITUDE'])

    # Filter out the chosen districts
    .filter(~pl.col('DISTRICT').is_in(drop_districts))

    # location information data
    .join(gh_with_loc_info.lazy(), on='GEOHASH', how='left')

    # Fuel stations
    .join(nearby_fuel_stn_df.lazy(), on='GEOHASH', how='left')

    # ISBIKE stands
    .join(nearby_isbike_stnds_df.lazy(), on='GEOHASH', how='left')

    # bicycle & micro-mobility parking spots
    .join(nearby_bike_mm_parking.lazy(), on='GEOHASH', how='left')

    # Join stadium col
    .join(stadium_df.lazy(), on='GEOHASH', how='left')

    # Filling missing values with these columns with 0
    .with_columns(pl.col(['NEARBY_FUEL_STNS_CNT', 'NEARBY_ISBIKE_STANDS_CNT', 'NEARBY_BIKE_MM_PARKING_CNT', 'IS_NEARBY_STADIUM'])
                 .fill_null(0))


    .select(pl.all(),
            
            # insert National Sovereignty Holiday school holiday for 2022
            pl.when(pl.col('DATE_TIME').dt.date().eq(datetime(2022, 4, 23)))
            .then('National Sovereignty Holiday')

            # insert National Sovereignty Holiday school holiday for 2022
            .when(pl.col('DATE_TIME').dt.date().eq(datetime(2023, 4, 23)))
            .then('National Sovereignty Holiday')

            # insert labour day school holiday for 2022
            .when(pl.col('DATE_TIME').dt.date().eq(datetime(2022, 5, 1)))
            .then('Labour Day')

            # Replacing some of school holiday names
            .otherwise(pl.col('SCHOOL_HOLIDAY_TYPE')
                      .str.replace('^Easter holidays$|^Spring time holidays$', 'Spring holidays')
                      .str.replace('^holiday of youth$', 'Youth and Sports Day')
                      .str.replace('^National sovereignty and the child$', 'National Sovereignty Holiday')
                      .str.replace('^Eid Holiday$', 'Ramadan Holiday'))

            # Cast to Categorical Data Type
            .cast(pl.Categorical),
            
            # Create 1 hour lag of Traffic Density by GEOHASH
            pl.col('NUMBER_OF_VEHICLES').shift(1).over('GEOHASH').backward_fill().alias('TD_LAG1'),

            # Create 24 hour lag of Traffic Density by GEOHASH
            pl.col('NUMBER_OF_VEHICLES').shift(24).over('GEOHASH').backward_fill().alias('TD_LAG24'),

            # Create 24*7 hours (1 week) lag of Traffic Density by GEOHASH
            pl.col('NUMBER_OF_VEHICLES').shift(24*7).over('GEOHASH').backward_fill().alias('TD_LAG168'),

            # Create 1 hour lag of PRCP by GEOHASH
            pl.col('PRCP').shift(1).over('GEOHASH').backward_fill().alias('PRCP_LAG1'),

            # Create 2 hour lags of PRCP by GEOHASH
            pl.col('PRCP').shift(2).over('GEOHASH').backward_fill().alias('PRCP_LAG2'),

            # Create 3 hour lags of PRCP by GEOHASH
            pl.col('PRCP').shift(3).over('GEOHASH').backward_fill().alias('PRCP_LAG3'),
            
            # Combining IS_NEARBY_STADIUM column and IS_FOOTBALL_MATCH column
            pl.when(pl.col('IS_NEARBY_STADIUM').eq(1) & pl.col('IS_FOOTBALL_MATCH').eq('true'))
            .then(2)
            .when(pl.col('IS_NEARBY_STADIUM').eq(1) & pl.col('IS_FOOTBALL_MATCH').eq('false'))
            .then(1)
            .otherwise(0)
            .cast(pl.Int8)
            .alias('FOOTBALL_STADIUM_STATUS'))


    .drop(['SCHOOL_HOLIDAY_TYPE', 'IS_NEARBY_STADIUM', 'IS_FOOTBALL_MATCH'])
    .rename({'literal' : 'SCHOOL_HOLIDAY_TYPE'})
    .sort(['DATE_TIME', 'GEOHASH'])

    .collect(streaming=True)
)

final_df.head()

DATE_TIME,GEOHASH,NUMBER_OF_VEHICLES,CITIZEN_RETURN,CITIZEN_ARRIVAL,TOURIST_ARRIVAL,HOLIDAY_NAME,HOLIDAY_TYPE,HIGHWAY_TRNSPRT_PSNGRS,SEA_TRNSPRT_PSNGRS,RAIL_TRNSPRT_PSNGRS,IS_COVID_RESTRICTION,NEARBY_LANDMARKS_CNT,NEARBY_PARKING_CAPACITY,NEARBY_TAXI_STND_CNT,NEARBY_MINIBUS_STOPS_CNT,NEARBY_FERRIES_CNT,NEARBY_METRO_STNS_CNT,NEARBY_BUS_STOP_CNT,TEMP,HUMIDITY,PRCP,WINDSPEED,WTHR_CAT,DISTRICT,LOCATION_TYPE,NEARBY_FUEL_STNS_CNT,NEARBY_ISBIKE_STANDS_CNT,NEARBY_BIKE_MM_PARKING_CNT,SCHOOL_HOLIDAY_TYPE,TD_LAG1,TD_LAG24,TD_LAG168,PRCP_LAG1,PRCP_LAG2,PRCP_LAG3,FOOTBALL_STADIUM_STATUS
datetime[ns],cat,u16,u32,u32,u32,cat,cat,u32,u32,u32,bool,u8,u16,u8,u8,u8,u8,u8,f32,u8,f32,f32,cat,cat,cat,u8,u8,u8,cat,u16,u16,u16,f32,f32,f32,i8
2020-01-01 00:00:00,"""sxk6xe""",56,627933,692729,1017034,"""New Year's Day…","""National holid…",15430,2211,31247,False,0,0,0,0,0,0,8,3.8,84,0.0,22.1,"""normal""","""Eyüpsultan""","""other""",0,0,0,"""Christmas holi…",56,56,56,0.0,0.0,0.0,0
2020-01-01 00:00:00,"""sxkc0p""",8,627933,692729,1017034,"""New Year's Day…","""National holid…",15430,2211,31247,False,0,0,0,0,0,0,31,5.0,85,0.0,9.5,"""normal""","""Ümraniye""","""other""",0,0,0,"""Christmas holi…",8,8,8,0.0,0.0,0.0,0
2020-01-01 00:00:00,"""sxkd1m""",89,627933,692729,1017034,"""New Year's Day…","""National holid…",15430,2211,31247,False,0,0,0,0,0,0,4,3.8,84,0.0,22.1,"""normal""","""Eyüpsultan""","""road""",0,0,0,"""Christmas holi…",89,89,89,0.0,0.0,0.0,0
2020-01-01 00:00:00,"""sxkd1c""",93,627933,692729,1017034,"""New Year's Day…","""National holid…",15430,2211,31247,False,0,0,0,0,0,0,0,3.8,84,0.0,22.1,"""normal""","""Eyüpsultan""","""road""",0,0,0,"""Christmas holi…",93,93,93,0.0,0.0,0.0,0
2020-01-01 00:00:00,"""sx7cwp""",6,627933,692729,1017034,"""New Year's Day…","""National holid…",15430,2211,31247,False,0,0,0,0,0,0,0,0.5,86,0.0,22.5,"""normal""","""Silivri""","""road""",2,0,0,"""Christmas holi…",6,6,6,0.0,0.0,0.0,0


### Verifying Data

In [14]:
final_df['SCHOOL_HOLIDAY_TYPE'].unique().to_list()

['Non Holiday',
 'Christmas holidays',
 'Winter holidays',
 'Schools closed (possibly distance learning) Covid-19',
 'National Sovereignty Holiday',
 'Labour Day',
 'Summer holidays',
 'Republic of Türkiye Day',
 'November vacation',
 'Ramadan Holiday',
 'Youth and Sports Day',
 'Spring holidays']

In [15]:
(
    final_df.select([pl.col('DATE_TIME').dt.date().alias('DATE'), 
                     pl.col('DATE_TIME').dt.year().alias('YEAR'), 
                     'SCHOOL_HOLIDAY_TYPE'])
    .unique()
    .groupby(['YEAR', 'SCHOOL_HOLIDAY_TYPE'], maintain_order=True)
    .agg(pl.col('DATE').n_unique())
    .pivot(index='SCHOOL_HOLIDAY_TYPE', columns='YEAR', values='DATE', aggregate_function='first')
    .sort(['2020', 'SCHOOL_HOLIDAY_TYPE'])
    .to_pandas()
    # .loc[:, "2020":].sum()
)

Unnamed: 0,SCHOOL_HOLIDAY_TYPE,2020,2021,2022,2023
0,Christmas holidays,1,1.0,2.0,1.0
1,National Sovereignty Holiday,1,1.0,1.0,1.0
2,Labour Day,1,1.0,1.0,
3,Republic of Türkiye Day,1,1.0,1.0,
4,Youth and Sports Day,1,1.0,1.0,
5,Ramadan Holiday,3,3.0,3.0,
6,November vacation,5,5.0,9.0,
7,Spring holidays,5,5.0,5.0,6.0
8,Schools closed (possibly distance learning) Co...,7,,,
9,Winter holidays,12,12.0,12.0,16.0


In [16]:
(
    final_df
    .null_count()
    .to_pandas()
    .T
    .rename(columns={0 : "null_counts"})
)

Unnamed: 0,null_counts
DATE_TIME,0
GEOHASH,0
NUMBER_OF_VEHICLES,0
CITIZEN_RETURN,0
CITIZEN_ARRIVAL,0
TOURIST_ARRIVAL,0
HOLIDAY_NAME,0
HOLIDAY_TYPE,0
HIGHWAY_TRNSPRT_PSNGRS,0
SEA_TRNSPRT_PSNGRS,0


In [17]:
# Final dataset information
print(f'Estimated Size of merged df: {final_df.estimated_size("gb"):.3f} GB')
print(f'Dataset Shape: {final_df.shape[0]:,} rows, {final_df.shape[1]} columns')

Estimated Size of merged df: 4.937 GB
Dataset Shape: 50,429,952 rows, 37 columns


In [18]:
# arranging columns in order
arrange_columns = [
    'DATE_TIME',
    'GEOHASH',
    'NUMBER_OF_VEHICLES',
    'DISTRICT',
    'LOCATION_TYPE',
    'CITIZEN_RETURN',
    'CITIZEN_ARRIVAL',
    'TOURIST_ARRIVAL',
    'HOLIDAY_NAME',
    'HOLIDAY_TYPE',
    'SCHOOL_HOLIDAY_TYPE',
    'HIGHWAY_TRNSPRT_PSNGRS',
    'SEA_TRNSPRT_PSNGRS',
    'RAIL_TRNSPRT_PSNGRS',
    'IS_COVID_RESTRICTION',
    'FOOTBALL_STADIUM_STATUS',
    'NEARBY_LANDMARKS_CNT',
    'NEARBY_PARKING_CAPACITY',
    'NEARBY_TAXI_STND_CNT',
    'NEARBY_MINIBUS_STOPS_CNT',
    'NEARBY_FERRIES_CNT',
    'NEARBY_METRO_STNS_CNT',
    'NEARBY_BUS_STOP_CNT',
    'NEARBY_FUEL_STNS_CNT',
    'NEARBY_ISBIKE_STANDS_CNT',
    'NEARBY_BIKE_MM_PARKING_CNT',
    'TEMP',
    'HUMIDITY',
    'PRCP',
    'WINDSPEED',
    'WTHR_CAT',
    'TD_LAG1',
    'TD_LAG24',
    'TD_LAG168',
    'PRCP_LAG1',
    'PRCP_LAG2',
    'PRCP_LAG3',
]

len(arrange_columns)

37

In [19]:
# Train set will be from Jan 2020 to End of April 2022
train_cutoff_date = datetime(2022, 4, 30, 23, 0, 0)

# Validation set will be from May 2022 to end of March 2023
valid_cutoff_date = datetime(2023, 3, 31, 23, 0, 0)

# Test set will be from 1st April 2023 to 30 April 2023

In [20]:
# Creating train set
(
    final_df
    .select(arrange_columns)
    .filter(pl.col('DATE_TIME').le(train_cutoff_date))
    .write_parquet("datasets/19_model/train.zstd", compression='zstd')
)

In [21]:
# Creating validation set
(
    final_df
    .select(arrange_columns)
    .filter(pl.col('DATE_TIME').gt(train_cutoff_date) & 
            pl.col('DATE_TIME').le(valid_cutoff_date))
    .write_parquet("datasets/19_model/valid.zstd", compression='zstd')
)

In [22]:
# Creating test set
(
    final_df
    .select(arrange_columns)
    .filter(pl.col('DATE_TIME').gt(valid_cutoff_date))
    .write_parquet("datasets/19_model/test.zstd", compression='zstd')
)
