In [1]:
import warnings
import polars as pl
from datetime import datetime

warnings.filterwarnings("ignore")

### Citizen and Tourist Travel Dataset

In [2]:
tr_cz_path="datasets/02_tr_tourist_citizen_monthlytravel/"

tr_df = (
    # Read CSV
    pl.read_csv(tr_cz_path+"tourist_monthly_arrivals.csv", columns=["TOTAL", "month", "year"])

    # Rename columns
    .rename({"month" : "MONTH", "year" : "YEAR", "TOTAL" : "TOURIST_ARRIVAL"})

    # Downcast datatypes
    .with_columns(pl.col("YEAR").cast(pl.UInt16),
                  pl.col("MONTH").cast(pl.UInt8),
                  pl.col("TOURIST_ARRIVAL").cast(pl.UInt32))
)

cz_df = (
    # Read CSV
    pl.read_csv(tr_cz_path+"citizen_travel_df.csv")

    # Group by columns
    .groupby(["year", "citizen_travel_type", "month"],maintain_order=True)

    # aggregate column by sum
    .agg(pl.col("number_of_people").sum())

    # Pivot to convert `citizen_travel_type` categories to columns
    .pivot(columns="citizen_travel_type", 
           values="number_of_people", 
           index=["year", "month"],
           aggregate_function="sum")

    # Rename columns
    .rename({"year" : "YEAR", "month" : "MONTH", "Returning" : "CITIZEN_RETURN", "Travelling Abroad" : "CITIZEN_ARRIVAL"})

    # Downcast datatypes
    .with_columns(pl.col("YEAR").cast(pl.UInt16),
                  pl.col("MONTH").cast(pl.UInt8),
                  pl.col(["CITIZEN_RETURN", "CITIZEN_ARRIVAL"]).cast(pl.UInt32))
)

### Holidays & Observances

In [3]:
hol_df = (
    # Read CSV
    pl.read_csv("datasets/06_tr_holidays_obs/tr_holidays_obs.csv")

    # Convert datestring to date datatype
    .with_columns(pl.col("date")
                  .str.to_datetime("%Y-%m-%d")
                  .dt
                  .date())

    # Filter out data before 2020
    .filter(pl.col("date").dt.year()>=2020)
)

# There are no information about ramadan start date. 
# Adding Ramdan start date, create a seperate dataframe and then append to hol_df
ramadan_hol_start = (
    hol_df.filter((pl.col("holiday_type")=="Half Day") &
                  (pl.col("holiday_name")=='Ramadan Feast Eve'))['date']
    .to_list()
)

print(ramadan_hol_start)
print(hol_df.columns)

[datetime.date(2020, 5, 23), datetime.date(2021, 5, 12), datetime.date(2022, 5, 1), datetime.date(2023, 4, 20)]
['date', 'holiday_name', 'holiday_type']


In [4]:
# Creating Ramadan Start dates list
ramadan_start =[]

for val in ["2020-04-24", "2021-04-13", "2022-04-02", "2023-03-23"]:
    ramadan_start.append(datetime.strptime(val, "%Y-%m-%d").date())

ramadan_start

[datetime.date(2020, 4, 24),
 datetime.date(2021, 4, 13),
 datetime.date(2022, 4, 2),
 datetime.date(2023, 3, 23)]

In [5]:
# Creating Ramadan days Dataframe
ramadan_list = []

for sd, ed in zip(ramadan_start, ramadan_hol_start):
    date_range = pl.date_range(start=sd, end=ed, interval='1d', closed="left", eager=True).to_list()

    for date in date_range:
        ramadan_list.append({
            'date' : date,
            'holiday_name' : 'Ramadan Observance',
            'holiday_type' : 'Ramadan Observance'
        })

ramadan_df = pl.DataFrame(ramadan_list)

In [6]:
# Appending to hol_df
hol_df = (
    pl.concat([hol_df, ramadan_df])
    .sort('date', 'holiday_name')

    # Some dates have two events, One Ramadan observance and national holiday. 
    # Keep only national rows for these dates
    .groupby('date').first()
    .rename({'holiday_name' : 'HOLIDAY_NAME', 'holiday_type' : 'HOLIDAY_TYPE', 'date' : 'DATE'})
)

hol_df.head()

DATE,HOLIDAY_NAME,HOLIDAY_TYPE
date,str,str
2020-01-01,"""New Year's Day…","""National holid…"
2020-03-20,"""March Equinox""","""Season"""
2020-04-23,"""National Sover…","""National holid…"
2020-04-24,"""Ramadan Observ…","""Ramadan Observ…"
2020-04-25,"""Ramadan Observ…","""Ramadan Observ…"


### School Holidays
    Similar to ramadan, a dataframe with all school holidays with dates will be created.

In [7]:
# Single day school holidays
sch_hol_1 = (
    pl.read_csv("datasets/07_tr_ist_school_holidays/tr_ist_school_holidays.csv")
    .filter(pl.col('end').is_null())
    .drop('end')
    .with_columns(pl.col('start')
                  .str.to_datetime("%Y-%m-%d")
                  .dt.date())
    .rename({'start':'DATE', 'reason' : 'SCHOOL_HOLIDAY_TYPE'})
)

sch_hol_1['SCHOOL_HOLIDAY_TYPE'].unique().to_list()

['Youth and Sports Day',
 'Labour Day',
 'National Sovereignty Holiday',
 'Ramadan Holiday',
 'Republic of Türkiye Day',
 'National sovereignty and the child',
 'holiday of youth',
 'Christmas holidays']

In [8]:
# Creating a dictionary with school holidays as keys and holidays starting and ending days as sub-dictionaries
sch_hol_dict = (
    pl.read_csv("datasets/07_tr_ist_school_holidays/tr_ist_school_holidays.csv")
    .filter(pl.col('end').is_not_null())
    .with_columns(pl.col('start').str.extract(r'^(\d{4})-').alias("year"))
    .with_columns(pl.concat_str('reason', 'year', separator="_"))
    .drop('year')
    .unique()
    .select(['reason', 'start', 'end'])
    .to_pandas()
    .set_index('reason')
    .T.to_dict()
)

sch_hol_dict

{'Easter holidays_2021': {'start': '2021-04-12', 'end': '2021-04-16'},
 'Schools closed (possibly distance learning) Covid-19_2020': {'start': '2020-03-16',
  'end': '2020-03-22'},
 "Fest des Fastenbrechens ('Id al Fitr)_2020": {'start': '2020-05-24',
  'end': '2020-05-26'},
 'Summer holidays_2020': {'start': '2020-06-22', 'end': '2020-08-30'},
 'Feast of the Breaking of the Fast_2021': {'start': '2021-05-13',
  'end': '2021-05-15'},
 'Spring time holidays_2022': {'start': '2022-04-11', 'end': '2022-04-15'},
 'Spring time holidays_2023': {'start': '2023-04-15', 'end': '2023-04-20'},
 'Sport holiday_2022': {'start': '2022-01-24', 'end': '2022-02-04'},
 'Ramadan Holiday_2022': {'start': '2022-05-03', 'end': '2022-05-05'},
 'Christmas holidays_2022': {'start': '2022-12-31', 'end': '2023-01-01'},
 'Christmas holidays_2023': {'start': '2023-12-31', 'end': '2024-01-01'},
 'Summer holidays_2022': {'start': '2022-06-17', 'end': '2022-09-11'},
 'Sport holiday_2023': {'start': '2023-01-21', 'end

In [9]:
# Creating a single df with all school holidays that has starting and ending dates.
school_hol_list = []

for name in sch_hol_dict:
    sd = datetime.strptime(sch_hol_dict[name]['start'], "%Y-%m-%d").date()
    ed = datetime.strptime(sch_hol_dict[name]['end'], "%Y-%m-%d").date()
    
    date_range = pl.date_range(start=sd, end=ed, interval='1d', eager=True).to_list()

    for date in date_range:
        school_hol_list.append({
            'DATE' : date,
            'SCHOOL_HOLIDAY_TYPE' : name,
        })

sch_hol_2 = (
    pl.DataFrame(school_hol_list)
    .with_columns(pl.col('SCHOOL_HOLIDAY_TYPE')
                  .str.extract(r'^(.+)_\d{4}$')
                  .str.replace("Sport holiday", "Winter holidays")
                  .str.replace(r"Fest des Fastenbrechens \('Id al Fitr\)|Feast of the Breaking of the Fast", "Eid Holiday")
                  .str.replace("National sovereignty and the child", "National Sovereignty Holiday"))
)

sch_hol_2['SCHOOL_HOLIDAY_TYPE'].unique().to_list()

['Winter holidays',
 'Schools closed (possibly distance learning) Covid-19',
 'November vacation',
 'Summer holidays',
 'Easter holidays',
 'Ramadan Holiday',
 'Spring time holidays',
 'Eid Holiday',
 'Christmas holidays']

In [10]:
# Merging to one dataset
all_school_holidays = (
    pl.concat([sch_hol_2, sch_hol_1])
    .sort('DATE')
    .filter(pl.col('DATE').lt(datetime(2023, 5, 1)))
)

all_school_holidays.head()

DATE,SCHOOL_HOLIDAY_TYPE
date,str
2020-01-01,"""Christmas holi…"
2020-01-20,"""Winter holiday…"
2020-01-21,"""Winter holiday…"
2020-01-22,"""Winter holiday…"
2020-01-23,"""Winter holiday…"


### Public Transport Passengers

##### At present, the public transport passenger dataset is not of hourly frequency. Therefore the following pre-processing steps will be done:
    * groupby date and road_type and take sum of total passengers
    * pivot the dataset to convert road_type categories as column
    * Resample the dataset at 1 hour frequency to get 29,184 total rows 
    * interpolate the data (similar to what was done to traffic density dataset) to fill missing values
    * Finally, sorting the data by datetime.

In [11]:
# Creating a dataframe with the last row containing the last date
# This will be appended to the public transport passenger data so
# that the upsampling will be done up to this datetime.
extra_row = (
    pl.DataFrame([{"transition_date" : datetime.strptime("2023-04-30 23:00:00", "%Y-%m-%d %H:%M:%S"),
                   "HIGHWAY" : None,
                   "SEA" : None,
                   "RAIL" : None}])
    .with_columns(pl.col('transition_date').dt.cast_time_unit('ns'),
                  pl.col(['HIGHWAY', 'SEA', 'RAIL']).cast(pl.UInt32))
)

extra_row

transition_date,HIGHWAY,SEA,RAIL
datetime[ns],u32,u32,u32
2023-04-30 23:00:00,,,


In [12]:
pub_trnsprt_df = (   
    # Read parquet
    pl.read_parquet("datasets/04_tr_public_transport_passengers/tr_ist_public_transport_travel.gz")

    # Convert column to show only date
    .with_columns(pl.col('transition_date').dt.date())

    # Groupby and aggregate to get sum
    .groupby(['transition_date', 'HOUR', 'road_type'], maintain_order=True)
    .agg(pl.col('number_of_passenger').sum())

    # Pivot to convert road_type categories to columns
    .pivot(index=['transition_date', 'HOUR'],
           columns='road_type',
           values='number_of_passenger')

    # Convert HOUR column from integer to time format
    .with_columns(pl.col('HOUR')
                  .cast(pl.Utf8)
                  .apply(lambda x: x.zfill(2) + ":00:00")
                  .str.strptime(pl.Time, fmt="%H:%M:%s"))

    # Combine transition_date and HOUR column to get datetime format
    .with_columns(pl.col('transition_date').dt.combine(pl.col('HOUR'), time_unit='ns'))

    # After resampling, HOUR will have lots of null values, so dropping them for now.
    .drop('HOUR')

    # Append the last row created.
    .extend(extra_row)

    # Sort date for proper upsampling
    .sort('transition_date')

    # Upsampling to get 1 hour frequncy for all days from 
    # Jan 2020 to 30 April 2023, 23:00 hours
    .upsample(time_column='transition_date', every="1h")

    # Extracting HOUR & DAYOFWEEK
    .with_columns(pl.col('transition_date').dt.hour().alias("HOUR"),
                  pl.col('transition_date').dt.weekday().alias("DAYOFWEEK"))

    # Sorting by HOUR & DAYOFWEEK. This will ensure capturing 
    # HOUR and WEEKLY patterns during interpolation
    .sort(['HOUR', 'DAYOFWEEK'])

    # Interpolating the road_typw=e columns.
    # The last row remains null, hence filling by forward fill
    .with_columns(pl.col(['HIGHWAY', 'SEA', 'RAIL'])
                  .interpolate()
                  .forward_fill())

    # Sort by datetime to get proper order
    .sort('transition_date')

    # These columns no longer needed
    .drop(['HOUR', 'DAYOFWEEK'])

    # Renaming columns
    .rename({'transition_date' : 'DATE_TIME',
             'HIGHWAY' : 'HIGHWAY_TRNSPRT_PSNGRS',
             'SEA' : 'SEA_TRNSPRT_PSNGRS',
             'RAIL' : 'RAIL_TRNSPRT_PSNGRS'})
)

In [13]:
pub_trnsprt_df.head()

DATE_TIME,HIGHWAY_TRNSPRT_PSNGRS,SEA_TRNSPRT_PSNGRS,RAIL_TRNSPRT_PSNGRS
datetime[ns],u32,u32,u32
2020-01-01 00:00:00,15430,2211,31247
2020-01-01 01:00:00,12108,1464,27136
2020-01-01 02:00:00,9168,815,13970
2020-01-01 03:00:00,6585,517,8190
2020-01-01 04:00:00,5054,219,5310


## COVID19 Restrictions
    Here, a dataset with the dates of official covid restrictions implemented will be created.
    Sources: https://en.wikipedia.org/wiki/COVID-19_pandemic_in_Turkey

#### COVID Restrictions TimeLine:
    * 16th March 2020 to 31st May 2020: Closure of schools, flight ban to some countries, Curfew for those over the age of 65, followed by people twenty and younger, curfew on weekends.

    * 20th Nov 2020 to 28th Feb 2021: Curfew on people age 65 and older and people twenty and younger. Businesses and places of worship to halt indoor activities

    * 30th March 2021 to 31st May 2021: Reintroduce lockdowns during weekends. Nationwide lockdowns from 29 April 2021 to 17th May. Weekend curfew remains after nationwide lockdown.

In [14]:
start_dates_list = ["2020-03-16", "2020-11-20", "2021-03-30"] 
end_dates_list = ["2020-05-31", "2021-02-28", "2021-05-31"]

covid_timeline_list = []

for start, end in zip(start_dates_list, end_dates_list):
    sd = datetime.strptime(start, "%Y-%m-%d").date()
    ed = datetime.strptime(end, "%Y-%m-%d").date()
    
    date_range = pl.date_range(start=sd, end=ed, interval='1d', eager=True).to_list()

    for date in date_range:
        covid_timeline_list.append({
            'DATE' : date,
            'IS_COVID_RESTRICTION' : 1,
        })

covid_df = pl.DataFrame(covid_timeline_list)
covid_df.head()

DATE,IS_COVID_RESTRICTION
date,i64
2020-03-16,1
2020-03-17,1
2020-03-18,1
2020-03-19,1
2020-03-20,1


## Traffic Density Dataset
    Putting it all together - Part 1

In [15]:
%%time
td_df = (
    # Scan TD density file
    pl.scan_parquet("datasets/01_tr_density/ist_traffic_density_rev03.zstd")

    .drop(['MINIMUM_SPEED', 'MAXIMUM_SPEED'])

    # Create datetime features and downcast datatype
    .with_columns(pl.col("DATE_TIME").dt.date().alias("DATE"),
                  pl.col("DATE_TIME").dt.month().alias("MONTH").cast(pl.UInt8),
                  pl.col("DATE_TIME").dt.year().alias("YEAR").cast(pl.UInt16),
                  pl.col("DATE_TIME").dt.hour().alias("HOUR").cast(pl.UInt8))

    # left join citizen travel dataset
    .join(cz_df.lazy(), on=["YEAR", "MONTH"], how="left")

    # left join tourist arrival dataset
    .join(tr_df.lazy(), on=["YEAR", "MONTH"], how="left")

    # left join holiday df and fill missing value in holiday name and type columns with "Non Holiday" category
    .join(hol_df.lazy(), on='DATE', how='left')
    .with_columns(pl.col(['HOLIDAY_NAME', 'HOLIDAY_TYPE']).fill_null("Non Holiday"))

    # left join school holidays and fill missing value in school holidays with "Non Holiday" category
    .join(all_school_holidays.lazy(), on='DATE', how='left')
    .with_columns(pl.col('SCHOOL_HOLIDAY_TYPE').fill_null("Non Holiday"))

    # left join public transport passenger data
    .join(pub_trnsprt_df.lazy(), on='DATE_TIME', how='left')

    # left join covid_df
    .join(covid_df.lazy(), on='DATE', how='left')

    # Cast all categorical columns to pl.Categorical datatype
    .with_columns(pl.col(['HOLIDAY_NAME', 'HOLIDAY_TYPE', 'SCHOOL_HOLIDAY_TYPE', 'GEOHASH']).cast(pl.Categorical),
                  pl.col('IS_COVID_RESTRICTION').fill_null(0).cast(pl.Boolean))

    # Drop DATE column
    .drop(['DATE', 'MONTH', 'YEAR', 'HOUR'])
    
).collect(streaming=True)

td_df.head()

Wall time: 27.5 s


DATE_TIME,LATITUDE,LONGITUDE,GEOHASH,AVERAGE_SPEED,NUMBER_OF_VEHICLES,CITIZEN_RETURN,CITIZEN_ARRIVAL,TOURIST_ARRIVAL,HOLIDAY_NAME,HOLIDAY_TYPE,SCHOOL_HOLIDAY_TYPE,HIGHWAY_TRNSPRT_PSNGRS,SEA_TRNSPRT_PSNGRS,RAIL_TRNSPRT_PSNGRS,IS_COVID_RESTRICTION
datetime[ns],f32,f32,cat,u8,u16,u32,u32,u32,cat,cat,cat,u32,u32,u32,bool
2020-01-01 00:00:00,40.78949,29.415894,"""sxkbj3""",72,97,627933,692729,1017034,"""New Year's Day…","""National holid…","""Christmas holi…",15430,2211,31247,False
2020-01-01 01:00:00,40.78949,29.415894,"""sxkbj3""",74,58,627933,692729,1017034,"""New Year's Day…","""National holid…","""Christmas holi…",12108,1464,27136,False
2020-01-01 02:00:00,40.78949,29.415894,"""sxkbj3""",79,40,627933,692729,1017034,"""New Year's Day…","""National holid…","""Christmas holi…",9168,815,13970,False
2020-01-01 03:00:00,40.78949,29.415894,"""sxkbj3""",74,33,627933,692729,1017034,"""New Year's Day…","""National holid…","""Christmas holi…",6585,517,8190,False
2020-01-01 04:00:00,40.78949,29.415894,"""sxkbj3""",79,25,627933,692729,1017034,"""New Year's Day…","""National holid…","""Christmas holi…",5054,219,5310,False


In [16]:
# Null counts by column
(
    td_df
    .null_count()
    .to_pandas().T
    .rename(columns={0 : "null_counts"})
)

Unnamed: 0,null_counts
DATE_TIME,0
LATITUDE,0
LONGITUDE,0
GEOHASH,0
AVERAGE_SPEED,0
NUMBER_OF_VEHICLES,0
CITIZEN_RETURN,0
CITIZEN_ARRIVAL,0
TOURIST_ARRIVAL,0
HOLIDAY_NAME,0


In [17]:
for k, v in td_df.schema.items():
    print(k, ":", v)

DATE_TIME : Datetime(time_unit='ns', time_zone=None)
LATITUDE : Float32
LONGITUDE : Float32
GEOHASH : Categorical
AVERAGE_SPEED : UInt8
NUMBER_OF_VEHICLES : UInt16
CITIZEN_RETURN : UInt32
CITIZEN_ARRIVAL : UInt32
TOURIST_ARRIVAL : UInt32
HOLIDAY_NAME : Categorical
HOLIDAY_TYPE : Categorical
SCHOOL_HOLIDAY_TYPE : Categorical
HIGHWAY_TRNSPRT_PSNGRS : UInt32
SEA_TRNSPRT_PSNGRS : UInt32
RAIL_TRNSPRT_PSNGRS : UInt32
IS_COVID_RESTRICTION : Boolean


In [18]:
# Merged dataset information
print(f'Estimated Size of merged df: {td_df.estimated_size("gb"):.3f} GB')
print(f'Dataset Shape: {td_df.shape[0]:,} rows, {td_df.shape[1]} columns')

Estimated Size of merged df: 2.914 GB
Dataset Shape: 52,910,592 rows, 16 columns


In [19]:
# Creating a separate dataset for min speed and max speed
min_max_speed_df = (
    pl.scan_parquet("datasets/01_tr_density/ist_traffic_density_rev03.zstd")
    .select(['DATE_TIME', 'GEOHASH', 'MINIMUM_SPEED', 'MAXIMUM_SPEED'])
    .with_columns(pl.col('GEOHASH').cast(pl.Categorical))
).collect(streaming=True)

min_max_speed_df.head()

DATE_TIME,GEOHASH,MINIMUM_SPEED,MAXIMUM_SPEED
datetime[ns],cat,u8,u8
2020-01-01 00:00:00,"""sxkbj3""",128,10
2020-01-01 01:00:00,"""sxkbj3""",149,7
2020-01-01 02:00:00,"""sxkbj3""",137,36
2020-01-01 03:00:00,"""sxkbj3""",124,42
2020-01-01 04:00:00,"""sxkbj3""",115,14


In [20]:
# Min Max dataset information
print(f'Estimated Size of Min-Max speed dataset: {min_max_speed_df.estimated_size("mb"):.3f} MB')
print(f'Dataset Shape: {min_max_speed_df.shape[0]:,} rows, {min_max_speed_df.shape[1]} columns')

Estimated Size of Min-Max speed dataset: 706.484 MB
Dataset Shape: 52,910,592 rows, 4 columns


In [21]:
# Saving traffic Density dataset to parquet
td_df.write_parquet("datasets/00_tr_df_merged/tr_ist_td_merged_01_ver01.zstd", compression="zstd")

In [22]:
# Saving min_max_speed_df dataset to parquet
min_max_speed_df.write_parquet("datasets/00_tr_df_merged/tr_ist_td_min_max_speed.zstd", compression="zstd")