In [1]:
import warnings
import polars as pl
import numpy as np
from datetime import datetime, time

warnings.filterwarnings("ignore")

#### This notebooks contains data pre-processing of merged traffic density columns using Polars. 
##### * First, we will remove data for all GEOHASH that has less than 28k hours of data.
##### * Then, we will transform the dataset to ensure every remaining GEOHASH location has 29,184 hours of data points. 

##### At present, for each GEOHASH, the DATE_TIME column is not in propert hour frequency. An exmaple will be shown soon.

##### **Here is an article about [Pandas VS Polars](https://towardsdatascience.com/pandas-dataframe-but-much-faster-f475d6be4cd4)**

In [2]:
%%time
# Read Parquet file and choose only selected columns (Ignore date time features for now to save memory)

selected_cols=['DATE_TIME', 'LATITUDE', 'LONGITUDE', 'GEOHASH', 'MINIMUM_SPEED', 'MAXIMUM_SPEED', 'AVERAGE_SPEED', 'NUMBER_OF_VEHICLES']

raw_df = (
    pl.read_parquet("datasets/01_tr_density/ist_traffic_density_rev01.gz", columns=selected_cols)
)

Wall time: 4.24 s


In [3]:
%%time
# Creating a copy to save time in case the original dataset is needed
all_df=raw_df.clone()

Wall time: 0 ns


In [4]:
all_df.head()

DATE_TIME,LATITUDE,LONGITUDE,GEOHASH,MINIMUM_SPEED,MAXIMUM_SPEED,AVERAGE_SPEED,NUMBER_OF_VEHICLES
datetime[ns],f32,f32,str,u8,u8,u8,u16
2020-01-01 00:00:00,41.080627,28.811646,"""sxk3xw""",135,18,81,132
2020-01-01 00:00:00,40.987244,29.108276,"""sxk9nm""",143,10,73,162
2020-01-01 00:00:00,41.003723,29.09729,"""sxk9q0""",128,6,50,110
2020-01-01 00:00:00,40.99823,28.67981,"""sxk3hx""",111,22,68,101
2020-01-01 00:00:00,41.042175,28.02063,"""sx7cmx""",99,99,99,1


In [5]:
# Get column information
column_info = all_df.schema

# Get data types information
data_types = all_df.dtypes

# Get number of rows
num_rows = all_df.height

# Estimate size of the DataFrame
size_bytes = all_df.estimated_size(unit='gb')

# Display the information
print("Column information:")
for key, value in column_info.items():
    print(key + ':', value)
print()
print("Number of rows:", num_rows)
print()
print("Size (bytes):", np.round(size_bytes, 3), "gb")

Column information:
DATE_TIME: Datetime(time_unit='ns', time_zone=None)
LATITUDE: Float32
LONGITUDE: Float32
GEOHASH: Utf8
MINIMUM_SPEED: UInt8
MAXIMUM_SPEED: UInt8
AVERAGE_SPEED: UInt8
NUMBER_OF_VEHICLES: UInt16

Number of rows: 66146858

Size (bytes): 2.156 gb


In [6]:
%%time
# Creating a seperate dataframe that shows all unique GEOHASH and the cooresponding count of rows. 
# Followed by creating percentage of rows available compared to total number of hours from 
# Jan 2020 to April 2023 (29,184 hours)

geohash_hrs = all_df.groupby('GEOHASH').count()
perc = pl.Series('perc', np.round(100*(geohash_hrs['count']/29184), 3))
geohash_hrs = geohash_hrs.with_columns(perc)

Wall time: 2.14 s


In [7]:
# Checking GEOHASH with atleast 96% non null rows. The corresponsing number of available data is displayed 
# in the `count` column
geohash_hrs.filter(pl.col('perc') > 96).sort('perc')

GEOHASH,count,perc
str,u32,f64
"""sxk3he""",28017,96.001
"""sxk9w1""",28017,96.001
"""sxkd8d""",28018,96.005
"""sxk9tb""",28025,96.029
"""sxk9tf""",28026,96.032
"""sxk8yd""",28027,96.035
"""sxk4rx""",28027,96.035
"""sxk3sd""",28027,96.035
"""sxkbut""",28029,96.042
"""sxk9zj""",28032,96.053


In [8]:
# Saving those GEOHASH with 96% hours of data
selected_gh = np.array(geohash_hrs.filter(pl.col('perc') > 96).sort('perc')['GEOHASH'])
selected_gh

array(['sxk3he', 'sxk9w1', 'sxkd8d', ..., 'sxk3nk', 'sxk3r9', 'sxk3nt'],
      dtype='<U6')

In [9]:
%%time
# Filtering out GEOHASH

filtered_df = (
    all_df.filter(pl.col('GEOHASH').is_in(selected_gh))
    .set_sorted(['DATE_TIME', 'GEOHASH'])
)

Wall time: 3.43 s


In [10]:
# Number of rows reduced from 66+ million records to 51.5 million records
# The number of rows in the filtered dataset will increase again laterwards when ensuring the 
# DATE_TIME for all GEOHASH is of hourly frequency
len(filtered_df)

51548379

In [11]:
# Checking whether all GEOHASH has start DATE_TIME as "2020-01-01 00:00:00"
gh_hr_1 = filtered_df.groupby('GEOHASH').agg(pl.col('DATE_TIME').first()).sort("DATE_TIME", descending=True)
gh_hr_1.head()

GEOHASH,DATE_TIME
str,datetime[ns]
"""sx7cm5""",2020-01-01 02:00:00
"""sx7ckc""",2020-01-01 02:00:00
"""sxk6ru""",2020-01-01 00:00:00
"""sxk904""",2020-01-01 00:00:00
"""sxk961""",2020-01-01 00:00:00


#### the above dataframe shows that 2 GEOHASH has start datetime at 02:00 hours instead of 00:00 hours.

In [12]:
%%time
# Here we will transform the dataset to ensure hourly frequency across all GEOHASH. 
# There will be null values for some of the rows as a result. These will be filled in using appropriate methods later.

# NOTE: For Some GEOHASH, the first hour of data starts from 02:00 hours intead of 00:00 hours.
# In order to fix this and ensure every single day has 24 hours of data, we will use offset argument and
# filter out rows

df = (
    # Groupby GEOHASH and upsample by DATE_TIME by 1 hour. Offset the first rows by 2 hours
    filtered_df.upsample(time_column='DATE_TIME', by='GEOHASH', every="1h", offset="-2h")

    # keep rows only from 1 Jan 2020 00:00 hours  
    .filter(pl.col('DATE_TIME') > datetime(2019, 12, 31, 23))

    # first backward fill, then forward fill null values for GEOHASH and coordinates
    .with_columns(pl.col(['GEOHASH', 'LATITUDE', 'LONGITUDE']).backward_fill().forward_fill())
)

Wall time: 19.7 s


#### For the above transformation, pandas took 2min 23s, while polars only took 19.9 seconds.

In [13]:
# there are about 1.3 million rows of missing values in total
df.null_count()

DATE_TIME,LATITUDE,LONGITUDE,GEOHASH,MINIMUM_SPEED,MAXIMUM_SPEED,AVERAGE_SPEED,NUMBER_OF_VEHICLES
u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,1362213,1362213,1362213,1362213


In [14]:
# The number of rows has increased from approximatly 51.5 million rows for the filtered 
# dataset to 52.9 million rows for the transformed dataset
# 29184 hours * 1813 GEOHASH = 52,910,592 hours of data 
len(df)

52910592

In [15]:
# Example of null values
df.filter(pl.col('GEOHASH')=='sxk9w1').head()

DATE_TIME,LATITUDE,LONGITUDE,GEOHASH,MINIMUM_SPEED,MAXIMUM_SPEED,AVERAGE_SPEED,NUMBER_OF_VEHICLES
datetime[ns],f32,f32,str,u8,u8,u8,u16
2020-01-01 00:00:00,41.053162,29.09729,"""sxk9w1""",46.0,7.0,24.0,7.0
2020-01-01 01:00:00,41.053162,29.09729,"""sxk9w1""",25.0,17.0,21.0,2.0
2020-01-01 02:00:00,41.053162,29.09729,"""sxk9w1""",44.0,15.0,29.0,5.0
2020-01-01 03:00:00,41.053162,29.09729,"""sxk9w1""",,,,
2020-01-01 04:00:00,41.053162,29.09729,"""sxk9w1""",61.0,9.0,33.0,5.0


#### As seen from the above sample of the dataset for GEOHASH='sxk9w1' (same as shown earlier), now there is a row of 03:00 hours. The corresponding values for speed and num_of_vehicles is null. We will handle missing values in the next section.

#### Overall, Polars is a much more efficient library to handle large datasets such as this one. Polars beats pandas in every task done here.

In [16]:
# All GEOHASH have 29184 rows of data
df.groupby('GEOHASH').count().sort('count')

GEOHASH,count
str,u32
"""sxk6mw""",29184
"""sxk3kf""",29184
"""sxk9ub""",29184
"""sxkb8x""",29184
"""sxkf26""",29184
"""sxkceq""",29184
"""sxkc2z""",29184
"""sxk9wn""",29184
"""sxk3r4""",29184
"""sxk396""",29184


In [17]:
# Saving to parquet format with zstd compression
df.write_parquet("datasets/01_tr_density/ist_traffic_density_rev02.zstd", compression='zstd')