In [1]:
import uuid
import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime

In [2]:
date_ranges = pd.date_range(start=datetime(1900, 1, 1), end=datetime(2025, 12, 31), freq="h")
locations = ["Italy", "France", "Spain", "Portugal", "Greece", "Germany", "Austria", "Norway", "Sweeden", "Poland"]
location_multiplier = [0.7, 1.2, 0.9, 0.5, 0.3, 1.0, 1.4, 1.9, 2.1, 1.4]

data = []

print("Started constructing data...")

for dt in date_ranges:
    for i, loc in enumerate(locations):
        data.append({
            "id": str(uuid.uuid4()),
            "date": dt,
            "location": loc,
            "sales": np.random.randint(low=1, high=50) * location_multiplier[i]
        })

print("Finished constructing data!")
print("Converting to dataframe...")

data = pd.DataFrame(data)

print("Finished converting to dataframe!")
print("Saving to CSV...")

data.to_csv("data/data_20240411.csv", index=False)

print("Saved to CSV!")
print("Done!")

Started constructing data...
Finished constructing data!
Converting to dataframe...
Finished converting to dataframe!
Saving to CSV...
Saved to CSV!
Done!


In [3]:
print(data.shape)
print()
data.sample(10)

(11044810, 4)



Unnamed: 0,id,date,location,sales
10421737,49782b91-865e-4cfc-8b86-028833a0b69c,2018-11-21 21:00:00,Norway,3.8
507721,7c391869-d05d-42a6-ba83-2f95ba0cd169,1905-10-17 12:00:00,France,20.4
8596934,984f12e9-9707-4253-9ab5-336f09d33294,1998-01-27 13:00:00,Greece,11.4
7045707,79cc52f1-97db-4d5d-8993-24dcedf5de9f,1980-05-18 02:00:00,Norway,93.1
8474416,df828e54-2c65-45d2-8673-8276cc7e6da3,1996-09-04 01:00:00,Austria,1.4
7894456,3a214f6a-fe8e-4769-8fd2-523993f5746b,1990-01-22 13:00:00,Austria,42.0
6622461,db05260c-abfa-4ff5-b9c6-9badda6e2e95,1975-07-20 14:00:00,France,26.4
3342042,ee4f5f83-47b1-44ab-9a6c-4346294a5ce4,1938-02-16 04:00:00,Spain,39.6
1158154,8c400598-a264-405b-b710-c120f1ad5596,1913-03-19 15:00:00,Greece,6.0
7391407,733603ab-0f0a-44df-94ae-00f4cfd72696,1984-04-27 12:00:00,Norway,9.5


In [4]:
# PANDAS
df_pd = pd.read_csv("data/data_20240411.csv")
## => 9.8s

In [5]:
# POLARS
df_pl = pl.read_csv("data/data_20240411.csv")
## => 2.0s

In [7]:
# PANDAS
df_pd_italy = pd.read_csv("data/data_20240411.csv")
df_pd_italy["date"] = pd.to_datetime(df_pd_italy["date"])
df_pd_italy = df_pd_italy[df_pd_italy["location"] == "Italy"]
df_pd_italy = df_pd_italy[["date", "sales"]]
## => 13.5s

In [8]:
# POLARS
df_pl_italy = (
    pl.read_csv("data/data_20240411.csv")
      .with_columns(pl.col("date").str.to_date("%Y-%m-%d %H:%M:%S"))
      .filter(pl.col("location") == "Italy")
      .select(pl.col(["date", "sales"]))
)
## => 1.8s

In [11]:
# PANDAS
df_pd = pd.read_csv("data/data_20240411.csv")
df_pd["date"] = pd.to_datetime(df_pd["date"])

df_pd_res = (
    df_pd
        .groupby([df_pd["date"].dt.year, df_pd["location"]])
        .agg(
            total_sales=("sales", "sum"),
            avg_sales=("sales", "mean")
        )
        .reset_index()
        .sort_values(by=["date", "total_sales"], ascending=[True, False])
)
## => 13.5s

In [12]:
# POLARS
df_pl_res = (
    pl.read_csv("data/data_20240411.csv")
        .with_columns(pl.col("date").str.to_date("%Y-%m-%d %H:%M:%S"))
        .group_by([pl.col("date").dt.year(), pl.col("location")])
        .agg(
            pl.sum("sales").alias("total_sales"),
            pl.mean("sales").alias("avg_sales")
        )
        .sort(by=["date", "total_sales"], descending=[False, True])
)
## => 1.9s

In [13]:
# POLARS WITH LAZY EVALUATION
df_pl_lazy_query = (
    pl.scan_csv("data/data_20240411.csv")
        .with_columns(pl.col("date").str.to_date("%Y-%m-%d %H:%M:%S"))
        .group_by([pl.col("date").dt.year(), pl.col("location")])
        .agg(
            pl.sum("sales").alias("total_sales"),
            pl.mean("sales").alias("avg_sales")
        )
        .sort(by=["date", "total_sales"], descending=[False, True])
)

df_pl_lazy_res = df_pl_lazy_query.collect()
## => 1.5s

In [15]:
df_pl_lazy_res.head()

date,location,total_sales,avg_sales
i32,str,f64,f64
1900,"""Sweeden""",461926.5,52.731336
1900,"""Norway""",415801.7,47.465947
1900,"""Austria""",307876.8,35.145753
1900,"""Poland""",306825.4,35.025731
1900,"""France""",260358.0,29.721233
