<a href="https://colab.research.google.com/github/imsumedhaa/FireDucks-in-Finance/blob/main/FireDucks_in_Finance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install fireducks

FIREDUCKS_FLAGS="--benchmark-mode"
import pandas as pd
import numpy as np

# Create synthetic financial dataset
df = pd.DataFrame({
    "account_id": np.random.randint(10000, 50000, size=10_000_000),
    "region": np.random.choice(["North", "South", "East", "West"], size=10_000_000),
    "amount": np.random.exponential(scale=1200, size=10_000_000),
    "transaction_type": np.random.choice(["deposit", "withdrawal", "transfer"], size=10_000_000),
    "timestamp": pd.date_range(start="2023-01-01", periods=10_000_000, freq='s')
})

df.to_csv("financial_data.csv", index=False)
print("✅ Dataset with 10M rows created: financial_data.csv")

Collecting fireducks
  Downloading fireducks-1.2.7-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting firefw==1.2.7 (from fireducks)
  Downloading firefw-1.2.7-py3-none-any.whl.metadata (818 bytes)
Collecting pyarrow<19.1,>=19.0 (from fireducks)
  Downloading pyarrow-19.0.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading fireducks-1.2.7-cp311-cp311-manylinux_2_28_x86_64.whl (7.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading firefw-1.2.7-py3-none-any.whl (12 kB)
Downloading pyarrow-19.0.1-cp311-cp311-manylinux_2_28_x86_64.whl (42.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, firefw, fireducks
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 18.1.0
    Uninstalling pyarrow-18.1.0:
      Successfully uninstalled pya

In [2]:
#Reading Data: Pandas vs FireDucks

import pandas as pd
import fireducks.pandas as fd
import time

# Pandas
start = time.time()
pandas_df = pd.read_csv("financial_data.csv")
print("📄 Pandas Read Time:", round(time.time() - start, 2), "seconds")

# FireDucks
start = time.time()
fd_df = fd.read_csv("financial_data.csv")
print("🔥 FireDucks Read Time:", round(time.time() - start, 2), "seconds")

📄 Pandas Read Time: 8.04 seconds
🔥 FireDucks Read Time: 0.03 seconds


In [3]:
#Filtering High-Value Transactions

# Pandas
start = time.time()
pandas_filtered = pandas_df[pandas_df['amount'] > 5000]
print("📉 Pandas Filter Time:", round(time.time() - start, 2), "seconds")

# FireDucks
start = time.time()
fd_filtered = fd_df[fd_df['amount'] > 5000]
print("🔥 FireDucks Filter Time:", round(time.time() - start, 2), "seconds")

📉 Pandas Filter Time: 0.05 seconds
🔥 FireDucks Filter Time: 0.0 seconds


In [4]:
# Grouping by Region

# Pandas
start = time.time()
pandas_grouped = pandas_filtered.groupby("region")["amount"].agg(["mean", "sum"]).reset_index()
print("📊 Pandas GroupBy Time:", round(time.time() - start, 2), "seconds")

# FireDucks
start = time.time()
fd_grouped = fd_filtered.groupby("region")["amount"].agg(["mean", "sum"]).reset_index()
print("🔥 FireDucks GroupBy Time:", round(time.time() - start, 2), "seconds")

📊 Pandas GroupBy Time: 0.03 seconds
🔥 FireDucks GroupBy Time: 0.0 seconds


In [5]:
#Sorting by Total Amount

# Pandas
start = time.time()
pandas_sorted = pandas_grouped.sort_values(by="sum", ascending=False)
print("🔽 Pandas Sort Time:", round(time.time() - start, 2), "seconds")

# FireDucks
start = time.time()
fd_sorted = fd_grouped.sort_values(by="sum", ascending=False)
print("🔥 FireDucks Sort Time:", round(time.time() - start, 2), "seconds")

🔽 Pandas Sort Time: 0.0 seconds
🔥 FireDucks Sort Time: 0.0 seconds
