<a href="https://colab.research.google.com/github/imsumedhaa/FireDucks-in-Finance/blob/main/FireDucks_in_Finance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# ✅ Install FireDucks
!pip install fireducks

# ✅ Set benchmark mode for FireDucks
import os
os.environ["FIREDUCKS_FLAGS"] = "--benchmark-mode"

import pandas as pd
import fireducks.pandas as fd
import numpy as np
import time

# 📊 Create synthetic financial dataset with 10M rows
df = pd.DataFrame({
    "account_id": np.random.randint(10000, 50000, size=10_000_000),
    "region": np.random.choice(["North", "South", "East", "West"], size=10_000_000),
    "amount": np.random.exponential(scale=1200, size=10_000_000),
    "transaction_type": np.random.choice(["deposit", "withdrawal", "transfer"], size=10_000_000),
    "timestamp": pd.date_range(start="2023-01-01", periods=10_000_000, freq='s')
})
df.to_csv("financial_data.csv", index=False)
print("✅ Dataset with 10M rows created: financial_data.csv")

✅ Dataset with 10M rows created: financial_data.csv


In [9]:
# 📄 Read CSV
start = time.time()
pandas_df = pd.read_csv("financial_data.csv")
print("📄 Pandas Read Time:", round((time.time() - start) * 1000, 2), "ms")

start = time.time()
fd_df = fd.read_csv("financial_data.csv")
print("🔥 FireDucks Read Time:", round((time.time() - start) * 1000, 2), "ms")

📄 Pandas Read Time: 9391.05 ms
🔥 FireDucks Read Time: 10.53 ms


In [6]:
# 🔎 Filter high-value transactions (amount > 5000)
start = time.time()
pandas_filtered = pandas_df[pandas_df['amount'] > 5000]
print("📉 Pandas Filter Time:", round((time.time() - start) * 1000, 2), "ms")

start = time.time()
fd_filtered = fd_df[fd_df['amount'] > 5000]
print("🔥 FireDucks Filter Time:", round((time.time() - start) * 1000, 2), "ms")

📉 Pandas Filter Time: 73.85 ms
🔥 FireDucks Filter Time: 0.78 ms


In [7]:
# 📊 Group by Region: Mean & Total Amount
start = time.time()
pandas_grouped = pandas_filtered.groupby("region")["amount"].agg(["mean", "sum"]).reset_index()
print("📊 Pandas GroupBy Time:", round((time.time() - start) * 1000, 2), "ms")

start = time.time()
fd_grouped = fd_filtered.groupby("region")["amount"].agg(["mean", "sum"]).reset_index()
print("🔥 FireDucks GroupBy Time:", round((time.time() - start) * 1000, 2), "ms")

📊 Pandas GroupBy Time: 12.34 ms
🔥 FireDucks GroupBy Time: 2.33 ms


In [8]:
# 🔽 Sort by Total Amount
start = time.time()
pandas_sorted = pandas_grouped.sort_values(by="sum", ascending=False)
print("🔽 Pandas Sort Time:", round((time.time() - start) * 1000, 2), "ms")

start = time.time()
fd_sorted = fd_grouped.sort_values(by="sum", ascending=False)
print("🔥 FireDucks Sort Time:", round((time.time() - start) * 1000, 2), "ms")

🔽 Pandas Sort Time: 1.15 ms
🔥 FireDucks Sort Time: 0.43 ms
