<a href="https://colab.research.google.com/github/imsumedhaa/Fireducks-vs-Pandas-AI-ML-Pipelines/blob/main/AI_%26_ML_Pipelines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install FireDucks
!pip install fireducks

# Set environment variable for benchmark mode BEFORE importing FireDucks
import os
os.environ["FIREDUCKS_FLAGS"] = "--benchmark-mode"

import pandas as pd
import fireducks.pandas as fd
import numpy as np
import time

# Create sample dataset
df = pd.DataFrame({
    'product': np.random.choice(['apple', 'banana', 'orange', 'kiwi'], size=1_000_000),
    'region': np.random.choice(['north', 'south', 'east', 'west'], size=1_000_000),
    'price': np.random.uniform(10, 200, size=1_000_000),
    'units_sold': np.random.randint(1, 50, size=1_000_000)
})
df.to_csv('sales_data.csv', index=False)
print("✅ Created 'sales_data.csv'")

Collecting fireducks
  Downloading fireducks-1.2.8-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting firefw==1.2.8 (from fireducks)
  Downloading firefw-1.2.8-py3-none-any.whl.metadata (818 bytes)
Collecting pyarrow<19.1,>=19.0 (from fireducks)
  Downloading pyarrow-19.0.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading fireducks-1.2.8-cp311-cp311-manylinux_2_28_x86_64.whl (7.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading firefw-1.2.8-py3-none-any.whl (12 kB)
Downloading pyarrow-19.0.1-cp311-cp311-manylinux_2_28_x86_64.whl (42.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, firefw, fireducks
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 20.0.0
    Uninstalling pyarrow-20.0.0:
      Successfully uninstalled pya

In [3]:
# 1. Pandas Benchmark: Read
# ----------------------------------------
start = time.time()
pdf = pd.read_csv('sales_data.csv')
print('📄 Pandas Read Time:', round((time.time() - start) * 1000, 2), 'ms')

# ----------------------------------------
# 2. FireDucks Benchmark: Read
# ----------------------------------------
start = time.time()
fdf = fd.read_csv('sales_data.csv')
print('🔥 FireDucks Read Time:', round((time.time() - start) * 1000, 2), 'ms')


📄 Pandas Read Time: 344.89 ms
🔥 FireDucks Read Time: 35.67 ms


In [5]:
start = time.time()
filtered_pdf = pdf[pdf['price'] > 100]
print('📉 Pandas Filter Time:', round((time.time() - start) * 1000, 2), 'ms')

# ----------------------------------------
# 4. FireDucks Benchmark: Filter
# ----------------------------------------
start = time.time()
filtered_fdf = fdf[fdf['price'] > 100]
print('🔥 FireDucks Filter Time:', round((time.time() - start) * 1000, 2), 'ms')

📉 Pandas Filter Time: 17.69 ms
🔥 FireDucks Filter Time: 17.82 ms


In [6]:
start = time.time()
grouped_pdf = pdf.groupby('region')['price'].mean().reset_index()
print('📊 Pandas GroupBy Time:', round((time.time() - start) * 1000, 2), 'ms')

# ----------------------------------------
# 6. FireDucks Benchmark: GroupBy
# ----------------------------------------
start = time.time()
grouped_fdf = fdf.groupby('region')['price'].mean().reset_index()
print('🔥 FireDucks GroupBy Time:', round((time.time() - start) * 1000, 2), 'ms')

📊 Pandas GroupBy Time: 59.76 ms
🔥 FireDucks GroupBy Time: 13.19 ms


In [7]:
start = time.time()
sorted_pdf = grouped_pdf.sort_values(by='price', ascending=False)
print('🔽 Pandas Sort Time:', round((time.time() - start) * 1000, 2), 'ms')

# ----------------------------------------
# 8. FireDucks Benchmark: Sort
# ----------------------------------------
start = time.time()
sorted_fdf = grouped_fdf.sort_values(by='price', ascending=False)
print('🔥 FireDucks Sort Time:', round((time.time() - start) * 1000, 2), 'ms')

# ----------------------------------------
print("✅ Benchmark Completed!")

🔽 Pandas Sort Time: 0.87 ms
🔥 FireDucks Sort Time: 7.32 ms
✅ Benchmark Completed!
