In [1]:
!pip install "modin[ray]" -q
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import time
import os
from typing import Dict, Any

import modin.pandas as mpd
import ray

ray.init(ignore_reinit_error=True, num_cpus=2)
print(f"Ray initialized with {ray.cluster_resources()}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.9/68.9 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25h

2025-07-01 17:11:21,502	INFO worker.py:1917 -- Started a local Ray instance.


Ray initialized with {'CPU': 2.0, 'node:__internal_head__': 1.0, 'node:172.28.0.12': 1.0, 'memory': 9341988455.0, 'object_store_memory': 4003709337.0}


In [2]:
def benchmark_operation(pandas_func, modin_func, data, operation_name: str) -> Dict[str, Any]:
    """Compare pandas vs modin performance"""

    start_time = time.time()
    pandas_result = pandas_func(data['pandas'])
    pandas_time = time.time() - start_time

    start_time = time.time()
    modin_result = modin_func(data['modin'])
    modin_time = time.time() - start_time

    speedup = pandas_time / modin_time if modin_time > 0 else float('inf')

    print(f"\n{operation_name}:")
    print(f"  Pandas: {pandas_time:.3f}s")
    print(f"  Modin:  {modin_time:.3f}s")
    print(f"  Speedup: {speedup:.2f}x")

    return {
        'operation': operation_name,
        'pandas_time': pandas_time,
        'modin_time': modin_time,
        'speedup': speedup
    }

In [3]:
def create_large_dataset(rows: int = 1_000_000):
    """Generate synthetic dataset for testing"""
    np.random.seed(42)

    data = {
        'customer_id': np.random.randint(1, 50000, rows),
        'transaction_amount': np.random.exponential(50, rows),
        'category': np.random.choice(['Electronics', 'Clothing', 'Food', 'Books', 'Sports'], rows),
        'region': np.random.choice(['North', 'South', 'East', 'West'], rows),
        'date': pd.date_range('2020-01-01', periods=rows, freq='H'),
        'is_weekend': np.random.choice([True, False], rows, p=[0.3, 0.7]),
        'rating': np.random.uniform(1, 5, rows),
        'quantity': np.random.poisson(3, rows) + 1,
        'discount_rate': np.random.beta(2, 5, rows),
        'age_group': np.random.choice(['18-25', '26-35', '36-45', '46-55', '55+'], rows)
    }

    pandas_df = pd.DataFrame(data)
    modin_df = mpd.DataFrame(data)

    print(f"Dataset created: {rows:,} rows × {len(data)} columns")
    print(f"Memory usage: {pandas_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

    return {'pandas': pandas_df, 'modin': modin_df}

dataset = create_large_dataset(500_000)

print("\n" + "="*60)
print("ADVANCED MODIN OPERATIONS BENCHMARK")
print("="*60)

Dataset created: 500,000 rows × 10 columns
Memory usage: 112.5 MB

ADVANCED MODIN OPERATIONS BENCHMARK


In [4]:
def complex_groupby(df):
    return df.groupby(['category', 'region']).agg({
        'transaction_amount': ['sum', 'mean', 'std', 'count'],
        'rating': ['mean', 'min', 'max'],
        'quantity': 'sum'
    }).round(2)

groupby_results = benchmark_operation(
    complex_groupby, complex_groupby, dataset, "Complex GroupBy Aggregation"
)


Complex GroupBy Aggregation:
  Pandas: 0.346s
  Modin:  6.456s
  Speedup: 0.05x


In [5]:
def advanced_cleaning(df):
    df_clean = df.copy()

    Q1 = df_clean['transaction_amount'].quantile(0.25)
    Q3 = df_clean['transaction_amount'].quantile(0.75)
    IQR = Q3 - Q1
    df_clean = df_clean[
        (df_clean['transaction_amount'] >= Q1 - 1.5 * IQR) &
        (df_clean['transaction_amount'] <= Q3 + 1.5 * IQR)
    ]

    df_clean['transaction_score'] = (
        df_clean['transaction_amount'] * df_clean['rating'] * df_clean['quantity']
    )
    df_clean['is_high_value'] = df_clean['transaction_amount'] > df_clean['transaction_amount'].median()

    return df_clean

cleaning_results = benchmark_operation(
    advanced_cleaning, advanced_cleaning, dataset, "Advanced Data Cleaning"
)


Advanced Data Cleaning:
  Pandas: 0.148s
  Modin:  6.948s
  Speedup: 0.02x


In [6]:
def time_series_analysis(df):
    df_ts = df.copy()
    df_ts = df_ts.set_index('date')

    daily_sum = df_ts.groupby(df_ts.index.date)['transaction_amount'].sum()
    daily_mean = df_ts.groupby(df_ts.index.date)['transaction_amount'].mean()
    daily_count = df_ts.groupby(df_ts.index.date)['transaction_amount'].count()
    daily_rating = df_ts.groupby(df_ts.index.date)['rating'].mean()

    daily_stats = type(df)({
        'transaction_sum': daily_sum,
        'transaction_mean': daily_mean,
        'transaction_count': daily_count,
        'rating_mean': daily_rating
    })

    daily_stats['rolling_mean_7d'] = daily_stats['transaction_sum'].rolling(window=7).mean()

    return daily_stats

ts_results = benchmark_operation(
    time_series_analysis, time_series_analysis, dataset, "Time Series Analysis"
)


Time Series Analysis:
  Pandas: 0.971s
  Modin:  4.298s
  Speedup: 0.23x


In [7]:
def create_lookup_data():
    """Create lookup tables for joins"""
    categories_data = {
        'category': ['Electronics', 'Clothing', 'Food', 'Books', 'Sports'],
        'commission_rate': [0.15, 0.20, 0.10, 0.12, 0.18],
        'target_audience': ['Tech Enthusiasts', 'Fashion Forward', 'Food Lovers', 'Readers', 'Athletes']
    }

    regions_data = {
        'region': ['North', 'South', 'East', 'West'],
        'tax_rate': [0.08, 0.06, 0.09, 0.07],
        'shipping_cost': [5.99, 4.99, 6.99, 5.49]
    }

    return {
        'pandas': {
            'categories': pd.DataFrame(categories_data),
            'regions': pd.DataFrame(regions_data)
        },
        'modin': {
            'categories': mpd.DataFrame(categories_data),
            'regions': mpd.DataFrame(regions_data)
        }
    }

lookup_data = create_lookup_data()

In [8]:
def advanced_joins(df, lookup):
    result = df.merge(lookup['categories'], on='category', how='left')
    result = result.merge(lookup['regions'], on='region', how='left')

    result['commission_amount'] = result['transaction_amount'] * result['commission_rate']
    result['tax_amount'] = result['transaction_amount'] * result['tax_rate']
    result['total_cost'] = result['transaction_amount'] + result['tax_amount'] + result['shipping_cost']

    return result

join_results = benchmark_operation(
    lambda df: advanced_joins(df, lookup_data['pandas']),
    lambda df: advanced_joins(df, lookup_data['modin']),
    dataset,
    "Advanced Joins & Calculations"
)


Advanced Joins & Calculations:
  Pandas: 0.424s
  Modin:  0.281s
  Speedup: 1.51x


In [9]:
print("\n" + "="*60)
print("MEMORY EFFICIENCY COMPARISON")
print("="*60)

def get_memory_usage(df, name):
    """Get memory usage of dataframe"""
    if hasattr(df, '_to_pandas'):
        memory_mb = df.memory_usage(deep=True).sum() / 1024**2
    else:
        memory_mb = df.memory_usage(deep=True).sum() / 1024**2

    print(f"{name} memory usage: {memory_mb:.1f} MB")
    return memory_mb

pandas_memory = get_memory_usage(dataset['pandas'], "Pandas")
modin_memory = get_memory_usage(dataset['modin'], "Modin")


MEMORY EFFICIENCY COMPARISON
Pandas memory usage: 112.5 MB
Modin memory usage: 112.5 MB


In [10]:
print("\n" + "="*60)
print("PERFORMANCE SUMMARY")
print("="*60)

results = [groupby_results, cleaning_results, ts_results, join_results]
avg_speedup = sum(r['speedup'] for r in results) / len(results)

print(f"\nAverage Speedup: {avg_speedup:.2f}x")
print(f"Best Operation: {max(results, key=lambda x: x['speedup'])['operation']} "
      f"({max(results, key=lambda x: x['speedup'])['speedup']:.2f}x)")

print("\nDetailed Results:")
for result in results:
    print(f"  {result['operation']}: {result['speedup']:.2f}x speedup")

print("\n" + "="*60)
print("MODIN BEST PRACTICES")
print("="*60)

best_practices = [
    "1. Use 'import modin.pandas as pd' to replace pandas completely",
    "2. Modin works best with operations on large datasets (>100MB)",
    "3. Ray backend is most stable; Dask for distributed clusters",
    "4. Some pandas functions may fall back to pandas automatically",
    "5. Use .to_pandas() to convert Modin DataFrame to pandas when needed",
    "6. Profile your specific workload - speedup varies by operation type",
    "7. Modin excels at: groupby, join, apply, and large data I/O operations"
]

for tip in best_practices:
    print(tip)

ray.shutdown()
print("\n✅ Tutorial completed successfully!")
print("🚀 Modin is now ready to scale your pandas workflows!")


PERFORMANCE SUMMARY

Average Speedup: 0.45x
Best Operation: Advanced Joins & Calculations (1.51x)

Detailed Results:
  Complex GroupBy Aggregation: 0.05x speedup
  Advanced Data Cleaning: 0.02x speedup
  Time Series Analysis: 0.23x speedup
  Advanced Joins & Calculations: 1.51x speedup

MODIN BEST PRACTICES
1. Use 'import modin.pandas as pd' to replace pandas completely
2. Modin works best with operations on large datasets (>100MB)
3. Ray backend is most stable; Dask for distributed clusters
4. Some pandas functions may fall back to pandas automatically
5. Use .to_pandas() to convert Modin DataFrame to pandas when needed
6. Profile your specific workload - speedup varies by operation type
7. Modin excels at: groupby, join, apply, and large data I/O operations

✅ Tutorial completed successfully!
🚀 Modin is now ready to scale your pandas workflows!
