In [1]:
import pandas as pd
import time
import tracemalloc

# Sample data (in-memory)
data = {'customer_id': [1, 2, 1, 3, 2], 
        'transaction_amount': [100, 200, 150, 50, 250],
        'transaction_date': ['2025-02-01', '2025-02-01', '2025-02-02', '2025-02-02', '2025-02-03']}
df_pandas = pd.DataFrame(data)

# Track memory usage and time
tracemalloc.start()
start_time = time.time()

# Transformation: Group by customer_id and sum transaction_amount
df_pandas_grouped = df_pandas.groupby('customer_id')['transaction_amount'].sum()

# Stop tracking
end_time = time.time()
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()

print(f"Pandas Execution Time: {end_time - start_time:.4f} seconds")
print(f"Pandas Memory Usage: current={current / 1024:.2f} KB, peak={peak / 1024:.2f} KB")
print(f"Pandas Result:\n{df_pandas_grouped}")

Pandas Execution Time: 0.0082 seconds
Pandas Memory Usage: current=85.37 KB, peak=95.40 KB
Pandas Result:
customer_id
1    250
2    450
3     50
Name: transaction_amount, dtype: int64


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum
import time
import tracemalloc

# Initialize Spark session
spark = SparkSession.builder.appName("PySparkExample").getOrCreate()

# Sample data (simulating distributed data)
data = [{'customer_id': 1, 'transaction_amount': 100, 'transaction_date': '2025-02-01'},
        {'customer_id': 2, 'transaction_amount': 200, 'transaction_date': '2025-02-01'},
        {'customer_id': 1, 'transaction_amount': 150, 'transaction_date': '2025-02-02'},
        {'customer_id': 3, 'transaction_amount': 50, 'transaction_date': '2025-02-02'},
        {'customer_id': 2, 'transaction_amount': 250, 'transaction_date': '2025-02-03'}]
df_spark = spark.createDataFrame(data)

# Track memory usage and time
tracemalloc.start()
start_time = time.time()

# Transformation: Group by customer_id and sum transaction_amount
df_spark_grouped = df_spark.groupBy('customer_id').agg(sum('transaction_amount').alias('total_amount'))

# Action to trigger computation and measure time correctly
df_spark_grouped.collect()

# Stop tracking
end_time = time.time()
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()

print(f"PySpark Execution Time: {end_time - start_time:.4f} seconds")
print(f"PySpark Memory Usage: current={current / 1024:.2f} KB, peak={peak / 1024:.2f} KB")
df_spark_grouped.show()

PySpark Execution Time: 2.2653 seconds
PySpark Memory Usage: current=349.06 KB, peak=480.80 KB
+-----------+------------+
|customer_id|total_amount|
+-----------+------------+
|          1|         250|
|          2|         450|
|          3|          50|
+-----------+------------+



In [3]:
import pandas as pd
import time
import psutil

# Generate sample data
data = {'col1': range(10000000), 'col2': [x % 100 for x in range(10000000)]}
df_pandas = pd.DataFrame(data)

# Transformation 1: Filter
start_time = time.time()
process = psutil.Process()
mem_before_filter = process.memory_info().rss / (1024 ** 2)
df_filtered_pandas = df_pandas[df_pandas['col2'] > 50]
mem_after_filter = process.memory_info().rss / (1024 ** 2)
filter_time = time.time() - start_time

# Transformation 2: Aggregate
start_time = time.time()
mem_before_agg = process.memory_info().rss / (1024 ** 2)
df_agg_pandas = df_filtered_pandas.groupby('col2').mean()
mem_after_agg = process.memory_info().rss / (1024 ** 2)
agg_time = time.time() - start_time

print(f"Pandas - Memory usage before filter: {mem_before_filter:.2f} MB")
print(f"Pandas - Memory usage after filter: {mem_after_filter:.2f} MB")
print(f"Pandas - Filter time: {filter_time:.4f} seconds")
print(f"Pandas - Memory usage before aggregation: {mem_before_agg:.2f} MB")
print(f"Pandas - Memory usage after aggregation: {mem_after_agg:.2f} MB")
print(f"Pandas - Aggregation time: {agg_time:.4f} seconds")

Pandas - Memory usage before filter: 323.96 MB
Pandas - Memory usage after filter: 515.84 MB
Pandas - Filter time: 0.2346 seconds
Pandas - Memory usage before aggregation: 515.84 MB
Pandas - Memory usage after aggregation: 521.59 MB
Pandas - Aggregation time: 0.1287 seconds


In [None]:
from pyspark.sql import SparkSession
import time
import psutil

spark = SparkSession.builder.master("local[*]").appName("PySparkExample").getOrCreate()

# Generate sample data
data = list(zip(range(10000000), [x % 100 for x in range(10000000)]))
df_spark = spark.createDataFrame(data, ["col1", "col2"])

# Transformation 1: Filter
start_time = time.time()
process = psutil.Process()
mem_before_filter = process.memory_info().rss / (1024 ** 2)
df_filtered_spark = df_spark.filter(df_spark['col2'] > 50)
df_filtered_spark.cache()  # Cache the result for subsequent operations
df_filtered_spark.count()  # Trigger execution and materialize the cache
mem_after_filter = process.memory_info().rss / (1024 ** 2)
filter_time = time.time() - start_time

# Transformation 2: Aggregate
start_time = time.time()
mem_before_agg = process.memory_info().rss / (1024 ** 2)
df_agg_spark = df_filtered_spark.groupBy('col2').mean()
df_agg_spark.count()  # Trigger execution
mem_after_agg = process.memory_info().rss / (1024 ** 2)
agg_time = time.time() - start_time

print(f"PySpark - Memory usage before filter: {mem_before_filter:.2f} MB")
print(f"PySpark - Memory usage after filter: {mem_after_filter:.2f} MB")
print(f"PySpark - Filter time: {filter_time:.4f} seconds")
print(f"PySpark - Memory usage before aggregation: {mem_before_agg:.2f} MB")
print(f"PySpark - Memory usage after aggregation: {mem_after_agg:.2f} MB")
print(f"PySpark - Aggregation time: {agg_time:.4f} seconds")

spark.stop()

In [None]:
1. Pandas Internal Working
Pandas is a single-node, in-memory data processing library built on top of NumPy. It works efficiently for smaller datasets 
that fit into your machine's memory.

How Transformations Work in Pandas:

Eager Evaluation: Pandas applies transformations immediately when you call a function. Every time you perform an operation (like filter, groupby, etc.), it processes the data and returns the result right away.
In-Memory Processing: All operations are performed in memory. This makes Pandas very fast for small to medium datasets but memory-bound for larger datasets.
Optimization: Limited internal optimization. You have control over optimizing your code (vectorization, avoiding loops, etc.).

2. PySpark Internal Working
PySpark is the Python API for Apache Spark, which is a distributed data processing engine. It works across multiple machines 
(nodes) and is designed for big data processing.

How Transformations Work in PySpark:

Lazy Evaluation: When you apply transformations (like filter, select, groupBy), PySpark doesn't execute them immediately. Instead, it builds a DAG (Directed Acyclic Graph) representing the sequence of transformations.
Actions Trigger Execution: The actual computation only happens when an action (like collect(), show(), or write()) is called. At that point, Spark optimizes the DAG and executes the tasks across the cluster.
Optimization with Catalyst and Tungsten:
Catalyst Optimizer: Optimizes query plans before execution.
Tungsten Engine: Provides efficient memory management and code generation for better performance.

3. Performance Comparison:

Pandas:

Fast for small to medium datasets (fits into RAM).
Time increases significantly as the dataset grows beyond memory capacity.
Immediate execution after each step.

PySpark:

Overhead from starting the Spark context and distributing data.
Slower for small datasets due to this overhead.
Scales efficiently for large datasets because of parallel processing.
Optimizes transformations before execution, reducing redundant computations.

When to Use What?
Pandas: Ideal for datasets that fit into memory (~1-2 GB), quick prototyping, and simple data analysis.
PySpark: Best for big data scenarios where datasets are large (GBs to TBs), and distributed processing across clusters is required.