In [49]:
import time
import argparse
import math
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, size, length

from sentiment_analysis.config import logger
from sentiment_analysis.load import load_amazon_reviews, load_model
import sentiment_analysis.process as process
from sentiment_analysis.utils import run_command

In [50]:
target_mb_sizes = [2, 4, 8]
sample_ratio = 0.1
input_file, output_dir = '/Subscription_Boxes.jsonl', '/analysis_outputs'
results = []

In [51]:
spark = SparkSession.builder \
        .appName("SentimentAnalysisBenchmark") \
        .getOrCreate()

In [60]:
hadoop_conf = sc._jsc.hadoopConfiguration()
fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(hadoop_conf)

# Define the HDFS path
path = sc._jvm.org.apache.hadoop.fs.Path(input_file)

# Get file status and size
file_status = fs.getFileStatus(path)
file_size_bytes = file_status.getLen()  # size in bytes
file_size_mb = file_size_bytes / (1024 * 1024)

print(f"File size: {file_size_bytes} bytes ({file_size_mb:.2f} MB)")


File size: 8953020 bytes (8.54 MB)


In [53]:
total_cores = sc.defaultParallelism
task_cpus = int(spark.conf.get("spark.task.cpus", "1"))
min_partitions = total_cores // task_cpus
logger.info(f"Using {min_partitions} min partitions")

2025-04-10 11:27:25,144 - INFO - Using 16 min partitions


In [54]:
for target_mb in target_mb_sizes:
    logger.info(f"Testing with target partition size of {target_mb} MB...")
    
    # Calculate number of partitions
    target_bytes = target_mb * 1024 * 1024
    num_partitions = max(1, math.ceil(file_size_bytes / target_bytes))
    
    logger.info(f"Using {num_partitions} partitions for {target_mb} MB target size")

2025-04-10 11:27:29,665 - INFO - Testing with target partition size of 2 MB...
2025-04-10 11:27:29,667 - INFO - Using 5 partitions for 2 MB target size
2025-04-10 11:27:29,670 - INFO - Testing with target partition size of 4 MB...
2025-04-10 11:27:29,672 - INFO - Using 3 partitions for 4 MB target size
2025-04-10 11:27:29,673 - INFO - Testing with target partition size of 8 MB...
2025-04-10 11:27:29,675 - INFO - Using 2 partitions for 8 MB target size


In [55]:
# Load the dataset once
reviews_df_original = load_amazon_reviews(spark, input_file, sample_ratio)

# Cache the original dataframe to improve performance
reviews_df_original.cache()

# Count total reviews and get estimated size per review
total_reviews = reviews_df_original.count()

# Estimate average review size by sampling
sample_df = reviews_df_original.sample(fraction=min(0.1, 1000.0/total_reviews))
sample_df = sample_df.withColumn("text_length", length(col("text")))

# Get average review text length from the full dataset
reviews_df_with_length = reviews_df_original.withColumn("text_length", length(col("text")))
avg_stats = reviews_df_with_length.agg({"text_length": "avg"}).collect()[0]
avg_text_length = avg_stats["avg(text_length)"]

# Estimate bytes per review (rough approximation including overhead)
# Text is UTF-8 so ~1-4 bytes per character + metadata
estimated_bytes_per_review = avg_text_length * 2 + 200  # Rough estimate

logger.info(f"Total reviews: {total_reviews:,}")
logger.info(f"Estimated average review size: {estimated_bytes_per_review} bytes")

if not process.bc_model:
    # Load model and tokenizer once
    tokenizer, model = load_model()
    # Broadcast model and tokenizer to all workers
    process.bc_tokenizer = spark.sparkContext.broadcast(tokenizer)
    process.bc_model = spark.sparkContext.broadcast(model)

25/04/10 11:27:30 WARN CacheManager: Asked to cache already cached data.
25/04/10 11:27:30 WARN CacheManager: Asked to cache already cached data.
2025-04-10 11:27:30,832 - INFO - Total reviews: 1,686
2025-04-10 11:27:30,833 - INFO - Estimated average review size: 700.2336892052194 bytes


In [59]:
total_size = reviews_df_original.rdd.map(lambda x: len(str(x))).sum()
file_size_mb = total_size / (1024 * 1024)
print(f"File size: {total_size} bytes ({file_size_mb:.2f} MB)")

File size: 662423 bytes (0.63 MB)


In [56]:
target_mb = 8
logger.info(f"Testing with target partition size of {target_mb} MB...")
    
# Calculate target reviews per partition
target_bytes = target_mb * 1024 * 1024
print(f"target_bytes {target_bytes}") 
reviews_per_partition = max(1, int(target_bytes / estimated_bytes_per_review))

# Calculate number of partitions
num_partitions = max(1, int(total_reviews / reviews_per_partition))

logger.info(f"Using {num_partitions} partitions for {target_mb} MB target size")
logger.info(f"Estimated {reviews_per_partition} reviews per partition")


2025-04-10 11:27:33,403 - INFO - Testing with target partition size of 8 MB...
2025-04-10 11:27:33,407 - INFO - Using 1 partitions for 8 MB target size
2025-04-10 11:27:33,409 - INFO - Estimated 11979 reviews per partition


target_bytes 8388608


In [9]:
for target_mb in target_mb_sizes:
    logger.info(f"Testing with target partition size of {target_mb} MB...")
    
    # Calculate number of partitions
    target_bytes = target_mb * 1024 * 1024
    num_partitions = max(1, math.ceil(file_size_bytes / target_bytes))
    
    logger.info(f"Using {num_partitions} partitions for {target_mb} MB target size")
    
    # Record start time
    start_time = time.time()
    
    # Create a fresh copy of the dataframe for this test
    reviews_df = reviews_df_original
    
    # Repartition the DataFrame to target partition count
    reviews_df = reviews_df.repartition(num_partitions)
    
    # Process time for sentiment analysis
    process_start_time = time.time()
    
    # Run sentiment analysis
    sentiment_results_df = reviews_df.withColumn(
        "result",
        process.batch_sentiment_analysis(reviews_df["text"]),
    )
    
    # Flatten the result column
    sentiment_results_df = sentiment_results_df.select(
        col("asin"),
        col("user_id"),
        col("result.review_text"),
        col("result.sentiment"),
        col("result.score"),
    )
    
    # Force execution with count action
    result_count = sentiment_results_df.count()
    
    # Write results to parquet
    output_path = f"{output_dir}/partition_size_{target_mb}MB"
    
    process_start_time = time.time()
    sentiment_results_df.write.option("header", "true").mode("overwrite").csv(output_path)
    process_time = time.time() - process_start_time
    
    # Total time
    total_time = time.time() - start_time
    
    # Actual partition stats
    actual_partitions = sentiment_results_df.rdd.getNumPartitions()
    actual_reviews_per_partition = total_reviews / actual_partitions
    
    # Store results
    results.append({
        "target_mb": target_mb,
        "num_partitions": actual_partitions,
        "reviews_per_partition": actual_reviews_per_partition,
        "total_reviews": total_reviews,
        "result_count": result_count,
        "process_time": process_time,
        "total_time": total_time,
        "throughput": total_reviews / process_time
    })
    
    # Unpersist the result dataframe to free up memory
    sentiment_results_df.unpersist(blocking=True)
    
    logger.info(f"Completed benchmark with target partition size {target_mb} MB:")
    logger.info(f"  Total reviews: {total_reviews}")
    logger.info(f"  Actual partitions: {actual_partitions}")
    logger.info(f"  Process Time: {process_time:.2f} seconds")
    logger.info(f"  Total Time: {total_time:.2f} seconds")
    logger.info(f"  Processing Throughput: {total_reviews / process_time:.2f} reviews/second")

2025-04-09 15:37:54,728 - INFO - Testing with target partition size of 2 MB...
2025-04-09 15:37:54,730 - INFO - Using 5 partitions for 2 MB target size
2025-04-09 15:38:03,349 - INFO - Processing 3244 reviews...         (0 + 4) / 5]
2025-04-09 15:38:03,391 - INFO - Processing 3244 reviews...
2025-04-09 15:38:03,454 - INFO - Processing 3241 reviews...
2025-04-09 15:38:03,513 - INFO - Processing 3243 reviews...
2025-04-09 15:48:19,789 - INFO - Done processing 3243 reviews..     (0 + 4) / 5]
2025-04-09 15:48:20,222 - INFO - Processing 3244 reviews...         (1 + 4) / 5]
2025-04-09 15:48:21,162 - INFO - Done processing 3241 reviews..
2025-04-09 15:48:23,799 - INFO - Done processing 3244 reviews..     (2 + 3) / 5]
2025-04-09 15:48:25,662 - INFO - Done processing 3244 reviews..     (3 + 2) / 5]
2025-04-09 15:58:19,510 - INFO - Done processing 3244 reviews..     (4 + 1) / 5]
2025-04-09 15:58:20,036 - INFO - Completed benchmark with target partition size 2 MB:
2025-04-09 15:58:20,037 - INFO -

In [13]:
output_dir = os.getcwd()
Path(output_dir).mkdir(parents=True, exist_ok=True)
results_df = pd.DataFrame(results)
results_df.to_csv(f"{output_dir}/partition_size_benchmark_results.csv", index=False)

# Create a summary report
with open(f"{output_dir}/benchmark_summary.txt", 'w') as f:
    f.write("Spark Partition Size Benchmark Summary\n")
    f.write("=====================================\n\n")
    
    # Find optimal partition size based on throughput
    optimal_idx = results_df['throughput'].idxmax()
    optimal_size = results_df.iloc[optimal_idx]['target_mb']
    optimal_partitions = results_df.iloc[optimal_idx]['num_partitions']
    
    f.write(f"Total reviews processed: {results_df.iloc[0]['total_reviews']}\n")
    f.write(f"Optimal target partition size: {optimal_size} MB\n")
    f.write(f"Optimal number of partitions: {optimal_partitions}\n\n")
    
    f.write("Detailed Results:\n")
    f.write("-----------------\n")
    for _, row in results_df.iterrows():
        f.write(f"Target Partition Size: {row['target_mb']} MB\n")
        f.write(f"  Actual Partitions: {int(row['num_partitions'])}\n")
        f.write(f"  Reviews per partition: {int(row['reviews_per_partition'])}\n")
        f.write(f"  Process Time: {row['process_time']:.2f} seconds\n")
        f.write(f"  Total Time: {row['total_time']:.2f} seconds\n")
        f.write(f"  Throughput: {row['throughput']:.2f} reviews/second\n\n")


In [14]:
# Create output directory if it doesn't exist
output_dir = os.getcwd()
Path(output_dir).mkdir(parents=True, exist_ok=True)

# Set style
plt.style.use('ggplot')

# Figure 1: Total Time vs Partition Size (MB)
plt.figure(figsize=(10, 6))
plt.plot(results_df['target_mb'], results_df['total_time'], 'o-', linewidth=2)
plt.title('Total Execution Time vs Partition Size')
plt.xlabel('Target Partition Size (MB)')
plt.ylabel('Time (seconds)')
plt.grid(True)
plt.tight_layout()
plt.savefig(f"{output_dir}/total_time_vs_partition_size.png")

# Figure 2: Process Time vs Partition Size (MB)
plt.figure(figsize=(10, 6))
plt.plot(results_df['target_mb'], results_df['process_time'], 'o-', linewidth=2)
plt.title('Processing Time vs Partition Size')
plt.xlabel('Target Partition Size (MB)')
plt.ylabel('Time (seconds)')
plt.grid(True)
plt.tight_layout()
plt.savefig(f"{output_dir}/process_time_vs_partition_size.png")

# Figure 3: Throughput vs Partition Size (MB)
plt.figure(figsize=(10, 6))
plt.plot(results_df['target_mb'], results_df['throughput'], 'o-', linewidth=2)
plt.title('Throughput vs Partition Size')
plt.xlabel('Target Partition Size (MB)')
plt.ylabel('Throughput (reviews/second)')
plt.grid(True)
plt.tight_layout()
plt.savefig(f"{output_dir}/throughput_vs_partition_size.png")

# Figure 4: Time Breakdown
plt.figure(figsize=(12, 6))
width = 0.35
ind = np.arange(len(results_df))

plt.bar(ind, results_df['process_time'], width, label='Process Time')

plt.xlabel('Target Partition Size (MB)')
plt.ylabel('Time (seconds)')
plt.title('Time Breakdown by Partition Size')
plt.xticks(ind, results_df['target_mb'])
plt.legend()
plt.tight_layout()
plt.savefig(f"{output_dir}/time_breakdown.png")

# Figure 5: Number of Partitions vs Target Size
plt.figure(figsize=(10, 6))
plt.plot(results_df['target_mb'], results_df['num_partitions'], 'o-', linewidth=2)
plt.title('Number of Partitions vs Target Partition Size')
plt.xlabel('Target Partition Size (MB)')
plt.ylabel('Number of Partitions')
plt.grid(True)
plt.tight_layout()
plt.savefig(f"{output_dir}/partitions_vs_size.png")

plt.close('all')

In [17]:
# Get the executor memory status
executor_memory_status = sc.getExecutorMemoryStatus()

# Count the number of executors
num_executors = len(executor_memory_status)

print(f"Number of executors: {num_executors}")

AttributeError: 'SparkContext' object has no attribute 'getExecutorMemoryStatus'

In [32]:
import torch
torch.set_num_threads(1)
torch.set_num_interop_threads(1)

In [33]:
print(torch.get_num_threads())
print(torch.get_num_interop_threads())

1
1


In [44]:
target_partitions_counts = [2,3,4,6,8,10,12,16,32,64,128]
results = []
for num_partitions in target_partitions_counts:
    logger.info(f"Testing with target partition count of {num_partitions}...")
    
    logger.info(f"Using {num_partitions} partitions")
    
    # Record start time
    start_time = time.time()
    
    # Create a fresh copy of the dataframe for this test
    reviews_df = reviews_df_original
    
    # Repartition the DataFrame to target partition count
    reviews_df = reviews_df.repartition(num_partitions)
    
    # Process time for sentiment analysis
    process_start_time = time.time()
    
    # Run sentiment analysis
    sentiment_results_df = reviews_df.withColumn(
        "result",
        process.batch_sentiment_analysis(reviews_df["text"]),
    )
    
    # Flatten the result column
    sentiment_results_df = sentiment_results_df.select(
        col("asin"),
        col("user_id"),
        col("result.review_text"),
        col("result.sentiment"),
        col("result.score"),
    )
    
    result_count = sentiment_results_df.count()
    
    # Write results to csv
    output_path = f"{output_dir}/partition_count_{num_partitions}"
    
    process_start_time = time.time()
    sentiment_results_df.write.option("header", "true").mode("overwrite").csv(output_path)
    process_time = time.time() - process_start_time
    
    # Total time
    total_time = time.time() - start_time
    
    # Actual partition stats
    actual_partitions = sentiment_results_df.rdd.getNumPartitions()
    actual_reviews_per_partition = total_reviews / actual_partitions
    
    # Store results
    results.append({
        "target_num_partitions": num_partitions,
        "actual_partitions": actual_partitions,
        "reviews_per_partition": actual_reviews_per_partition,
        "total_reviews": total_reviews,
        "result_count": result_count,
        "process_time": process_time,
        "total_time": total_time,
        "throughput": total_reviews / process_time
    })
    
    # Unpersist the result dataframe to free up memory
    sentiment_results_df.unpersist(blocking=True)
    
    logger.info(f"Completed benchmark with target partition count {num_partitions}:")
    logger.info(f"  Total reviews: {total_reviews}")
    logger.info(f"  Actual partitions: {actual_partitions}")
    logger.info(f"  Process Time: {process_time:.2f} seconds")
    logger.info(f"  Total Time: {total_time:.2f} seconds")
    logger.info(f"  Processing Throughput: {total_reviews / process_time:.2f} reviews/second")

2025-04-09 18:37:23,557 - INFO - Testing with target partition count of 2...
2025-04-09 18:37:23,560 - INFO - Using 2 partitions
2025-04-09 18:37:29,216 - INFO - Processing 843 reviews...          (0 + 2) / 2]
2025-04-09 18:37:29,509 - INFO - Processing 843 reviews...
2025-04-09 18:40:07,988 - INFO - Done processing 843 reviews..      (0 + 2) / 2]
2025-04-09 18:40:09,160 - INFO - Done processing 843 reviews..      (1 + 1) / 2]
2025-04-09 18:40:09,336 - INFO - Completed benchmark with target partition count 2:
2025-04-09 18:40:09,338 - INFO -   Total reviews: 1686
2025-04-09 18:40:09,340 - INFO -   Actual partitions: 2
2025-04-09 18:40:09,341 - INFO -   Process Time: 165.59 seconds
2025-04-09 18:40:09,342 - INFO -   Total Time: 165.71 seconds
2025-04-09 18:40:09,343 - INFO -   Processing Throughput: 10.18 reviews/second
2025-04-09 18:40:09,344 - INFO - Testing with target partition count of 3...
2025-04-09 18:40:09,345 - INFO - Using 3 partitions
2025-04-09 18:40:09,557 - INFO - Process

In [45]:
output_dir = os.getcwd()
Path(output_dir).mkdir(parents=True, exist_ok=True)
results_df = pd.DataFrame(results)
results_df.to_csv(f"{output_dir}/partition_count_benchmark_results.csv", index=False)

# Create a summary report
with open(f"{output_dir}/partition_count_benchmark_summary.txt", 'w') as f:
    f.write("Spark Partition Count Benchmark Summary\n")
    f.write("=====================================\n\n")
    
    # Find optimal partition size based on throughput
    optimal_idx = results_df['throughput'].idxmax()
    optimal_partitions = results_df.iloc[optimal_idx]['target_num_partitions']
    
    f.write(f"Total reviews processed: {results_df.iloc[0]['total_reviews']}\n")
    f.write(f"Optimal target partition size: {optimal_size} MB\n")
    f.write(f"Optimal number of partitions: {optimal_partitions}\n\n")
    
    f.write("Detailed Results:\n")
    f.write("-----------------\n")
    for _, row in results_df.iterrows():
        f.write(f"Target Partition Count: {row['target_num_partitions']} MB\n")
        f.write(f"  Actual Partitions: {int(row['actual_partitions'])}\n")
        f.write(f"  Reviews per partition: {int(row['reviews_per_partition'])}\n")
        f.write(f"  Process Time: {row['process_time']:.2f} seconds\n")
        f.write(f"  Total Time: {row['total_time']:.2f} seconds\n")
        f.write(f"  Throughput: {row['throughput']:.2f} reviews/second\n\n")


In [46]:
results_df

Unnamed: 0,target_num_partitions,actual_partitions,reviews_per_partition,total_reviews,result_count,process_time,total_time,throughput
0,2,2,843.0,1686,1686,165.594337,165.708975,10.181508
1,3,3,562.0,1686,1686,113.792464,113.866739,14.816447
2,4,4,421.5,1686,1686,87.004179,87.084807,19.378379
3,6,6,281.0,1686,1686,106.323035,106.43445,15.857335
4,8,8,210.75,1686,1686,84.050362,84.152804,20.059402
5,10,10,168.6,1686,1686,97.29338,97.384135,17.329031
6,12,12,140.5,1686,1686,83.341282,83.471644,20.23007
7,16,16,105.375,1686,1686,82.990737,83.100753,20.31552
8,32,32,52.6875,1686,1686,83.58827,83.71678,20.170294
9,64,64,26.34375,1686,1686,87.067641,87.213753,19.364255


In [48]:
# Create output directory if it doesn't exist
output_dir = os.getcwd()
Path(output_dir).mkdir(parents=True, exist_ok=True)

# Set style
plt.style.use('ggplot')

# Figure 1: Total Time vs Partition Count
plt.figure(figsize=(10, 6))
plt.plot(results_df['target_num_partitions'], results_df['total_time'], 'o-', linewidth=2)
plt.title('Total Execution Time vs Partition Count')
plt.xlabel('Target Partition Count')
plt.ylabel('Time (seconds)')
plt.xscale('log', base=2)  # Use logarithmic scale with base 2
plt.xticks(results_df['target_num_partitions'], labels=results_df['target_num_partitions'])  # Keep original tick values
plt.grid(True)
plt.tight_layout()
plt.savefig(f"{output_dir}/total_time_vs_partition_count.png")

# Figure 2: Process Time vs Partition Count
plt.figure(figsize=(10, 6))
plt.plot(results_df['target_num_partitions'], results_df['process_time'], 'o-', linewidth=2)
plt.title('Processing Time vs Partition Count')
plt.xlabel('Target Partition Count')
plt.ylabel('Time (seconds)')
plt.xscale('log', base=2)  # Use logarithmic scale with base 2
plt.xticks(results_df['target_num_partitions'], labels=results_df['target_num_partitions'])  # Keep original tick values
plt.grid(True)
plt.tight_layout()
plt.savefig(f"{output_dir}/process_time_vs_partition_count.png")

# Figure 3: Throughput vs Partition Count
plt.figure(figsize=(10, 6))
plt.plot(results_df['target_num_partitions'], results_df['throughput'], 'o-', linewidth=2)
plt.title('Throughput vs Partition Count')
plt.xlabel('Target Partition Count')
plt.ylabel('Throughput (reviews/second)')
plt.xscale('log', base=2)  # Use logarithmic scale with base 2
plt.xticks(results_df['target_num_partitions'], labels=results_df['target_num_partitions'])  # Keep original tick values
plt.grid(True)
plt.tight_layout()
plt.savefig(f"{output_dir}/throughput_vs_partition_count.png")

# Figure 4: Time Breakdown
plt.figure(figsize=(12, 6))
width = 0.35
ind = np.arange(len(results_df))
plt.bar(ind, results_df['process_time'], width, label='Process Time')
plt.xlabel('Target Partition Count')
plt.ylabel('Time (seconds)')
plt.title('Time Breakdown by Partition Count')
plt.xticks(ind, results_df['target_num_partitions'])  # Keep the original ticks for the bar chart
plt.legend()
plt.tight_layout()
plt.savefig(f"{output_dir}/time_breakdown_partition_count.png")

plt.close('all')