In [2]:
from transformers import AutoTokenizer, pipeline, AutoModelForSequenceClassification
from tqdm import tqdm
import torch
import json
import pandas as pd
import logging
import numpy as np
import time

In [3]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [4]:
torch.set_num_threads(1)
torch.set_num_interop_threads(1)

In [5]:
print(torch.get_num_threads())
print(torch.get_num_interop_threads())

1
1


In [6]:
import sys
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pickle

tokenizer = AutoTokenizer.from_pretrained(
    "finiteautomata/bertweet-base-sentiment-analysis"
)
model = AutoModelForSequenceClassification.from_pretrained(
    "finiteautomata/bertweet-base-sentiment-analysis"
)

# Serialize and measure
model_bytes = pickle.dumps(model)
tokenizer_bytes = pickle.dumps(tokenizer)

print(f"Model size: {len(model_bytes)/1024/1024:.2f} MB")
print(f"Tokenizer size: {len(tokenizer_bytes)/1024/1024:.2f} MB")


Model size: 514.70 MB
Tokenizer size: 2.41 MB


In [7]:
import psutil, os
p = psutil.Process(os.getpid())
print(f"Number of active threads: {p.num_threads()}")

Number of active threads: 18


In [11]:
amazon_reviews = []
# Load the JSON data (assuming it's a list of review objects)
with open("Baby_Products.jsonl", "r") as f:
    for line in f:
        amazon_reviews.append(json.loads(line))

amazon_reviews_text = [review["text"] for review in amazon_reviews[:1000]]

# Initialize the DistilBERT tokenizer
bertweet_tokenizer = AutoTokenizer.from_pretrained(
    "finiteautomata/bertweet-base-sentiment-analysis"
)

# Compute token lengths for each review text
token_counts = [
    len(
        bertweet_tokenizer(
            review["text"], return_tensors="pt", padding="max_length", truncation=True
        )
    )
    for review in amazon_reviews
]
average_tokens = sum(token_counts) / len(token_counts)

print(f"Average token count: {average_tokens:.1f}")

bertweet_tokenizer = AutoTokenizer.from_pretrained(
    "finiteautomata/bertweet-base-sentiment-analysis"
)
bertweet_model = AutoModelForSequenceClassification.from_pretrained(
    "finiteautomata/bertweet-base-sentiment-analysis"
)

fine_tuned_sentiment_results = []
processing_times = []
token_counts = []

print(f"Processing {len(amazon_reviews_text)} reviews")
for review_text in tqdm(amazon_reviews_text, desc="Processing reviews"):
    start_time = time.time()

    # Tokenize the review text
    inputs = bertweet_tokenizer(
        review_text, return_tensors="pt", padding="max_length", truncation=True
    )

    # Calculate the actual number of tokens (using attention_mask to ignore padding)
    token_count = inputs["attention_mask"].sum().item()
    token_counts.append(token_count)

    # # Perform sentiment analysis
    # with torch.no_grad():
    #     sentiment_batch = bertweet_model(**inputs).logits

    # predicted_class_id = sentiment_batch.argmax().item()
    # sentiment_label = bertweet_model.config.id2label[predicted_class_id]

    # fine_tuned_sentiment_results.append(
    #     {"review_text": review_text, "sentiment": sentiment_label}
    # )

    batch_time = time.time() - start_time
    processing_times.append(batch_time)

# Compute average and maximum values
avg_time = sum(processing_times) / len(processing_times)
max_time = max(processing_times)
avg_tokens = sum(token_counts) / len(token_counts)
max_tokens = max(token_counts)
total_batches = len(processing_times)
total_processing_time = sum(processing_times)
total_tokens = sum(token_counts) 
throughput = total_tokens / total_processing_time

print("Total number of tokens:", total_tokens)
print("Total number of batches:", total_batches)
print("Total time taken: {:.4f} seconds".format(total_processing_time))
print("Average time per batch: {:.4f} seconds".format(avg_time))
print("Maximum time for a batch: {:.4f} seconds".format(max_time))
print("Average batch size (in tokens): {:.2f}".format(avg_tokens))
print("Longest batch (in tokens):", max_tokens)
print("Throughput (in tokens/second):", throughput)

# bertweet_sentiment_amazon_df = pd.DataFrame(fine_tuned_sentiment_results)

# # Save the results to a CSV file
# bertweet_sentiment_amazon_df.to_csv(
#     "bertweet_sentiment_analysis_amazon_results.csv", index=False
# )

# print("Result saved to 'bertweet_sentiment_analysis_amazon_results.csv'.")

KeyboardInterrupt: 

In [8]:
amazon_reviews = []
# Load the JSON data (assuming it's a list of review objects)
with open("Subscription_Boxes.jsonl", "r") as f:
    for line in f:
        amazon_reviews.append(json.loads(line))

amazon_reviews_text = [review["text"] for review in amazon_reviews[:1000]]

# Initialize the DistilBERT tokenizer
bertweet_tokenizer = AutoTokenizer.from_pretrained(
    "finiteautomata/bertweet-base-sentiment-analysis"
)

# Compute token lengths for each review text
token_counts = [
    len(
        bertweet_tokenizer(
            review["text"], return_tensors="pt", padding="max_length", truncation=True
        )
    )
    for review in amazon_reviews
]
average_tokens = sum(token_counts) / len(token_counts)

print(f"Average token count: {average_tokens:.1f}")

bertweet_tokenizer = AutoTokenizer.from_pretrained(
    "finiteautomata/bertweet-base-sentiment-analysis"
)
bertweet_model = AutoModelForSequenceClassification.from_pretrained(
    "finiteautomata/bertweet-base-sentiment-analysis"
)

fine_tuned_sentiment_results = []
processing_times = []
token_counts = []

print(f"Processing {len(amazon_reviews_text)} reviews")
for review_text in tqdm(amazon_reviews_text, desc="Processing reviews"):
    start_time = time.time()

    # Tokenize the review text
    inputs = bertweet_tokenizer(
        review_text, return_tensors="pt", padding="max_length", truncation=True
    )

    # Calculate the actual number of tokens (using attention_mask to ignore padding)
    token_count = inputs["attention_mask"].sum().item()
    token_counts.append(token_count)

    # Perform sentiment analysis
    with torch.no_grad():
        sentiment_batch = bertweet_model(**inputs).logits

    predicted_class_id = sentiment_batch.argmax().item()
    sentiment_label = bertweet_model.config.id2label[predicted_class_id]

    fine_tuned_sentiment_results.append(
        {"review_text": review_text, "sentiment": sentiment_label}
    )

    batch_time = time.time() - start_time
    processing_times.append(batch_time)

# Compute average and maximum values
avg_time = sum(processing_times) / len(processing_times)
max_time = max(processing_times)
avg_tokens = sum(token_counts) / len(token_counts)
max_tokens = max(token_counts)
total_batches = len(processing_times)
total_processing_time = sum(processing_times)
total_tokens = sum(token_counts) 
throughput = total_tokens / total_processing_time

print("Total number of batches:", total_batches)
print("Total time taken: {:.4f} seconds".format(total_processing_time))
print("Average time per batch: {:.4f} seconds".format(avg_time))
print("Maximum time for a batch: {:.4f} seconds".format(max_time))
print("Average batch size (in tokens): {:.2f}".format(avg_tokens))
print("Longest batch (in tokens):", max_tokens)
print("Throughput (in tokens/second):", throughput)

bertweet_sentiment_amazon_df = pd.DataFrame(fine_tuned_sentiment_results)

# Save the results to a CSV file
bertweet_sentiment_amazon_df.to_csv(
    "bertweet_sentiment_analysis_amazon_results.csv", index=False
)

print("Result saved to 'bertweet_sentiment_analysis_amazon_results.csv'.")

Average token count: 3.0
Processing 1000 reviews


Processing reviews: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:46<00:00,  4.41it/s]

Total number of batches: 1000
Total time taken: 224.6940 seconds
Average time per batch: 0.2247 seconds
Maximum time for a batch: 0.3317 seconds
Average batch size (in tokens): 55.40
Longest batch (in tokens): 128
Throughput (in tokens/second): 246.56195015504193
Result saved to 'bertweet_sentiment_analysis_amazon_results.csv'.





In [8]:
import time

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from sentiment_analysis.config import logger
from sentiment_analysis.utils import run_command
from sentiment_analysis.load import load_amazon_reviews, load_model
from sentiment_analysis.process import batch_sentiment_analysis
import sentiment_analysis.process as process
import math


In [6]:
import os

file_path = "Subscription_Boxes.jsonl"
size_bytes = os.path.getsize(file_path)
size_mb = size_bytes / (1024 * 1024)

print(f"File size: {size_bytes} bytes ({size_mb:.2f} MB)")


File size: 8953020 bytes (8.54 MB)


In [13]:
task_cpus

1

In [14]:
target_mb = 8
total_cores = 4
task_cpus = int(spark.conf.get("spark.task.cpus", "1"))
min_partitions = total_cores // task_cpus
logger.info(f"Testing with target partition size of {target_mb} MB...")
    
# Calculate target reviews per partition
target_bytes = target_mb * 1024 * 1024

# Calculate number of partitions
num_partitions = max(min_partitions, math.ceil(size_bytes / target_bytes))

logger.info(f"Using {num_partitions} partitions for {target_mb} MB target size")


2025-04-08 14:26:29,698 - INFO - Testing with target partition size of 8 MB...
2025-04-08 14:26:29,701 - INFO - Using 4 partitions for 8 MB target size


In [2]:
def merge_results_csv_in_hdfs(read_hdfs_path, write_hdfs_path, csv_file_name):
    logger.info(f"WRITING ANALYSIS SUMMARY OUTPUT {csv_file_name} TO HDFS...")
    final_path = f"{write_hdfs_path}/{csv_file_name}.csv"
    temp_csv_path = "/tmp/merged_results.csv"
    # Merge csv files from hdfs and save them locally
    merge_command = [
        "/home/almalinux/hadoop-3.4.0/bin/hdfs",
        "dfs",
        "-getmerge",
        f"{read_hdfs_path}/part-*.csv",
        temp_csv_path,
    ]
    run_command(merge_command)

    # Upload the merged csv to hdfs
    upload_command = [
        "/home/almalinux/hadoop-3.4.0/bin/hdfs",
        "dfs",
        "-put",
        "-f",
        temp_csv_path,
        final_path,
    ]
    run_command(upload_command)
    # Remove the local merged file
    delete_local_file(temp_csv_path)


In [3]:
input_path, output_path = "/Subscription_Boxes.jsonl", "/analysis_outputs"
# Initialize Spark session
spark = SparkSession.builder.appName("SentimentAnalysis").getOrCreate()
# Load the dataset
reviews_df = load_amazon_reviews(spark, input_path)
# Count total reviews
total_reviews = reviews_df.count()
logger.info(f"Processing {total_reviews:,} reviews")
# Load model and tokenizer
tokenizer, model = load_model()
# Broadcast model and tokenizer to all workers
process.bc_tokenizer = spark.sparkContext.broadcast(tokenizer)
process.bc_model = spark.sparkContext.broadcast(model)
sentiment_results_df = reviews_df.withColumn(
    "result",
    process.batch_sentiment_analysis(reviews_df["text"]),
)
# Flatten the result column
sentiment_results_df = sentiment_results_df.select(
    col("asin"),
    col("user_id"),
    col("result.review_text"),
    col("result.sentiment"),
    col("result.score"),
)
start_time = time.time()
sentiment_results_df.write.option("header", "true").mode("overwrite").csv(output_path)
end_time = time.time()
logger.info(f"Done processing all reviews in {end_time - start_time:.2f} seconds")
# Combine results into a single csv file
merge_results_csv_in_hdfs(
    output_path, "/summary_outputs", "sentiment_analysis_results"
)

# Stop Spark session
spark.stop()


25/04/08 13:40:35 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
2025-04-08 13:40:42,666 - INFO - Processing 16,216 reviews                      
2025-04-08 13:40:42,671 - INFO - Loading BERTweet model and tokenizer...
emoji is not installed, thus not converting emoticons or emojis into text. Please install emoji: pip3 install emoji
2025-04-08 13:40:58,267 - INFO - Processing 7674 reviews...         (0 + 3) / 3]
2025-04-08 13:40:58,345 - INFO - Processing 1059 reviews...
2025-04-08 13:40:58,815 - INFO - Processing reviews using 1 torch threads...
2025-04-08 13:40:58,863 - INFO - Processing reviews using 1 torch threads...
2025-04-08 13:40:59,092 - INFO - Processing 7483 reviews...
2025-04-08 13:40:59,703 - INFO - Processing reviews using 1 torch threads...
2025-04-08 13:44:14,791 - INFO - Done processing 1059 reviews..     (0 + 3) / 3]
2025-04-08 13:50:02,343 - ERROR - KeyboardInterrupt while sending command.) / 3]
Traceback (most 

KeyboardInterrupt: 

In [11]:
total_cores = sc.defaultParallelism
print("Total cores (default parallelism):", total_cores)

Total cores (default parallelism): 36
