In [8]:
import ray
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Initialize Ray with dashboard enabled
ray.init(ignore_reinit_error=True, include_dashboard=True)

# Print dashboard URL
print("Ray Dashboard is running. Access it at: http://127.0.0.1:8265")


2024-09-29 11:48:42,993	INFO worker.py:1777 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8266 [39m[22m


Ray Dashboard is running. Access it at: http://127.0.0.1:8265






In [9]:
# Define a Ray Actor to load the model and tokenizer once and reuse for multiple tasks
@ray.remote
class SentimentAnalyzerActor:
    def __init__(self, model_name):
        # Load the pre-trained DistilBERT model and tokenizer for sentiment analysis
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.pipeline = pipeline("sentiment-analysis", model=self.model, tokenizer=self.tokenizer)
    
    def analyze_sentiment(self, summary_text):
        # Analyze sentiment of the given summary text
        result = self.pipeline(summary_text, truncation=True, max_length=512)
        # Return the label (POSITIVE/NEGATIVE) and score (confidence)
        return result[0]['label'], result[0]['score']

# Create an instance of the SentimentAnalyzerActor
model_name = "../models/distilbert-base-uncased-finetuned-sst-2-english"
sentiment_analyzer = SentimentAnalyzerActor.remote(model_name)


In [10]:
# Load the DataFrame that contains the summary text
df_summary = pd.read_csv('../data/processed/Books_bart-large-summary_10000_rows.csv')  # Path to the CSV with summaries

# Display the first few rows of the DataFrame
df_summary.head()


Unnamed: 0,Id,summary
0,1882931173,This is only for Julie Strain fans. It's a col...
1,826414346,I don't care much for Dr. Seuss but after read...
2,826414346,"If people become the books they read and if ""t..."
3,826414346,Theodore Seuss Geisel (1904-1991) was one of t...
4,826414346,Philip Nel - Dr. Seuss: American IconThis is b...


In [11]:
# Submit Ray tasks for each summary in the DataFrame
sentiment_futures = [sentiment_analyzer.analyze_sentiment.remote(summary) for summary in df_summary['summary']]

# Retrieve the sentiment results when the tasks are done
sentiment_results = ray.get(sentiment_futures)

# Unpack the sentiment results into labels and scores
sentiment_labels, sentiment_scores = zip(*sentiment_results)

# Add the sentiment results back into the DataFrame
df_summary['sentiment_label'] = sentiment_labels
df_summary['sentiment_score'] = sentiment_scores

# Display the updated DataFrame with sentiment analysis
df_summary.head()


Unnamed: 0,Id,summary,sentiment_label,sentiment_score
0,1882931173,This is only for Julie Strain fans. It's a col...,NEGATIVE,0.924354
1,826414346,I don't care much for Dr. Seuss but after read...,POSITIVE,0.998793
2,826414346,"If people become the books they read and if ""t...",POSITIVE,0.999778
3,826414346,Theodore Seuss Geisel (1904-1991) was one of t...,POSITIVE,0.998776
4,826414346,Philip Nel - Dr. Seuss: American IconThis is b...,POSITIVE,0.982722


In [12]:
# Save the updated DataFrame with sentiment results to a new CSV file
df_summary.to_csv('../data/Books_summary_with_sentiment.csv', index=False)

# Print confirmation
print("Sentiment analysis completed and saved to 'Books_summary_with_sentiment.csv'.")


Sentiment analysis completed and saved to 'Books_summary_with_sentiment.csv'.


In [13]:
# Shut down Ray after processing
ray.shutdown()

# Print confirmation
print("Ray has been shut down.")


Ray has been shut down.
