The below code performs sentiment analysis on labelled dataset by using "roberta-base" model and its evaluation metrics are also computed.
We have used "facebook/bart-large" for trends extraction for comparing the output with finetuned model.

In [1]:
!pip install transformers datasets scikit-learn

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

Code for Sentiment Analysis using "roberta-base" model.

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import json

# Load the tokenizer and model for sentiment analysis
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create a pipeline with the model and tokenizer
sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

# Load the dataset
with open("/content/combined_dataset.json", "r") as f:
    data = json.load(f)

sentences = []
true_labels = []

# Prepare the dataset
# Ensure that each sentence is paired with its corresponding sentiment
for entry in data:
    for sentence in entry["sentence"]:  # Iterate through sentences within each entry
        sentences.append(sentence)
        true_labels.append(entry["sentiment"]) # Assign the sentiment to each sentence

# Convert true labels to binary: "positive" -> 1, "negative" -> 0
true_labels_binary = [1 if label == "positive" else 0 for label in true_labels]

# Process sentences and handle length constraints
predicted_labels_binary = []
for sentence in sentences:
    # Tokenize and truncate the input
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        # Predict sentiment
        outputs = model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1).item()
        predicted_labels_binary.append(prediction)

# Calculate metrics
accuracy = accuracy_score(true_labels_binary, predicted_labels_binary)
precision = precision_score(true_labels_binary, predicted_labels_binary)
recall = recall_score(true_labels_binary, predicted_labels_binary)
f1 = f1_score(true_labels_binary, predicted_labels_binary)

# Print results
print("RoBERTa Sentiment Analysis Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTa Sentiment Analysis Metrics:
Accuracy: 0.5934
Precision: 0.5934
Recall: 1.0000
F1 Score: 0.7448


Code for Trends Extraction using "facebook/bart-large" model

In [3]:
!pip install rouge-score

import json
from transformers import BartTokenizer, BartForConditionalGeneration
from sklearn.model_selection import train_test_split
import torch
from rouge_score import rouge_scorer
import numpy as np

# Load data from JSON file
file_path = "/content/combined_dataset.json"
with open(file_path, "r") as file:
    data = json.load(file)

# Extract sentences and trends from the loaded data
sentences = [item["sentence"] for item in data]
trends = [item["trends"] for item in data]

# Ensure alignment of sentences and trends
flat_sentences = []
flat_trends = []

for item in data:
    for sentence in item["sentence"]:
        flat_sentences.append(sentence)
        flat_trends.append("; ".join(item["trends"]))  # Associate the same trends for all sentences in a group

# Check alignment
assert len(flat_sentences) == len(flat_trends), "Mismatch in the number of sentences and trends."

# Split the data into training and testing subsets
train_texts, test_texts, train_trends, test_trends = train_test_split(
    flat_sentences, flat_trends, test_size=0.2, random_state=42
)

# Load BART model and tokenizer
model_name = "facebook/bart-large"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate trends using BART
def generate_trends(model, tokenizer, texts, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for text in texts:
            inputs = tokenizer(
                text, return_tensors="pt", max_length=512, truncation=True, padding=True
            ).to(device)
            outputs = model.generate(inputs["input_ids"], max_length=50)
            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            predictions.append(generated_text)
    return predictions

# Generate trends for the test set
predicted_trends = generate_trends(model, tokenizer, test_texts, device)

# Compute ROUGE metrics using rouge_scorer
rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def evaluate_trends(actual_trends, predicted_trends, k=5):
    rouge_1_scores, rouge_2_scores, rouge_l_scores = [], [], []
    mrr_scores = []
    precision_at_k = []
    recall_at_k = []
    f1_at_k = []

    for actual, predicted in zip(actual_trends, predicted_trends):
        # ROUGE scores
        rouge_scores = rouge_scorer_obj.score(" ".join(predicted.split()), " ".join(actual.split()))
        rouge_1_scores.append(rouge_scores['rouge1'].fmeasure)
        rouge_2_scores.append(rouge_scores['rouge2'].fmeasure)
        rouge_l_scores.append(rouge_scores['rougeL'].fmeasure)

        # For MRR, Precision@k, Recall@k, F1@k, assume that each trend is a set of relevant keywords
        actual_trends_set = set(actual.split("; "))  # Actual trends split by semicolon
        predicted_trends_list = predicted.split("; ")  # Predicted trends as list

        # MRR Calculation
        rank = next((i + 1 for i, trend in enumerate(predicted_trends_list) if trend in actual_trends_set), 0)
        mrr_scores.append(1 / rank if rank != 0 else 0)

        # Precision@k, Recall@k, F1@k (k = 5)
        top_k_predictions = predicted_trends_list[:k]
        relevant_predictions = set(top_k_predictions) & actual_trends_set

        # Precision@k = Relevant Predictions / k
        precision_at_k.append(len(relevant_predictions) / k)

        # Recall@k = Relevant Predictions / Total Relevant
        recall_at_k.append(len(relevant_predictions) / len(actual_trends_set))

        # F1@k = 2 * (Precision * Recall) / (Precision + Recall)
        p_at_k = len(relevant_predictions) / k
        r_at_k = len(relevant_predictions) / len(actual_trends_set)
        f1_at_k.append(2 * (p_at_k * r_at_k) / (p_at_k + r_at_k) if (p_at_k + r_at_k) > 0 else 0)



    # Average ROUGE scores
    avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
    avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

    # Average MRR, Precision@k, Recall@k, F1@k
    avg_mrr = sum(mrr_scores) / len(mrr_scores)
    avg_precision_at_k = sum(precision_at_k) / len(precision_at_k)
    avg_recall_at_k = sum(recall_at_k) / len(recall_at_k)
    avg_f1_at_k = sum(f1_at_k) / len(f1_at_k)

    return {
        "Average ROUGE-1": avg_rouge_1,
        "Average ROUGE-2": avg_rouge_2,
        "Average ROUGE-L": avg_rouge_l,
        "Mean Reciprocal Rank (MRR)": avg_mrr,
        "Precision@k": avg_precision_at_k,
        "Recall@k": avg_recall_at_k,
        "F1@k": avg_f1_at_k,
    }

# Helper function to flatten and get unique elements
def flatten_and_unique(lst):
    return list(set([item.strip().lower() for sublist in lst for item in sublist]))

# Flatten and deduplicate
flat_predicted = flatten_and_unique(predicted_trends)
flat_actual = flatten_and_unique(test_trends)

# Evaluate trends on the test set
test_metrics = evaluate_trends(flat_actual, flat_predicted, k=5)

# Display evaluation metrics
print("Evaluation Metrics for Facebook BART:")
for metric, value in test_metrics.items():
    print(f"{metric}: {value:.4f}")


Evaluation Metrics for Facebook BART:
Average ROUGE-1: 0.0000
Average ROUGE-2: 0.0000
Average ROUGE-L: 0.0000
Mean Reciprocal Rank (MRR): 0.0345
Precision@k: 0.0069
Recall@k: 0.0345
F1@k: 0.0115
