## DSPy Explorations for Sentiment Classification

### Imports

In [1]:
import numpy as np
import pandas as pd
import openai
import warnings
import os
import io
import sys
import time
import pickle
from tqdm.auto import tqdm
import dsp
import dspy
from dspy import Retrieve
from dsp.utils import dotdict
from dspy.datasets import DataLoader
from dspy.evaluate import answer_exact_match
from dspy.evaluate.evaluate import Evaluate
from dspy.teleprompt import COPRO
from datasets import load_dataset
from dotenv import load_dotenv
import torch
import torch.nn.functional as F
import torch.distributed as dist
import torch.nn as nn
from transformers import ElectraTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
from typing import List, Union, Optional
from transformers import AutoModel, AutoTokenizer
from utils import fix_random_seeds, prepare_device, setup_environment

### Preferences

In [2]:
# Set maximum column width for pandas to none
pd.set_option('display.max_colwidth', None)

In [3]:
# Suppress specific warnings
warnings.filterwarnings("ignore", message="resume_download is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use force_download=True.", category=FutureWarning)
warnings.filterwarnings("ignore", message="`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.", category=FutureWarning)


### Setup DDP Environment

I fine-tuned my models in a DDP multi-GPU environment using `ddp_sentiment_finetune.py`. Because I'm loading a checkpoint from that pipele, which has a DDP wrapper, I'm runninig some minimal functions here to setup a basic single-GPU DDP environemnt.

In [4]:
# Set some variables for the DDP environment
rank = 0
device_type = 'cuda'
world_size = 1
backend = 'nccl'
debug = False
random_seed = 42

In [5]:
# Define the device
device = prepare_device(rank, device_type)

In [6]:
# Initiate the process group
setup_environment(rank, world_size, backend, device, debug)

Rank 0 - Device: cuda:0
1 process groups initialized with 'nccl' backend on localhost:12355
NCCL Timeout: 1 hr 0 min. NCCL Blocking Wait: Enabled


In [7]:
# Set random seeds
fix_random_seeds(random_seed)

### Load ELECTRA Tokenizer and Fine-Tuned Model

We'll load an ELECTRA model that was fine-tuned on sentiment using `ddp_sentiment_finetune.py`. This will be part of our DSPy module's forward pass. We'll get the ELECTRA classification for the review, and that will be input/context in the prompt for the GPT4o-mini language model.

In [8]:
# Load the ELECTRA tokenizer from Hugging Face
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')

In [9]:
# Function to unpickle the checkpoint saved with DDP and GPUs
class CPU_Unpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        else: return super().find_class(module, name)

In [10]:
# Load the ELECTRA model from a checkpoint pickle file
with open('checkpoints/final_model_20240903-164225.pkl', 'rb') as f:
    electra_v4_model = CPU_Unpickler(f).load()

In [11]:
# Move the model to the device (GPU ideally) to speed up inference time
electra_v4_model.to(device)

TorchDDPNeuralClassifier(
	batch_size=16,
	max_iter=9,
	eta=2e-05,
	optimizer_class=<class 'torch.optim.adamw.AdamW'>,
	l2_strength=0.01,
	gradient_accumulation_steps=2,
	max_grad_norm=None,
	validation_fraction=0.1,
	early_stopping=None,
	n_iter_no_change=5,
	warm_start=False,
	tol=1e-05,
	finetune_bert=True,
	pooling=mean,
	hidden_dim=1024,
	hidden_activation=GELU(approximate='none'),
	num_layers=2,
	checkpoint_interval=1,
	target_score=None,
	interactive=False,
	freeze_bert=False,
	dropout_rate=0.3,
	show_progress=True,
	advance_epochs=1,
	use_zero=True,
	scheduler_class=<class 'torch.optim.lr_scheduler.CosineAnnealingWarmRestarts'>)

In [12]:
# Review the ELECTRA model with custom pooling and classifier head
electra_v4_model.model

DistributedDataParallel(
  (module): BERTClassifier(
    (bert): ElectraModel(
      (embeddings): ElectraEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): ElectraEncoder(
        (layer): ModuleList(
          (0-11): 12 x ElectraLayer(
            (attention): ElectraAttention(
              (self): ElectraSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): ElectraSelfOutput(
                (dense): Linear(in_features=768, o

### ELECTRA Prediction Functions

In [13]:
# Set the class label mapping dictionary
numeric_dict = {0: 'negative', 1: 'neutral', 2: 'positive'}

In [14]:
# Tokenize function
def tokenize(texts, tokenizer, device):
    encoded = tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)
    
    return input_ids, attention_mask

In [15]:
# Function to predict the class of a sentence
def predict_sentence(model, sentence, tokenizer, numeric_dict):

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.model
    
    # Tokenize the input sentence
    input_ids, attention_mask = tokenize([sentence], tokenizer, device)

    # Set the model to evaluation mode
    model.eval()

    # Perform inference
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    # Get the predictions
    if isinstance(outputs, torch.Tensor):
        logits = outputs
    elif hasattr(outputs, 'logits'):
        logits = outputs.logits
    else:
        logits = outputs[0]

    probabilities = torch.softmax(logits, dim=1)
    predicted_class = torch.argmax(probabilities, dim=1).item()
    predicted_label = numeric_dict[predicted_class]

    # Move probabilities back to CPU and convert to list
    probabilities = probabilities.cpu().squeeze().tolist()

    # Free up GPU memory
    del input_ids, attention_mask, logits, outputs
    torch.cuda.empty_cache()

    return predicted_class, predicted_label, probabilities

### Setup DSPy and OpenAI Environment

In [16]:
# Load the API keys in a `.env` file in the local root directory
load_dotenv()

True

In [17]:
# Set environment variables
os.environ["DSP_NOTEBOOK_CACHEDIR"] = 'cache'
openai_key = os.getenv('OPENAI_API_KEY')

In [18]:
# Define the language model
lm = dspy.OpenAI(model='gpt-4o-mini', api_key=openai_key)
dspy.settings.configure(lm=lm)

### Load Dataset

The dataset we're using is a merge of the Dynasent R1, Dynasent R2, and SST. There are 2 CSV files saved in `data/merged`: `train_all.csv` and `dev_all.csv`.

In [178]:
# Define the data loader
dl = DataLoader()

In [179]:
# Load in the merged dataset files (mix of Dynasent R1, R2 and SST)
train_df = pd.read_csv('data/merged/train_all.csv')
dev_df = pd.read_csv('data/merged/dev_all.csv')

In [180]:
# Rename some columns to match the names in the DSPy signature
train_df.rename(columns={'sentence': 'review', 'label': 'classification'}, inplace=True)
dev_df.rename(columns={'sentence': 'review', 'label': 'classification'}, inplace=True)

In [181]:
# Define the train and dev datasets with example objects for DSPy
train_dataset = dl.from_pandas(
    train_df,
    fields=('review', 'classification'),
    input_keys=['review'],
)
dev_dataset = dl.from_pandas(
    dev_df,
    fields=('review', 'classification'),
    input_keys=['review'],
)

### Create DSPy Signature, Module, Metric

In [56]:
# DSPy signature that defines the prompt: review, classifier_decision -> classification
class ElectraV7SentimentClassification(dspy.Signature):
    __doc__ = """Classify the sentiment of a review as either 'positive', 'neutral', or 'negative'."""

    review = dspy.InputField(desc="The review text to classify.")
    classifier_decision = dspy.InputField(desc="The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.")
    classification = dspy.OutputField(desc="One word representing the sentiment classification: 'positive', 'neutral', or 'negative' (do not repeat the field name, do not use 'mixed').")

In [57]:
# Review the signature
ElectraV7SentimentClassification

ElectraV7SentimentClassification(review, classifier_decision -> classification
    instructions="Classify the sentiment of a review as either 'positive', 'neutral', or 'negative'."
    review = Field(annotation=str required=True json_schema_extra={'desc': 'The review text to classify.', '__dspy_field_type': 'input', 'prefix': 'Review:'})
    classifier_decision = Field(annotation=str required=True json_schema_extra={'desc': 'The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.', '__dspy_field_type': 'input', 'prefix': 'Classifier Decision:'})
    classification = Field(annotation=str required=True json_schema_extra={'desc': "One word representing the sentiment classification: 'positive', 'neutral', or 'negative' (do not repeat the field name, do not use 'mixed').", '__dspy_field_type': 'output', 'prefix': 'Classification:'})
)

In [58]:
# DSPy module that uses the ELECTRA model to help classify the sentiment of a review
class ElectraV7Sentiment(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.Predict(ElectraV7SentimentClassification)
        self.tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
        self.numeric_dict = {0: 'negative', 1: 'neutral', 2: 'positive'}

    def classify_with_electra(self, review):
        _, predicted_label, probabilities = predict_sentence(electra_v4_model, review, self.tokenizer, self.numeric_dict)
        return predicted_label, probabilities

    def forward(self, review):
        classifier_decision, probabilities = self.classify_with_electra(review)

        prediction = self.generate_answer(
            review=review, 
            classifier_decision=classifier_decision
        )
        # Sleep for 1 second to avoid rate limiting
        time.sleep(1)
        
        return dspy.Prediction(
            classification=prediction.classification.lower().strip(),
            classifier_decision=classifier_decision, 
            probabilities=probabilities
        )

In [59]:
# Create an instance of the ELECTRA sentiment module
electra_sentiment_v7 = ElectraV7Sentiment()

In [60]:
# Define a classification match metric that is flexible for the DSPy optimizer prompt variations
def classification_match(review, pred, trace=None, frac=1.0):
    # Define the possible field names, based on the prompts we saw in the COPRO optimizer
    field_names = ['classification', 'sentiment_classification']
    
    # Get the actual classification from the review
    actual_classification = None
    for field in field_names:
        if hasattr(review, field):
            actual_classification = getattr(review, field)
            break
    if actual_classification is None:
        raise ValueError("No classification field found in the review object")
    
    # Get the predicted classification
    predicted_classification = None
    for field in field_names:
        if hasattr(pred, field):
            predicted_classification = getattr(pred, field)
            break
    if predicted_classification is None:
        raise ValueError("No classification field found in the prediction object")
    
    # Clean up the predicted classification
    predicted_classification = predicted_classification.lower().strip()
    if ':' in predicted_classification:
        predicted_classification = predicted_classification.split(':')[-1].strip()
    
    # Extract just the sentiment if there's additional information
    sentiment_words = ['positive', 'neutral', 'negative']
    for word in sentiment_words:
        if word in predicted_classification:
            predicted_classification = word
            break
    
    # Perform the matching
    if isinstance(actual_classification, str):
        return dsp.answer_match(predicted_classification, [actual_classification], frac=frac)
    elif isinstance(actual_classification, list):
        return dsp.answer_match(predicted_classification, actual_classification, frac=frac)
    else:
        raise TypeError("Unexpected type for classification")

### Test the Prompt with ELECTRA Classification as Context

In [61]:
# Create a test result for the ELECTRA sentiment module
electra_v7_result = electra_sentiment_v7(review="Those 2 drinks are part of the HK culture and has years of history. It is so bad.")

In [62]:
# Review the result
electra_v7_result

Prediction(
    classification='negative',
    classifier_decision='negative',
    probabilities=[0.9999992847442627, 1.6079250997336203e-07, 6.279589683799713e-07]
)

In [63]:
# Inspect the last history of the language model
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'positive', 'neutral', or 'negative'.

---

Follow the following format.

Review: The review text to classify.
Classifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.
Classification: One word representing the sentiment classification: 'positive', 'neutral', or 'negative' (do not repeat the field name, do not use 'mixed').

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.
Classifier Decision: negative
Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'positive', 'neutral', or 'negative'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.\nClassification: One word representing the sentiment classification: 'positive', 'neutral', or 'negative' (do not repeat the field name, do not use 'mixed').\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\nClassifier Decision: negative\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

### Run a Full Evaluation on the Dev Dataset

In [286]:
# Define the evaluater, setting threads to 1 to avoid rate limiting
full_evaluater = Evaluate(
    devset=dev_dataset,
    num_threads=1,
    display_progress=True,
    display_table=True,
    return_outputs=True)

In [267]:
finals_large_results13 = full_evaluater(electra_sentiment_v7, metric=classification_match)

Average Metric: 4429 / 5421  (81.7): 100%|██████████| 5421/5421 [1:35:35<00:00,  1.06s/it]


Unnamed: 0,review,example_classification,pred_classification,classifier_decision,probabilities,classification_match
0,Found Thai Spoon on the Vegan Pittsburgh website.,neutral,neutral,neutral,"[4.697101303463569e-07, 0.9999978542327881, 1.621236606297316e-06]",✔️ [True]
1,Our bill came out to around $27 and we ate like the wealthy.,positive,positive,positive,"[0.0002764011442195624, 4.462958258955041e-06, 0.9997192025184631]",✔️ [True]
2,State Farm broke down the costs for me of the parts and labor.,neutral,neutral,neutral,"[3.5087659853161313e-06, 0.9990319013595581, 0.0009645841782912612]",✔️ [True]
3,The only con for this resto is the wait to get a seat.,negative,negative,negative,"[0.9999983310699463, 1.3033886148150486e-07, 1.499672521276807e-06]",✔️ [True]
4,We could hear the people above us stomping around even after midnight.,negative,negative,negative,"[0.9999862909317017, 1.3497856343747117e-05, 1.8418370473227696e-07]",✔️ [True]
5,"For your information, I got the salad because it sounded delicious not because I was on a diet - although it did only have half...",positive,positive,positive,"[4.743712634081021e-05, 1.3617607237392804e-06, 0.9999512434005737]",✔️ [True]
6,"It can not be enjoyed , even on the level that one enjoys a bad slasher flick , primarily because it is dull .",negative,negative,negative,"[0.9999980926513672, 3.132423103124893e-07, 1.5206121588562382e-06]",✔️ [True]
7,3rd strike with these guys.,negative,negative,negative,"[0.9999703168869019, 1.2263358257769141e-06, 2.8483986170613207e-05]",✔️ [True]
8,And then the service...PHENOMENAL!!,positive,positive,positive,"[3.887211619257869e-08, 3.464687097221031e-07, 0.9999996423721313]",✔️ [True]
9,"After about 10 minutes of coaxing with some buffing tools and liquids, it was gone!",positive,negative,neutral,"[0.005842412821948528, 0.7642890214920044, 0.229868546128273]",False


In [64]:
def convert_results_to_df(results):
    # Extract the list of tuples from the results tuple
    examples = results[1]

    # Initialize an empty list to store the extracted data
    data = []

    # Iterate over the list of tuples
    for example, prediction, match in examples:
        review = example['review']
        classification = example['classification']
        prediction_classification = prediction.classification
        data.append({
            'review': review,
            'classification': classification,
            'prediction': prediction_classification,
            'match': match
        })

    # Convert the list to a DataFrame
    results_df = pd.DataFrame(data)
    return results_df

In [None]:
finals_large_results13_df = convert_results_to_df(finals_large_results13)

### Explore Prompt Variations with COPRO Optimizer

We'll see if we can get any improvement by having a language model try to optimize the prompt, and evaluating the classification match metric. I had to make the metric more flexible in field names, bucause the model would sometimes suggest field names other than 'classification'.

In [54]:
# Create the COPRO optimizer for the ELECTRA sentiment module
copro_optimizer4 = COPRO(
    metric=classification_match,
    track_stats=True,
    breadth=5,
    depth=5,
    max_rounds=10
)

In [53]:
# Set some arguments for the COPRO optimizer
kwargs = dict(num_threads=1, display_progress=True, display_table=False)

In [73]:
compiled_prompt_opt4 = copro_optimizer4.compile(electra_sentiment_v7, trainset=train_dataset[300:350], eval_kwargs=kwargs)

Average Metric: 47 / 50  (94.0): 100%|██████████| 50/50 [01:11<00:00,  1.44s/it]
Average Metric: 33 / 50  (66.0): 100%|██████████| 50/50 [01:40<00:00,  2.01s/it]
Average Metric: 47 / 50  (94.0): 100%|██████████| 50/50 [01:17<00:00,  1.54s/it]
Average Metric: 22 / 50  (44.0): 100%|██████████| 50/50 [02:21<00:00,  2.82s/it]
Average Metric: 50 / 50  (100.0): 100%|██████████| 50/50 [01:10<00:00,  1.40s/it]
Average Metric: 17 / 50  (34.0): 100%|██████████| 50/50 [02:41<00:00,  3.22s/it]
Average Metric: 19 / 50  (38.0): 100%|██████████| 50/50 [02:36<00:00,  3.14s/it]
Average Metric: 17 / 50  (34.0): 100%|██████████| 50/50 [00:52<00:00,  1.05s/it]
Average Metric: 19 / 50  (38.0): 100%|██████████| 50/50 [00:52<00:00,  1.05s/it]
Average Metric: 17 / 50  (34.0): 100%|██████████| 50/50 [00:52<00:00,  1.05s/it]
Average Metric: 19 / 50  (38.0): 100%|██████████| 50/50 [00:52<00:00,  1.05s/it]
Average Metric: 17 / 50  (34.0): 100%|██████████| 50/50 [00:52<00:00,  1.05s/it]
Average Metric: 19 / 50  (3

In [74]:
compiled_prompt_opt4.results_best

{140569734002816: {'depth': [0, 1, 2, 3, 4],
  'max': [100.0, 100.0, 100.0, 100.0, 100.0],
  'average': [79.6,
   67.14285714285714,
   67.14285714285714,
   67.14285714285714,
   67.14285714285714],
  'min': [44.0, 34.0, 34.0, 34.0, 34.0],
  'std': [21.36913662270893,
   26.74444439734071,
   26.74444439734071,
   26.74444439734071,
   26.74444439734071]}}

In [75]:
compiled_prompt_opt4.candidate_programs

[{'score': 100.0,
  'program': generate_answer = Predict(StringSignature(review, classifier_decision -> classification
      instructions="Classify the sentiment of a review as either 'positive', 'neutral', or 'negative'."
      review = Field(annotation=str required=True json_schema_extra={'desc': 'The review text to classify.', '__dspy_field_type': 'input', 'prefix': 'Review:'})
      classifier_decision = Field(annotation=str required=True json_schema_extra={'desc': 'The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.', '__dspy_field_type': 'input', 'prefix': 'Classifier Decision:'})
      classification = Field(annotation=str required=True json_schema_extra={'desc': "One word representing the sentiment classification: 'positive', 'neutral', or 'negative'.", '__dspy_field_type': 'output', 'prefix': 'Classification:'})
  )),
  'instruction': "Classify the sentiment of a review as either 'positive', 'neutral', or 'negative'.",
  'prefix': 'Classifica

### See if GPT4o is Better than Mini

In [121]:
lm = dspy.OpenAI(model='gpt-4o', api_key=openai_key)
dspy.settings.configure(lm=lm)

In [76]:
copro_optimizer5 = COPRO(
    metric=classification_match,
    track_stats=True,
    breadth=3,
    depth=3,
    max_rounds=10
)

In [78]:
compiled_prompt_opt5_gpt4o = copro_optimizer5.compile(electra_sentiment_v7, trainset=train_dataset[300:350], eval_kwargs=kwargs)

Average Metric: 50 / 50  (100.0): 100%|██████████| 50/50 [01:16<00:00,  1.53s/it]
Average Metric: 46 / 50  (92.0): 100%|██████████| 50/50 [01:09<00:00,  1.39s/it] 
Average Metric: 49 / 50  (98.0): 100%|██████████| 50/50 [01:14<00:00,  1.48s/it] 
Average Metric: 49 / 50  (98.0): 100%|██████████| 50/50 [01:14<00:00,  1.50s/it] 
Average Metric: 48 / 50  (96.0): 100%|██████████| 50/50 [01:10<00:00,  1.40s/it] 


In [86]:
compiled_prompt_opt5_gpt4o.results_best

{140569734002720: {'depth': [0, 1, 2],
  'max': [100.0, 100.0, 100.0],
  'average': [96.66666666666667, 97.0, 96.8],
  'min': [92.0, 92.0, 92.0],
  'std': [3.39934634239519, 3.0, 2.7129319932501073]}}

In [87]:
compiled_prompt_opt5_gpt4o.candidate_programs

[{'score': 100.0,
  'program': generate_answer = Predict(StringSignature(review, classifier_decision -> classification
      instructions="---\nBasic Instruction: Classify the sentiment of a review as either 'positive', 'neutral', or 'negative'.\nProposed Instruction: Examine the context and underlying emotions conveyed in the following review. Based on your analysis, classify the overall sentiment as either 'positive', 'neutral', or 'negative'."
      review = Field(annotation=str required=True json_schema_extra={'desc': 'The review text to classify.', '__dspy_field_type': 'input', 'prefix': 'Review:'})
      classifier_decision = Field(annotation=str required=True json_schema_extra={'desc': 'The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.', '__dspy_field_type': 'input', 'prefix': 'Classifier Decision:'})
      classification = Field(annotation=str required=True json_schema_extra={'desc': "One word representing the sentiment classification: 'posi

Note: I ran the above prompt that scored 100 against a larger dataset, and it did not perform well.

### Evaluate GPT4 Only - No ELECTRA Classification Included

In [107]:
# Modified ElectraV7SentimentClassification class without BERT/ELECTRA input
class SentimentClassificationWithoutElectra(dspy.Signature):
    """Classify the sentiment of a review as either 'positive', 'neutral', or 'negative'."""

    review = dspy.InputField(desc="The review text to classify.")
    classification = dspy.OutputField(desc="One word representing the sentiment classification: 'positive', 'neutral', or 'negative' (do not repeat the field name).")

# Modified ElectraV7Sentiment class without BERT/ELECTRA input
class SentimentClassifierWithoutElectra(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.Predict(SentimentClassificationWithoutElectra)

    def forward(self, review):
        prediction = self.generate_answer(review=review)
        return dspy.Prediction(classification=prediction.classification.lower().strip())


In [80]:
gpt_only_sentiment = SentimentClassifierWithoutElectra()

In [81]:
gpt_only_sentiment_result = gpt_only_sentiment(review="Those 2 drinks are part of the HK culture and has years of history. It is so bad.")

In [82]:
gpt_only_sentiment_result

Prediction(
    classification='classification: negative'
)

In [83]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'positive', 'neutral', or 'negative'.

---

Follow the following format.

Review: The review text to classify.
Classification: One word representing the sentiment classification: 'positive', 'neutral', or 'negative'.

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.
Classification:[32m Classification: negative[0m





"\n\n\nClassify the sentiment of a review as either 'positive', 'neutral', or 'negative'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassification: One word representing the sentiment classification: 'positive', 'neutral', or 'negative'.\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\nClassification:\x1b[32m Classification: negative\x1b[0m\n\n\n"

In [84]:
compiled_prompt_gpt4o_only = copro_optimizer5.compile(gpt_only_sentiment, trainset=train_dataset[300:350], eval_kwargs=kwargs)

Average Metric: 41 / 50  (82.0): 100%|██████████| 50/50 [00:22<00:00,  2.23it/s]
Average Metric: 39 / 50  (78.0): 100%|██████████| 50/50 [00:21<00:00,  2.30it/s]
Average Metric: 43 / 50  (86.0): 100%|██████████| 50/50 [00:15<00:00,  3.31it/s]
Average Metric: 0 / 19  (0.0):  19%|█▉        | 19/100 [13:02:29<55:35:52, 2471.02s/it]
Average Metric: 0 / 6  (0.0):  30%|███       | 6/20 [4:33:39<10:38:32, 2736.61s/it]
Average Metric: 21 / 50  (42.0): 100%|██████████| 50/50 [01:06<00:00,  1.34s/it]
Average Metric: 21 / 50  (42.0): 100%|██████████| 50/50 [00:00<00:00, 1318.04it/s]


### Evaluate the 100 Score COPRO Prompt

Let's evaluate the prompt that had the top score earlier above. It had a different instruction set.

In [117]:
class ElectraV8SentimentClassification(dspy.Signature):
    __doc__ = """Examine the context and underlying emotions conveyed in the following review.
    Based on your analysis, classify the overall sentiment as either 'positive', 'neutral', or 'negative'."""

    review = dspy.InputField(desc="The review text to classify.")
    classifier_decision = dspy.InputField(desc="The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.")
    classification = dspy.OutputField(desc="One word representing the sentiment classification: 'positive', 'neutral', or 'negative' (do not repeat the field name).")

class ElectraV8Sentiment(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.Predict(ElectraV8SentimentClassification)
        self.tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
        self.numeric_dict = {0: 'negative', 1: 'neutral', 2: 'positive'}

    def classify_with_electra(self, review):
        _, predicted_label, probabilities = predict_sentence(electra_v4_model, review, self.tokenizer, self.numeric_dict)
        return predicted_label, probabilities

    def forward(self, review):
        classifier_decision, probabilities = self.classify_with_electra(review)

        prediction = self.generate_answer(
            review=review, 
            classifier_decision=classifier_decision
        )
        time.sleep(1)
        
        return dspy.Prediction(
            classification=prediction.classification.lower().strip(),
            classifier_decision=classifier_decision, 
            probabilities=probabilities
        )

In [123]:
electra_sentiment_v8 = ElectraV8Sentiment()

In [124]:
electra_sentiment_v8_result = electra_sentiment_v8(review="Those 2 drinks are part of the HK culture and has years of history. It is so bad.")

In [125]:
electra_sentiment_v8_result

Prediction(
    classification='negative',
    classifier_decision='negative',
    probabilities=[0.9999992847442627, 1.6079250997336203e-07, 6.279589683799713e-07]
)

In [126]:
lm.inspect_history(n=1)




Examine the context and underlying emotions conveyed in the following review.
Based on your analysis, classify the overall sentiment as either 'positive', 'neutral', or 'negative'.

---

Follow the following format.

Review: The review text to classify.
Classifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.
Classification: One word representing the sentiment classification: 'positive', 'neutral', or 'negative' (do not repeat the field name).

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.
Classifier Decision: negative
Classification:[32m negative[0m





"\n\n\nExamine the context and underlying emotions conveyed in the following review.\nBased on your analysis, classify the overall sentiment as either 'positive', 'neutral', or 'negative'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.\nClassification: One word representing the sentiment classification: 'positive', 'neutral', or 'negative' (do not repeat the field name).\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\nClassifier Decision: negative\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [127]:
finals_large_results14_gpt4o = full_evaluater(electra_sentiment_v8, metric=classification_match)
# additional prompt doesn't seem to be working better, even with 4o

Average Metric: 31 / 45  (68.9):   1%|          | 45/5421 [00:59<2:20:10,  1.56s/it]

KeyboardInterrupt: 

In [106]:
lm.inspect_history(n=20)




Examine the context and underlying emotions conveyed in the following review.
Based on your analysis, classify the overall sentiment as either 'positive', 'neutral', or 'negative'.

---

Follow the following format.

Review: The review text to classify.
Classifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.
Classification: One word representing the sentiment classification: 'positive', 'neutral', or 'negative' (do not repeat the field name).

---

Review: Our bill came out to around $27 and we ate like the wealthy.
Classifier Decision: positive
Classification:[32m positive[0m





Examine the context and underlying emotions conveyed in the following review.
Based on your analysis, classify the overall sentiment as either 'positive', 'neutral', or 'negative'.

---

Follow the following format.

Review: The review text to classify.
Classifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sen

"\n\n\nExamine the context and underlying emotions conveyed in the following review.\nBased on your analysis, classify the overall sentiment as either 'positive', 'neutral', or 'negative'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.\nClassification: One word representing the sentiment classification: 'positive', 'neutral', or 'negative' (do not repeat the field name).\n\n---\n\nReview: Our bill came out to around $27 and we ate like the wealthy.\nClassifier Decision: positive\nClassification:\x1b[32m positive\x1b[0m\n\n\n\n\n\nExamine the context and underlying emotions conveyed in the following review.\nBased on your analysis, classify the overall sentiment as either 'positive', 'neutral', or 'negative'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassifier Decision: The sentiment classification proposed by

### Prepare Bake-Off Submission

In [66]:
bakeoff_df = pd.read_csv('data/bakeoff/cs224u-sentiment-test-unlabeled.csv')

In [67]:
bakeoff_df

Unnamed: 0,example_id,sentence
0,0,"This year we were at a restaurant that clearly ""skimped"" on menu items as well as food portions."
1,1,A long way.
2,2,"A friend and I went on a Thursday evening around 730/ 8 pm or so and it was a little busy, but we have to wait at all for a table."
3,3,You'll love to say I used to be married to that woman.
4,4,I feel like any place I move will be a downgrade from this place!
...,...,...
2995,2995,despite its many infuriating flaws -- not the least of which is amy 's self-absorbed personality -- amy 's o 's honesty will win you over .
2996,2996,A bone cyst is a hollow spot of bone filled with fluid and would have showed up as a different density on the X-ray.
2997,2997,The portions are big & the check is small.
2998,2998,Service and food was mediocre at best.


In [68]:
bakeoff_df.rename(columns={'sentence': 'review'}, inplace=True)

In [69]:
bakeoff_df

Unnamed: 0,example_id,review
0,0,"This year we were at a restaurant that clearly ""skimped"" on menu items as well as food portions."
1,1,A long way.
2,2,"A friend and I went on a Thursday evening around 730/ 8 pm or so and it was a little busy, but we have to wait at all for a table."
3,3,You'll love to say I used to be married to that woman.
4,4,I feel like any place I move will be a downgrade from this place!
...,...,...
2995,2995,despite its many infuriating flaws -- not the least of which is amy 's self-absorbed personality -- amy 's o 's honesty will win you over .
2996,2996,A bone cyst is a hollow spot of bone filled with fluid and would have showed up as a different density on the X-ray.
2997,2997,The portions are big & the check is small.
2998,2998,Service and food was mediocre at best.


In [70]:
bakeoff_dataset = dl.from_pandas(
    bakeoff_df,
    fields=['review'],
    input_keys=['review'],
)

In [71]:
bakeoff_dataset[0]

Example({'review': 'This year we were at a restaurant that clearly "skimped" on menu items as well as food portions.'}) (input_keys={'review'})

In [72]:
bakeoff_test_result = electra_sentiment_v7(bakeoff_df['review'][0])

In [73]:
bakeoff_test_result

Prediction(
    classification='negative',
    classifier_decision='negative',
    probabilities=[0.9999994039535522, 3.573900073661207e-07, 2.1121232407494972e-07]
)

In [74]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'positive', 'neutral', or 'negative'.

---

Follow the following format.

Review: The review text to classify.
Classifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.
Classification: One word representing the sentiment classification: 'positive', 'neutral', or 'negative' (do not repeat the field name, do not use 'mixed').

---

Review: This year we were at a restaurant that clearly "skimped" on menu items as well as food portions.
Classifier Decision: negative
Classification:[32m negative[0m





'\n\n\nClassify the sentiment of a review as either \'positive\', \'neutral\', or \'negative\'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.\nClassification: One word representing the sentiment classification: \'positive\', \'neutral\', or \'negative\' (do not repeat the field name, do not use \'mixed\').\n\n---\n\nReview: This year we were at a restaurant that clearly "skimped" on menu items as well as food portions.\nClassifier Decision: negative\nClassification:\x1b[32m negative\x1b[0m\n\n\n'

In [75]:
# Function to iterate through the bake-off dataset and make predictions
def make_batch_predictions(model, dataset) -> pd.DataFrame:
    results = []
    total = len(dataset) if hasattr(dataset, '__len__') else None
    
    with tqdm(total=total, desc="Processing reviews", file=sys.stdout) as pbar:
        for example in dataset:
            try:
                prediction = model(**example)
                
                result = {
                    'review': example['review'],
                    'classification': prediction.classification,
                    'classifier_decision': prediction.classifier_decision,
                    # Add any other fields you want to save
                }
                
                results.append(result)
            except Exception as e:
                print(f"Error processing review: {example['review'][:50]}... Error: {str(e)}")
            finally:
                pbar.update(1)
                pbar.refresh()  # Force refresh of the progress bar
    
    return pd.DataFrame(results)

In [76]:
bakeoff_results = make_batch_predictions(electra_sentiment_v7, bakeoff_dataset)

Processing reviews:   0%|          | 0/3000 [00:00<?, ?it/s]

In [77]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'positive', 'neutral', or 'negative'.

---

Follow the following format.

Review: The review text to classify.
Classifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.
Classification: One word representing the sentiment classification: 'positive', 'neutral', or 'negative' (do not repeat the field name, do not use 'mixed').

---

Review: I mean seriously, you can go to the Metro Health civic, get your prescriptions filled, and get your weekly groceries while waiting.
Classifier Decision: positive
Classification:[32m positive[0m





"\n\n\nClassify the sentiment of a review as either 'positive', 'neutral', or 'negative'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.\nClassification: One word representing the sentiment classification: 'positive', 'neutral', or 'negative' (do not repeat the field name, do not use 'mixed').\n\n---\n\nReview: I mean seriously, you can go to the Metro Health civic, get your prescriptions filled, and get your weekly groceries while waiting.\nClassifier Decision: positive\nClassification:\x1b[32m positive\x1b[0m\n\n\n"

In [78]:
bakeoff_results

Unnamed: 0,review,classification,classifier_decision
0,"This year we were at a restaurant that clearly ""skimped"" on menu items as well as food portions.",negative,negative
1,A long way.,neutral,neutral
2,"A friend and I went on a Thursday evening around 730/ 8 pm or so and it was a little busy, but we have to wait at all for a table.",neutral,neutral
3,You'll love to say I used to be married to that woman.,neutral,neutral
4,I feel like any place I move will be a downgrade from this place!,positive,positive
...,...,...,...
2995,despite its many infuriating flaws -- not the least of which is amy 's self-absorbed personality -- amy 's o 's honesty will win you over .,positive,positive
2996,A bone cyst is a hollow spot of bone filled with fluid and would have showed up as a different density on the X-ray.,neutral,neutral
2997,The portions are big & the check is small.,positive,positive
2998,Service and food was mediocre at best.,negative,negative


In [79]:
bakeoff_submission_df = bakeoff_df.copy()

In [80]:
bakeoff_submission_df['prediction'] = bakeoff_results['classification']

In [81]:
bakeoff_submission_df.rename(columns={'review': 'sentence'}, inplace=True)

In [82]:
bakeoff_submission_df

Unnamed: 0,example_id,sentence,prediction
0,0,"This year we were at a restaurant that clearly ""skimped"" on menu items as well as food portions.",negative
1,1,A long way.,neutral
2,2,"A friend and I went on a Thursday evening around 730/ 8 pm or so and it was a little busy, but we have to wait at all for a table.",neutral
3,3,You'll love to say I used to be married to that woman.,neutral
4,4,I feel like any place I move will be a downgrade from this place!,positive
...,...,...,...
2995,2995,despite its many infuriating flaws -- not the least of which is amy 's self-absorbed personality -- amy 's o 's honesty will win you over .,positive
2996,2996,A bone cyst is a hollow spot of bone filled with fluid and would have showed up as a different density on the X-ray.,neutral
2997,2997,The portions are big & the check is small.,positive
2998,2998,Service and food was mediocre at best.,negative


In [83]:
bakeoff_submission_df['prediction'].value_counts()

prediction
negative    1261
positive    1027
neutral      712
Name: count, dtype: int64

In [84]:
bakeoff_submission_df[bakeoff_submission_df['prediction'] == 'mixed']

Unnamed: 0,example_id,sentence,prediction


In [85]:
bakeoff_results[bakeoff_results['classification'] == 'mixed']

Unnamed: 0,review,classification,classifier_decision


In [86]:
bakeoff_submission_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   example_id  3000 non-null   int64 
 1   sentence    3000 non-null   object
 2   prediction  3000 non-null   object
dtypes: int64(1), object(2)
memory usage: 70.4+ KB


In [105]:
bakeoff_submission_df.to_csv("data/bakeoff/cs224u-sentiment-bakeoff-entry.csv", index=True)

In [106]:
def test_bakeoff_entry(filename="cs224u-sentiment-bakeoff-entry.csv"):
    gold_df = pd.read_csv(
        os.path.join("data", "bakeoff", "cs224u-sentiment-test-unlabeled.csv"))
    entry_df = pd.read_csv(filename)
    print(f"Gold columns: {gold_df.columns}")
    print(f"Entry columns: {entry_df.columns}")

    # Check that no required columns are missing:
    expected_cols = {'example_id', 'sentence', 'prediction'}
    missing_cols = expected_cols - set(entry_df.columns)
    errcount = 0
    if len(missing_cols) != 0:
        errcount += 1
        print(f"Entry is missing required columns {missing_cols}")
        return

    # Check that the predictions are in our space:
    labels = {'positive', 'negative', 'neutral'}
    predtypes = set(entry_df.prediction.unique())
    unexpected = predtypes - labels
    if len(unexpected) != 0:
        errcount += 1
        print(f"Prediction column has unexpected values: {unexpected}")

    # Check that the dataset hasn't been rearranged:
    for colname in ('example_id', 'sentence'):
        print(f"Checking alignment on column '{colname}'")
        if not entry_df[colname].equals(gold_df[colname]):
            print(f"entry_df[{colname}] does not match gold_df[{colname}]")
            
            errcount += 1
            print(f"Entry is misaligned with test data on column {colname}")

    # Clean bill of health:
    if errcount == 0:
        print("No errors detected with `test_bakeoff_entry`.")

In [107]:
test_bakeoff_entry("data/bakeoff/cs224u-sentiment-bakeoff-entry.csv")

Gold columns: Index(['example_id', 'sentence'], dtype='object')
Entry columns: Index(['Unnamed: 0', 'example_id', 'sentence', 'prediction'], dtype='object')
Checking alignment on column 'example_id'
Checking alignment on column 'sentence'
No errors detected with `test_bakeoff_entry`.


### Split the ELECTRA Model from Classifier Head

In [151]:
class ElectraSplitModel():
    def __init__(self, ddp_model):
        self.tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Access the Electra model and custom pooling layer explicitly, without using nn.Sequential
        self.electra_model = ddp_model.model.module.bert
        self.custom_pooling = ddp_model.model.module.custom_pooling

        # Access the classifier head for later use
        self.classifier_head = ddp_model.model.module.classifier

    def extract_embeddings(self, texts):
        # Tokenize the input texts
        input_ids, attention_mask = tokenize(texts, self.tokenizer, self.device)
        
        # Forward pass through Electra model (returns a dict-like object)
        outputs = self.electra_model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Extract the hidden states (last hidden state of Electra)
        last_hidden_state = outputs.last_hidden_state
        
        # Pass the hidden state through the custom pooling layer
        pooled_embeddings = self.custom_pooling(last_hidden_state, attention_mask)
        
        return pooled_embeddings


In [152]:
electra_split_model = ElectraSplitModel(electra_v4_model)

In [154]:
electra_split_model.electra_model

ElectraModel(
  (embeddings): ElectraEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): ElectraEncoder(
    (layer): ModuleList(
      (0-11): 12 x ElectraLayer(
        (attention): ElectraAttention(
          (self): ElectraSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): ElectraSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0

In [156]:
electra_split_model.custom_pooling

PoolingLayer()

In [155]:
electra_split_model.classifier_head

Classifier(
  (layers): Sequential(
    (0): Linear(in_features=768, out_features=1024, bias=True)
    (1): GELU(approximate='none')
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=1024, out_features=1024, bias=True)
    (4): GELU(approximate='none')
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=1024, out_features=3, bias=True)
  )
)

In [110]:
train_df

Unnamed: 0,review,classification,source
0,Those 2 drinks are part of the HK culture and has years of history. It is so bad.,negative,dynasent_r2
1,"I was told by the repair company that was doing the car repair that fixing the rim was ""impossible"" and to replace it.",negative,dynasent_r1
2,It is there to give them a good time .,neutral,sst_local
3,Like leafing through an album of photos accompanied by the sketchiest of captions .,negative,sst_local
4,Johnny was a talker and liked to have fun.,positive,dynasent_r1
...,...,...,...
102092,I thought this place was supposed to be good.,negative,dynasent_r1
102093,They claim it's because people didn't like it because *somehow* they couldn't figure out that it was going to be bitter.,negative,dynasent_r1
102094,There is also another marbled-out full bathroom off of the living room.,neutral,dynasent_r1
102095,You put in your cell phone number & select a day/time to make an appointment.,neutral,dynasent_r1


In [157]:
embeddings_list = []
labels_list = []

print("Generating embeddings...")
for index, row in tqdm(train_df.iterrows(), total=len(train_df), desc="Processing Sentences"):
    review = row['review']
    classification = row['classification']
    
    with torch.no_grad():
        # Extract embeddings using the updated method
        embeddings = electra_split_model.extract_embeddings([review]).cpu().numpy().flatten()
    
    embeddings_list.append(embeddings)
    labels_list.append(classification)


Generating embeddings...


Processing Sentences:   0%|          | 0/102097 [00:00<?, ?it/s]

In [158]:
embeddings_df = pd.DataFrame(embeddings_list)
embeddings_df['label'] = labels_list  # Add the labels as a column

In [162]:
embeddings_df['review'] = train_df['review']

In [165]:
embeddings_df['source'] = train_df['source']

In [167]:
embeddings_df.rename(columns={i: f"embedding_{i}" for i in range(768)}, inplace=True)

In [183]:
embeddings_df.rename(columns={'label': 'classification'}, inplace=True)

In [170]:
embeddings_df

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767,classification,review,source
0,-0.402117,-0.450127,0.147037,-0.554020,-0.638906,0.147562,-0.204138,0.653244,-0.079969,-0.057176,...,-0.090892,-0.765751,-0.456145,-0.085832,-0.584845,0.295148,-0.233968,negative,Those 2 drinks are part of the HK culture and has years of history. It is so bad.,dynasent_r2
1,-0.393685,-0.628868,0.197301,-0.448643,-0.890617,0.287361,-0.098624,0.782526,-0.025718,0.147110,...,-0.164597,-0.941937,-0.554716,-0.111366,-0.607455,0.488544,-0.213159,negative,"I was told by the repair company that was doing the car repair that fixing the rim was ""impossible"" and to replace it.",dynasent_r1
2,0.000260,1.189879,0.580651,0.516785,0.385979,0.033826,-0.001303,-0.215182,0.069211,-0.054507,...,-0.161399,0.486307,0.496383,0.185416,0.754505,-0.208561,-0.229780,neutral,It is there to give them a good time .,sst_local
3,-0.450467,-0.560251,0.155656,-0.448020,-0.584685,0.215566,-0.249357,0.689859,-0.146624,0.004290,...,-0.085385,-0.825503,-0.488971,-0.138494,-0.620767,0.271081,-0.238336,negative,Like leafing through an album of photos accompanied by the sketchiest of captions .,sst_local
4,0.551893,-1.038651,-0.375347,-0.559218,0.393628,0.409123,0.235162,0.420329,-0.367803,0.080697,...,-0.235647,0.343479,-0.040225,0.274155,-0.153276,-0.332588,0.275649,positive,Johnny was a talker and liked to have fun.,dynasent_r1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102092,-0.482056,-0.601502,0.048537,-0.478421,-0.556970,0.264339,-0.153623,0.549176,-0.052104,-0.097786,...,-0.164706,-0.720283,-0.420324,0.063541,-0.664345,0.323033,-0.314557,negative,I thought this place was supposed to be good.,dynasent_r1
102093,-0.559109,-0.453376,0.037667,-0.263920,-0.524163,0.269627,0.087732,0.438318,0.031376,0.085030,...,-0.097832,-0.690008,-0.284175,-0.166281,-0.695879,0.289846,-0.367000,negative,They claim it's because people didn't like it because *somehow* they couldn't figure out that it was going to be bitter.,dynasent_r1
102094,-0.064309,1.084431,0.867599,0.232269,0.583336,-0.003340,-0.010600,-0.197908,-0.104622,-0.031977,...,-0.251795,0.367268,0.391235,0.298359,0.352074,-0.176239,-0.154379,neutral,There is also another marbled-out full bathroom off of the living room.,dynasent_r1
102095,-0.068502,1.033296,0.664439,0.420302,0.836877,-0.084326,-0.263820,-0.240083,-0.304729,-0.046530,...,-0.252680,0.453644,0.487078,0.236565,0.497269,-0.223335,-0.608245,neutral,You put in your cell phone number & select a day/time to make an appointment.,dynasent_r1


In [176]:
embeddings_df.info(max_cols=771)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102097 entries, 0 to 102096
Data columns (total 771 columns):
 #    Column          Non-Null Count   Dtype  
---   ------          --------------   -----  
 0    embedding_0     102097 non-null  float32
 1    embedding_1     102097 non-null  float32
 2    embedding_2     102097 non-null  float32
 3    embedding_3     102097 non-null  float32
 4    embedding_4     102097 non-null  float32
 5    embedding_5     102097 non-null  float32
 6    embedding_6     102097 non-null  float32
 7    embedding_7     102097 non-null  float32
 8    embedding_8     102097 non-null  float32
 9    embedding_9     102097 non-null  float32
 10   embedding_10    102097 non-null  float32
 11   embedding_11    102097 non-null  float32
 12   embedding_12    102097 non-null  float32
 13   embedding_13    102097 non-null  float32
 14   embedding_14    102097 non-null  float32
 15   embedding_15    102097 non-null  float32
 16   embedding_16    102097 non-null  flo

In [171]:
embeddings_df.to_csv("data/merged/train_all_embeddings.csv", index=False)

In [173]:
test_df = pd.read_csv('data/merged/train_all_embeddings.csv')

In [174]:
test_df

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767,classification,review,source
0,-0.402117,-0.450127,0.147037,-0.554020,-0.638906,0.147562,-0.204138,0.653244,-0.079969,-0.057176,...,-0.090892,-0.765751,-0.456145,-0.085832,-0.584845,0.295148,-0.233968,negative,Those 2 drinks are part of the HK culture and has years of history. It is so bad.,dynasent_r2
1,-0.393685,-0.628868,0.197301,-0.448643,-0.890617,0.287361,-0.098624,0.782526,-0.025718,0.147110,...,-0.164597,-0.941937,-0.554716,-0.111366,-0.607455,0.488544,-0.213159,negative,"I was told by the repair company that was doing the car repair that fixing the rim was ""impossible"" and to replace it.",dynasent_r1
2,0.000260,1.189879,0.580651,0.516785,0.385979,0.033826,-0.001303,-0.215182,0.069211,-0.054507,...,-0.161399,0.486307,0.496383,0.185416,0.754505,-0.208561,-0.229780,neutral,It is there to give them a good time .,sst_local
3,-0.450467,-0.560251,0.155656,-0.448020,-0.584685,0.215566,-0.249357,0.689859,-0.146624,0.004290,...,-0.085385,-0.825503,-0.488971,-0.138494,-0.620767,0.271081,-0.238336,negative,Like leafing through an album of photos accompanied by the sketchiest of captions .,sst_local
4,0.551894,-1.038651,-0.375347,-0.559218,0.393628,0.409123,0.235162,0.420329,-0.367803,0.080697,...,-0.235647,0.343479,-0.040225,0.274155,-0.153276,-0.332588,0.275649,positive,Johnny was a talker and liked to have fun.,dynasent_r1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102092,-0.482056,-0.601502,0.048537,-0.478421,-0.556970,0.264339,-0.153623,0.549176,-0.052104,-0.097786,...,-0.164706,-0.720283,-0.420324,0.063541,-0.664345,0.323033,-0.314557,negative,I thought this place was supposed to be good.,dynasent_r1
102093,-0.559109,-0.453376,0.037667,-0.263920,-0.524163,0.269627,0.087732,0.438318,0.031376,0.085030,...,-0.097832,-0.690008,-0.284175,-0.166281,-0.695879,0.289846,-0.367000,negative,They claim it's because people didn't like it because *somehow* they couldn't figure out that it was going to be bitter.,dynasent_r1
102094,-0.064309,1.084431,0.867599,0.232269,0.583336,-0.003340,-0.010600,-0.197908,-0.104622,-0.031977,...,-0.251795,0.367268,0.391235,0.298359,0.352074,-0.176239,-0.154379,neutral,There is also another marbled-out full bathroom off of the living room.,dynasent_r1
102095,-0.068502,1.033296,0.664439,0.420302,0.836877,-0.084326,-0.263820,-0.240083,-0.304729,-0.046530,...,-0.252680,0.453644,0.487078,0.236565,0.497269,-0.223335,-0.608245,neutral,You put in your cell phone number & select a day/time to make an appointment.,dynasent_r1


In [177]:
test_df.info(max_cols=771)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102097 entries, 0 to 102096
Data columns (total 771 columns):
 #    Column          Non-Null Count   Dtype  
---   ------          --------------   -----  
 0    embedding_0     102097 non-null  float64
 1    embedding_1     102097 non-null  float64
 2    embedding_2     102097 non-null  float64
 3    embedding_3     102097 non-null  float64
 4    embedding_4     102097 non-null  float64
 5    embedding_5     102097 non-null  float64
 6    embedding_6     102097 non-null  float64
 7    embedding_7     102097 non-null  float64
 8    embedding_8     102097 non-null  float64
 9    embedding_9     102097 non-null  float64
 10   embedding_10    102097 non-null  float64
 11   embedding_11    102097 non-null  float64
 12   embedding_12    102097 non-null  float64
 13   embedding_13    102097 non-null  float64
 14   embedding_14    102097 non-null  float64
 15   embedding_15    102097 non-null  float64
 16   embedding_16    102097 non-null  flo

In [182]:
dev_embeddings_list = []
dev_labels_list = []

print("Generating embeddings...")
for index, row in tqdm(dev_df.iterrows(), total=len(dev_df), desc="Processing Sentences"):
    review = row['review']
    classification = row['classification']
    
    with torch.no_grad():
        # Extract embeddings using the updated method
        dev_embeddings = electra_split_model.extract_embeddings([review]).cpu().numpy().flatten()
    
    dev_embeddings_list.append(dev_embeddings)
    dev_labels_list.append(classification)


Generating embeddings...


Processing Sentences:   0%|          | 0/5421 [00:00<?, ?it/s]

In [184]:
dev_embeddings_df = pd.DataFrame(dev_embeddings_list)
dev_embeddings_df['label'] = dev_labels_list  # Add the labels as a column

In [185]:
dev_embeddings_df['review'] = dev_df['review']

In [186]:
dev_embeddings_df['source'] = dev_df['source']

In [187]:
dev_embeddings_df.rename(columns={i: f"embedding_{i}" for i in range(768)}, inplace=True)

In [188]:
dev_embeddings_df.rename(columns={'label': 'classification'}, inplace=True)

In [189]:
dev_embeddings_df

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767,classification,review,source
0,0.225414,1.017563,0.625026,0.483827,0.354414,-0.097294,0.050955,-0.255838,-0.029534,-0.042653,...,-0.245365,0.615838,0.472009,0.312599,0.184574,-0.112558,-0.055385,neutral,Found Thai Spoon on the Vegan Pittsburgh website.,dynasent_r1
1,0.542622,-1.014068,-0.243230,-0.737740,0.230173,-0.255116,0.086105,0.250760,-0.611111,-0.258188,...,-0.012513,-0.240601,-0.285545,-0.022228,-0.298299,-0.366071,0.116083,positive,Our bill came out to around $27 and we ate like the wealthy.,dynasent_r1
2,0.121821,0.343868,0.166896,0.401580,0.584421,-0.029947,-0.106082,-0.270320,-0.376784,-0.097001,...,-0.181886,0.476996,0.557914,-0.128651,0.468908,-0.160955,-0.245353,neutral,State Farm broke down the costs for me of the parts and labor.,dynasent_r1
3,-0.481431,-0.511226,0.105461,-0.545051,-0.566164,0.182205,-0.138122,0.638711,-0.038914,-0.120969,...,-0.031384,-0.751729,-0.403850,-0.078003,-0.650540,0.354354,-0.288278,negative,The only con for this resto is the wait to get a seat.,dynasent_r1
4,-0.446629,-0.187822,0.161752,-0.050124,-0.827621,0.236889,0.035475,0.344158,-0.033137,0.161185,...,-0.165057,-0.914168,-0.328175,0.012489,-0.689582,0.459529,-0.231245,negative,We could hear the people above us stomping around even after midnight.,dynasent_r1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5416,0.199067,-0.216328,0.295911,-0.442693,0.343279,-0.015607,0.348022,-0.152800,-0.323240,-0.302808,...,-0.068553,-0.177792,0.278916,0.083909,-0.304750,-0.393848,-0.154282,neutral,I think it's really a matter of mastering the basics when it comes to having nice people that know what they're selling.,dynasent_r2
5417,-0.499727,-0.464927,0.302022,-0.390633,-0.815165,0.060372,-0.147000,0.454052,-0.128180,-0.054083,...,-0.121867,-0.612123,-0.413869,0.046400,-0.597213,0.256335,-0.287705,negative,A bloated gasbag thesis grotesquely impressed by its own gargantuan aura of self-importance ...,sst_local
5418,-0.188918,-0.511853,-0.122822,-0.483598,-0.153160,-0.214628,-0.013024,0.501246,-0.339550,-0.418352,...,-0.078241,-0.680062,-0.214437,-0.087417,-0.703153,0.312814,-0.206814,negative,"Its story may be a thousand years old , but why did it have to seem like it took another thousand to tell it to us ?",sst_local
5419,-0.397605,-0.473972,0.011632,-0.353383,-0.737209,0.085004,-0.066591,0.525708,-0.144680,0.024075,...,-0.065764,-0.786990,-0.361672,-0.091418,-0.587658,0.294250,-0.406102,neutral,"I felt sad for Lise not so much because of what happens as because she was captured by this movie when she obviously belongs in something lighter and sunnier , by Rohmer , for example .",sst_local


In [190]:
dev_embeddings_df.to_csv("data/merged/dev_all_embeddings.csv", index=False)

In [193]:
splits = {'train': 'train.jsonl', 'test': 'test.jsonl'}
tweet_train_df = pd.read_json("hf://datasets/mteb/tweet_sentiment_extraction/" + splits["train"], lines=True)
tweet_dev_df = pd.read_json("hf://datasets/mteb/tweet_sentiment_extraction/" + splits["test"], lines=True)

In [194]:
tweet_train_df

Unnamed: 0,id,text,label,label_text
0,cb774db0d1,"I`d have responded, if I were going",1,neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0,negative
2,088c60f138,my boss is bullying me...,0,negative
3,9642c003ef,what interview! leave me alone,0,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on the releases we already bought",0,negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband lost his job and can`t afford it,0,negative
27477,4f4c4fc327,"I`ve wondered about rake to. The client has made it clear .NET only, don`t force devs to learn a new lang #agile #ccnet",0,negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - you probably need it after such hectic weekend Take care hun xxxx,2,positive
27479,ed167662a5,But it was worth it ****.,2,positive


In [195]:
tweet_dev_df

Unnamed: 0,id,text,label,label_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,1,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -- skyscrapers galore). Good tweeps in China: (SH) (BJ).,2,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to quit her company, such a shame!",0,negative
3,01082688c6,happy bday!,2,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,2,positive
...,...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep but i try it",0,negative
3530,416863ce47,"All alone in this old house again. Thanks for the net which keeps me alive and kicking! Whoever invented the net, i wanna kiss your hair!",2,positive
3531,6332da480c,I know what you mean. My little dog is sinking into depression... he wants to move someplace tropical,0,negative
3532,df1baec676,_sutra what is your next youtube video gonna be about? I love your videos!,2,positive


In [233]:
tweet_embeddings_list = []
tweet_labels_list = []

print("Generating embeddings...")
for index, row in tqdm(tweet_train_df.iterrows(), total=len(tweet_train_df), desc="Processing Sentences"):
    review = row['text']
    classification = row['label_text']
    
    with torch.no_grad():
        # Extract embeddings using the updated method
        tweet_embeddings = electra_split_model.extract_embeddings([review]).cpu().numpy().flatten()
    
    tweet_embeddings_list.append(tweet_embeddings)
    tweet_labels_list.append(classification)


Generating embeddings...


Processing Sentences:   0%|          | 0/27481 [00:00<?, ?it/s]

In [234]:
dev_tweet_embeddings_list = []
dev_tweet_labels_list = []

print("Generating embeddings...")
for index, row in tqdm(tweet_dev_df.iterrows(), total=len(tweet_dev_df), desc="Processing Sentences"):
    review = row['text']
    classification = row['label_text']
    
    with torch.no_grad():
        # Extract embeddings using the updated method
        dev_tweet_embeddings = electra_split_model.extract_embeddings([review]).cpu().numpy().flatten()
    
    dev_tweet_embeddings_list.append(dev_tweet_embeddings)
    dev_tweet_labels_list.append(classification)


Generating embeddings...


Processing Sentences:   0%|          | 0/3534 [00:00<?, ?it/s]

In [235]:
tweet_embeddings_df = pd.DataFrame(tweet_embeddings_list)
tweet_embeddings_df['label'] = tweet_labels_list  # Add the labels as a column

In [236]:
dev_tweet_embeddings_df = pd.DataFrame(dev_tweet_embeddings_list)
dev_tweet_embeddings_df['label'] = dev_tweet_labels_list  # Add the labels as a column

In [237]:
tweet_embeddings_df['text'] = tweet_train_df['text']

In [238]:
dev_tweet_embeddings_df['text'] = tweet_dev_df['text']

In [239]:
tweet_embeddings_df.to_csv("data/tweet/train_embeddings.csv", index=False)

In [240]:
dev_tweet_embeddings_df.to_csv("data/tweet/dev_embeddings.csv", index=False)

In [241]:
tweet_embeddings_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,760,761,762,763,764,765,766,767,label,text
0,0.381554,0.794922,0.541201,0.481426,0.323598,-0.099398,0.167367,-0.480494,-0.050794,-0.061221,...,0.538282,-0.138111,0.555931,0.589623,-0.011869,0.220851,-0.195177,-0.048827,neutral,"I`d have responded, if I were going"
1,0.307270,-0.987401,-0.326024,-0.710464,0.059104,-0.137902,-0.160784,0.406591,-0.565777,-0.161692,...,-0.350296,-0.065568,0.053796,-0.404935,-0.119155,-0.207082,-0.291866,0.225825,negative,Sooo SAD I will miss you here in San Diego!!!
2,-0.695778,-0.456851,0.084439,-0.329923,-0.701522,-0.039037,-0.035405,0.493828,-0.075197,-0.006629,...,-0.046841,-0.050415,-0.766332,-0.284677,0.106289,-0.691197,0.295791,-0.349947,negative,my boss is bullying me...
3,-0.715640,-0.324932,-0.044517,-0.046939,-0.692270,0.264505,0.134966,-0.085385,0.017387,0.128579,...,0.010712,-0.015120,-0.510937,-0.062284,-0.172358,-0.503127,0.367855,-0.035547,negative,what interview! leave me alone
4,-0.608140,-0.298680,0.111056,-0.313429,-0.623134,0.123741,0.052431,0.408345,0.061402,0.081912,...,-0.272971,-0.049727,-0.630022,-0.220886,0.049405,-0.724035,0.405521,-0.139068,negative,"Sons of ****, why couldn`t they put them on the releases we already bought"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27476,-0.074931,0.550983,0.526824,0.501499,0.177432,-0.095743,0.236191,-0.472939,-0.111814,0.078022,...,0.749427,-0.059516,0.679032,0.572549,-0.060522,0.174653,-0.280192,-0.165760,negative,wish we could come see u on Denver husband lost his job and can`t afford it
27477,-0.423768,0.743161,0.117799,0.425489,0.095100,-0.000276,0.278295,-0.379624,0.183852,0.021073,...,0.465893,0.065546,0.385793,0.584576,0.170476,-0.273193,-0.010267,0.050660,negative,"I`ve wondered about rake to. The client has made it clear .NET only, don`t force devs to learn a new lang #agile #ccnet"
27478,0.479175,-1.027423,-0.451457,-0.574839,0.248438,0.358990,-0.016915,0.427665,-0.428300,0.083854,...,-0.376410,-0.209876,0.164584,-0.248789,0.288163,-0.153698,-0.263476,0.406769,positive,Yay good for both of you. Enjoy the break - you probably need it after such hectic weekend Take care hun xxxx
27479,0.572013,-0.986481,-0.374878,-0.602228,0.387683,0.031786,0.040147,0.240205,-0.269468,0.024129,...,-0.423633,-0.175220,0.169771,-0.101836,0.150304,-0.244136,-0.459250,0.287517,positive,But it was worth it ****.


In [242]:
X_train = tweet_embeddings_df.drop(columns=['label', 'text'])
y_train = tweet_embeddings_df['label']

In [243]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer

In [244]:
# Create a pipeline with Standard Scaler and Logistic Regression
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(solver='newton-cg',
                                  C=0.001,
                                  multi_class='ovr',
                                  max_iter=1000))
])

In [245]:
pipe.fit(X_train, y_train)

In [246]:
# run a classification report
y_pred = pipe.predict(X_train)
print(classification_report(y_train, y_pred, zero_division=0))

              precision    recall  f1-score   support

    negative       0.70      0.70      0.70      7781
     neutral       0.68      0.68      0.68     11118
    positive       0.75      0.75      0.75      8582

    accuracy                           0.71     27481
   macro avg       0.71      0.71      0.71     27481
weighted avg       0.71      0.71      0.71     27481



In [250]:
def predict_sentence_logreg(logreg_model, electra_split_model, sentence, tokenizer, numeric_dict):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    with torch.no_grad():
        embeddings = electra_split_model.extract_embeddings([sentence]).cpu().numpy().flatten()
    
    predicted_probs = logreg_model.predict_proba([embeddings])
    
    predicted_class = predicted_probs.argmax(axis=1)[0]
    predicted_label = numeric_dict[predicted_class]
    probabilities = predicted_probs[0].tolist()

    del embeddings
    torch.cuda.empty_cache()

    return predicted_class, predicted_label, probabilities


In [259]:
test_sentence = "Maybe it would be your cup of tea"
_, predicted_label, predicted_probabilities = predict_sentence_logreg(pipe, electra_split_model, test_sentence, tokenizer, numeric_dict)

In [260]:
predicted_label, predicted_probabilities

('neutral', [0.11291774356329498, 0.7646277766201729, 0.12245447981653207])

In [None]:
class ElectraV9Sentiment(dspy.Module):
    def init(self):
        super().init()
        self.tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')

        # Splitting the model into two parts
        self.electra_with_pooling = nn.Sequential(
            electra_v4_model.bert,  # Electra backbone with embeddings and encoder
            electra_v4_model.custom_pooling  # Custom pooling layer
        )

    def extract_embeddings(self, texts):
        # Tokenize input
        input_ids, attention_mask = tokenize(texts, self.tokenizer, device)
        
        # Get pooled embeddings from the model
        pooled_embeddings = self.electra_with_pooling(input_ids=input_ids, attention_mask=attention_mask)
        return pooled_embeddings

In [281]:
class ElectraV9SentimentClassification(dspy.Signature):
    __doc__ = """Classify the sentiment of a review as either 'positive', 'neutral', or 'negative'."""
    
    review = dspy.InputField(desc="The review text to classify.")
    classifier_decision = dspy.InputField(desc="The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.")
    logreg_decision = dspy.InputField(desc="The sentiment classification proposed by a Logistic Regression model.")
    classification = dspy.OutputField(desc="One word representing your sentiment classification: 'positive', 'neutral', or 'negative' (do not repeat the field name, do not use 'mixed').")

class ElectraV9Sentiment(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.Predict(ElectraV9SentimentClassification)
        self.tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
        self.numeric_dict = {0: 'negative', 1: 'neutral', 2: 'positive'}

    def classify_with_electra(self, review):
        _, predicted_label, probabilities = predict_sentence(electra_v4_model, review, self.tokenizer, self.numeric_dict)
        return predicted_label, probabilities
    
    def classify_with_logistic_regression(self, review):        
        # Predict the sentiment using the logistic regression model
        _, predicted_label, probabilities = predict_sentence_logreg(pipe, electra_split_model, review, self.tokenizer, self.numeric_dict)
        return predicted_label, probabilities

    def forward(self, review):
        classifier_decision, probabilities = self.classify_with_electra(review)
        logreg_decision, logreg_probabilities = self.classify_with_logistic_regression(review)

        prediction = self.generate_answer(
            review=review, 
            classifier_decision=classifier_decision,
            logreg_decision=logreg_decision
        )
        time.sleep(0.5)
        
        return dspy.Prediction(
            classification=prediction.classification.lower().strip(),
            classifier_decision=classifier_decision, 
            logreg_decision=logreg_decision,
            probabilities=probabilities,
            logreg_probabilities=logreg_probabilities
        )

In [282]:
electra_sentiment_v9 = ElectraV9Sentiment()

In [283]:
electra_sentiment_v9_result = electra_sentiment_v9(review="Those 2 drinks are part of the HK culture and has years of history. It is so bad.")

In [284]:
electra_sentiment_v9_result

Prediction(
    classification='negative',
    classifier_decision='negative',
    logreg_decision='negative',
    probabilities=[0.9999992847442627, 1.6079250997336203e-07, 6.279589683799713e-07],
    logreg_probabilities=[0.7870939901343715, 0.187844166389211, 0.02506184347641748]
)

In [285]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'positive', 'neutral', or 'negative'.

---

Follow the following format.

Review: The review text to classify.

Classifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.

Logreg Decision: The sentiment classification proposed by a Logistic Regression model.

Classification: One word representing your sentiment classification: 'positive', 'neutral', or 'negative' (do not repeat the field name, do not use 'mixed').

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.

Classifier Decision: negative

Logreg Decision: negative

Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'positive', 'neutral', or 'negative'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\n\nClassifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.\n\nLogreg Decision: The sentiment classification proposed by a Logistic Regression model.\n\nClassification: One word representing your sentiment classification: 'positive', 'neutral', or 'negative' (do not repeat the field name, do not use 'mixed').\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\n\nClassifier Decision: negative\n\nLogreg Decision: negative\n\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [280]:
finals_large_results15 = full_evaluater(electra_sentiment_v9, metric=classification_match)

Average Metric: 10 / 11  (90.9):   0%|          | 11/5421 [00:15<2:09:11,  1.43s/it]

KeyboardInterrupt: 

In [None]:
def convert_results_to_df_plus(results):
    # Extract the list of tuples from the results tuple
    examples = results[1]

    # Initialize an empty list to store the extracted data
    data = []

    # Iterate over the list of tuples
    for example, prediction, match in examples:
        review = example['review']
        classification = example['classification']
        prediction_classification = prediction.classification
        logreg_match = prediction.logreg_decision == classification
        classifier_match = prediction.classifier_decision == classification
        data.append({
            'review': review,
            'classification': classification,
            'prediction': prediction_classification,
            'match': match,
            'classifier_decision': prediction.classifier_decision,
            'logreg_decision': prediction.logreg_decision,
            'classifier_match': classifier_match,
            'logreg_match': logreg_match
        })

    # Convert the list to a DataFrame
    results_df = pd.DataFrame(data)
    return results_df

In [None]:
finals_large_results15_df = convert_results_to_df_plus(finals_large_results15)

In [None]:
finals_large_results15_df.to_csv("data/merged/finals_large_results15.csv", index=False)

In [3]:
finals_large_results15_df = pd.read_csv('data/merged/finals_large_results15.csv')

In [4]:
finals_large_results15_df

Unnamed: 0,review,classification,prediction,match,classifier_decision,logreg_decision,classifier_match,logreg_match
0,Found Thai Spoon on the Vegan Pittsburgh website.,neutral,neutral,True,neutral,neutral,True,True
1,Our bill came out to around $27 and we ate lik...,positive,positive,True,positive,positive,True,True
2,State Farm broke down the costs for me of the ...,neutral,neutral,True,neutral,neutral,True,True
3,The only con for this resto is the wait to get...,negative,negative,True,negative,negative,True,True
4,We could hear the people above us stomping aro...,negative,negative,True,negative,negative,True,True
...,...,...,...,...,...,...,...,...
5416,I think it's really a matter of mastering the ...,neutral,positive,False,positive,positive,False,False
5417,A bloated gasbag thesis grotesquely impressed ...,negative,negative,True,negative,negative,True,True
5418,"Its story may be a thousand years old , but wh...",negative,negative,True,negative,negative,True,True
5419,I felt sad for Lise not so much because of wha...,neutral,negative,False,negative,negative,False,False


In [5]:
# Calculate the percentage of correct predictions for each column
match_percentage = finals_large_results15_df['match'].value_counts(normalize=True).get(True, 0) * 100
classifier_match_percentage = finals_large_results15_df['classifier_match'].value_counts(normalize=True).get(True, 0) * 100
logreg_match_percentage = finals_large_results15_df['logreg_match'].value_counts(normalize=True).get(True, 0) * 100

# Print the results
print(f"Match correct percentage: {match_percentage:.2f}%")
print(f"Classifier match correct percentage: {classifier_match_percentage:.2f}%")
print(f"Logreg match correct percentage: {logreg_match_percentage:.2f}%")

Match correct percentage: 81.61%
Classifier match correct percentage: 78.77%
Logreg match correct percentage: 76.46%


Average Metric: 88 / 100  (88.0): 100%|██████████| 100/100 [01:44<00:00,  1.05s/it]
Average Metric: 18 / 100  (18.0): 100%|██████████| 100/100 [01:44<00:00,  1.05s/it]
Average Metric: 77 / 100  (77.0): 100%|██████████| 100/100 [01:44<00:00,  1.05s/it]
Average Metric: 0 / 62  (0.0):  62%|██████▏   | 62/100 [12:19<01:43,  2.71s/it]

ERROR:dspy.evaluate.evaluate:[2m2024-09-04T08:17:14.596821Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 Timed out generating response. Please try again with a shorter prompt or with `max_tokens` set to a lower value. {
    "error": {
        "message": "Timed out generating response. Please try again with a shorter prompt or with `max_tokens` set to a lower value.",
        "type": "internal_error",
        "param": null,
        "code": "request_timeout"
    }
}
 500 {'error': {'message': 'Timed out generating response. Please try again with a shorter prompt or with `max_tokens` set to a lower value.', 'type': 'internal_error', 'param': None, 'code': 'request_timeout'}} {'Date': 'Wed, 04 Sep 2024 08:17:14 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '251', 'Connection': 'keep-alive', 'vary': 'Origin', 'x-request-id': 'req_e6bbedaf405865d0e50a444edf47a343', 'strict-transport-security': 'max-age=15552000; includeSubDomains; preload', 

Average Metric: 0.0 / 100  (0.0): 100%|██████████| 100/100 [14:12<00:00,  8.53s/it] 
Average Metric: 95 / 100  (95.0): 100%|██████████| 100/100 [01:44<00:00,  1.05s/it]
Average Metric: 0 / 100  (0.0): 100%|██████████| 100/100 [04:39<00:00,  2.80s/it]
Average Metric: 17 / 100  (17.0): 100%|██████████| 100/100 [03:33<00:00,  2.14s/it]
Average Metric: 0 / 100  (0.0): 100%|██████████| 100/100 [04:14<00:00,  2.55s/it]
Average Metric: 0 / 100  (0.0): 100%|██████████| 100/100 [01:44<00:00,  1.05s/it]
Average Metric: 0 / 100  (0.0): 100%|██████████| 100/100 [01:44<00:00,  1.05s/it]


{140569829212272: {'depth': [0, 1, 2, 3, 4],
  'max': [95.0, 95.0, 95.0, 95.0, 95.0],
  'average': [55.6, 42.142857142857146, 36.875, 36.875, 36.875],
  'min': [0.0, 0.0, 0.0, 0.0, 0.0],
  'std': [38.89781484865185,
   39.42235975914674,
   39.42219393945497,
   39.42219393945497,
   39.42219393945497]}}

[{'score': 95.0,
  'program': generate_answer = Predict(StringSignature(review, classifier_decision -> classification
      instructions="Classify the sentiment of a review as either 'positive', 'neutral', or 'negative'."
      review = Field(annotation=str required=True json_schema_extra={'desc': 'The review text to classify.', '__dspy_field_type': 'input', 'prefix': 'Review:'})
      classifier_decision = Field(annotation=str required=True json_schema_extra={'desc': 'The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.', '__dspy_field_type': 'input', 'prefix': 'Classifier Decision:'})
      classification = Field(annotation=str required=True json_schema_extra={'desc': "One word representing the sentiment classification: 'positive', 'neutral', or 'negative'.", '__dspy_field_type': 'output', 'prefix': 'Classification:'})
  )),
  'instruction': "Classify the sentiment of a review as either 'positive', 'neutral', or 'negative'.",
  'prefix': 'Classificat




Attempted Instructions:
[1] «Instruction #1: Proposed Instruction: Carefully read the following review and assess its overall sentiment. Classify the sentiment as 'positive', 'neutral', or 'negative', considering specific phrases that reflect the reviewer’s emotions. Additionally, provide a concise explanation of your reasoning, highlighting key language or phrases that influenced your classification and their implications on the overall interpretation.»
[2] «Prefix #1: Sentiment Evaluation:»
[3] «Resulting Score #1: 17.0»
[4] «Instruction #2: Proposed Instruction: Analyze the given review carefully and determine its emotional tone based on the language and context used. Assign a sentiment label of 'positive', 'neutral', or 'negative

---

Follow the following format.

Review: The review text to classify.
Classifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.
Attempted Instructions: 
[1] «Instruction #1: Proposed Instruction: Care

'\n\n\nAttempted Instructions:\n[1] «Instruction #1: Proposed Instruction: Carefully read the following review and assess its overall sentiment. Classify the sentiment as \'positive\', \'neutral\', or \'negative\', considering specific phrases that reflect the reviewer’s emotions. Additionally, provide a concise explanation of your reasoning, highlighting key language or phrases that influenced your classification and their implications on the overall interpretation.»\n[2] «Prefix #1: Sentiment Evaluation:»\n[3] «Resulting Score #1: 17.0»\n[4] «Instruction #2: Proposed Instruction: Analyze the given review carefully and determine its emotional tone based on the language and context used. Assign a sentiment label of \'positive\', \'neutral\', or \'negative\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.\nAttempted Instructions: \n[1] «Instruction #1




Analyze the provided review in detail, considering the overall tone, word choices, and emotional undertones. Based on your assessment, classify the sentiment as either 'positive', 'neutral', or 'negative'. Be sure to justify your classification with reasoning to provide depth to your response.

---

Follow the following format.

Review: The review text to classify.
Classifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.
Sentiment Classification: One word representing the sentiment classification: 'positive', 'neutral', or 'negative'.

---

Review: I am always skeptical of auto places trying to nickel and dime me for services/parts I dont need--I have never had that experience here.
Classifier Decision: positive
Sentiment Classification:[32m Review: I am always skeptical of auto places trying to nickel and dime me for services/parts I dont need--I have never had that experience here.  
Classifier Decision: positive  
Sentiment Clas

'\n\n\nAnalyze the provided review in detail, considering the overall tone, word choices, and emotional undertones. Based on your assessment, classify the sentiment as either \'positive\', \'neutral\', or \'negative\'. Be sure to justify your classification with reasoning to provide depth to your response.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.\nSentiment Classification: One word representing the sentiment classification: \'positive\', \'neutral\', or \'negative\'.\n\n---\n\nReview: I am always skeptical of auto places trying to nickel and dime me for services/parts I dont need--I have never had that experience here.\nClassifier Decision: positive\nSentiment Classification:\x1b[32m Review: I am always skeptical of auto places trying to nickel and dime me for services/parts I dont need--I have never had that experience here.  \nClassifier D

generate_answer = Predict(StringSignature(review, classifier_decision -> classification
    instructions='Analyze the provided review to determine the overall sentiment conveyed. Classify the sentiment into one of three categories: \n\n1. Positive - the review expresses satisfaction or enjoyment.\n2. Neutral - the review is neither particularly favorable nor unfavorable.\n3. Negative - the review conveys dissatisfaction or disappointment.\n\nBe sure to consider the nuances in language that indicate sentiment.'
    review = Field(annotation=str required=True json_schema_extra={'desc': 'The review text to classify.', '__dspy_field_type': 'input', 'prefix': 'Review:'})
    classifier_decision = Field(annotation=str required=True json_schema_extra={'desc': 'The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.', '__dspy_field_type': 'input', 'prefix': 'Classifier Decision:'})
    classification = Field(annotation=str required=True json_schema_extra={'desc':

Average Metric: 17 / 20  (85.0): 100%|██████████| 20/20 [00:31<00:00,  1.56s/it]
Average Metric: 19 / 20  (95.0): 100%|██████████| 20/20 [00:20<00:00,  1.05s/it]
Average Metric: 12 / 20  (60.0): 100%|██████████| 20/20 [00:48<00:00,  2.42s/it]
Average Metric: 11 / 20  (55.0): 100%|██████████| 20/20 [00:46<00:00,  2.35s/it]
Average Metric: 12 / 20  (60.0): 100%|██████████| 20/20 [00:50<00:00,  2.52s/it]


generate_answer = Predict(StringSignature(review, classifier_decision -> classification
    instructions="Classify the sentiment of a review as either 'positive', 'neutral', or 'negative'."
    review = Field(annotation=str required=True json_schema_extra={'desc': 'The review text to classify.', '__dspy_field_type': 'input', 'prefix': 'Review:'})
    classifier_decision = Field(annotation=str required=True json_schema_extra={'desc': 'The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.', '__dspy_field_type': 'input', 'prefix': 'Classifier Decision:'})
    classification = Field(annotation=str required=True json_schema_extra={'desc': "One word representing the sentiment classification: 'positive', 'neutral', or 'negative'.", '__dspy_field_type': 'output', 'prefix': 'Classification:'})
))




Analyze the given review and determine the overall sentiment expressed within it. Label it as 'positive' if the sentiment is uplifting or favorable, 'neutral' if it conveys neither strong happiness nor disappointment, or 'negative' if the sentiment is critical or disparaging. Provide a clear reasoning behind your classification.

---

Follow the following format.

Review: The review text to classify.
Classifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.
Sentiment Classification: One word representing the sentiment classification: 'positive', 'neutral', or 'negative'.

---

Review: Even my fiancé was disgusted he tried it, as he didn't want his beer anymore then!
Classifier Decision: negative
Sentiment Classification:[32m negative[0m





Analyze the given review and determine the overall sentiment expressed within it. Label it as 'positive' if the sentiment is uplifting or favorable, 'neutral' if it conveys neither strong happi

'\n\n\nAnalyze the given review and determine the overall sentiment expressed within it. Label it as \'positive\' if the sentiment is uplifting or favorable, \'neutral\' if it conveys neither strong happiness nor disappointment, or \'negative\' if the sentiment is critical or disparaging. Provide a clear reasoning behind your classification.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.\nSentiment Classification: One word representing the sentiment classification: \'positive\', \'neutral\', or \'negative\'.\n\n---\n\nReview: Even my fiancé was disgusted he tried it, as he didn\'t want his beer anymore then!\nClassifier Decision: negative\nSentiment Classification:\x1b[32m negative\x1b[0m\n\n\n\n\n\nAnalyze the given review and determine the overall sentiment expressed within it. Label it as \'positive\' if the sentiment is uplifting or favorable,

{140569736962256: {'depth': [0, 1, 2],
  'max': [95.0, 95.0, 95.0],
  'average': [90.0, 73.75, 71.0],
  'min': [85.0, 55.0, 55.0],
  'std': [5.0, 16.723860200324566, 15.937377450509228]}}

[{'score': 95.0,
  'program': generate_answer = Predict(StringSignature(review, classifier_decision -> classification
      instructions="Classify the sentiment of a review as either 'positive', 'neutral', or 'negative'."
      review = Field(annotation=str required=True json_schema_extra={'desc': 'The review text to classify.', '__dspy_field_type': 'input', 'prefix': 'Review:'})
      classifier_decision = Field(annotation=str required=True json_schema_extra={'desc': 'The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.', '__dspy_field_type': 'input', 'prefix': 'Classifier Decision:'})
      classification = Field(annotation=str required=True json_schema_extra={'desc': "One word representing the sentiment classification: 'positive', 'neutral', or 'negative'.", '__dspy_field_type': 'output', 'prefix': 'Classification:'})
  )),
  'instruction': "Classify the sentiment of a review as either 'positive', 'neutral', or 'negative'.",
  'prefix': 'Classificat

Average Metric: 16 / 20  (80.0): 100%|██████████| 20/20 [00:36<00:00,  1.85s/it]
Average Metric: 20 / 20  (100.0): 100%|██████████| 20/20 [00:29<00:00,  1.47s/it]
Average Metric: 19 / 20  (95.0): 100%|██████████| 20/20 [00:48<00:00,  2.44s/it]
Average Metric: 0 / 6  (0.0):  30%|███       | 6/20 [00:21<00:53,  3.82s/it]

KeyboardInterrupt: 




Attempted Instructions:
[1] «Instruction #1: Analyze the given review and determine the overall sentiment expressed within it. Label it as 'positive' if the sentiment is uplifting or favorable, 'neutral' if it conveys neither strong happiness nor disappointment, or 'negative' if the sentiment is critical or disparaging. Provide a clear reasoning behind your classification.»
[2] «Prefix #1: Sentiment Classification:»
[3] «Resulting Score #1: 80.0»
[4] «Instruction #2: Proposed Instruction: "Evaluate the provided review and classify its sentiment as 'positive', 'neutral', or 'negative'. In addition to your classification, please summarize the key phrases that influenced your decision and explain why

---

Follow the following format.

Review: The review text to classify.

Classifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.

Classification: One word representing the sentiment classification: 'positive', 'neutral', or 'negative'.



'\n\n\nAttempted Instructions:\n[1] «Instruction #1: Analyze the given review and determine the overall sentiment expressed within it. Label it as \'positive\' if the sentiment is uplifting or favorable, \'neutral\' if it conveys neither strong happiness nor disappointment, or \'negative\' if the sentiment is critical or disparaging. Provide a clear reasoning behind your classification.»\n[2] «Prefix #1: Sentiment Classification:»\n[3] «Resulting Score #1: 80.0»\n[4] «Instruction #2: Proposed Instruction: "Evaluate the provided review and classify its sentiment as \'positive\', \'neutral\', or \'negative\'. In addition to your classification, please summarize the key phrases that influenced your decision and explain why\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\n\nClassifier Decision: The sentiment classification proposed by a BERT/ELECTRA model fine-tuned on sentiment.\n\nClassification: One word representing the sentiment classification: \'positiv