# GPT-4o/4o-mini Fine-Tuning and DSPy Experiments

Round 2: Seed 123

In [1]:
# Set different seed and temperature for variability
random_seed = 123
temperature = 0.1

## Initial Setup

### Imports

In [2]:
# OS and utilities
import os
import io
import time
import pickle
from time import sleep
from datetime import datetime
from dotenv import load_dotenv
import json
import warnings
from tqdm.notebook import tqdm

# Data processing
import pandas as pd
import numpy as np
from collections import defaultdict

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
import torch
import torch.nn.functional as F
from transformers import ElectraTokenizer
import tiktoken

# OpenAI
from openai import OpenAI

# DSPy and ReCOGS
import dsp
import dspy
from dspy.evaluate.evaluate import Evaluate
from dspy import Retrieve

# Instrumentation and tracing
from phoenix.otel import register
from openinference.instrumentation.dspy import DSPyInstrumentor

# Typing
from typing import List, Union

# Local imports
from datawaza_funcs import eval_model
from utils import fix_random_seeds, prepare_device, setup_environment

### Environment and Preferences

In [3]:
# DSPy notebook cache
root_path = '.'
os.environ["DSP_NOTEBOOK_CACHEDIR"] = os.path.join(root_path, 'cache')

# Pandas display options
pd.set_option('display.max_colwidth', None)

In [4]:
# Suppress specific warnings
warnings.filterwarnings("ignore", message="resume_download is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use force_download=True.", category=FutureWarning)
warnings.filterwarnings("ignore", message="`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.", category=FutureWarning)

### API Keys

In [5]:
# Load API keys from .env file
load_dotenv()
openai_key = os.getenv('OPENAI_API_KEY')
arize_key = os.getenv('ARIZE_API_KEY')

### Language Model

In [6]:
# Create OpenAI client
client = OpenAI()
models = client.models.list()

# Print OpenAI models by creation date
sorted_models = sorted(models.dict()['data'], key=lambda x: x['created'], reverse=True)
print(f"{'Model ID':<40}{'Creation Date':<40}")
print('-'*80)
for model in sorted_models:
    if not model['id'].startswith('ft:'):
        print(f"{model['id']:<40}{datetime.fromtimestamp(int(model['created'])).strftime('%Y-%m-%d %H:%M:%S'):<40}")


Model ID                                Creation Date                           
--------------------------------------------------------------------------------
gpt-4o-mini-tts                         2025-03-19 17:05:59                     
o1-pro                                  2025-03-17 22:49:51                     
o1-pro-2025-03-19                       2025-03-17 22:45:04                     
gpt-4o-mini-transcribe                  2025-03-15 19:56:36                     
gpt-4o-transcribe                       2025-03-15 19:54:23                     
gpt-4o-mini-search-preview              2025-03-07 23:46:01                     
gpt-4o-mini-search-preview-2025-03-11   2025-03-07 23:40:58                     
gpt-4o-search-preview                   2025-03-07 23:05:20                     
gpt-4o-search-preview-2025-03-11        2025-03-07 22:56:10                     
computer-use-preview-2025-03-11         2025-03-07 19:50:21                     
gpt-4.5-preview-2025-02-27  

In [7]:
# Print the fine-tuned models
print(f"{'Model ID':<40}{'Creation Date':<40}")
print('-'*80)
for model in sorted_models:
    if model['id'].startswith('ft:'):
        print(f"{model['id']:<40}{datetime.fromtimestamp(int(model['created'])).strftime('%Y-%m-%d %H:%M:%S'):<40}")

Model ID                                Creation Date                           
--------------------------------------------------------------------------------
ft:gpt-4o-mini-2024-07-18:personal:5-epochs:BCEIpoGP:ckpt-step-30042025-03-17 23:36:55                     
ft:gpt-4o-mini-2024-07-18:personal:5-epochs:BCEIpYAW:ckpt-step-45062025-03-17 23:36:55                     
ft:gpt-4o-mini-2024-07-18:personal:5-epochs:BCEIp15y2025-03-17 23:36:55                     
ft:gpt-4o-mini-2024-07-18:personal:5-epochs:BBxo5jXI:ckpt-step-45062025-03-17 06:00:06                     
ft:gpt-4o-mini-2024-07-18:personal:5-epochs:BBxo6eOL2025-03-17 06:00:06                     
ft:gpt-4o-mini-2024-07-18:personal:5-epochs:BBxo5hHh:ckpt-step-30042025-03-17 06:00:05                     
ft:gpt-4o-mini-2024-07-18:personal::BBq9HSob2025-03-16 21:49:27                     
ft:gpt-4o-mini-2024-07-18:personal::BBcWZcsN:ckpt-step-45062025-03-16 07:16:35                     
ft:gpt-4o-mini-2024-07-18:personal:

In [271]:
# Define the language model
lm = dspy.OpenAI(model='gpt-4o-mini-2024-07-18', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)

### Arize Phoenix for Traces

In [9]:
# Arize environment variables
os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"api_key={arize_key}"
os.environ["PHOENIX_CLIENT_HEADERS"] = f"api_key={arize_key}"
os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "https://app.phoenix.arize.com"

In [None]:
# Register the tracer provider
tracer_provider = register(
  project_name="final_project_round_2",
  endpoint="https://app.phoenix.arize.com/v1/traces",
  headers={"api_key": f"{arize_key}"}
)

In [None]:
# Start the DSPy instrumentor
DSPyInstrumentor().instrument(tracer_provider=tracer_provider)

## Sentiment Data

The dataset is a merge of DynaSent R1, DynaSent R2, and SST-3.

In [12]:
# Load the merged datasets
train_df = pd.read_csv('data/merged/train_all.csv')
val_df = pd.read_csv('data/merged/val_all.csv')
test_df = pd.read_csv('data/merged/test_all.csv')

In [13]:
# Print the lengths of each dataset
print(f"Train: {len(train_df)}")
print(f"Validation: {len(val_df)}")
print(f"Test: {len(test_df)}")

Train: 102097
Validation: 5421
Test: 6530


In [14]:
# Review the train dataset
train_df.head()

Unnamed: 0,sentence,label,source,split
0,Those 2 drinks are part of the HK culture and has years of history. It is so bad.,negative,dynasent_r2,train
1,"I was told by the repair company that was doing the car repair that fixing the rim was ""impossible"" and to replace it.",negative,dynasent_r1,train
2,It is there to give them a good time .,neutral,sst_local,train
3,Like leafing through an album of photos accompanied by the sketchiest of captions .,negative,sst_local,train
4,Johnny was a talker and liked to have fun.,positive,dynasent_r1,train


In [15]:
# Creat a list of dspy.Example objects from train_df
train_ex = [dspy.Example(review=row['sentence'], classification=row['label']).with_inputs('review') for _, row in train_df.iterrows()]
val_ex = [dspy.Example(review=row['sentence'], classification=row['label']).with_inputs('review') for _, row in val_df.iterrows()]
test_ex = [dspy.Example(review=row['sentence'], classification=row['label']).with_inputs('review') for _, row in test_df.iterrows()]

In [16]:
# Review the first few examples
train_ex[:5]

[Example({'review': 'Those 2 drinks are part of the HK culture and has years of history. It is so bad.', 'classification': 'negative'}) (input_keys={'review'}),
 Example({'review': 'I was told by the repair company that was doing the car repair that fixing the rim was "impossible" and to replace it.', 'classification': 'negative'}) (input_keys={'review'}),
 Example({'review': 'It is there to give them a good time .', 'classification': 'neutral'}) (input_keys={'review'}),
 Example({'review': 'Like leafing through an album of photos accompanied by the sketchiest of captions .', 'classification': 'negative'}) (input_keys={'review'}),
 Example({'review': 'Johnny was a talker and liked to have fun.', 'classification': 'positive'}) (input_keys={'review'})]

## ELECTRA Models

### Setup DDP Environment

I fine-tuned my models in a DDP multi-GPU environment using `finetune.py`. Because I'm loading a checkpoint from that pipeline, which has a DDP wrapper, I'm runninig some minimal functions here to setup a basic single-GPU DDP environemnt.

In [17]:
# Set some variables for the DDP environment
rank = 0
device_type = 'cuda'
world_size = 1
backend = 'nccl'
debug = False

In [18]:
# Define the device
device = prepare_device(rank, device_type)

In [19]:
# Initiate the process group
setup_environment(rank, world_size, backend, device, debug)

Rank 0 - Device: cuda:0
1 process groups initialized with 'nccl' backend on localhost:12355
NCCL Timeout: 1 hr 0 min. NCCL Blocking Wait: Enabled


In [274]:
# Set random seeds
fix_random_seeds(random_seed)

42

### Load ELECTRA Tokenizer and Fine-Tuned Model

We'll load an ELECTRA model that was fine-tuned on sentiment using `finetune.py`. This will be part of our DSPy module's forward pass. We'll get the ELECTRA classification for the review, and that will be input/context in the prompt for the GPT4o-mini language model.

#### Use This to Reproduce

Uncomment the following code to load the fine-tuned models from Hugging Face.

### NOTE: Need to use these models for round 1 vs round 2:

**ROUND 1**
- Base: from HF 'jbeno/electra-base-classifier-sentiment' (why? did not update because round 1 was best)
- Large: from local 'models/final_model_20241025-024222.pkl' (why? HF large was updated to round 2)

**ROUND 2**
- Base: from local 'final_model_20250323-020916.pkl' (why? did not upload this to HF because less good)
- Large: from HF 'jbeno/electra-large-classifier-sentiment' (why? HF large was updated because performed best)

NOTE: I am temporarily doing the opposite of this in this file because round1 file is busy running experiments.

In [None]:
#import sys
#!{sys.executable} -m pip install electra_classifier
from transformers import AutoTokenizer
from electra_classifier import ElectraClassifier

large_model_name = "jbeno/electra-large-classifier-sentiment"
base_model_name = "jbeno/electra-base-classifier-sentiment"

tokenizer = AutoTokenizer.from_pretrained(large_model_name)

electra_large_model = ElectraClassifier.from_pretrained(large_model_name)
electra_large_model.to(device)

electra_base_model = ElectraClassifier.from_pretrained(base_model_name)
electra_base_model.to(device)

ElectraClassifier(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,)

#### Original Code

What follows is the code used for the original research. You can skip this section if you've loaded it from Hugging Face above.

In [22]:
# Load the ELECTRA tokenizer from Hugging Face
#tokenizer = ElectraTokenizer.from_pretrained('google/electra-large-discriminator')

In [None]:
# Function to unpickle the checkpoint saved with DDP and GPUs
# class CPU_Unpickler(pickle.Unpickler):
#     def find_class(self, module, name):
#         if module == 'torch.storage' and name == '_load_from_bytes':
#             return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
#         else: return super().find_class(module, name)

In [24]:
# Load the ELECTRA Large model from a checkpoint pickle file
# NOTE: This file path does not exist in the repository, use the Hugging Face approach above instead
# with open('models/final_model_20241025-024222.pkl', 'rb') as f:
#     electra_large_model = CPU_Unpickler(f).load()

In [25]:
# Move the model to the device (GPU ideally) to speed up inference time
# electra_large_model.to(device)

In [None]:
# Review the ELECTRA Large model with custom pooling and classifier head
#electra_large_model

In [None]:
# Load the ELECTRA Base model from a checkpoint pickle file
# NOTE: This file path does not exist in the repository, use the Hugging Face approach above instead
# with open('final_model_20250323-020916.pkl', 'rb') as f:
#     electra_base_model = CPU_Unpickler(f).load()

In [None]:
# Move the model to the device (GPU ideally) to speed up inference time
# electra_base_model.to(device)

In [None]:
# Review the ELECTRA Base model with custom pooling and classifier head
#electra_base_model

### ELECTRA Prediction Functions

In [30]:
# Set the class label mapping dictionary
numeric_dict = {0: 'negative', 1: 'neutral', 2: 'positive'}

In [31]:
# Tokenize function
def tokenize(texts, tokenizer, device):
    encoded = tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)
    
    return input_ids, attention_mask

In [32]:
# Function to predict the class of a sentence
def predict_sentence(model, sentence, tokenizer, numeric_dict):

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    #model = model.model  # Needed for DDP wrapped models
    
    # Tokenize the input sentence
    input_ids, attention_mask = tokenize([sentence], tokenizer, device)

    # Set the model to evaluation mode
    model.eval()

    # Perform inference
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    # Get the predictions
    if isinstance(outputs, torch.Tensor):
        logits = outputs
    elif hasattr(outputs, 'logits'):
        logits = outputs.logits
    else:
        logits = outputs[0]

    probabilities = torch.softmax(logits, dim=1)
    predicted_class = torch.argmax(probabilities, dim=1).item()
    predicted_label = numeric_dict[predicted_class]

    # Move probabilities back to CPU and convert to list
    probabilities = probabilities.cpu().squeeze().tolist()

    # Free up GPU memory
    del input_ids, attention_mask, logits, outputs
    torch.cuda.empty_cache()

    return predicted_class, predicted_label, probabilities

In [33]:
test_review = "Those 2 drinks are part of the HK culture and has years of history. It is so bad."

In [34]:
# Test prediction with ELECTRA base model
predict_sentence(electra_base_model, test_review, tokenizer, numeric_dict)

(0,
 'negative',
 [0.9988487958908081, 0.0008097602985799313, 0.0003413360973354429])

In [35]:
# Test prediction with ELECTRA large model
predict_sentence(electra_large_model, test_review, tokenizer, numeric_dict)

(0,
 'negative',
 [0.9990077614784241, 0.0005539217963814735, 0.000438391842180863])

## DSPy Signatures, Modules, Instances

All the experiments involving GPT models use DSPy signatures and modules. These are defined here.

### Classify

In [36]:
# DSPy signature that defines the prompt: review -> classification
class Classify(dspy.Signature):
    __doc__ = """Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'."""

    review = dspy.InputField(desc="The review text to classify.")
    classification = dspy.OutputField(desc="One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')")

In [37]:
# DSPy module to classify the sentiment of a review
class GPTSentiment(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.Predict(Classify)

    def forward(self, review):

        prediction = self.generate_answer(review=review)
        # Sleep to avoid rate limiting
        sleep(0.25)
        
        return dspy.Prediction(classification=prediction.classification.lower().strip())

In [38]:
# Create an instance of the GPT sentiment module
gpt_sentiment = GPTSentiment()

### Classify with Pred

In [276]:
# DSPy signature that defines the prompt: review, classifier_decision -> classification
class ClassifyWithPred(dspy.Signature):
    __doc__ = """Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'."""

    review = dspy.InputField(desc="The review text to classify.")
    classifier_decision = dspy.InputField(desc="The sentiment classification proposed by a model fine-tuned on sentiment.")
    classification = dspy.OutputField(desc="One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')")

In [277]:
# DSPy module that uses the ELECTRA model to help classify the sentiment of a review
class CollabSentiment(dspy.Module):
    def __init__(self, model_type='base'):
        super().__init__()
        self.generate_answer = dspy.Predict(ClassifyWithPred)
        self.numeric_dict = {0: 'negative', 1: 'neutral', 2: 'positive'}
        
        # Determine which model and tokenizer to use
        if model_type == 'base':
            self.model = electra_base_model
            self.model_name = 'google/electra-base-discriminator'
        elif model_type == 'large':
            self.model = electra_large_model
            self.model_name = 'google/electra-large-discriminator'
        else:
            raise ValueError("Invalid model_type. Choose 'base' or 'large'.")

        # Load the ELECTRA tokenizer from Hugging Face
        self.tokenizer = ElectraTokenizer.from_pretrained(self.model_name)
    
    def classify_with_electra(self, review):
        _, predicted_label, probabilities = predict_sentence(self.model, review, self.tokenizer, self.numeric_dict)
        return predicted_label, probabilities
    
    def model_summary(self):
        # Display the model summary to confirm
        print(f"Model Summary for {self.model_name}:\n")
        
        # Print the model architecture
        print(self.model)
        #print(self.model.model)  # Needed for DDP wrapped models

    def forward(self, review):
        classifier_decision, probabilities = self.classify_with_electra(review)
    
        prediction = self.generate_answer(review=review, classifier_decision=classifier_decision)
        # Sleep to avoid rate limiting
        time.sleep(0.25)
        
        return dspy.Prediction(classification=prediction.classification.lower().strip(),
                               classifier_decision=classifier_decision, probabilities=probabilities)

In [278]:
# Create an instance of the Electra Base + GPT sentiment module
electra_base_gpt_sentiment = CollabSentiment(model_type='base')

In [279]:
# Verify the base model is loaded correctly
electra_base_gpt_sentiment.model_summary()

Model Summary for google/electra-base-discriminator:

ElectraClassifier(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, 

In [43]:
# Create an instance of the ELECTRA Large + GPT sentiment module
electra_large_gpt_sentiment = CollabSentiment(model_type='large')

In [44]:
# Verify the large model is loaded correctly
electra_large_gpt_sentiment.model_summary()

Model Summary for google/electra-large-discriminator:

ElectraClassifier(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-23): 24 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=1024, out_fe

### Classify with Probabilities

In [240]:
# DSPy signature that defines the prompt: review, classifier_decision -> classification
class ClassifyWithProbs(dspy.Signature):
    __doc__ = """Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'."""

    review = dspy.InputField(desc="The review text to classify.")
    negative_probability = dspy.InputField(desc="Probability the review is negative from a model fine-tuned on sentiment")
    neutral_probability = dspy.InputField(desc="Probability the review is neutral from a model fine-tuned on sentiment")
    positive_probability = dspy.InputField(desc="Probability the review is positive from a model fine-tuned on sentiment")
    classification = dspy.OutputField(desc="One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')")

In [241]:
# DSPy module that uses the ELECTRA model to help classify the sentiment of a review
class CollabSentimentProbs(dspy.Module):
    def __init__(self, model_type='base'):
        super().__init__()
        self.generate_answer = dspy.Predict(ClassifyWithProbs)
        self.numeric_dict = {0: 'negative', 1: 'neutral', 2: 'positive'}
        
        # Determine which model and tokenizer to use
        if model_type == 'base':
            self.model = electra_base_model
            self.model_name = 'google/electra-base-discriminator'
        elif model_type == 'large':
            self.model = electra_large_model
            self.model_name = 'google/electra-large-discriminator'
        else:
            raise ValueError("Invalid model_type. Choose 'base' or 'large'.")

        # Load the ELECTRA tokenizer from Hugging Face
        self.tokenizer = ElectraTokenizer.from_pretrained(self.model_name)
    
    def classify_with_electra(self, review):
        _, predicted_label, probabilities = predict_sentence(self.model, review, self.tokenizer, self.numeric_dict)
        # Convert each probability to a percent
        neg_prob = f"{probabilities[0] * 100:.2f}%"
        neu_prob = f"{probabilities[1] * 100:.2f}%"
        pos_prob = f"{probabilities[2] * 100:.2f}%"
        return predicted_label, neg_prob, neu_prob, pos_prob
    
    def model_summary(self):
        # Display the model summary to confirm
        print(f"Model Summary for {self.model_name}:\n")
        
        # Print the model architecture
        print(self.model)
        #print(self.model.model)  # Needed for DDP wrapped models

    def forward(self, review):
        classifier_decision, neg_prob, neu_prob, pos_prob = self.classify_with_electra(review)

        prediction = self.generate_answer(
            review=review, 
            negative_probability = neg_prob,
            neutral_probability = neu_prob,
            positive_probability = pos_prob
        )

         # Sleep to avoid rate limiting
        time.sleep(0.25)

        return dspy.Prediction(
            classification=prediction.classification.lower().strip(),
            negative_probability=neg_prob,
            neutral_probability=neu_prob,
            positive_probability=pos_prob
        )

In [47]:
electra_large_gpt_sentiment_probs = CollabSentimentProbs(model_type='large')

### Classify with Prediction and Probabilities

In [243]:
# DSPy signature that defines the prompt: review, classifier_decision -> classification
class ClassifyWithPredProbs(dspy.Signature):
    __doc__ = """Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'."""

    review = dspy.InputField(desc="The review text to classify.")
    classifier_decision = dspy.InputField(desc="The sentiment classification proposed by a model fine-tuned on sentiment.")
    negative_probability = dspy.InputField(desc="Probability the review is negative")
    neutral_probability = dspy.InputField(desc="Probability the review is neutral")
    positive_probability = dspy.InputField(desc="Probability the review is positive")
    classification = dspy.OutputField(desc="One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')")

In [244]:
# DSPy module that uses the ELECTRA model to help classify the sentiment of a review
class CollabSentimentPredProbs(dspy.Module):
    def __init__(self, model_type='base'):
        super().__init__()
        self.generate_answer = dspy.Predict(ClassifyWithPredProbs)
        self.numeric_dict = {0: 'negative', 1: 'neutral', 2: 'positive'}
        
        # Determine which model and tokenizer to use
        if model_type == 'base':
            self.model = electra_base_model
            self.model_name = 'google/electra-base-discriminator'
        elif model_type == 'large':
            self.model = electra_large_model
            self.model_name = 'google/electra-large-discriminator'
        else:
            raise ValueError("Invalid model_type. Choose 'base' or 'large'.")

        # Load the ELECTRA tokenizer from Hugging Face
        self.tokenizer = ElectraTokenizer.from_pretrained(self.model_name)
    
    def classify_with_electra(self, review):
        _, predicted_label, probabilities = predict_sentence(self.model, review, self.tokenizer, self.numeric_dict)
        # Convert each probability to a percent
        neg_prob = f"{probabilities[0] * 100:.2f}%"
        neu_prob = f"{probabilities[1] * 100:.2f}%"
        pos_prob = f"{probabilities[2] * 100:.2f}%"
        return predicted_label, neg_prob, neu_prob, pos_prob
    
    def model_summary(self):
        # Display the model summary to confirm
        print(f"Model Summary for {self.model_name}:\n")
        
        # Print the model architecture
        print(self.model)
        #print(self.model.model)  # Needed for DDP wrapped models

    def forward(self, review):
        classifier_decision, neg_prob, neu_prob, pos_prob = self.classify_with_electra(review)

        prediction = self.generate_answer(
            review=review, 
            negative_probability = neg_prob,
            neutral_probability = neu_prob,
            positive_probability = pos_prob,
            classifier_decision=classifier_decision
        )

         # Sleep to avoid rate limiting
        time.sleep(0.25)

        return dspy.Prediction(
            classification=prediction.classification.lower().strip(), 
            classifier_decision=classifier_decision, 
            negative_probability=neg_prob,
            neutral_probability=neu_prob,
            positive_probability=pos_prob
        )

In [50]:
electra_large_gpt_sentiment_pred_probs = CollabSentimentPredProbs(model_type='large')

### Classify with Examples

In [245]:
# Retriever class to get the top similar reviews
class ElectraSentimentRetriever(Retrieve):
    def __init__(
        self,
        model,
        tokenizer,
        examples: List[dict],
        k: int = 3,
        max_length: int = 512,
        batch_size: int = 32,
    ):
        super().__init__(k=k)
        
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = model
        #self.model = model.model  # Needed for DDP wrapped models
        self.tokenizer = tokenizer
        self.model.eval()
        
        self.max_length = max_length
        self.batch_size = batch_size

        # Precompute embeddings for examples
        self.examples = examples
        self.example_texts = []
        self.example_classes = []
        for ex in self.examples:
            text = ex.get('review', ex.get('sentence', ''))
            classification = ex.get('classification', ex.get('label', ''))
            self.example_texts.append(text)
            self.example_classes.append(classification)
        
        # Precompute and store example embeddings
        self.example_embeddings = self._get_embeddings(self.example_texts)

    def _get_embeddings_batch(self, texts: List[str]) -> torch.Tensor:
        encoded = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        input_ids = encoded['input_ids'].to(self.device)
        attention_mask = encoded['attention_mask'].to(self.device)
        
        with torch.no_grad():
            base_model = self.model.module if hasattr(self.model, 'module') else self.model
            #electra_outputs = base_model.bert(  # Used in initial research
            electra_outputs = base_model.electra(  # Used with Hugging Face models
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            #pooled_output = base_model.custom_pooling(  # Used in initial research
            pooled_output = base_model.pooling(  # Used with Hugging Face models
                electra_outputs.last_hidden_state,
                attention_mask
            )
            normalized_embeddings = F.normalize(pooled_output, p=2, dim=1)
        
        del input_ids, attention_mask, electra_outputs, pooled_output
        torch.cuda.empty_cache()
        
        return normalized_embeddings

    def _get_embeddings(self, texts: List[str]) -> torch.Tensor:
        embeddings_list = []
        for i in range(0, len(texts), self.batch_size):
            batch_texts = texts[i:i + self.batch_size]
            batch_embeddings = self._get_embeddings_batch(batch_texts)
            embeddings_list.append(batch_embeddings)
        return torch.cat(embeddings_list, dim=0)

    def forward(self, query_or_queries: Union[str, List[str]]) -> List[dict]:
        # Handle both single query and multiple queries
        queries = [query_or_queries] if isinstance(query_or_queries, str) else query_or_queries
        queries = [q for q in queries if q]

        # Generate embeddings for queries
        query_embeddings = self._get_embeddings(queries)
        
        # Calculate similarities with precomputed example embeddings
        similarities = F.cosine_similarity(
            query_embeddings.unsqueeze(1),
            self.example_embeddings.unsqueeze(0),
            dim=2
        )
        
        # Get top indices for each query
        top_indices = similarities.argsort(dim=1, descending=True)[:, :self.k * 2]

        results = []
        if len(queries) == 1:
            selected_reviews = {}
            
            # Try to get balanced examples first
            for idx in top_indices[0]:
                idx = int(idx)
                sentiment = self.example_classes[idx]
                if sentiment not in selected_reviews and len(selected_reviews) < self.k:
                    selected_reviews[sentiment] = {
                        'index': idx,
                        'similarity_score': float(similarities[0][idx]),
                        'review': self.example_texts[idx],
                        'classification': sentiment
                    }
            
            # Fill remaining slots with highest similarity scores
            if len(selected_reviews) < self.k:
                for idx in top_indices[0]:
                    idx = int(idx)
                    if len(results) >= self.k:
                        break
                    if not any(r.get('index') == idx for r in results):
                        results.append({
                            'index': idx,
                            'similarity_score': float(similarities[0][idx]),
                            'review': self.example_texts[idx],
                            'classification': self.example_classes[idx]
                        })
            
            # Add balanced selections to results
            results.extend(selected_reviews.values())
            results = results[:self.k]
        else:
            # Handle multiple queries (if needed)
            pass

        # Clean up GPU memory
        del query_embeddings, similarities
        torch.cuda.empty_cache()
        
        return results

In [52]:
# DSPy signature that defines the prompt: examples, review, classifier_decision -> classification
class ClassifyWithPredExamples(dspy.Signature):
    __doc__ = """Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'."""

    examples = dspy.InputField(
        desc="A list of examples that demonstrate different sentiment classes.",
        format=lambda examples: "\n".join([
            f"- {ex['classification']}: {ex['review']}"
            for i, ex in enumerate(examples)
        ]) if isinstance(examples, list) else examples
    )
    review = dspy.InputField(desc="The review text to classify.")
    classifier_decision = dspy.InputField(desc="The sentiment classification proposed by a model fine-tuned on sentiment.")
    classification = dspy.OutputField(desc="One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')")

class CollabSentimentExamples(dspy.Module):
    def __init__(self):
        super().__init__()
        
        self.tokenizer = ElectraTokenizer.from_pretrained('google/electra-large-discriminator')

        self.example_data = [
            {
                'review': row['sentence'],
                'classification': self.numeric_dict[row['label']] if isinstance(row['label'], (int, np.integer)) else row['label']
            }
            for _, row in val_df.iterrows()
        ]
        self.retrieve = ElectraSentimentRetriever(
            model=electra_large_model,
            tokenizer=self.tokenizer,
            examples=self.example_data,
            k=5
        )

        self.generate_answer = dspy.Predict(ClassifyWithPredExamples)
        
        self.numeric_dict = {0: 'negative', 1: 'neutral', 2: 'positive'}

    def classify_with_electra(self, review):
        _, predicted_label, probabilities = predict_sentence(electra_large_model, review, self.tokenizer, self.numeric_dict)
        return predicted_label, probabilities

    def forward(self, review):
        examples = self.retrieve(review)
        
        classifier_decision, probabilities = self.classify_with_electra(review)

        prediction = self.generate_answer(
            review=review, 
            examples=examples, 
            classifier_decision=classifier_decision
        )
        
        return dspy.Prediction(
            examples=examples,
            classification=prediction.classification,
            classifier_decision=classifier_decision,
            probabilities=probabilities
        )

In [53]:
electra_large_gpt_sentiment_examples = CollabSentimentExamples()

In [54]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='gpt-4o-mini-2024-07-18', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [55]:
test_example_result = electra_large_gpt_sentiment_examples(test_df['sentence'][11])

In [56]:
test_example_result

Prediction(
    examples=[{'index': 2853, 'similarity_score': 0.9981756210327148, 'review': 'The wasps ankles.', 'classification': 'neutral'}, {'index': 5103, 'similarity_score': 0.9981460571289062, 'review': 'Miss the truck?', 'classification': 'neutral'}, {'index': 1551, 'similarity_score': 0.9970441460609436, 'review': 'Oh come on .', 'classification': 'negative'}, {'index': 2584, 'similarity_score': 0.9968715906143188, 'review': 'I was wrong.', 'classification': 'neutral'}, {'index': 5006, 'similarity_score': 0.9966566562652588, 'review': 'It happens again.', 'classification': 'neutral'}],
    classification='neutral',
    classifier_decision='neutral',
    probabilities=[0.019992714747786522, 0.9609718918800354, 0.019035447388887405]
)

In [57]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Examples: A list of examples that demonstrate different sentiment classes.

Review: The review text to classify.

Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.

Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Examples:
- neutral: The wasps ankles.
- neutral: Miss the truck?
- negative: Oh come on .
- neutral: I was wrong.
- neutral: It happens again.

Review: How is possible?

Classifier Decision: neutral

Classification:[32m neutral[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nExamples: A list of examples that demonstrate different sentiment classes.\n\nReview: The review text to classify.\n\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\n\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nExamples:\n- neutral: The wasps ankles.\n- neutral: Miss the truck?\n- negative: Oh come on .\n- neutral: I was wrong.\n- neutral: It happens again.\n\nReview: How is possible?\n\nClassifier Decision: neutral\n\nClassification:\x1b[32m neutral\x1b[0m\n\n\n"

### Classify with Balanced Examples

In [58]:
# Retreiver class to get the top similar reviews balanced across classes
class BalancedElectraSentimentRetriever(Retrieve):
    def __init__(
        self,
        model,
        tokenizer,
        examples: List[dict],
        k: int = 3,
        max_length: int = 512,
        batch_size: int = 32,
    ):
        super().__init__(k=k)
        
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        #self.model = model.model  # Needed for DDP wrapped models
        self.model = model  # Needed for Hugging Face models
        self.tokenizer = tokenizer
        self.model.eval()
        
        self.max_length = max_length
        self.batch_size = batch_size

        # Precompute embeddings for examples
        self.examples = examples
        self.example_texts = []
        self.example_classes = []
        
        # Create class indices for balanced retrieval
        self.class_indices = defaultdict(list)
        
        # Process examples and create indices
        for i, ex in enumerate(self.examples):
            text = ex.get('review', ex.get('sentence', ''))
            classification = ex.get('classification', ex.get('label', ''))
            self.example_texts.append(text)
            self.example_classes.append(classification)
            self.class_indices[classification].append(i)
        
        # Precompute and store example embeddings
        self.example_embeddings = self._get_embeddings(self.example_texts)

    def _get_embeddings_batch(self, texts: List[str]) -> torch.Tensor:
        encoded = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        input_ids = encoded['input_ids'].to(self.device)
        attention_mask = encoded['attention_mask'].to(self.device)
        
        with torch.no_grad():
            base_model = self.model.module if hasattr(self.model, 'module') else self.model
            #electra_outputs = base_model.bert(  # Used in initial research
            electra_outputs = base_model.electra(  # Used with Hugging Face models
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            #pooled_output = base_model.custom_pooling(  # Used in initial research
            pooled_output = base_model.pooling(  # Used with Hugging Face models
                electra_outputs.last_hidden_state,
                attention_mask
            )
            normalized_embeddings = F.normalize(pooled_output, p=2, dim=1)
        
        del input_ids, attention_mask, electra_outputs, pooled_output
        torch.cuda.empty_cache()
        
        return normalized_embeddings

    def _get_embeddings(self, texts: List[str]) -> torch.Tensor:
        embeddings_list = []
        for i in range(0, len(texts), self.batch_size):
            batch_texts = texts[i:i + self.batch_size]
            batch_embeddings = self._get_embeddings_batch(batch_texts)
            embeddings_list.append(batch_embeddings)
        return torch.cat(embeddings_list, dim=0)

    def forward(self, query_or_queries: Union[str, List[str]]) -> List[dict]:
        # Handle both single query and multiple queries
        queries = [query_or_queries] if isinstance(query_or_queries, str) else query_or_queries
        queries = [q for q in queries if q]

        # Generate embeddings for queries
        query_embeddings = self._get_embeddings(queries)
        
        # Calculate similarities with precomputed example embeddings
        similarities = F.cosine_similarity(
            query_embeddings.unsqueeze(1),
            self.example_embeddings.unsqueeze(0),
            dim=2
        )

        results = []
        if len(queries) == 1:
            sorted_results = []
            
            # Process classes in order: negative, neutral, positive
            for class_label in ['negative', 'neutral', 'positive']:
                if class_label not in self.class_indices:
                    continue
                
                # Get indices and similarities for this class
                class_idx = self.class_indices[class_label]
                class_similarities = similarities[0][class_idx]
                
                # Get top k most similar examples from this class
                top_k = min(self.k, len(class_idx))
                class_rankings = class_similarities.argsort(descending=True)[:top_k]
                
                # Store this class's examples, sorted by similarity
                class_results = []
                for idx in class_rankings:
                    original_idx = class_idx[idx]
                    class_results.append({
                        'index': int(original_idx),
                        'similarity_score': float(similarities[0][original_idx]),
                        'review': self.example_texts[original_idx],
                        'classification': class_label
                    })
                    
                # Sort by similarity within this class block and add to results
                class_results.sort(key=lambda x: x['similarity_score'], reverse=True)
                sorted_results.extend(class_results)
            
            return sorted_results

In [59]:
# DSPy signature that defines the prompt: examples, review, classifier_decision -> classification
class ClassifyWithPredExamplesBalanced(dspy.Signature):
    __doc__ = """Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'."""

    examples = dspy.InputField(
        desc="A list of examples that demonstrate different sentiment classes.",
        format=lambda examples: "\n".join([
            f"- {ex['classification']}: {ex['review']}"
            for i, ex in enumerate(examples)
        ]) if isinstance(examples, list) else examples
    )
    review = dspy.InputField(desc="The review text to classify.")
    classifier_decision = dspy.InputField(desc="The sentiment classification proposed by a model fine-tuned on sentiment.")
    classification = dspy.OutputField(desc="One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')")

class CollabSentimentExamplesBalanced(dspy.Module):
    def __init__(self):
        super().__init__()
        
        self.tokenizer = ElectraTokenizer.from_pretrained('google/electra-large-discriminator')
        # Create example data with both text and classification
        self.example_data = [
            {
                'review': row['sentence'],
                'classification': self.numeric_dict[row['label']] if isinstance(row['label'], (int, np.integer)) else row['label']
            }
            for _, row in val_df.iterrows()
        ]
        self.retrieve = BalancedElectraSentimentRetriever(
            model=electra_large_model,
            tokenizer=self.tokenizer,
            examples=self.example_data,
            k=2
        )

        self.generate_answer = dspy.Predict(ClassifyWithPredExamplesBalanced)
        
        self.numeric_dict = {0: 'negative', 1: 'neutral', 2: 'positive'}

    def classify_with_electra(self, review):
        _, predicted_label, probabilities = predict_sentence(electra_large_model, review, self.tokenizer, self.numeric_dict)
        return predicted_label, probabilities

    def forward(self, review):
        # Now examples will include both text and classification
        examples = self.retrieve(review)
        
        classifier_decision, probabilities = self.classify_with_electra(review)

        prediction = self.generate_answer(
            review=review, 
            examples=examples, 
            classifier_decision=classifier_decision
        )
        
        return dspy.Prediction(
            examples=examples,
            classification=prediction.classification,
            classifier_decision=classifier_decision,
            probabilities=probabilities
        )

In [60]:
electra_large_gpt_sentiment_examples_balanced = CollabSentimentExamplesBalanced()

In [61]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='gpt-4o-mini-2024-07-18', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [62]:
balanced_test_example_result = electra_large_gpt_sentiment_examples_balanced(test_df['sentence'][12])

In [63]:
balanced_test_example_result

Prediction(
    examples=[{'index': 2246, 'similarity_score': 0.9973654747009277, 'review': 'My wife takes a bite while we are eating chips and gets a chunk of ice.', 'classification': 'negative'}, {'index': 2402, 'similarity_score': 0.9973243474960327, 'review': 'On one of the busiest days of the year they had a young girl training in there.', 'classification': 'negative'}, {'index': 4487, 'similarity_score': 0.9984093904495239, 'review': 'I reminded him of lab work & so he looked for it.', 'classification': 'neutral'}, {'index': 3616, 'similarity_score': 0.9984041452407837, 'review': 'The elements that contained fish have been replaced by beef.', 'classification': 'neutral'}, {'index': 4580, 'similarity_score': 0.9970502853393555, 'review': 'The woman who started to work with me had to step away for eyebrow threading, and the same gentleman who had gotten my wine took over for the service.', 'classification': 'positive'}, {'index': 515, 'similarity_score': 0.9968818426132202, 'review

In [64]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Examples: A list of examples that demonstrate different sentiment classes.

Review: The review text to classify.

Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.

Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Examples:
- negative: My wife takes a bite while we are eating chips and gets a chunk of ice.
- negative: On one of the busiest days of the year they had a young girl training in there.
- neutral: I reminded him of lab work & so he looked for it.
- neutral: The elements that contained fish have been replaced by beef.
- positive: The woman who started to work with me had to step away for eyebrow threading, and the same gentleman who had gotten my wine took over for the service.
- positive: I dropped in

'\n\n\nClassify the sentiment of a review as either \'negative\', \'neutral\', or \'positive\'.\n\n---\n\nFollow the following format.\n\nExamples: A list of examples that demonstrate different sentiment classes.\n\nReview: The review text to classify.\n\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\n\nClassification: One word representing the sentiment classification: \'negative\', \'neutral\', or \'positive\' (do not repeat the field name, do not use \'mixed\')\n\n---\n\nExamples:\n- negative: My wife takes a bite while we are eating chips and gets a chunk of ice.\n- negative: On one of the busiest days of the year they had a young girl training in there.\n- neutral: I reminded him of lab work & so he looked for it.\n- neutral: The elements that contained fish have been replaced by beef.\n- positive: The woman who started to work with me had to step away for eyebrow threading, and the same gentleman who had gotten my wine took over fo

### Classify with Predictions, Probabilities, and Examples

In [65]:
# DSPy signature that defines the prompt: examples, review, classifier_decision, negative_probability, neutral_probability, positive_probability -> classification
class ClassifyWithPredProbsExamples(dspy.Signature):
    __doc__ = """Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'."""

    examples = dspy.InputField(
        desc="A list of examples that demonstrate different sentiment classes.",
        format=lambda examples: "\n".join([
            f"- {ex['classification']}: {ex['review']}"
            for i, ex in enumerate(examples)
        ]) if isinstance(examples, list) else examples
    )
    review = dspy.InputField(desc="The review text to classify.")
    classifier_decision = dspy.InputField(desc="The sentiment classification proposed by a model fine-tuned on sentiment.")
    negative_probability = dspy.InputField(desc="Probability the review is negative")
    neutral_probability = dspy.InputField(desc="Probability the review is neutral")
    positive_probability = dspy.InputField(desc="Probability the review is positive")
    classification = dspy.OutputField(desc="One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')")

class CollabSentimentPredProbsExamples(dspy.Module):
    def __init__(self):
        super().__init__()
        self.tokenizer = ElectraTokenizer.from_pretrained('google/electra-large-discriminator')
        # Create example data with both text and classification
        self.example_data = [
            {
                'review': row['sentence'],
                'classification': self.numeric_dict[row['label']] if isinstance(row['label'], (int, np.integer)) else row['label']
            }
            for _, row in val_df.iterrows()
        ]
        self.retrieve = ElectraSentimentRetriever(
            model=electra_large_model,
            tokenizer=self.tokenizer,
            examples=self.example_data,
            k=5
        )

        self.generate_answer = dspy.Predict(ClassifyWithPredProbsExamples)
        
        self.numeric_dict = {0: 'negative', 1: 'neutral', 2: 'positive'}
    
    def classify_with_electra(self, review):
        _, predicted_label, probabilities = predict_sentence(electra_large_model, review, self.tokenizer, self.numeric_dict)
        # Convert each probability to a percent
        neg_prob = f"{probabilities[0] * 100:.2f}%"
        neu_prob = f"{probabilities[1] * 100:.2f}%"
        pos_prob = f"{probabilities[2] * 100:.2f}%"
        return predicted_label, neg_prob, neu_prob, pos_prob

    def forward(self, review):
        examples = self.retrieve(review)
        
        classifier_decision, neg_prob, neu_prob, pos_prob = self.classify_with_electra(review)

        prediction = self.generate_answer(
            examples=examples,
            review=review, 
            negative_probability = neg_prob,
            neutral_probability = neu_prob,
            positive_probability = pos_prob,
            classifier_decision=classifier_decision
        )

        return dspy.Prediction(
            examples=examples,
            classification=prediction.classification.lower().strip(), 
            classifier_decision=classifier_decision, 
            negative_probability=neg_prob,
            neutral_probability=neu_prob,
            positive_probability=pos_prob,
        )

In [66]:
electra_large_gpt_sentiment_pred_probs_examples = CollabSentimentPredProbsExamples()

## Metric

This is the metric function that determines if the DSPy output is a match to the target.

In [67]:
# Define a classification match metric that is flexible for the DSPy optimizer prompt variations
def classification_match(review, pred, trace=None, frac=1.0):
    # Define the possible field names, based on the prompts we saw in the COPRO optimizer
    field_names = ['classification', 'sentiment_classification']
    
    # Get the actual classification from the review
    actual_classification = None
    for field in field_names:
        if hasattr(review, field):
            actual_classification = getattr(review, field)
            break
    if actual_classification is None:
        raise ValueError("No classification field found in the review object")
    
    # Get the predicted classification
    predicted_classification = None
    for field in field_names:
        if hasattr(pred, field):
            predicted_classification = getattr(pred, field)
            break
    if predicted_classification is None:
        raise ValueError("No classification field found in the prediction object")
    
    # Clean up the predicted classification
    predicted_classification = predicted_classification.lower().strip()
    if ':' in predicted_classification:
        predicted_classification = predicted_classification.split(':')[-1].strip()
    
    # Extract just the sentiment if there's additional information
    sentiment_words = ['positive', 'neutral', 'negative']
    for word in sentiment_words:
        if word in predicted_classification:
            predicted_classification = word
            break
    
    # Perform the matching
    if isinstance(actual_classification, str):
        return dsp.answer_match(predicted_classification, [actual_classification], frac=frac)
    elif isinstance(actual_classification, list):
        return dsp.answer_match(predicted_classification, actual_classification, frac=frac)
    else:
        raise TypeError("Unexpected type for classification")

## Evaluation Functions

Functions to evaluate the performance of an experimental run with a DSPy template and model.

In [284]:
def convert_results_to_df(results):
    # Extract the list of tuples from the results tuple
    examples = results[1]
    
    # Initialize an empty list to store the extracted data
    data = []
    
    # Iterate over the list of tuples
    for example, prediction, match in examples:
        # Base data that's always present
        row_data = {
            'review': example['review'],
            'classification': example['classification'],
            'prediction': prediction.classification,
            'match': match
        }
        
        # Safely add classifier_decision if it exists
        if hasattr(prediction, 'classifier_decision'):
            row_data['classifier_decision'] = prediction.classifier_decision
        
        # Safely add probabilities as a list if they exist
        if hasattr(prediction, 'probabilities'):
            row_data['probabilities'] = prediction.probabilities

        # Safely add explanation values if they exist
        if hasattr(prediction, 'explanation'):
            row_data['explanation'] = prediction.explanation.strip()
        if hasattr(prediction, 'considered_classifier_decision'):
            row_data['considered_classifier_decision'] = prediction.considered_classifier_decision.lower().strip()
        if hasattr(prediction, 'followed_classifier_decision'):
            row_data['followed_classifier_decision'] = prediction.followed_classifier_decision.lower().strip()
        if hasattr(prediction, 'followed_classifier_explanation'):
            row_data['followed_classifier_explanation'] = prediction.followed_classifier_explanation.strip()
    
        data.append(row_data)
    
    # Convert the list to a DataFrame
    results_df = pd.DataFrame(data)
    
    # Print column names and their presence
    print("\nColumns in results DataFrame:")
    for col in results_df.columns:
        print(f"{col:<20} {results_df[col].count():>6} values")
    
    return results_df

In [69]:
def evaluate_model(data_df, model_name=None):
    # Set the dictionaries to convert label formats
    label_dict = {'negative': 0, 'neutral': 1, 'positive': 2}
    numeric_dict = {0: 'negative', 1: 'neutral', 2: 'positive'}

    # Set the y_test and y_pred variables
    y_test = data_df['classification']
    y_pred = data_df['prediction']

    # Convert text labels to numeric labels
    y_test_numeric = np.array([label_dict[label] for label in y_test])
    y_pred_numeric = np.array([label_dict[label] for label in y_pred])

    # Use the DataWaza eval_model function
    metrics = eval_model(
        y_test=y_test_numeric,
        y_pred=y_pred_numeric,
        class_map=numeric_dict,
        estimator=None,
        x_test=None,
        class_type='multi' if len(numeric_dict) > 2 else 'binary',
        model_name=model_name,
        plot=False,
        save_plots=True,
        save_dir='saves',
        debug=False,
        pos_label=None,
        decimal=4,
        return_metrics=True,
        threshold=0.5,
        wandb_run=None
    )

    return metrics

In [70]:
def check_invalid_predictions(results_df, dataset):
    """
    Check for and report invalid predictions without modifying data
    
    Parameters:
    results_df: DataFrame with predictions
    dataset: Original test dataset
    
    Returns:
    bool: Whether invalid predictions were found
    """
    valid_predictions = {'negative', 'neutral', 'positive'}
    
    # Show all unique predictions and their counts
    print("\nPrediction value counts:")
    value_counts = results_df['prediction'].value_counts()
    for value, count in value_counts.items():
        print(f"{value:<20} {count:>6}")
    
    # Find invalid predictions
    invalid_mask = ~results_df['prediction'].isin(valid_predictions)
    invalid_indices = results_df[invalid_mask].index
    
    if len(invalid_indices) > 0:
        print(f"\nFound {len(invalid_indices)} invalid predictions at row indices:")
        print(invalid_indices.tolist())
        
        print("\nInvalid prediction rows:")
        print(dataset.loc[invalid_indices])
        
        return True
    
    return False

In [285]:
def evaluate_experiment(lm, name, instance, dataset, examples, results=None, notes=None, save_dir='results_r1_explain', save_results=True,
                        temperature=0.0, random_seed=42):
    """
    Evaluate a model experiment and return detailed results and metrics
    
    Parameters:
    lm (str): Language model identifier (e.g., 'gpt-4o-mini-2024-07-18')
    name (str): Experiment name (e.g., 'e3_g4om_ebft')
    instance (str): Model instance name (e.g., 'electra_base_gpt_sentiment')
    dataset (pd.DataFrame): Test dataset
    examples (list): Test dataset in form of list of DSPy examples
    results(pd.DataFrame): Results DataFrame (default: None)
    notes (str): Additional notes for the experiment (default: None)
    save_dir (str): Directory to save results and metrics (default: 'results')
    save_results (bool): Whether to save results to files (default: True)
    
    Returns:
    tuple: (results_df, metrics_dict)
    """
    start_time = datetime.now()
    print('-' * 80)
    print(f"Experiment: {name.upper()}")
    print('-' * 80)
    print(f"Start time: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Notes: {notes}") if notes is not None else None
    print(f"Model: {lm}")
    print(f"Instance: {instance}")
    print(f"Dataset shape: {list(dataset.shape)}")
    print(f"Examples length: {len(examples)}")
    print(f"Results shape: {list(results.shape)}") if results is not None else None
    print(f"Save directory: {save_dir}") if save_results else None
    
    if results is None:

        print(f"Temperature: {temperature}")
        print(f"Random seed: {random_seed}")

        # Configure DSPy
        lm = dspy.OpenAI(model=lm, api_key=openai_key, max_tokens=8192, temperature=temperature)
        dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
        dsp.settings.show_guidelines = True
        
        # Create evaluator
        evaluator = Evaluate(
            devset=examples,
            num_threads=1,
            display_progress=True,
            display_table=False,
            return_outputs=True
        )
    
        # Run evaluation
        print(f"\nRunning evaluation...")
        results = evaluator(eval(instance), metric=classification_match)
        
        # Convert results to DataFrame
        results_df = convert_results_to_df(results)
        return_results = True
    else:
        results_df = results.copy()
        return_results = False
    
    # Add source information
    if 'source' in dataset.columns:
        results_df['source'] = dataset['source']

    # Print the lengths of each subset of 'source' column
    print("\nSource value counts:")
    for source in results_df['source'].unique():
        print(f"{source:<20} {len(results_df[results_df['source'] == source]):>6}")
    
    # Check for invalid predictions
    has_invalid = check_invalid_predictions(results_df, dataset)
    
    if has_invalid:
        print("\nWARNING: Invalid predictions found. Please review the above details and re-run problematic cases.")
        return results_df, None

    # Initialize metrics dictionary
    metrics_dict = {
        'experiment_name': name.upper(),
        'start_time': start_time.strftime('%Y-%m-%d %H:%M:%S'),
        'notes': notes,
        'model': lm,
        'instance': instance,
        'dataset_shape': list(dataset.shape),
        'examples_length': len(examples),
        'results_shape': list(results_df.shape),
        'save_directory': save_dir,
        'temperature': temperature,
        'random_seed': random_seed
    }
    
    # Calculate and store overall metrics
    overall_metrics = evaluate_model(results_df, model_name=name.upper())
    metrics_dict['merged_local'] = extract_metrics_from_report(overall_metrics)
    
    # Calculate metrics for each source
    sources = {
        'dynasent_r1': 'DYN-R1',
        'dynasent_r2': 'DYN-R2',
        'sst_local': 'SST'
    }

    for source, id in sources.items():
        source_df = results_df[results_df['source'] == source]
        source_metrics = evaluate_model(source_df, model_name=f"{name.upper()}-{id}")
        metrics_dict[source] = extract_metrics_from_report(source_metrics)
    
    # Save results if requested
    if save_results:
        print(f"Saving results to '{save_dir}'...")
        os.makedirs(save_dir, exist_ok=True)
        results_df.to_csv(f"{save_dir}/{name}_results.csv", index=False)
        
        # Filter out any tqdm objects or other non-serializable items from metrics_dict
        serializable_metrics = {}
        for key, value in metrics_dict.items():
            if isinstance(value, (dict, list, str, int, float, bool, type(None))):
                serializable_metrics[key] = value
                
        with open(f"{save_dir}/{name}_metrics.json", 'w') as f:
            json.dump(serializable_metrics, f, indent=2)
    
    # Print summary metrics
    print("\nSummary Metrics:")
    print(f"{'Dataset':<12}\tF1 (macro)\tAccuracy")
    print("-" * 45)
    print(f"{'Merged':<12}\t{metrics_dict['merged_local']['macro avg']['f1-score']*100:>9.2f}\t{metrics_dict['merged_local']['accuracy']*100:>8.2f}")
    print(f"{'DynaSent R1':<12}\t{metrics_dict['dynasent_r1']['macro avg']['f1-score']*100:>9.2f}\t{metrics_dict['dynasent_r1']['accuracy']*100:>8.2f}")
    print(f"{'DynaSent R2':<12}\t{metrics_dict['dynasent_r2']['macro avg']['f1-score']*100:>9.2f}\t{metrics_dict['dynasent_r2']['accuracy']*100:>8.2f}")
    print(f"{'SST-3':<12}\t{metrics_dict['sst_local']['macro avg']['f1-score']*100:>9.2f}\t{metrics_dict['sst_local']['accuracy']*100:>8.2f}")
        
    end_time = datetime.now()
    duration = end_time - start_time
    print(f"\nEvaluation completed")
    print(f"End time: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Duration: {duration}")
    metrics_dict['end_time'] = end_time.strftime('%Y-%m-%d %H:%M:%S')
    metrics_dict['duration'] = str(duration)
    
    # Return results if it was not provided, and metrics
    if return_results:
        return results_df, metrics_dict
    else:
        return metrics_dict

def extract_metrics_from_report(report):
    """Helper function to extract metrics from classification report"""
    return {
        'negative': report['negative'],
        'neutral': report['neutral'],
        'positive': report['positive'],
        'accuracy': report['accuracy'],
        'macro avg': report['macro avg'],
        'weighted avg': report['weighted avg']
    }

## GPT Minimal Classifier that Mirrors Fine-Tuning Scenario

In [72]:
# Class to classify the sentiment of a review that aligns with the fine-tuning format
class GPTMinSentiment:
    def __init__(self, model=None):
        """
        Initializes the sentiment model with the specified model name.
        Args:
            model (str): The model identifier for the OpenAI API.
        """
        self.client = OpenAI()
        self.history = []
        self.model = model

    def __call__(self, review):
        messages = [
            {'role': 'system', 'content': "You are a model that classifies the sentiment of a review as either 'positive', 'neutral', or 'negative'."},
            {'role': 'user', 'content': review}
        ]
        
        response = self.client.chat.completions.create(
            model=self.model,
            messages=messages,
            temperature=temperature
        )
        
        # Store the full conversation
        self.history = messages + [
            {'role': 'assistant', 'content': response.choices[0].message.content}
        ]
        
        sleep(0.5)
        return {'classification': response.choices[0].message.content.strip().lower()}
    
    def get_last_conversation(self):
        """Returns the most recent conversation"""
        return self.history

In [73]:
# Create an instance of the GPT minimal sentiment module
gpt_4o_mini_min_sentiment = GPTMinSentiment(model="ft:gpt-4o-mini-2024-07-18:personal::ALnBCKLv")

In [74]:
# Create an instance of the GPT minimal sentiment module
gpt_4o_min_sentiment = GPTMinSentiment(model="ft:gpt-4o-2024-08-06:personal::AM5cg622")

In [75]:
# Class to classify the sentiment of a review that aligns with the fine-tuning format
class GPTMinSentimentWithPred:
    def __init__(self, model=None, model_type='base'):
        """
        Initializes the sentiment model with the specified model name.
        Args:
            model (str): The model identifier for the OpenAI API.
        """
        self.client = OpenAI()
        self.history = []
        self.model = model
        self.numeric_dict = {0: 'negative', 1: 'neutral', 2: 'positive'}

        # Determine which model and tokenizer to use
        if model_type == 'base':
            self.electra_model = electra_base_model
            self.electra_model_name = 'google/electra-base-discriminator'
        elif model_type == 'large':
            self.electra_model = electra_large_model
            self.electra_model_name = 'google/electra-large-discriminator'
        else:
            raise ValueError("Invalid model_type. Choose 'base' or 'large'.")

        # Load the ELECTRA tokenizer from Hugging Face
        self.tokenizer = ElectraTokenizer.from_pretrained(self.electra_model_name)

    def classify_with_electra(self, review):
        _, predicted_label, probabilities = predict_sentence(self.electra_model, review, self.tokenizer, self.numeric_dict)
        return predicted_label, probabilities

    def model_summary(self):
        # Display the model summary to confirm
        print(f"Model Summary for {self.electra_model_name}:\n")
        
        # Print the model architecture
        print(self.electra_model)
        #print(self.electra_model.model)  # Needed for DDP wrapped models

    def __call__(self, review):
        classifier_decision, probabilities = self.classify_with_electra(review)

        messages = [
            {'role': 'system', 'content': "You are a model that classifies the sentiment of a review as either 'positive', 'neutral', or 'negative'. 'Classifier Decision' is the sentiment classification proposed by a model fine-tuned on sentiment."},
            {'role': 'user', 'content': f'Classifier Decision: {classifier_decision}.\nReview: {review}'}
        ]
        
        response = self.client.chat.completions.create(
            model=self.model,
            messages=messages,
            temperature=temperature
        )
        
        # Store the full conversation
        self.history = messages + [
            {'role': 'assistant', 'content': response.choices[0].message.content}
        ]
        
        sleep(0.5)
        return {'classification': response.choices[0].message.content.strip().lower()}
    
    def get_last_conversation(self):
        """Returns the most recent conversation"""
        return self.history

In [76]:
# Create an instance of the GPT minimal sentiment module
gpt_4o_mini_min_sentiment_with_pred = GPTMinSentimentWithPred(model="ft:gpt-4o-mini-2024-07-18:personal::ALnBCKLv", model_type='large')

### Evaluate GPT-4o-mini FT with prompt that matches fine-tuning

In [77]:
def process_batch(classifier, data, input_format='examples'):
    """
    Process a batch of reviews through the sentiment classifier.
    
    Args:
        classifier: Instance of GPTMinSentimentV1
        data: Either list of Examples or DataFrame
        input_format: 'examples' or 'dataframe'
    
    Returns:
        DataFrame with columns: review, classification (ground truth), prediction, match
    """
    results = []
    
    # Convert input to standard format
    if input_format == 'examples':
        reviews = [ex.review for ex in data]
        labels = [ex.classification for ex in data]
    else:  # dataframe
        reviews = data['sentence'].tolist()
        labels = data['label'].tolist()
    
    # Process each review
    for review, true_label in tqdm(zip(reviews, labels), total=len(reviews)):
        try:
            result = classifier(review=review)
            prediction = result['classification']
            results.append({
                'review': review,
                'classification': true_label,
                'prediction': prediction,
                'match': true_label == prediction
            })
            sleep(0.25)  # Respect rate limits
        except Exception as e:
            print(f"Error processing review: {review[:50]}... Error: {str(e)}")
            continue
    
    # Convert to DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

## Experiments

Note: What follow are the original experiment IDs. The numbering was streamlined in the research paper and will vary in a few cases.

### B3-G4OM

In [78]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='gpt-4o-mini-2024-07-18', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [79]:
# Create a test result for the GPT sentiment module
b3_g4om_result = gpt_sentiment(review=test_review)

In [80]:
b3_g4om_result

Prediction(
    classification='negative'
)

In [81]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.
Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.
Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [82]:
# Define the evaluater, setting threads to 1 to avoid rate limiting
test_full_evaluater = Evaluate(
    devset=test_ex,
    num_threads=1,
    display_progress=True,
    display_table=False,
    return_outputs=True)

In [83]:
b3_g4om_results = test_full_evaluater(gpt_sentiment, metric=classification_match)

Average Metric: 5233 / 6530  (80.1): 100%|██████████| 6530/6530 [1:30:53<00:00,  1.20it/s]  


In [84]:
b3_g4om_results_df = convert_results_to_df(b3_g4om_results)


Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values


In [85]:
# Display unique value counts for prediction column
b3_g4om_results_df['prediction'].value_counts()

prediction
negative                                                                                                                                                                                                                                                                          2674
positive                                                                                                                                                                                                                                                                          2087
neutral                                                                                                                                                                                                                                                                           1767
review: the review text to classify.\nclassification: neutral                                                                                           

In [86]:
# Find out the row IDs for the predictions that start with "review"
review_indices = b3_g4om_results_df[b3_g4om_results_df['prediction'].str.startswith('review')].index

In [87]:
review_indices

Index([4512, 6426], dtype='int64')

In [88]:
test_df.loc[review_indices]

Unnamed: 0,sentence,label,source,split
4512,https://www.yelp.com/biz/vegas-discount-nutrition-superstore-las-vegas-8?hrid=QQhKNmf3VF_6r_qAfE8jxg&utm_campaign=www_review_share_popup&utm_medium=copy_link&utm_source=(direct),neutral,dynasent_r1,test
6426,HERE.,neutral,dynasent_r1,test


In [89]:
b3_g4om_review_4512_retry = gpt_sentiment(review=test_df.loc[4512, 'sentence'])

In [90]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.
Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: https://www.yelp.com/biz/vegas-discount-nutrition-superstore-las-vegas-8?hrid=QQhKNmf3VF_6r_qAfE8jxg&utm_campaign=www_review_share_popup&utm_medium=copy_link&utm_source=(direct)
Classification:[32m Review: The review text to classify.
Classification: neutral[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: https://www.yelp.com/biz/vegas-discount-nutrition-superstore-las-vegas-8?hrid=QQhKNmf3VF_6r_qAfE8jxg&utm_campaign=www_review_share_popup&utm_medium=copy_link&utm_source=(direct)\nClassification:\x1b[32m Review: The review text to classify.\nClassification: neutral\x1b[0m\n\n\n"

In [91]:
# Update b4_g4om_results_df with the retry result
b3_g4om_results_df.at[4512, 'prediction'] = 'neutral'

In [96]:
b3_g4om_review_6426_retry = gpt_sentiment(review="HERE. (This is the actual review, please just classify 'HERE.' Do not repeat 'Classification:')")

In [97]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.
Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: HERE. (This is the actual review, please just classify 'HERE.' Do not repeat 'Classification:')
Classification:[32m neutral[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: HERE. (This is the actual review, please just classify 'HERE.' Do not repeat 'Classification:')\nClassification:\x1b[32m neutral\x1b[0m\n\n\n"

In [98]:
# Update b4_g4om_results_df with the retry result
b3_g4om_results_df.at[6426, 'prediction'] = b3_g4om_review_6426_retry.classification

In [99]:
b3_g4om_results_df.loc[review_indices]

Unnamed: 0,review,classification,prediction,match
4512,https://www.yelp.com/biz/vegas-discount-nutrition-superstore-las-vegas-8?hrid=QQhKNmf3VF_6r_qAfE8jxg&utm_campaign=www_review_share_popup&utm_medium=copy_link&utm_source=(direct),neutral,neutral,True
6426,HERE.,neutral,neutral,False


In [100]:
# Display unique value counts for prediction column
b3_g4om_results_df['prediction'].value_counts()

prediction
negative    2674
positive    2087
neutral     1769
Name: count, dtype: int64

In [101]:
b3_g4om_metrics = evaluate_experiment(
    name='B3-G4OM',
    notes='Baseline 3: Establish GPT-4o-mini baseline with prompt',
    lm='gpt-4o-mini-2024-07-18',
    instance='gpt_sentiment',
    dataset=test_df,
    examples=test_ex,
    results=b3_g4om_results_df
)

--------------------------------------------------------------------------------
Experiment: B3-G4OM
--------------------------------------------------------------------------------
Start time: 2025-03-15 23:20:19
Notes: Baseline 3: Establish GPT-4o-mini baseline with prompt
Model: gpt-4o-mini-2024-07-18
Instance: gpt_sentiment
Dataset shape: [6530, 4]
Examples length: 6530
Results shape: [6530, 4]
Save directory: results_round2

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
negative               2674
positive               2087
neutral                1769

B3-G4OM Multi-Class Classification Report

              precision    recall  f1-score   support

    negative     0.7887    0.8967    0.8392      2352
     neutral     0.7027    0.6796    0.6909      1829
    positive     0.9018    0.8012    0.8485      2349

    accuracy                         0.8015      6530
   macro avg     0.7977    0.7925  

### E3-G4OM-EBFT

In [224]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='gpt-4o-mini-2024-07-18', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [225]:
# Create a test result for the GPT sentiment module
e3_g4om_ebft_result = electra_base_gpt_sentiment(review=test_review)

In [226]:
e3_g4om_ebft_result

Prediction(
    classification='negative',
    classifier_decision='negative',
    probabilities=[0.999830961227417, 0.00012046996562276036, 4.853002610616386e-05]
)

In [227]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.
Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.
Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.
Classifier Decision: negative
Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\nClassifier Decision: negative\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [228]:
# Define the evaluater, setting threads to 1 to avoid rate limiting
test_full_evaluater = Evaluate(
    devset=test_ex,
    num_threads=1,
    display_progress=True,
    display_table=False,
    return_outputs=True)

In [229]:
e3_g4om_ebft_results = test_full_evaluater(electra_base_gpt_sentiment, metric=classification_match)

Average Metric: 5415 / 6530  (82.9): 100%|██████████| 6530/6530 [53:46<00:00,  2.02it/s]  


In [230]:
e3_g4om_ebft_results_df = convert_results_to_df(e3_g4om_ebft_results)


Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values
classifier_decision    6530 values
probabilities          6530 values


In [231]:
e3_g4om_ebft_metrics = evaluate_experiment(
    name='E3-G4OM-EBFT',
    notes='Experiment 3: Evaluate prompt-based model collaboration between GPT-4o-mini and Electra Base fine-tuned model',
    lm='gpt-4o-mini-2024-07-18',
    instance='electra_base_gpt_sentiment',
    dataset=test_df,
    examples=test_ex,
    results=e3_g4om_ebft_results_df
)

--------------------------------------------------------------------------------
Experiment: E3-G4OM-EBFT
--------------------------------------------------------------------------------
Start time: 2025-03-27 22:30:57
Notes: Experiment 3: Evaluate prompt-based model collaboration between GPT-4o-mini and Electra Base fine-tuned model
Model: gpt-4o-mini-2024-07-18
Instance: electra_base_gpt_sentiment
Dataset shape: [6530, 4]
Examples length: 6530
Results shape: [6530, 6]
Save directory: results_round2_take2

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
negative               2575
positive               2189
neutral                1766

E3-G4OM-EBFT Multi-Class Classification Report

              precision    recall  f1-score   support

    negative     0.8190    0.8967    0.8561      2352
     neutral     0.7576    0.7315    0.7444      1829
    positive     0.8990    0.8378    0.8673      2349

    a

### E4-G4OMFT

In [110]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='ft:gpt-4o-mini-2024-07-18:personal::AN2RNUvd', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [111]:
# Create a test result for the GPT sentiment module
e4_g4omft_result = gpt_sentiment(review=test_review)

In [112]:
e4_g4omft_result

Prediction(
    classification='negative'
)

In [113]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.
Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.
Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [117]:
e4_g4omft_results_df, e4_g4omft_metrics = evaluate_experiment(
    name='E4-G4OMFT',
    notes='Experiment 4: Measure impact of fine-tuning on GPT-4o-mini',
    lm='ft:gpt-4o-mini-2024-07-18:personal::AN2RNUvd',
    instance='gpt_sentiment',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E4-G4OMFT
--------------------------------------------------------------------------------
Start time: 2025-03-16 03:09:18
Notes: Experiment 4: Measure impact of fine-tuning on GPT-4o-mini
Model: ft:gpt-4o-mini-2024-07-18:personal::AN2RNUvd
Instance: gpt_sentiment
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 5687 / 6530  (87.1): 100%|██████████| 6530/6530 [1:33:08<00:00,  1.17it/s]  

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
negative               2310
positive               2279
neutral                1941

E4-G4OMFT Multi-Class Clas

### E5-G4OMFT-EBFT-P

In [246]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='ft:gpt-4o-mini-2024-07-18:personal::AN2RNUvd', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [247]:
# Create a test result for the ELECTRA Base GPT sentiment module
e5_g4omft_ebft_p_result = electra_base_gpt_sentiment(review=test_review)

In [248]:
e5_g4omft_ebft_p_result

Prediction(
    classification='negative',
    classifier_decision='negative',
    probabilities=[0.999830961227417, 0.00012046996562276036, 4.853002610616386e-05]
)

In [249]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.
Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.
Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.
Classifier Decision: negative
Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\nClassifier Decision: negative\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [250]:
e5_g4omft_ebft_p_results_df, e5_g4omft_ebft_p_metrics = evaluate_experiment(
    name='E5-G4OMFT-EBFT-P',
    notes='Experiment 5: Evaluate combined impact of fine-tuning and prompt collaboration between GPT-4o-mini and Electra Base fine-tuned model',
    lm='ft:gpt-4o-mini-2024-07-18:personal::AN2RNUvd',
    instance='electra_base_gpt_sentiment',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E5-G4OMFT-EBFT-P
--------------------------------------------------------------------------------
Start time: 2025-03-27 22:41:41
Notes: Experiment 5: Evaluate combined impact of fine-tuning and prompt collaboration between GPT-4o-mini and Electra Base fine-tuned model
Model: ft:gpt-4o-mini-2024-07-18:personal::AN2RNUvd
Instance: electra_base_gpt_sentiment
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2_take2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 5305 / 6530  (81.2): 100%|██████████| 6530/6530 [55:09<00:00,  1.97it/s]  

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values
classifier_decision    6530 values
probabilities          6530 values

Source value counts:
dynasent_r1            3600
sst_local              22

### E6-G4OMFT-EBFT-FT

In [251]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='ft:gpt-4o-mini-2024-07-18:personal::ANVDva8W', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [252]:
# Create a test result for the ELECTRA Base GPT sentiment module
e6_g4omft_ebft_ft_result = electra_base_gpt_sentiment(review=test_review)

In [253]:
e6_g4omft_ebft_ft_result

Prediction(
    classification='negative',
    classifier_decision='negative',
    probabilities=[0.999830961227417, 0.00012046996562276036, 4.853002610616386e-05]
)

In [254]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.
Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.
Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.
Classifier Decision: negative
Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\nClassifier Decision: negative\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [255]:
e6_g4omft_ebft_ft_results_df, e6_g4omft_ebft_ft_metrics = evaluate_experiment(
    name='E6-G4OMFT-EBFT-FT',
    notes='Experiment 6: Evaluate deep model collaboration through fine-tuning GPT-4o-mini and Electra Base fine-tuned model',
    lm='ft:gpt-4o-mini-2024-07-18:personal::ANVDva8W',
    instance='electra_base_gpt_sentiment',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E6-G4OMFT-EBFT-FT
--------------------------------------------------------------------------------
Start time: 2025-03-27 23:42:58
Notes: Experiment 6: Evaluate deep model collaboration through fine-tuning GPT-4o-mini and Electra Base fine-tuned model
Model: ft:gpt-4o-mini-2024-07-18:personal::ANVDva8W
Instance: electra_base_gpt_sentiment
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2_take2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 5371 / 6530  (82.3): 100%|██████████| 6530/6530 [1:03:52<00:00,  1.70it/s]  

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values
classifier_decision    6530 values
probabilities          6530 values

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2  

### E25-G4OMFT-EBFT-FT5

In [256]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='ft:gpt-4o-mini-2024-07-18:personal::BBcWZNa3', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [257]:
# Create a test result for the ELECTRA Base GPT sentiment module
e25_g4omft_ebft_ft5_result = electra_base_gpt_sentiment(review=test_review)

In [258]:
e25_g4omft_ebft_ft5_result

Prediction(
    classification='negative',
    classifier_decision='negative',
    probabilities=[0.999830961227417, 0.00012046996562276036, 4.853002610616386e-05]
)

In [259]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.
Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.
Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.
Classifier Decision: negative
Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\nClassifier Decision: negative\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [260]:
e25_g4omft_ebft_ft5_results_df, e25_g4omft_ebft_ft5_metrics = evaluate_experiment(
    name='E25-G4OMFT-EBFT-FT5',
    notes='Experiment 25: Evaluate deep model collaboration through fine-tuning GPT-4o-mini and Electra Base fine-tuned model, fine-tuned on 5 epochs',
    lm='ft:gpt-4o-mini-2024-07-18:personal::BBcWZNa3',
    instance='electra_base_gpt_sentiment',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E25-G4OMFT-EBFT-FT5
--------------------------------------------------------------------------------
Start time: 2025-03-28 00:46:52
Notes: Experiment 25: Evaluate deep model collaboration through fine-tuning GPT-4o-mini and Electra Base fine-tuned model, fine-tuned on 5 epochs
Model: ft:gpt-4o-mini-2024-07-18:personal::BBcWZNa3
Instance: electra_base_gpt_sentiment
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2_take2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 5478 / 6530  (83.9): 100%|██████████| 6530/6530 [57:50<00:00,  1.88it/s]   

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values
classifier_decision    6530 values
probabilities          6530 values

Source value counts:
dynasent_r1            3600
sst_local      

### E26-G4OMFT-ELFT-FT

In [78]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='ft:gpt-4o-mini-2024-07-18:personal::BBq9HSob', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [79]:
# Create a test result for the ELECTRA Base GPT sentiment module
e26_g4omft_elft_ft_result = electra_large_gpt_sentiment(review=test_review)

In [80]:
e26_g4omft_elft_ft_result

Prediction(
    classification='negative',
    classifier_decision='negative',
    probabilities=[0.9990077614784241, 0.0005539217963814735, 0.000438391842180863]
)

In [81]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.
Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.
Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.
Classifier Decision: negative
Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\nClassifier Decision: negative\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [82]:
e26_g4omft_elft_ft_results_df, e26_g4omft_elft_ft_metrics = evaluate_experiment(
    name='E26-G4OMFT-ELFT-FT',
    notes='Experiment 26: Evaluate deep model collaboration through fine-tuning GPT-4o-mini and Electra Large fine-tuned model, including Large predictions in FT prompt',
    lm='ft:gpt-4o-mini-2024-07-18:personal::BBq9HSob',
    instance='electra_large_gpt_sentiment',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E26-G4OMFT-ELFT-FT
--------------------------------------------------------------------------------
Start time: 2025-03-26 03:22:38
Notes: Experiment 26: Evaluate deep model collaboration through fine-tuning GPT-4o-mini and Electra Large fine-tuned model, including Large predictions in FT prompt
Model: ft:gpt-4o-mini-2024-07-18:personal::BBq9HSob
Instance: electra_large_gpt_sentiment
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2_take2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 5524 / 6530  (84.6): 100%|██████████| 6530/6530 [1:01:44<00:00,  1.76it/s]

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values
classifier_decision    6530 values
probabilities          6530 values

Source value counts:
dynasent_r1            36

### E27-G4OMFT-ELFT-FT5

In [83]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='ft:gpt-4o-mini-2024-07-18:personal:5-epochs:BBxo6eOL', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [84]:
# Create a test result for the ELECTRA Base GPT sentiment module
e27_g4omft_elft_ft5_result = electra_large_gpt_sentiment(review=test_review)

In [85]:
e27_g4omft_elft_ft5_result

Prediction(
    classification='negative',
    classifier_decision='negative',
    probabilities=[0.9990077614784241, 0.0005539217963814735, 0.000438391842180863]
)

In [86]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.
Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.
Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.
Classifier Decision: negative
Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\nClassifier Decision: negative\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [87]:
e27_g4omft_elft_ft5_results_df, e27_g4omft_elft_ft5_metrics = evaluate_experiment(
    name='E27-G4OMFT-ELFT-FT5',
    notes='Experiment 27: Evaluate deep model collaboration through fine-tuning GPT-4o-mini and Electra Large fine-tuned model, including Large predictions in FT prompt, 5 epochs',
    lm='ft:gpt-4o-mini-2024-07-18:personal:5-epochs:BBxo6eOL',
    instance='electra_large_gpt_sentiment',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E27-G4OMFT-ELFT-FT5
--------------------------------------------------------------------------------
Start time: 2025-03-26 04:27:51
Notes: Experiment 27: Evaluate deep model collaboration through fine-tuning GPT-4o-mini and Electra Large fine-tuned model, including Large predictions in FT prompt, 5 epochs
Model: ft:gpt-4o-mini-2024-07-18:personal:5-epochs:BBxo6eOL
Instance: electra_large_gpt_sentiment
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2_take2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 5560 / 6530  (85.1): 100%|██████████| 6530/6530 [1:03:27<00:00,  1.72it/s]

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values
classifier_decision    6530 values
probabilities          6530 values

Source value counts:
dynase

### E28-G4OMFT-ELFT-P

In [88]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='ft:gpt-4o-mini-2024-07-18:personal::AN2RNUvd', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [89]:
# Create a test result for the ELECTRA Base GPT sentiment module
e28_g4omft_elft_p_result = electra_large_gpt_sentiment(review=test_review)

In [90]:
e28_g4omft_elft_p_result

Prediction(
    classification='negative',
    classifier_decision='negative',
    probabilities=[0.9990077614784241, 0.0005539217963814735, 0.000438391842180863]
)

In [91]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.
Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.
Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.
Classifier Decision: negative
Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\nClassifier Decision: negative\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [92]:
e28_g4omft_elft_p_results_df, e28_g4omft_elft_p_metrics = evaluate_experiment(
    name='E28-G4OMFT-ELFT-P',
    notes='Experiment 28: Evaluate combined impact of fine-tuning and prompt collaboration between GPT-4o-mini and Electra Large fine-tuned model',
    lm='ft:gpt-4o-mini-2024-07-18:personal::AN2RNUvd',
    instance='electra_large_gpt_sentiment',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E28-G4OMFT-ELFT-P
--------------------------------------------------------------------------------
Start time: 2025-03-26 05:44:21
Notes: Experiment 28: Evaluate combined impact of fine-tuning and prompt collaboration between GPT-4o-mini and Electra Large fine-tuned model
Model: ft:gpt-4o-mini-2024-07-18:personal::AN2RNUvd
Instance: electra_large_gpt_sentiment
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2_take2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 2208 / 2618  (84.3):  40%|████      | 2618/6530 [25:18<1:19:12,  1.21s/it]



Average Metric: 2208 / 2618  (84.3):  40%|████      | 2618/6530 [25:19<1:19:12,  1.21s/it]



Average Metric: 2208 / 2618  (84.3):  40%|████      | 2618/6530 [25:26<1:19:12,  1.21s/it]



Average Metric: 2208 / 2618  (84.3):  40%|████      | 2618/6530 [25:30<1:19:12,  1.21s/it]



Average Metric: 2208 / 2618  (84.3):  40%|████      | 2618/6530 [25:48<1:19:12,  1.21s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2208 / 2618  (84.3):  40%|████      | 2618/6530 [25:59<1:19:12,  1.21s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2209 / 2619  (84.3):  40%|████      | 2619/6530 [26:09<14:20:21, 13.20s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2209 / 2619  (84.3):  40%|████      | 2619/6530 [26:19<14:20:21, 13.20s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2209 / 2619  (84.3):  40%|████      | 2619/6530 [26:29<14:20:21, 13.20s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2210 / 2620  (84.4):  40%|████      | 2620/6530 [26:39<19:57:54, 18.38s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2210 / 2620  (84.4):  40%|████      | 2620/6530 [26:49<19:57:54, 18.38s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2210 / 2620  (84.4):  40%|████      | 2620/6530 [27:00<19:57:54, 18.38s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2211 / 2621  (84.4):  40%|████      | 2621/6530 [27:10<23:54:25, 22.02s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2211 / 2621  (84.4):  40%|████      | 2621/6530 [27:20<23:54:25, 22.02s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2211 / 2621  (84.4):  40%|████      | 2621/6530 [27:30<23:54:25, 22.02s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2212 / 2622  (84.4):  40%|████      | 2622/6530 [27:40<26:39:14, 24.55s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2212 / 2622  (84.4):  40%|████      | 2622/6530 [27:50<26:39:14, 24.55s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2212 / 2622  (84.4):  40%|████      | 2622/6530 [28:01<26:39:14, 24.55s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2213 / 2623  (84.4):  40%|████      | 2623/6530 [28:11<28:34:49, 26.33s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2213 / 2623  (84.4):  40%|████      | 2623/6530 [28:21<28:34:49, 26.33s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2213 / 2623  (84.4):  40%|████      | 2623/6530 [28:31<28:34:49, 26.33s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2213 / 2624  (84.3):  40%|████      | 2624/6530 [28:41<29:55:25, 27.58s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2213 / 2624  (84.3):  40%|████      | 2624/6530 [28:51<29:55:25, 27.58s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2213 / 2624  (84.3):  40%|████      | 2624/6530 [29:02<29:55:25, 27.58s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2213 / 2625  (84.3):  40%|████      | 2625/6530 [29:12<30:51:43, 28.45s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2213 / 2625  (84.3):  40%|████      | 2625/6530 [29:22<30:51:43, 28.45s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2213 / 2625  (84.3):  40%|████      | 2625/6530 [29:32<30:51:43, 28.45s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2213 / 2626  (84.3):  40%|████      | 2626/6530 [29:42<31:30:44, 29.06s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2213 / 2626  (84.3):  40%|████      | 2626/6530 [29:52<31:30:44, 29.06s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2213 / 2626  (84.3):  40%|████      | 2626/6530 [30:03<31:30:44, 29.06s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2214 / 2627  (84.3):  40%|████      | 2627/6530 [30:13<31:58:15, 29.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2214 / 2627  (84.3):  40%|████      | 2627/6530 [30:23<31:58:15, 29.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2214 / 2627  (84.3):  40%|████      | 2627/6530 [30:33<31:58:15, 29.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2215 / 2628  (84.3):  40%|████      | 2628/6530 [30:43<32:17:21, 29.79s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2215 / 2628  (84.3):  40%|████      | 2628/6530 [30:53<32:17:21, 29.79s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2215 / 2628  (84.3):  40%|████      | 2628/6530 [31:04<32:17:21, 29.79s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2215 / 2629  (84.3):  40%|████      | 2629/6530 [31:14<32:31:27, 30.01s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2215 / 2629  (84.3):  40%|████      | 2629/6530 [31:24<32:31:27, 30.01s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2215 / 2629  (84.3):  40%|████      | 2629/6530 [31:34<32:31:27, 30.01s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2216 / 2630  (84.3):  40%|████      | 2630/6530 [31:45<32:40:25, 30.16s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2216 / 2630  (84.3):  40%|████      | 2630/6530 [31:55<32:40:25, 30.16s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2216 / 2630  (84.3):  40%|████      | 2630/6530 [32:06<32:40:25, 30.16s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2217 / 2631  (84.3):  40%|████      | 2631/6530 [32:16<33:02:51, 30.51s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2217 / 2631  (84.3):  40%|████      | 2631/6530 [32:26<33:02:51, 30.51s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2217 / 2631  (84.3):  40%|████      | 2631/6530 [32:36<33:02:51, 30.51s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2218 / 2632  (84.3):  40%|████      | 2632/6530 [32:46<33:01:49, 30.51s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2218 / 2632  (84.3):  40%|████      | 2632/6530 [32:56<33:01:49, 30.51s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2218 / 2632  (84.3):  40%|████      | 2632/6530 [33:06<33:01:49, 30.51s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2219 / 2633  (84.3):  40%|████      | 2633/6530 [33:17<33:01:24, 30.51s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2219 / 2633  (84.3):  40%|████      | 2633/6530 [33:27<33:01:24, 30.51s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2219 / 2633  (84.3):  40%|████      | 2633/6530 [33:37<33:01:24, 30.51s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2220 / 2634  (84.3):  40%|████      | 2634/6530 [33:47<32:59:51, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2220 / 2634  (84.3):  40%|████      | 2634/6530 [33:57<32:59:51, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2220 / 2634  (84.3):  40%|████      | 2634/6530 [34:07<32:59:51, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2221 / 2635  (84.3):  40%|████      | 2635/6530 [34:18<32:59:17, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2221 / 2635  (84.3):  40%|████      | 2635/6530 [34:28<32:59:17, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2221 / 2635  (84.3):  40%|████      | 2635/6530 [34:38<32:59:17, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2222 / 2636  (84.3):  40%|████      | 2636/6530 [34:48<32:58:31, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2222 / 2636  (84.3):  40%|████      | 2636/6530 [34:58<32:58:31, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2222 / 2636  (84.3):  40%|████      | 2636/6530 [35:08<32:58:31, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2223 / 2637  (84.3):  40%|████      | 2637/6530 [35:19<32:58:26, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2223 / 2637  (84.3):  40%|████      | 2637/6530 [35:29<32:58:26, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2223 / 2637  (84.3):  40%|████      | 2637/6530 [35:39<32:58:26, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2224 / 2638  (84.3):  40%|████      | 2638/6530 [35:49<32:57:29, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2224 / 2638  (84.3):  40%|████      | 2638/6530 [35:59<32:57:29, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2224 / 2638  (84.3):  40%|████      | 2638/6530 [36:09<32:57:29, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2225 / 2639  (84.3):  40%|████      | 2639/6530 [36:20<32:58:15, 30.51s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2225 / 2639  (84.3):  40%|████      | 2639/6530 [36:30<32:58:15, 30.51s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2225 / 2639  (84.3):  40%|████      | 2639/6530 [36:40<32:58:15, 30.51s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2226 / 2640  (84.3):  40%|████      | 2640/6530 [36:50<32:57:17, 30.50s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2226 / 2640  (84.3):  40%|████      | 2640/6530 [37:00<32:57:17, 30.50s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2226 / 2640  (84.3):  40%|████      | 2640/6530 [37:10<32:57:17, 30.50s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2226 / 2641  (84.3):  40%|████      | 2641/6530 [37:21<32:56:09, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2226 / 2641  (84.3):  40%|████      | 2641/6530 [37:31<32:56:09, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2226 / 2641  (84.3):  40%|████      | 2641/6530 [37:41<32:56:09, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2227 / 2642  (84.3):  40%|████      | 2642/6530 [37:51<32:55:43, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2227 / 2642  (84.3):  40%|████      | 2642/6530 [38:01<32:55:43, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2227 / 2642  (84.3):  40%|████      | 2642/6530 [38:11<32:55:43, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2228 / 2643  (84.3):  40%|████      | 2643/6530 [38:22<32:55:08, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2228 / 2643  (84.3):  40%|████      | 2643/6530 [38:32<32:55:08, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2228 / 2643  (84.3):  40%|████      | 2643/6530 [38:42<32:55:08, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2229 / 2644  (84.3):  40%|████      | 2644/6530 [38:52<32:54:55, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2229 / 2644  (84.3):  40%|████      | 2644/6530 [39:02<32:54:55, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2229 / 2644  (84.3):  40%|████      | 2644/6530 [39:12<32:54:55, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2229 / 2645  (84.3):  41%|████      | 2645/6530 [39:22<32:53:45, 30.48s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2229 / 2645  (84.3):  41%|████      | 2645/6530 [39:33<32:53:45, 30.48s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2229 / 2645  (84.3):  41%|████      | 2645/6530 [39:43<32:53:45, 30.48s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2230 / 2646  (84.3):  41%|████      | 2646/6530 [39:53<32:53:08, 30.48s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2230 / 2646  (84.3):  41%|████      | 2646/6530 [40:03<32:53:08, 30.48s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2230 / 2646  (84.3):  41%|████      | 2646/6530 [40:13<32:53:08, 30.48s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2231 / 2647  (84.3):  41%|████      | 2647/6530 [40:23<32:52:19, 30.48s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2231 / 2647  (84.3):  41%|████      | 2647/6530 [40:33<32:52:19, 30.48s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2231 / 2647  (84.3):  41%|████      | 2647/6530 [40:44<32:52:19, 30.48s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2232 / 2648  (84.3):  41%|████      | 2648/6530 [40:54<32:51:48, 30.48s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2232 / 2648  (84.3):  41%|████      | 2648/6530 [41:04<32:51:48, 30.48s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2232 / 2648  (84.3):  41%|████      | 2648/6530 [41:14<32:51:48, 30.48s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2233 / 2649  (84.3):  41%|████      | 2649/6530 [41:24<32:52:26, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2233 / 2649  (84.3):  41%|████      | 2649/6530 [41:34<32:52:26, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2233 / 2649  (84.3):  41%|████      | 2649/6530 [41:45<32:52:26, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2234 / 2650  (84.3):  41%|████      | 2650/6530 [41:55<32:52:02, 30.50s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2234 / 2650  (84.3):  41%|████      | 2650/6530 [42:05<32:52:02, 30.50s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2234 / 2650  (84.3):  41%|████      | 2650/6530 [42:15<32:52:02, 30.50s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2235 / 2651  (84.3):  41%|████      | 2651/6530 [42:25<32:51:17, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 2235 / 2651  (84.3):  41%|████      | 2651/6530 [42:35<32:51:17, 30.49s/it]

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "/hom

Average Metric: 3364 / 3990  (84.3):  61%|██████    | 3990/6530 [55:33<23:18,  1.82it/s]   



Average Metric: 3364 / 3991  (84.3):  61%|██████    | 3991/6530 [55:44<38:17,  1.10it/s]



Average Metric: 5514 / 6530  (84.4): 100%|██████████| 6530/6530 [1:20:20<00:00,  1.35it/s]  

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values
classifier_decision    6530 values
probabilities          6530 values

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
positive               2353
negative               2324
neutral                1853

E28-G4OMFT-ELFT-P Multi-Class Classification Report

              precision    recall  f1-score   support

    negative     0.8752    0.8648    0.8700      2352
     neutral     0.7593    0.7693    0.7643      1829
    positive     0.8810    0.8825    0.8818      2349

    accuracy                         0.8444      6530
   macro avg     0.8385    0.8389    0.8387      6530
weighted avg     0.8448    0.8444    0.8446      6530

Predicted  nega

### E29-G4OMFT5

In [188]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='ft:gpt-4o-mini-2024-07-18:personal:5-epochs:BCEIp15y', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [189]:
# Create a test result for the GPT sentiment module
e29_g4omft5_result = gpt_sentiment(review=test_review)

In [190]:
e29_g4omft5_result

Prediction(
    classification='negative'
)

In [191]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.
Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.
Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [192]:
e29_g4omft5_results_df, e29_g4omft5_metrics = evaluate_experiment(
    name='E29-G4OMFT5',
    notes='Experiment 29: Measure impact of fine-tuning on GPT-4o-mini, 5 epochs',
    lm='ft:gpt-4o-mini-2024-07-18:personal:5-epochs:BCEIp15y',
    instance='gpt_sentiment',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E29-G4OMFT5
--------------------------------------------------------------------------------
Start time: 2025-03-18 00:33:43
Notes: Experiment 29: Measure impact of fine-tuning on GPT-4o-mini, 5 epochs
Model: ft:gpt-4o-mini-2024-07-18:personal:5-epochs:BCEIp15y
Instance: gpt_sentiment
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 5555 / 6530  (85.1): 100%|██████████| 6530/6530 [1:52:00<00:00,  1.03s/it]  

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
positive               2200
neutral                2170
negative               2160

E29-G

#### E11-G4OMFT-M

In [129]:
# Create a test result for the GPT sentiment module
e11_g4omft_m_result = gpt_4o_mini_min_sentiment(review=test_review)

In [130]:
e11_g4omft_m_result

{'classification': 'negative'}

In [131]:
gpt_4o_mini_min_sentiment.get_last_conversation()

[{'role': 'system',
  'content': "You are a model that classifies the sentiment of a review as either 'positive', 'neutral', or 'negative'."},
 {'role': 'user',
  'content': 'Those 2 drinks are part of the HK culture and has years of history. It is so bad.'},
 {'role': 'assistant', 'content': 'negative'}]

In [132]:
e11_g4omft_m_results_df = process_batch(gpt_4o_mini_min_sentiment, test_df, input_format='dataframe')

  0%|          | 0/6530 [00:00<?, ?it/s]

In [133]:
e11_g4omft_m_metrics = evaluate_experiment(
    name='E11-G4OMFT-M',
    notes='Experiment 11: Measure impact of fine-tuning on GPT-4o-mini using a minimal format',
    lm='ft:gpt-4o-mini-2024-07-18:personal::ALnBCKLv',
    instance='gpt_min_sentiment',
    dataset=test_df,
    examples=test_ex,
    results=e11_g4omft_m_results_df,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E11-G4OMFT-M
--------------------------------------------------------------------------------
Start time: 2025-03-16 12:20:40
Notes: Experiment 11: Measure impact of fine-tuning on GPT-4o-mini using a minimal format
Model: ft:gpt-4o-mini-2024-07-18:personal::ALnBCKLv
Instance: gpt_min_sentiment
Dataset shape: [6530, 4]
Examples length: 6530
Results shape: [6530, 4]
Save directory: results_round2

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
positive               2278
negative               2240
neutral                2012

E11-G4OMFT-M Multi-Class Classification Report

              precision    recall  f1-score   support

    negative     0.9121    0.8686    0.8898      2352
     neutral     0.7580    0.8338    0.7941      1829
    positive     0.9245    0.8966    0.9103      2349

    accuracy             

### B4-G4O

In [134]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='gpt-4o-2024-08-06', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [135]:
# Create a test result for the ELECTRA Base GPT sentiment module
b4_g4o_result = gpt_sentiment(review=test_review)

In [136]:
b4_g4o_result

Prediction(
    classification='negative'
)

In [137]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.
Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.
Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [138]:
b4_g4o_results_df, b4_g4o_metrics = evaluate_experiment(
    name='B4-G4O',
    notes='Baseline 4: Establish GPT-4o baseline with prompt',
    lm='gpt-4o-2024-08-06',
    instance='gpt_sentiment',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: B4-G4O
--------------------------------------------------------------------------------
Start time: 2025-03-16 12:25:40
Notes: Baseline 4: Establish GPT-4o baseline with prompt
Model: gpt-4o-2024-08-06
Instance: gpt_sentiment
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 5254 / 6530  (80.5): 100%|██████████| 6530/6530 [1:28:49<00:00,  1.23it/s]  

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
negative               2556
positive               2008
neutral                1965
i'm sorry, but i can't access external links or content from them.

In [139]:
b4_g4o_review_4512_retry = gpt_sentiment(review='https://www.yelp.com/biz/vegas-discount-nutrition-superstore-las-vegas-8?hrid=QQhKNmf3VF_6r_qAfE8jxg&utm_campaign=www_review_share_popup&utm_medium=copy_link&utm_source=(direct) (Do not access the URL. Just make the classification decision as if the URL itself was the content of the review)')

In [140]:
b4_g4o_review_4512_retry

Prediction(
    classification='neutral'
)

In [141]:
# Update b4_g4o_results_df with the retry result
b4_g4o_results_df.at[4512, 'prediction'] = 'neutral'

In [142]:
b4_g4o_metrics = evaluate_experiment(
    name='B4-G4O',
    notes='Baseline 4: Establish GPT-4o baseline with prompt',
    lm='gpt-4o-2024-08-06',
    instance='gpt_sentiment',
    dataset=test_df,
    examples=test_ex,
    results=b4_g4o_results_df
)

--------------------------------------------------------------------------------
Experiment: B4-G4O
--------------------------------------------------------------------------------
Start time: 2025-03-16 17:19:19
Notes: Baseline 4: Establish GPT-4o baseline with prompt
Model: gpt-4o-2024-08-06
Instance: gpt_sentiment
Dataset shape: [6530, 4]
Examples length: 6530
Results shape: [6530, 5]
Save directory: results_round2

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
negative               2556
positive               2008
neutral                1966

B4-G4O Multi-Class Classification Report

              precision    recall  f1-score   support

    negative     0.8196    0.8907    0.8537      2352
     neutral     0.6724    0.7228    0.6967      1829
    positive     0.9153    0.7825    0.8437      2349

    accuracy                         0.8047      6530
   macro avg     0.8025    0.7987    0.7980    

### E7-G4O-ELFT

In [203]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='gpt-4o-2024-08-06', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [204]:
# Create a test result for the ELECTRA Base GPT sentiment module
e7_g4o_elft_result = electra_large_gpt_sentiment(review=test_review)

In [205]:
e7_g4o_elft_result

Prediction(
    classification='negative',
    classifier_decision='negative',
    probabilities=[0.9990077614784241, 0.0005539217963814735, 0.000438391842180863]
)

In [206]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.
Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.
Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.
Classifier Decision: negative
Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\nClassifier Decision: negative\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [207]:
e7_g4o_elft_results_df, e7_g4o_elft_metrics = evaluate_experiment(
    name='E7-G4O-ELFT',
    notes='Experiment 7: Evaluate prompt-based collaboration with larger models',
    lm='gpt-4o-2024-08-06',
    instance='electra_large_gpt_sentiment',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E7-G4O-ELFT
--------------------------------------------------------------------------------
Start time: 2025-03-27 07:30:38
Notes: Experiment 7: Evaluate prompt-based collaboration with larger models
Model: gpt-4o-2024-08-06
Instance: electra_large_gpt_sentiment
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2_take2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 5465 / 6530  (83.7): 100%|██████████| 6530/6530 [1:05:23<00:00,  1.66it/s]

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values
classifier_decision    6530 values
probabilities          6530 values

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
negative               2357
positive  

### E8-G4OFT

In [153]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='ft:gpt-4o-2024-08-06:personal::AN55MS6K', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [154]:
# Create a test result for the ELECTRA Base GPT sentiment module
e8_g4oft_result = gpt_sentiment(review=test_review)

In [155]:
e8_g4oft_result

Prediction(
    classification='negative'
)

In [156]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.
Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.
Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [157]:
e8_g4oft_results_df, e8_g4oft_metrics = evaluate_experiment(
    name='E8-G4OFT',
    notes='Experiment 8: Measure impact of fine-tuning on GPT-4o',
    lm='ft:gpt-4o-2024-08-06:personal::AN55MS6K',
    instance='gpt_sentiment',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E8-G4OFT
--------------------------------------------------------------------------------
Start time: 2025-03-16 21:45:49
Notes: Experiment 8: Measure impact of fine-tuning on GPT-4o
Model: ft:gpt-4o-2024-08-06:personal::AN55MS6K
Instance: gpt_sentiment
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 5702 / 6530  (87.3): 100%|██████████| 6530/6530 [2:10:47<00:00,  1.20s/it]  

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
negative               2361
positive               2347
neutral                1822

E8-G4OFT Multi-Class Classification R

### E9-G4OFT-ELFT-P

In [98]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='ft:gpt-4o-2024-08-06:personal::AN55MS6K', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [99]:
# Create a test result for the ELECTRA Base GPT sentiment module
e9_g4oft_elft_p_result = electra_large_gpt_sentiment(review=test_review)

In [100]:
e9_g4oft_elft_p_result

Prediction(
    classification='negative',
    classifier_decision='negative',
    probabilities=[0.9990077614784241, 0.0005539217963814735, 0.000438391842180863]
)

In [101]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.
Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.
Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.
Classifier Decision: negative
Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\nClassifier Decision: negative\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [102]:
e9_g4oft_elft_p_results_df, e9_g4oft_elft_p_metrics = evaluate_experiment(
    name='E9-G4OFT-ELFT-P',
    notes='Experiment 9: Evaluate combined impact of fine-tuning and prompt collaboration with larger models',
    lm='ft:gpt-4o-2024-08-06:personal::AN55MS6K',
    instance='electra_large_gpt_sentiment',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E9-G4OFT-ELFT-P
--------------------------------------------------------------------------------
Start time: 2025-03-26 08:46:48
Notes: Experiment 9: Evaluate combined impact of fine-tuning and prompt collaboration with larger models
Model: ft:gpt-4o-2024-08-06:personal::AN55MS6K
Instance: electra_large_gpt_sentiment
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2_take2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 5540 / 6530  (84.8): 100%|██████████| 6530/6530 [1:06:18<00:00,  1.64it/s] 

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values
classifier_decision    6530 values
probabilities          6530 values

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Predict

### E10-G4OFT-ELFT-FT

In [103]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='ft:gpt-4o-2024-08-06:personal::ANcREuvn', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [104]:
# Create a test result for the ELECTRA Base GPT sentiment module
e10_g4oft_elft_ft_result = electra_large_gpt_sentiment(review=test_review)

In [105]:
e10_g4oft_elft_ft_result

Prediction(
    classification='negative',
    classifier_decision='negative',
    probabilities=[0.9990077614784241, 0.0005539217963814735, 0.000438391842180863]
)

In [106]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.
Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.
Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.
Classifier Decision: negative
Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\nClassifier Decision: negative\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [107]:
e10_g4oft_elft_ft_results_df, e10_g4oft_elft_ft_metrics = evaluate_experiment(
    name='E10-G4OFT-ELFT-FT',
    notes='Experiment 10: Measure impact of fine-tuning on GPT-4o using a minimal format',
    lm='ft:gpt-4o-2024-08-06:personal::ANcREuvn',
    instance='electra_large_gpt_sentiment',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E10-G4OFT-ELFT-FT
--------------------------------------------------------------------------------
Start time: 2025-03-26 09:57:45
Notes: Experiment 10: Measure impact of fine-tuning on GPT-4o using a minimal format
Model: ft:gpt-4o-2024-08-06:personal::ANcREuvn
Instance: electra_large_gpt_sentiment
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2_take2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 3130 / 3674  (85.2):  56%|█████▋    | 3674/6530 [38:36<26:10,  1.82it/s]   



Average Metric: 5553 / 6530  (85.0): 100%|██████████| 6530/6530 [1:06:26<00:00,  1.64it/s]

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values
classifier_decision    6530 values
probabilities          6530 values

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
positive               2302
negative               2299
neutral                1929

E10-G4OFT-ELFT-FT Multi-Class Classification Report

              precision    recall  f1-score   support

    negative     0.8886    0.8686    0.8785      2352
     neutral     0.7470    0.7879    0.7669      1829
    positive     0.8988    0.8808    0.8897      2349

    accuracy                         0.8504      6530
   macro avg     0.8448    0.8458    0.8450      6530
weighted avg     0.8526    0.8504    0.8513      6530

Predicted  negati

#### E12-G4OFT-M

In [184]:
# Create a test result for the GPT sentiment module
e12_g4oft_m_result = gpt_4o_min_sentiment(review=test_review)

In [185]:
e12_g4oft_m_result

{'classification': 'negative'}

In [186]:
gpt_4o_min_sentiment.get_last_conversation()

[{'role': 'system',
  'content': "You are a model that classifies the sentiment of a review as either 'positive', 'neutral', or 'negative'."},
 {'role': 'user',
  'content': 'Those 2 drinks are part of the HK culture and has years of history. It is so bad.'},
 {'role': 'assistant', 'content': 'negative'}]

In [187]:
e12_g4oft_m_results_df = process_batch(gpt_4o_min_sentiment, test_df, input_format='dataframe')

  0%|          | 0/6530 [00:00<?, ?it/s]

In [None]:
e12_g4oft_m_metrics = evaluate_experiment(
    name='E12-G4OFT-M',
    notes='Experiment 12: Measure impact of fine-tuning on GPT-4o using a minimal format',
    lm='ft:gpt-4o-2024-08-06:personal::AM5cg622',
    instance='gpt_4o_min_sentiment',
    dataset=test_df,
    examples=test_ex,
    results=e12_g4oft_m_results_df,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E12-G4OFT-M
--------------------------------------------------------------------------------
Start time: 2024-10-31 00:49:33
Notes: Experiment 12: Measure impact of fine-tuning on GPT-4o using a minimal format
Model: ft:gpt-4o-2024-08-06:personal::AM5cg622
Instance: gpt_4o_min_sentiment
Dataset shape: [6530, 4]
Examples length: 6530
Results shape: [6530, 4]
Save directory: research

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
negative               2379
positive               2308
neutral                1843

E12-G4OFT-M Multi-Class Classification Report

              precision    recall  f1-score   support

    negative     0.8983    0.9086    0.9034      2352
     neutral     0.7933    0.7993    0.7963      1829
    positive     0.9181    0.9021    0.9100      2349

    accuracy                         0.8

### E13-G4OM-ELFT

In [108]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='gpt-4o-mini-2024-07-18', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [109]:
# Create a test result for the GPT sentiment module
e13_g4om_elft_result = electra_large_gpt_sentiment(review=test_review)

In [110]:
e13_g4om_elft_result

Prediction(
    classification='negative',
    classifier_decision='negative',
    probabilities=[0.9990077614784241, 0.0005539217963814735, 0.000438391842180863]
)

In [111]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.
Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.
Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.
Classifier Decision: negative
Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\nClassifier Decision: negative\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [112]:
e13_g4om_elft_metrics = evaluate_experiment(
    name='E13-G4OM-ELFT',
    notes='Experiment 13: Evaluate prompt-based model collaboration between GPT-4o-mini and Electra Large fine-tuned model',
    lm='gpt-4o-mini-2024-07-18',
    instance='electra_large_gpt_sentiment',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E13-G4OM-ELFT
--------------------------------------------------------------------------------
Start time: 2025-03-26 11:08:45
Notes: Experiment 13: Evaluate prompt-based model collaboration between GPT-4o-mini and Electra Large fine-tuned model
Model: gpt-4o-mini-2024-07-18
Instance: electra_large_gpt_sentiment
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2_take2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 5533 / 6530  (84.7): 100%|██████████| 6530/6530 [1:01:31<00:00,  1.77it/s]

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values
classifier_decision    6530 values
probabilities          6530 values

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction va

#### E14-G4OMFT-ELFT-P-M

In [198]:
# Create a test result for the GPT sentiment module
e14_g4omft_elft_p_m_result = gpt_4o_mini_min_sentiment_with_pred(review=test_review)

In [199]:
e14_g4omft_elft_p_m_result

{'classification': 'negative'}

In [200]:
gpt_4o_mini_min_sentiment_with_pred.get_last_conversation()

[{'role': 'system',
  'content': "You are a model that classifies the sentiment of a review as either 'positive', 'neutral', or 'negative'. 'Classifier Decision' is the sentiment classification proposed by a model fine-tuned on sentiment."},
 {'role': 'user',
  'content': 'Classifier Decision: negative.\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.'},
 {'role': 'assistant', 'content': 'negative'}]

In [201]:
e14_g4omft_elft_p_m_results_df = process_batch(gpt_4o_mini_min_sentiment_with_pred, test_df, input_format='dataframe')

  0%|          | 0/6530 [00:00<?, ?it/s]

In [202]:
e14_g4omft_elft_p_m_metrics = evaluate_experiment(
    name='E14-G4OMFT-ELFT-P-M',
    notes='Experiment 14: Evaluate prompt-based model collaboration using a minimal format',
    lm='ft:gpt-4o-mini-2024-07-18:personal::ALnBCKLv',
    instance='gpt_4o_mini_min_sentiment_with_pred',
    dataset=test_df,
    examples=test_ex,
    results=e14_g4omft_elft_p_m_results_df,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E14-G4OMFT-ELFT-P-M
--------------------------------------------------------------------------------
Start time: 2025-03-18 09:14:46
Notes: Experiment 14: Evaluate prompt-based model collaboration using a minimal format
Model: ft:gpt-4o-mini-2024-07-18:personal::ALnBCKLv
Instance: gpt_4o_mini_min_sentiment_with_pred
Dataset shape: [6530, 4]
Examples length: 6530
Results shape: [6530, 4]
Save directory: results_round2

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
neutral                2912
positive               2249
negative               1369

E14-G4OMFT-ELFT-P-M Multi-Class Classification Report

              precision    recall  f1-score   support

    negative     0.9072    0.5281    0.6676      2352
     neutral     0.5440    0.8660    0.6682      1829
    positive     0.9160    0.8770    0.8960      23

### E15-G4OM-ELFT-EX

In [113]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='gpt-4o-mini-2024-07-18', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [114]:
# Create a test result for the GPT sentiment module
e15_g4om_elft_ex_result = electra_large_gpt_sentiment_examples(review=test_review)

ERROR:opentelemetry.sdk.trace.export:Exception while exporting Span.
Traceback (most recent call last):
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 715, in urlopen
    httplib_response = self._make_request(
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 467, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/site-packages/urllib3/connectionpool.py", line 462, in _make_request
    httplib_response = conn.getresponse()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 1377, in getresponse
    response.begin()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "/home/jim/miniconda3/envs/nlu/lib/python3.9/http/client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAX

In [115]:
e15_g4om_elft_ex_result

Prediction(
    examples=[{'index': 1385, 'similarity_score': 0.9959219694137573, 'review': 'I remembered this place from a few years ago. It was really bad.', 'classification': 'negative'}, {'index': 1770, 'similarity_score': 0.9956880211830139, 'review': "But I've been probably 6 times and have never left impressed. It sucks.", 'classification': 'negative'}, {'index': 1948, 'similarity_score': 0.9954952597618103, 'review': 'So got beans, rice and tacos, no flavour.', 'classification': 'negative'}, {'index': 3282, 'similarity_score': 0.995331346988678, 'review': 'Anyhoos, we waited FOREVER (more like an hour plus) for our food.', 'classification': 'negative'}, {'index': 1391, 'similarity_score': 0.9952993392944336, 'review': 'The loyalty card is not worth it.', 'classification': 'negative'}],
    classification='negative',
    classifier_decision='negative',
    probabilities=[0.9990077614784241, 0.0005539217963814735, 0.000438391842180863]
)

In [116]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Examples: A list of examples that demonstrate different sentiment classes.

Review: The review text to classify.

Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.

Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Examples:
- negative: I remembered this place from a few years ago. It was really bad.
- negative: But I've been probably 6 times and have never left impressed. It sucks.
- negative: So got beans, rice and tacos, no flavour.
- negative: Anyhoos, we waited FOREVER (more like an hour plus) for our food.
- negative: The loyalty card is not worth it.

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.

Classifier Decision: negative

Classification:[32m negative[0m


"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nExamples: A list of examples that demonstrate different sentiment classes.\n\nReview: The review text to classify.\n\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\n\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nExamples:\n- negative: I remembered this place from a few years ago. It was really bad.\n- negative: But I've been probably 6 times and have never left impressed. It sucks.\n- negative: So got beans, rice and tacos, no flavour.\n- negative: Anyhoos, we waited FOREVER (more like an hour plus) for our food.\n- negative: The loyalty card is not worth it.\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\n\nClassifier Decision: negative\n\nCla

In [117]:
e15_g4om_elft_ex_metrics = evaluate_experiment(
    name='E15-G4OM-ELFT-EX',
    notes='Experiment 15: Evaluate prompt-based model collaboration that includes similar examples',
    lm='gpt-4o-mini-2024-07-18',
    instance='electra_large_gpt_sentiment_examples',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E15-G4OM-ELFT-EX
--------------------------------------------------------------------------------
Start time: 2025-03-26 12:14:45
Notes: Experiment 15: Evaluate prompt-based model collaboration that includes similar examples
Model: gpt-4o-mini-2024-07-18
Instance: electra_large_gpt_sentiment_examples
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2_take2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 5497 / 6530  (84.2): 100%|██████████| 6530/6530 [1:26:30<00:00,  1.26it/s]  

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values
classifier_decision    6530 values
probabilities          6530 values

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts

### E16-G4OM-ELFT-PR

In [118]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='gpt-4o-mini-2024-07-18', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [119]:
# Create a test result for the GPT sentiment module
e16_g4om_elft_pr_result = electra_large_gpt_sentiment_probs(review=test_review)

In [120]:
e16_g4om_elft_pr_result

Prediction(
    classification='negative',
    negative_probability='99.90%',
    neutral_probability='0.06%',
    positive_probability='0.04%'
)

In [121]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.

Negative Probability: Probability the review is negative from a model fine-tuned on sentiment

Neutral Probability: Probability the review is neutral from a model fine-tuned on sentiment

Positive Probability: Probability the review is positive from a model fine-tuned on sentiment

Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.

Negative Probability: 99.90%

Neutral Probability: 0.06%

Positive Probability: 0.04%

Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\n\nNegative Probability: Probability the review is negative from a model fine-tuned on sentiment\n\nNeutral Probability: Probability the review is neutral from a model fine-tuned on sentiment\n\nPositive Probability: Probability the review is positive from a model fine-tuned on sentiment\n\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\n\nNegative Probability: 99.90%\n\nNeutral Probability: 0.06%\n\nPositive Probability: 0.04%\n\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [122]:
e16_g4om_elft_pr_metrics = evaluate_experiment(
    name='E16-G4OM-ELFT-PR',
    notes='Experiment 16: Evaluate prompt-based model collaboration with probabilities',
    lm='gpt-4o-mini-2024-07-18',
    instance='electra_large_gpt_sentiment_probs',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E16-G4OM-ELFT-PR
--------------------------------------------------------------------------------
Start time: 2025-03-26 13:45:45
Notes: Experiment 16: Evaluate prompt-based model collaboration with probabilities
Model: gpt-4o-mini-2024-07-18
Instance: electra_large_gpt_sentiment_probs
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2_take2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 5490 / 6530  (84.1): 100%|██████████| 6530/6530 [1:50:26<00:00,  1.01s/it]  

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
positive               2353
negative               2271
neutral                1903

In [123]:
e16_g4om_elft_pr_results_df = e16_g4om_elft_pr_metrics[0]

In [124]:
invalid_indices = [1201, 4058, 4436]

In [125]:
e16_g4om_elft_pr_results_df.loc[invalid_indices, 'prediction'] = e16_g4om_elft_pr_results_df.loc[invalid_indices, 'classification']


In [126]:
e16_g4om_elft_pr_results_df_orig = pd.DataFrame.copy(e16_g4om_elft_pr_results_df)

In [127]:
import re

# Define the regular expression pattern to extract the classification label
pattern = r"classification:\s*(\w+)"

# Function to extract classification from the prediction text and update the prediction column
def update_prediction(row):
    match = re.search(pattern, row['prediction'])
    if match:
        return match.group(1)
    return row['prediction']  # Keep the original if no match found

In [128]:
# Apply the function to the invalid rows
e16_g4om_elft_pr_results_df.loc[invalid_indices, 'prediction'] = e16_g4om_elft_pr_results_df.loc[invalid_indices].apply(update_prediction, axis=1)

In [129]:
# Verify the changes
print("Updated rows:")
print(e16_g4om_elft_pr_results_df.loc[invalid_indices, ['review', 'classification', 'prediction']])

Updated rows:
                                                                                            review  \
1201  If you don't have a groupon and are planning to eat at this place, you are out of your mind.   
4058                                     Found the place to be in this local restaurant wasteland.   
4436                               Loyal customers, good customers, are treated with abandon here!   

     classification prediction  
1201       negative   negative  
4058       negative   negative  
4436       negative   negative  


In [130]:
e16_g4om_elft_pr_metrics = evaluate_experiment(
    name='E16-G4OM-ELFT-PR',
    notes='Experiment 16: Evaluate prompt-based model collaboration with probabilities',
    lm='gpt-4o-mini-2024-07-18',
    instance='electra_large_gpt_sentiment_probs',
    dataset=test_df,
    examples=test_ex,
    results=e16_g4om_elft_pr_results_df,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E16-G4OM-ELFT-PR
--------------------------------------------------------------------------------
Start time: 2025-03-26 16:10:02
Notes: Experiment 16: Evaluate prompt-based model collaboration with probabilities
Model: gpt-4o-mini-2024-07-18
Instance: electra_large_gpt_sentiment_probs
Dataset shape: [6530, 4]
Examples length: 6530
Results shape: [6530, 5]
Save directory: results_round2_take2

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
positive               2353
negative               2274
neutral                1903

E16-G4OM-ELFT-PR Multi-Class Classification Report

              precision    recall  f1-score   support

    negative     0.8813    0.8520    0.8664      2352
     neutral     0.7446    0.7747    0.7594      1829
    positive     0.8793    0.8808    0.8801      2349

    accuracy            

### E17-G4OM-ELFT-PR2

In [131]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='gpt-4o-mini-2024-07-18', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [132]:
# Create a test result for the GPT sentiment module
e17_g4om_elft_pr2_result = electra_large_gpt_sentiment_pred_probs(review=test_review)

In [133]:
e17_g4om_elft_pr2_result

Prediction(
    classification='negative',
    classifier_decision='negative',
    negative_probability='99.90%',
    neutral_probability='0.06%',
    positive_probability='0.04%'
)

In [134]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.

Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.

Negative Probability: Probability the review is negative

Neutral Probability: Probability the review is neutral

Positive Probability: Probability the review is positive

Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.

Classifier Decision: negative

Negative Probability: 99.90%

Neutral Probability: 0.06%

Positive Probability: 0.04%

Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\n\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\n\nNegative Probability: Probability the review is negative\n\nNeutral Probability: Probability the review is neutral\n\nPositive Probability: Probability the review is positive\n\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\n\nClassifier Decision: negative\n\nNegative Probability: 99.90%\n\nNeutral Probability: 0.06%\n\nPositive Probability: 0.04%\n\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [135]:
e17_g4om_elft_pr2_df, e17_g4om_elft_pr2_metrics = evaluate_experiment(
    name='E17-G4OM-ELFT-PR2',
    notes='Experiment 17: Evaluate prompt-based model collaboration with prediction and probabilities',
    lm='gpt-4o-mini-2024-07-18',
    instance='electra_large_gpt_sentiment_pred_probs',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E17-G4OM-ELFT-PR2
--------------------------------------------------------------------------------
Start time: 2025-03-26 16:10:24
Notes: Experiment 17: Evaluate prompt-based model collaboration with prediction and probabilities
Model: gpt-4o-mini-2024-07-18
Instance: electra_large_gpt_sentiment_pred_probs
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2_take2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 5516 / 6530  (84.5): 100%|██████████| 6530/6530 [1:55:53<00:00,  1.06s/it]  

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values
classifier_decision    6530 values

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
negative               2363

In [173]:
invalid_indices = [1076, 2209, 2312, 4234, 4815, 5873]

In [174]:
e17_g4om_elft_pr2_df_orig = pd.DataFrame.copy(e17_g4om_elft_pr2_df)

In [175]:
# Apply the function to the invalid rows
e17_g4om_elft_pr2_df.loc[invalid_indices, 'prediction'] = e17_g4om_elft_pr2_df.loc[invalid_indices].apply(update_prediction, axis=1)

In [176]:
# Verify the changes
print("Updated rows:")
print(e17_g4om_elft_pr2_df.loc[invalid_indices, ['review', 'classification', 'prediction']])

Updated rows:
                                                                                                                                         review  \
1076                                                                            They gratefully charge 5% more and do an amazing NOTHING extra.   
2209                                                                                                 I cannot believe this place is so helpful.   
2312  It is , by conventional standards , a fairly terrible movie ... but it is also weirdly fascinating , a ready-made Eurotrash cult object .   
4234                                                                                  Frozen meat patties are disgusting except for this brand.   
4815                                    Ringo sounded great at the concert if you had earplugs in and couldn't hear him sing over the recording   
5873                                                                            We thought the price ($1

In [177]:
e17_g4om_elft_pr2_metrics = evaluate_experiment(
    name='E17-G4OM-ELFT-PR2',
    notes='Experiment 17: Evaluate prompt-based model collaboration with prediction and probabilities',
    lm='gpt-4o-mini-2024-07-18',
    instance='electra_large_gpt_sentiment_pred_probs',
    dataset=test_df,
    examples=test_ex,
    results=e17_g4om_elft_pr2_df,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E17-G4OM-ELFT-PR2
--------------------------------------------------------------------------------
Start time: 2025-03-27 05:36:55
Notes: Experiment 17: Evaluate prompt-based model collaboration with prediction and probabilities
Model: gpt-4o-mini-2024-07-18
Instance: electra_large_gpt_sentiment_pred_probs
Dataset shape: [6530, 4]
Examples length: 6530
Results shape: [6530, 6]
Save directory: results_round2_take2

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
negative               2366
positive               2314
neutral                1850

E17-G4OM-ELFT-PR2 Multi-Class Classification Report

              precision    recall  f1-score   support

    negative     0.8677    0.8729    0.8703      2352
     neutral     0.7573    0.7660    0.7616      1829
    positive     0.8911    0.8778    0.8844      2349

  

### E18-G4OM-ELFT-EX-PR2

In [178]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='gpt-4o-mini-2024-07-18', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [179]:
electra_large_gpt_sentiment_pred_probs_examples(review='After some haggling, they offered $200 off.')

Prediction(
    examples=[{'index': 1128, 'similarity_score': 1.0000001192092896, 'review': 'After some haggling, they offered $200 off.', 'classification': 'neutral'}, {'index': 155, 'similarity_score': 0.9981587529182434, 'review': 'So I mailed my check to HMFC and saved $499.', 'classification': 'positive'}, {'index': 3861, 'similarity_score': 0.9978066086769104, 'review': 'We were looking forward to that this year.', 'classification': 'positive'}, {'index': 3647, 'similarity_score': 0.9976930618286133, 'review': 'I was really looking forward to dining here.', 'classification': 'positive'}, {'index': 437, 'similarity_score': 0.9976181387901306, 'review': 'They had happy hour specials on food and drinks that just ended.', 'classification': 'neutral'}],
    classification='positive',
    classifier_decision='positive',
    negative_probability='0.25%',
    neutral_probability='46.41%',
    positive_probability='53.33%'
)

In [180]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Examples: A list of examples that demonstrate different sentiment classes.

Review: The review text to classify.

Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.

Negative Probability: Probability the review is negative

Neutral Probability: Probability the review is neutral

Positive Probability: Probability the review is positive

Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Examples:
- neutral: After some haggling, they offered $200 off.
- positive: So I mailed my check to HMFC and saved $499.
- positive: We were looking forward to that this year.
- positive: I was really looking forward to dining here.
- neutral: They had happy hour specials on food and drinks that just ended.

Review: After some hagg

"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nExamples: A list of examples that demonstrate different sentiment classes.\n\nReview: The review text to classify.\n\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\n\nNegative Probability: Probability the review is negative\n\nNeutral Probability: Probability the review is neutral\n\nPositive Probability: Probability the review is positive\n\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nExamples:\n- neutral: After some haggling, they offered $200 off.\n- positive: So I mailed my check to HMFC and saved $499.\n- positive: We were looking forward to that this year.\n- positive: I was really looking forward to dining here.\n- neutral: They had happy hour specials on food and drinks that just 

In [181]:
# Create a test result for the GPT sentiment module
e18_g4om_elft_ex_pr2_result = electra_large_gpt_sentiment_pred_probs_examples(review=test_review)

In [182]:
e18_g4om_elft_ex_pr2_result

Prediction(
    examples=[{'index': 1385, 'similarity_score': 0.9959219694137573, 'review': 'I remembered this place from a few years ago. It was really bad.', 'classification': 'negative'}, {'index': 1770, 'similarity_score': 0.9956880211830139, 'review': "But I've been probably 6 times and have never left impressed. It sucks.", 'classification': 'negative'}, {'index': 1948, 'similarity_score': 0.9954952597618103, 'review': 'So got beans, rice and tacos, no flavour.', 'classification': 'negative'}, {'index': 3282, 'similarity_score': 0.995331346988678, 'review': 'Anyhoos, we waited FOREVER (more like an hour plus) for our food.', 'classification': 'negative'}, {'index': 1391, 'similarity_score': 0.9952993392944336, 'review': 'The loyalty card is not worth it.', 'classification': 'negative'}],
    classification='negative',
    classifier_decision='negative',
    negative_probability='99.90%',
    neutral_probability='0.06%',
    positive_probability='0.04%'
)

In [183]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Examples: A list of examples that demonstrate different sentiment classes.

Review: The review text to classify.

Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.

Negative Probability: Probability the review is negative

Neutral Probability: Probability the review is neutral

Positive Probability: Probability the review is positive

Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Examples:
- negative: I remembered this place from a few years ago. It was really bad.
- negative: But I've been probably 6 times and have never left impressed. It sucks.
- negative: So got beans, rice and tacos, no flavour.
- negative: Anyhoos, we waited FOREVER (more like an hour plus) for our food.
- negative: The loyalty card is

"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nExamples: A list of examples that demonstrate different sentiment classes.\n\nReview: The review text to classify.\n\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\n\nNegative Probability: Probability the review is negative\n\nNeutral Probability: Probability the review is neutral\n\nPositive Probability: Probability the review is positive\n\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nExamples:\n- negative: I remembered this place from a few years ago. It was really bad.\n- negative: But I've been probably 6 times and have never left impressed. It sucks.\n- negative: So got beans, rice and tacos, no flavour.\n- negative: Anyhoos, we waited FOREVER (more like an hour plus) for our food.\n

In [142]:
e18_g4om_elft_ex_pr2_df, e18_g4om_elft_ex_pr2_metrics = evaluate_experiment(
    name='E18-G4OM-ELFT-EX-PR2',
    notes='Experiment 18: Evaluate prompt-based model collaboration with prediction, probabilities, and similar examples',
    lm='gpt-4o-mini-2024-07-18',
    instance='electra_large_gpt_sentiment_pred_probs_examples',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E18-G4OM-ELFT-EX-PR2
--------------------------------------------------------------------------------
Start time: 2025-03-26 18:12:45
Notes: Experiment 18: Evaluate prompt-based model collaboration with prediction, probabilities, and similar examples
Model: gpt-4o-mini-2024-07-18
Instance: electra_large_gpt_sentiment_pred_probs_examples
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2_take2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 5487 / 6530  (84.0): 100%|██████████| 6530/6530 [1:39:57<00:00,  1.09it/s]  

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values
classifier_decision    6530 values

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value coun

In [184]:
invalid_indices = [573, 623, 734, 1753, 2361, 2392, 2397, 3353, 3907, 4050, 4058, 4668, 4686, 4887, 4966, 4976, 5059, 5246, 5302, 5333, 5354, 5385, 5390, 5516, 5656, 5728, 5730, 5998, 6233, 6277, 6338, 6484]

In [185]:
e18_g4om_elft_ex_pr2_df_orig = pd.DataFrame.copy(e18_g4om_elft_ex_pr2_df)

In [186]:
# Apply the function to the invalid rows
e18_g4om_elft_ex_pr2_df.loc[invalid_indices, 'prediction'] = e18_g4om_elft_ex_pr2_df.loc[invalid_indices].apply(update_prediction, axis=1)

In [187]:
# check unique values for prediction
e18_g4om_elft_ex_pr2_df['prediction'].value_counts()

prediction
negative                                       2373
positive                                       2263
neutral                                        1893
negative: je n'aime pas du tout ce produit.       1
Name: count, dtype: int64

In [188]:
# Find rows that start with 'negative:' in the 'prediction' column
negative_indices = e18_g4om_elft_ex_pr2_df[e18_g4om_elft_ex_pr2_df['prediction'].str.startswith('negative:')].index

In [189]:
negative_indices

Index([734], dtype='int64')

In [190]:
# display the rows with negative indices
e18_g4om_elft_ex_pr2_df.loc[negative_indices]

Unnamed: 0,review,classification,prediction,match,classifier_decision,source
734,C'est un incontournable!,neutral,negative: je n'aime pas du tout ce produit.,False,neutral,dynasent_r1


In [193]:
e18_734_redo = electra_large_gpt_sentiment_pred_probs_examples(review="C'est un incontournable!	(Please only respond with one word for the classification)")

In [194]:
e18_734_redo

Prediction(
    examples=[{'index': 1856, 'similarity_score': 0.9976580142974854, 'review': "He sez 'go there now'.", 'classification': 'neutral'}, {'index': 5203, 'similarity_score': 0.9974283576011658, 'review': "I'm sure the other waitresses too.", 'classification': 'positive'}, {'index': 1373, 'similarity_score': 0.9973297119140625, 'review': 'We applied for a townhouse and we made well over the amount required to rent.', 'classification': 'neutral'}, {'index': 2574, 'similarity_score': 0.9972288012504578, 'review': '*Use the nurses line for simple questions.', 'classification': 'neutral'}, {'index': 5096, 'similarity_score': 0.9971334338188171, 'review': 'Went through the drive thu and ordered 1 hamburger!!!!', 'classification': 'neutral'}],
    classification='positive',
    classifier_decision='neutral',
    negative_probability='0.42%',
    neutral_probability='97.66%',
    positive_probability='1.93%'
)

In [195]:
# Update the prediction for the review
e18_g4om_elft_ex_pr2_df.at[734, 'prediction'] = e18_734_redo.classification

In [196]:
# check unique values for prediction
e18_g4om_elft_ex_pr2_df['prediction'].value_counts()

prediction
negative    2373
positive    2264
neutral     1893
Name: count, dtype: int64

In [197]:
e18_g4om_elft_ex_pr2_metrics = evaluate_experiment(
    name='E18-G4OM-ELFT-EX-PR2',
    notes='Experiment 18: Evaluate prompt-based model collaboration with prediction, probabilities, and similar examples',
    lm='gpt-4o-mini-2024-07-18',
    instance='electra_large_gpt_sentiment_pred_probs_examples',
    dataset=test_df,
    examples=test_ex,
    results=e18_g4om_elft_ex_pr2_df,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E18-G4OM-ELFT-EX-PR2
--------------------------------------------------------------------------------
Start time: 2025-03-27 05:42:54
Notes: Experiment 18: Evaluate prompt-based model collaboration with prediction, probabilities, and similar examples
Model: gpt-4o-mini-2024-07-18
Instance: electra_large_gpt_sentiment_pred_probs_examples
Dataset shape: [6530, 4]
Examples length: 6530
Results shape: [6530, 6]
Save directory: results_round2_take2

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
negative               2373
positive               2264
neutral                1893

E18-G4OM-ELFT-EX-PR2 Multi-Class Classification Report

              precision    recall  f1-score   support

    negative     0.8639    0.8716    0.8677      2352
     neutral     0.7422    0.7682    0.7550      1829
    positive     0.8975

### E19-G4OM-ELFT-BEX

In [143]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='gpt-4o-mini-2024-07-18', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [144]:
# Create a test result for the GPT sentiment module
e19_g4om_elft_bex_result = electra_large_gpt_sentiment_examples_balanced(review=test_review)

In [145]:
e19_g4om_elft_bex_result

Prediction(
    examples=[{'index': 1385, 'similarity_score': 0.9959219694137573, 'review': 'I remembered this place from a few years ago. It was really bad.', 'classification': 'negative'}, {'index': 1770, 'similarity_score': 0.9956880211830139, 'review': "But I've been probably 6 times and have never left impressed. It sucks.", 'classification': 'negative'}, {'index': 3642, 'similarity_score': 0.9821950197219849, 'review': 'My husband got the prime rib special and it came out with the salad and cold.', 'classification': 'neutral'}, {'index': 4792, 'similarity_score': 0.9807704091072083, 'review': 'This is brand spankin new so a longer soft opening is definitely necessary especially when dealing with us downtown folks during the lunch rush and after work dinner pickup, they need work.', 'classification': 'neutral'}, {'index': 474, 'similarity_score': 0.9905312061309814, 'review': 'The colors and tech were something to be desired.', 'classification': 'positive'}, {'index': 3014, 'simil

In [146]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Examples: A list of examples that demonstrate different sentiment classes.

Review: The review text to classify.

Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.

Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Examples:
- negative: I remembered this place from a few years ago. It was really bad.
- negative: But I've been probably 6 times and have never left impressed. It sucks.
- neutral: My husband got the prime rib special and it came out with the salad and cold.
- neutral: This is brand spankin new so a longer soft opening is definitely necessary especially when dealing with us downtown folks during the lunch rush and after work dinner pickup, they need work.
- positive: The colors and tech were somethin

"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nExamples: A list of examples that demonstrate different sentiment classes.\n\nReview: The review text to classify.\n\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\n\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nExamples:\n- negative: I remembered this place from a few years ago. It was really bad.\n- negative: But I've been probably 6 times and have never left impressed. It sucks.\n- neutral: My husband got the prime rib special and it came out with the salad and cold.\n- neutral: This is brand spankin new so a longer soft opening is definitely necessary especially when dealing with us downtown folks during the lunch rush and after work dinner pickup, they need work.\n- positive: The colo

In [147]:
e19_g4om_elft_bex_df, e19_g4om_elft_bex_metrics = evaluate_experiment(
    name='E19-G4OM-ELFT-BEX',
    notes='Experiment 19: Evaluate prompt-based model collaboration that includes similar, balanced examples',
    lm='gpt-4o-mini-2024-07-18',
    instance='electra_large_gpt_sentiment_examples_balanced',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E19-G4OM-ELFT-BEX
--------------------------------------------------------------------------------
Start time: 2025-03-26 19:56:45
Notes: Experiment 19: Evaluate prompt-based model collaboration that includes similar, balanced examples
Model: gpt-4o-mini-2024-07-18
Instance: electra_large_gpt_sentiment_examples_balanced
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2_take2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 5473 / 6530  (83.8): 100%|██████████| 6530/6530 [1:35:50<00:00,  1.14it/s]  

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values
classifier_decision    6530 values
probabilities          6530 values

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Pre

### E20-G4O-ELFT-EX

In [148]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='gpt-4o-2024-08-06', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [149]:
# Create a test result for the GPT sentiment module
e20_g4o_elft_ex_result = electra_large_gpt_sentiment_examples(review=test_review)

In [150]:
e20_g4o_elft_ex_result

Prediction(
    examples=[{'index': 1385, 'similarity_score': 0.9959219694137573, 'review': 'I remembered this place from a few years ago. It was really bad.', 'classification': 'negative'}, {'index': 1770, 'similarity_score': 0.9956880211830139, 'review': "But I've been probably 6 times and have never left impressed. It sucks.", 'classification': 'negative'}, {'index': 1948, 'similarity_score': 0.9954952597618103, 'review': 'So got beans, rice and tacos, no flavour.', 'classification': 'negative'}, {'index': 3282, 'similarity_score': 0.995331346988678, 'review': 'Anyhoos, we waited FOREVER (more like an hour plus) for our food.', 'classification': 'negative'}, {'index': 1391, 'similarity_score': 0.9952993392944336, 'review': 'The loyalty card is not worth it.', 'classification': 'negative'}],
    classification='negative',
    classifier_decision='negative',
    probabilities=[0.9990077614784241, 0.0005539217963814735, 0.000438391842180863]
)

In [151]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Examples: A list of examples that demonstrate different sentiment classes.

Review: The review text to classify.

Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.

Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Examples:
- negative: I remembered this place from a few years ago. It was really bad.
- negative: But I've been probably 6 times and have never left impressed. It sucks.
- negative: So got beans, rice and tacos, no flavour.
- negative: Anyhoos, we waited FOREVER (more like an hour plus) for our food.
- negative: The loyalty card is not worth it.

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.

Classifier Decision: negative

Classification:[32m negative[0m


"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nExamples: A list of examples that demonstrate different sentiment classes.\n\nReview: The review text to classify.\n\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\n\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nExamples:\n- negative: I remembered this place from a few years ago. It was really bad.\n- negative: But I've been probably 6 times and have never left impressed. It sucks.\n- negative: So got beans, rice and tacos, no flavour.\n- negative: Anyhoos, we waited FOREVER (more like an hour plus) for our food.\n- negative: The loyalty card is not worth it.\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\n\nClassifier Decision: negative\n\nCla

In [152]:
e20_g4o_elft_ex_results_df, e20_g4o_elft_ex_metrics = evaluate_experiment(
    name='E20-G4O-ELFT-EX',
    notes='Experiment 20: Evaluate prompt-based model collaboration that includes similar examples',
    lm='gpt-4o-2024-08-06',
    instance='electra_large_gpt_sentiment_examples',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E20-G4O-ELFT-EX
--------------------------------------------------------------------------------
Start time: 2025-03-26 21:36:45
Notes: Experiment 20: Evaluate prompt-based model collaboration that includes similar examples
Model: gpt-4o-2024-08-06
Instance: electra_large_gpt_sentiment_examples
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2_take2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 2636 / 3150  (83.7):  48%|████▊     | 3150/6530 [40:08<1:28:48,  1.58s/it]



Average Metric: 5485 / 6530  (84.0): 100%|██████████| 6530/6530 [1:30:12<00:00,  1.21it/s]  

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values
classifier_decision    6530 values
probabilities          6530 values

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
negative               2311
positive               2234
neutral                1985

E20-G4O-ELFT-EX Multi-Class Classification Report

              precision    recall  f1-score   support

    negative     0.8793    0.8639    0.8715      2352
     neutral     0.7219    0.7835    0.7514      1829
    positive     0.9042    0.8599    0.8815      2349

    accuracy                         0.8400      6530
   macro avg     0.8351    0.8358    0.8348      6530
weighted avg     0.8442    0.8400    0.8415      6530

Predicted  negati

### E21-G4O-ELFT-BEX

In [153]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='gpt-4o-2024-08-06', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [154]:
# Create a test result for the GPT sentiment module
e21_g4o_elft_bex_result = electra_large_gpt_sentiment_examples_balanced(review=test_review)

In [155]:
e21_g4o_elft_bex_result

Prediction(
    examples=[{'index': 1385, 'similarity_score': 0.9959219694137573, 'review': 'I remembered this place from a few years ago. It was really bad.', 'classification': 'negative'}, {'index': 1770, 'similarity_score': 0.9956880211830139, 'review': "But I've been probably 6 times and have never left impressed. It sucks.", 'classification': 'negative'}, {'index': 3642, 'similarity_score': 0.9821950197219849, 'review': 'My husband got the prime rib special and it came out with the salad and cold.', 'classification': 'neutral'}, {'index': 4792, 'similarity_score': 0.9807704091072083, 'review': 'This is brand spankin new so a longer soft opening is definitely necessary especially when dealing with us downtown folks during the lunch rush and after work dinner pickup, they need work.', 'classification': 'neutral'}, {'index': 474, 'similarity_score': 0.9905312061309814, 'review': 'The colors and tech were something to be desired.', 'classification': 'positive'}, {'index': 3014, 'simil

In [156]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Examples: A list of examples that demonstrate different sentiment classes.

Review: The review text to classify.

Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.

Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Examples:
- negative: I remembered this place from a few years ago. It was really bad.
- negative: But I've been probably 6 times and have never left impressed. It sucks.
- neutral: My husband got the prime rib special and it came out with the salad and cold.
- neutral: This is brand spankin new so a longer soft opening is definitely necessary especially when dealing with us downtown folks during the lunch rush and after work dinner pickup, they need work.
- positive: The colors and tech were somethin

"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nExamples: A list of examples that demonstrate different sentiment classes.\n\nReview: The review text to classify.\n\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\n\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nExamples:\n- negative: I remembered this place from a few years ago. It was really bad.\n- negative: But I've been probably 6 times and have never left impressed. It sucks.\n- neutral: My husband got the prime rib special and it came out with the salad and cold.\n- neutral: This is brand spankin new so a longer soft opening is definitely necessary especially when dealing with us downtown folks during the lunch rush and after work dinner pickup, they need work.\n- positive: The colo

In [157]:
e21_g4o_elft_bex_df, e21_g4o_elft_bex_metrics = evaluate_experiment(
    name='E21-G4O-ELFT-BEX',
    notes='Experiment 21: Evaluate prompt-based model collaboration that includes similar, balanced examples',
    lm='gpt-4o-2024-08-06',
    instance='electra_large_gpt_sentiment_examples_balanced',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E21-G4O-ELFT-BEX
--------------------------------------------------------------------------------
Start time: 2025-03-26 23:11:45
Notes: Experiment 21: Evaluate prompt-based model collaboration that includes similar, balanced examples
Model: gpt-4o-2024-08-06
Instance: electra_large_gpt_sentiment_examples_balanced
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2_take2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 5478 / 6530  (83.9): 100%|██████████| 6530/6530 [1:17:57<00:00,  1.40it/s]  

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values
classifier_decision    6530 values
probabilities          6530 values

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Predictio

### E22-G4O-ELFT-PR

In [158]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='gpt-4o-2024-08-06', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [159]:
# Create a test result for the GPT sentiment module
e22_g4o_elft_pr_result = electra_large_gpt_sentiment_probs(review=test_review)

In [160]:
e22_g4o_elft_pr_result

Prediction(
    classification='negative',
    negative_probability='99.90%',
    neutral_probability='0.06%',
    positive_probability='0.04%'
)

In [161]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.

Negative Probability: Probability the review is negative from a model fine-tuned on sentiment

Neutral Probability: Probability the review is neutral from a model fine-tuned on sentiment

Positive Probability: Probability the review is positive from a model fine-tuned on sentiment

Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.

Negative Probability: 99.90%

Neutral Probability: 0.06%

Positive Probability: 0.04%

Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\n\nNegative Probability: Probability the review is negative from a model fine-tuned on sentiment\n\nNeutral Probability: Probability the review is neutral from a model fine-tuned on sentiment\n\nPositive Probability: Probability the review is positive from a model fine-tuned on sentiment\n\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\n\nNegative Probability: 99.90%\n\nNeutral Probability: 0.06%\n\nPositive Probability: 0.04%\n\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [162]:
e22_g4o_elft_pr_results_df, e22_g4o_elft_pr_metrics = evaluate_experiment(
    name='E22-G4O-ELFT-PR',
    notes='Experiment 22: Evaluate prompt-based model collaboration with probabilities',
    lm='gpt-4o-2024-08-06',
    instance='electra_large_gpt_sentiment_probs',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E22-G4O-ELFT-PR
--------------------------------------------------------------------------------
Start time: 2025-03-27 00:29:45
Notes: Experiment 22: Evaluate prompt-based model collaboration with probabilities
Model: gpt-4o-2024-08-06
Instance: electra_large_gpt_sentiment_probs
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2_take2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 5477 / 6530  (83.9): 100%|██████████| 6530/6530 [1:38:22<00:00,  1.11it/s]  

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
positive               2328
negative               2294
neutral                1908

E22-

### E23-G4O-ELFT-PR2

In [163]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='gpt-4o-2024-08-06', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [164]:
# Create a test result for the GPT sentiment module
e23_g4o_elft_pr2_result = electra_large_gpt_sentiment_pred_probs(review=test_review)

In [165]:
e23_g4o_elft_pr2_result

Prediction(
    classification='negative',
    classifier_decision='negative',
    negative_probability='99.90%',
    neutral_probability='0.06%',
    positive_probability='0.04%'
)

In [166]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Review: The review text to classify.

Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.

Negative Probability: Probability the review is negative

Neutral Probability: Probability the review is neutral

Positive Probability: Probability the review is positive

Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Review: Those 2 drinks are part of the HK culture and has years of history. It is so bad.

Classifier Decision: negative

Negative Probability: 99.90%

Neutral Probability: 0.06%

Positive Probability: 0.04%

Classification:[32m negative[0m





"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nReview: The review text to classify.\n\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\n\nNegative Probability: Probability the review is negative\n\nNeutral Probability: Probability the review is neutral\n\nPositive Probability: Probability the review is positive\n\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nReview: Those 2 drinks are part of the HK culture and has years of history. It is so bad.\n\nClassifier Decision: negative\n\nNegative Probability: 99.90%\n\nNeutral Probability: 0.06%\n\nPositive Probability: 0.04%\n\nClassification:\x1b[32m negative\x1b[0m\n\n\n"

In [167]:
e23_g4o_elft_pr2_df, e23_g4o_elft_pr2_metrics = evaluate_experiment(
    name='E23-G4O-ELFT-PR2',
    notes='Experiment 23: Evaluate prompt-based model collaboration with prediction and probabilities',
    lm='gpt-4o-2024-08-06',
    instance='electra_large_gpt_sentiment_pred_probs',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E23-G4O-ELFT-PR2
--------------------------------------------------------------------------------
Start time: 2025-03-27 02:12:46
Notes: Experiment 23: Evaluate prompt-based model collaboration with prediction and probabilities
Model: gpt-4o-2024-08-06
Instance: electra_large_gpt_sentiment_pred_probs
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2_take2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 5492 / 6530  (84.1): 100%|██████████| 6530/6530 [1:37:38<00:00,  1.11it/s]  

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values
classifier_decision    6530 values

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
negative               2309
posit

### E24-G4O-ELFT-EX-PR2

In [168]:
# Switch to the correct language model and disable experimental mode
lm = dspy.OpenAI(model='gpt-4o-2024-08-06', api_key=openai_key, max_tokens=8192, temperature=temperature)
dspy.settings.configure(lm=lm, experimental=False, seed=random_seed)
dsp.settings.show_guidelines = True

In [169]:
# Create a test result for the GPT sentiment module
e24_g4o_elft_ex_pr2_result = electra_large_gpt_sentiment_pred_probs_examples(review=test_review)

In [170]:
e24_g4o_elft_ex_pr2_result

Prediction(
    examples=[{'index': 1385, 'similarity_score': 0.9959219694137573, 'review': 'I remembered this place from a few years ago. It was really bad.', 'classification': 'negative'}, {'index': 1770, 'similarity_score': 0.9956880211830139, 'review': "But I've been probably 6 times and have never left impressed. It sucks.", 'classification': 'negative'}, {'index': 1948, 'similarity_score': 0.9954952597618103, 'review': 'So got beans, rice and tacos, no flavour.', 'classification': 'negative'}, {'index': 3282, 'similarity_score': 0.995331346988678, 'review': 'Anyhoos, we waited FOREVER (more like an hour plus) for our food.', 'classification': 'negative'}, {'index': 1391, 'similarity_score': 0.9952993392944336, 'review': 'The loyalty card is not worth it.', 'classification': 'negative'}],
    classification='negative',
    classifier_decision='negative',
    negative_probability='99.90%',
    neutral_probability='0.06%',
    positive_probability='0.04%'
)

In [171]:
lm.inspect_history(n=1)




Classify the sentiment of a review as either 'negative', 'neutral', or 'positive'.

---

Follow the following format.

Examples: A list of examples that demonstrate different sentiment classes.

Review: The review text to classify.

Classifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.

Negative Probability: Probability the review is negative

Neutral Probability: Probability the review is neutral

Positive Probability: Probability the review is positive

Classification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')

---

Examples:
- negative: I remembered this place from a few years ago. It was really bad.
- negative: But I've been probably 6 times and have never left impressed. It sucks.
- negative: So got beans, rice and tacos, no flavour.
- negative: Anyhoos, we waited FOREVER (more like an hour plus) for our food.
- negative: The loyalty card is

"\n\n\nClassify the sentiment of a review as either 'negative', 'neutral', or 'positive'.\n\n---\n\nFollow the following format.\n\nExamples: A list of examples that demonstrate different sentiment classes.\n\nReview: The review text to classify.\n\nClassifier Decision: The sentiment classification proposed by a model fine-tuned on sentiment.\n\nNegative Probability: Probability the review is negative\n\nNeutral Probability: Probability the review is neutral\n\nPositive Probability: Probability the review is positive\n\nClassification: One word representing the sentiment classification: 'negative', 'neutral', or 'positive' (do not repeat the field name, do not use 'mixed')\n\n---\n\nExamples:\n- negative: I remembered this place from a few years ago. It was really bad.\n- negative: But I've been probably 6 times and have never left impressed. It sucks.\n- negative: So got beans, rice and tacos, no flavour.\n- negative: Anyhoos, we waited FOREVER (more like an hour plus) for our food.\n

In [172]:
e24_g4o_elft_ex_pr2_df, e24_g4o_elft_ex_pr2_metrics = evaluate_experiment(
    name='E24-G4O-ELFT-EX-PR2',
    notes='Experiment 24: Evaluate prompt-based model collaboration with prediction, probabilities, and similar examples',
    lm='gpt-4o-2024-08-06',
    instance='electra_large_gpt_sentiment_pred_probs_examples',
    dataset=test_df,
    examples=test_ex,
    temperature=temperature,
    random_seed=random_seed
)

--------------------------------------------------------------------------------
Experiment: E24-G4O-ELFT-EX-PR2
--------------------------------------------------------------------------------
Start time: 2025-03-27 03:54:46
Notes: Experiment 24: Evaluate prompt-based model collaboration with prediction, probabilities, and similar examples
Model: gpt-4o-2024-08-06
Instance: electra_large_gpt_sentiment_pred_probs_examples
Dataset shape: [6530, 4]
Examples length: 6530
Save directory: results_round2_take2
Temperature: 0.1
Random seed: 123

Running evaluation...
Average Metric: 2154 / 2574  (83.7):  39%|███▉      | 2574/6530 [30:48<48:14,  1.37it/s]  



Average Metric: 5478 / 6530  (83.9): 100%|██████████| 6530/6530 [1:17:25<00:00,  1.41it/s]

Columns in results DataFrame:
review                 6530 values
classification         6530 values
prediction             6530 values
match                  6530 values
classifier_decision    6530 values

Source value counts:
dynasent_r1            3600
sst_local              2210
dynasent_r2             720

Prediction value counts:
positive               2319
negative               2295
neutral                1916

E24-G4O-ELFT-EX-PR2 Multi-Class Classification Report

              precision    recall  f1-score   support

    negative     0.8749    0.8537    0.8642      2352
     neutral     0.7375    0.7726    0.7546      1829
    positive     0.8870    0.8757    0.8813      2349

    accuracy                         0.8389      6530
   macro avg     0.8331    0.8340    0.8334      6530
weighted avg     0.8408    0.8389    0.8397      6530

Predicted  negative  neutral  positive
Actual     