In [1]:

# Updated imports for new LLMs and libraries
import warnings
import os

warnings.filterwarnings('ignore', category=FutureWarning)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import time
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report,
 )

# API setup for LLMs (Groq, dotenv)
from groq import Groq
from dotenv import load_dotenv

load_dotenv()

GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if GROQ_API_KEY:
    groq_client = Groq(api_key=GROQ_API_KEY)

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (14, 6)

print("✓ Updated setup complete for new LLM sentiment analysis")

✓ Updated setup complete for new LLM sentiment analysis


## 1. Load Dataset

In [2]:
# Load the 100% agreement dataset (highest quality)
data_path = "../../DatasetAnalysis_FinancialPhraseBank/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt"

sentences = []
sentiments = []

with open(data_path, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        line = line.strip()
        if "@" in line:
            parts = line.rsplit("@", 1)
            if len(parts) == 2:
                sentences.append(parts[0])
                sentiments.append(parts[1])

df = pd.DataFrame({"sentence": sentences, "true_sentiment": sentiments})

print(f"Dataset loaded: {len(df)} sentences")
print(f"\nSentiment distribution:")
print(df["true_sentiment"].value_counts())

# Display sample
print("\n" + "=" * 80)
print("Sample sentences:")
print("=" * 80)
display(df.sample(5, random_state=42))

Dataset loaded: 2264 sentences

Sentiment distribution:
true_sentiment
neutral     1391
positive     570
negative     303
Name: count, dtype: int64

Sample sentences:


Unnamed: 0,sentence,true_sentiment
1755,The contract value amounts to EUR 2.4 million .,neutral
1281,Kemira shares closed at ( x20ac ) 16.66 ( $ 2...,neutral
350,The company slipped to an operating loss of EU...,negative
420,According to Atria 's President and CEO Matti ...,positive
56,"In 2009 , Fiskars ' cash flow from operating a...",positive


## 2. Chain-of-Thought Prompt Design

**Prompt Strategy**: Stepwise reasoning with explicit JSON output format.

In [3]:
def create_cot_prompt(sentence):
    """
    Creates a Chain-of-Thought prompt for sentiment classification.
    Guides the model through stepwise reasoning.
    """
    prompt = f"""You are a financial sentiment analysis expert. Analyze the following financial statement step-by-step.

Classify the sentiment of the following financial statement as either "positive", "negative", or "neutral" from an investor's perspective.

Think through this systematically:
Step 1: Identify the key financial metrics, events, or indicators mentioned in the statement.
Step 2: List any positive signals (growth, profit increases, expansions, etc.).
Step 3: List any negative signals (losses, declines, challenges, etc.).
Step 4: Evaluate the net impact on stock price from an investor's perspective.
Step 5: Based on your analysis, classify the sentiment.

Classification guidelines:
- Positive: Good news for stock price (revenue increase, profit growth, etc.)
- Negative: Bad news for stock price (losses, declining sales, etc.)
- Neutral: No clear impact on stock price or mixed signals

IMPORTANT: Provide your final answer in this exact JSON format:
{{
    "sentiment": "positive/negative/neutral",
    "confidence": 0.0-1.0,
    "rationale": "Brief summary of your step-by-step reasoning"
}}

Financial Statement: "{sentence}"
"""
    
    return prompt

## 3. Model Inference Functions

In [4]:
def call_llama(prompt, model_name, temperature=0.0):
    """Call Llama via Groq API"""
    max_retries = 3
    last_error = None
    for attempt in range(max_retries):
        try:
            chat_completion = groq_client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ],
                model=model_name,
                temperature=temperature,
                max_tokens=500,
            )
            return chat_completion.choices[0].message.content
        except Exception as e:
            last_error = str(e)
            print(f"Attempt {attempt + 1}/{max_retries} failed: {last_error}")
            if attempt < max_retries - 1:
                time.sleep(2**attempt)
                continue
            return None
    return None

def parse_response(response_text):
    """Parse JSON response from model"""
    try:
        # Try to extract JSON from response
        if "```json" in response_text:
            json_str = response_text.split("```json")[1].split("```")[0].strip()
        elif "```" in response_text:
            json_str = response_text.split("```")[1].strip()
        else:
            json_str = response_text.strip()
        result = json.loads(json_str)
        return result
    except Exception as e:
        print("Parse error: {}".format(str(e)[:100]))
        print("Raw response was:\n{}".format(response_text))
        # Fallback: try to extract sentiment with regex
        response_lower = response_text.lower() if response_text else ""
        if "positive" in response_lower and "negative" not in response_lower:
            return {
                "sentiment": "positive",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        elif "negative" in response_lower:
            return {
                "sentiment": "negative",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        elif "neutral" in response_lower:
            return {
                "sentiment": "neutral",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        return None

print("✓ Model inference functions defined")

✓ Model inference functions defined


## 4. Run Experiments

In [5]:
# Updated experiment runs for new LLMs
test_df = df.head(100).copy()  # Use a sample of the dataset

# E7: Updated GPT OSS 20B (Chain-of-Thought)
print("Running E7: Updated GPT OSS 20B (Chain-of-Thought)...")
e7_results = []
for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="E7 Progress"):
    prompt = create_cot_prompt(row["sentence"])
    response = call_llama(prompt, model_name="openai/gpt-oss-20b")
    if response:
        parsed = parse_response(response)
        if parsed:
            e7_results.append({
                "sentence": row["sentence"],
                "true_sentiment": row["true_sentiment"],
                "predicted_sentiment": parsed.get("sentiment", "unknown"),
                "confidence": parsed.get("confidence", 0),
                "rationale": parsed.get("rationale", ""),
            })
        else:
            e7_results.append({
                "sentence": row["sentence"],
                "true_sentiment": row["true_sentiment"],
                "predicted_sentiment": "error",
                "confidence": 0,
                "rationale": "Parse error",
            })
    time.sleep(0.5)
e7_df = pd.DataFrame(e7_results)
print(f"\n✓ E7 completed: {len(e7_df)} predictions")
display(e7_df.head())

# E8: Updated GPT OSS 1Z20B (Chain-of-Thought)
print("Running E8: Updated GPT OSS 120B (Chain-of-Thought)...")
e8_results = []
for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="E8 Progress"):
    prompt = create_cot_prompt(row["sentence"])
    response = call_llama(prompt, model_name="openai/gpt-oss-120b")
    if response:
        parsed = parse_response(response)
        if parsed:
            e8_results.append({
                "sentence": row["sentence"],
                "true_sentiment": row["true_sentiment"],
                "predicted_sentiment": parsed.get("sentiment", "unknown"),
                "confidence": parsed.get("confidence", 0),
                "rationale": parsed.get("rationale", ""),
            })
        else:
            e8_results.append({
                "sentence": row["sentence"],
                "true_sentiment": row["true_sentiment"],
                "predicted_sentiment": "error",
                "confidence": 0,
                "rationale": "Parse error",
            })
    time.sleep(0.5)
e8_df = pd.DataFrame(e8_results)
print(f"\n✓ E8 completed: {len(e8_df)} predictions")
display(e8_df.head())

# E9: Updated Llama-3.3-70B (Chain-of-Thought)
print("Running E9: Updated Llama-3.3-70B (Chain-of-Thought)...")
e9_results = []
for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="E9 Progress"):
    prompt = create_cot_prompt(row["sentence"])
    response = call_llama(prompt, model_name="llama-3.3-70b-versatile")
    if response:
        parsed = parse_response(response)
        if parsed:
            e9_results.append({
                "sentence": row["sentence"],
                "true_sentiment": row["true_sentiment"],
                "predicted_sentiment": parsed.get("sentiment", "unknown"),
                "confidence": parsed.get("confidence", 0),
                "rationale": parsed.get("rationale", ""),
            })
        else:
            e9_results.append({
                "sentence": row["sentence"],
                "true_sentiment": row["true_sentiment"],
                "predicted_sentiment": "error",
                "confidence": 0,
                "rationale": "Parse error",
            })
    time.sleep(0.5)
e9_df = pd.DataFrame(e9_results)
print(f"\n✓ E9 completed: {len(e9_df)} predictions")
display(e9_df.head())

Running E7: Updated GPT OSS 20B (Chain-of-Thought)...


E7 Progress:   1%|          | 1/100 [00:01<02:06,  1.28s/it]

Attempt 1/3 failed: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01kf8cn3b9fm4vzg7zcc3qwwhk` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199846, Requested 367. Please try again in 1m32.015999999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Attempt 2/3 failed: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01kf8cn3b9fm4vzg7zcc3qwwhk` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199844, Requested 367. Please try again in 1m31.152s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Attempt 3/3 failed: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01kf8cn3b9fm4v

E7 Progress:   2%|▏         | 2/100 [00:05<04:33,  2.79s/it]

Attempt 1/3 failed: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01kf8cn3b9fm4vzg7zcc3qwwhk` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199837, Requested 359. Please try again in 1m24.672s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Attempt 2/3 failed: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01kf8cn3b9fm4vzg7zcc3qwwhk` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199835, Requested 359. Please try again in 1m23.808s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Attempt 3/3 failed: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01kf8cn3b9fm4vzg7zcc

E7 Progress:   3%|▎         | 3/100 [00:08<05:09,  3.19s/it]

Attempt 1/3 failed: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01kf8cn3b9fm4vzg7zcc3qwwhk` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199829, Requested 351. Please try again in 1m17.759999999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Attempt 2/3 failed: Error code: 429 - {'error': {'message': 'Rate limit reached for model `openai/gpt-oss-20b` in organization `org_01kf8cn3b9fm4vzg7zcc3qwwhk` service tier `on_demand` on tokens per day (TPD): Limit 200000, Used 199826, Requested 351. Please try again in 1m16.464s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


E7 Progress:   3%|▎         | 3/100 [00:11<05:55,  3.67s/it]


KeyboardInterrupt: 

## 5. Calculate Metrics

In [None]:
# Updated metrics calculation for new LLMs
def calculate_metrics(df, exp_name):
    """Calculate all evaluation metrics"""
    if df.empty or "predicted_sentiment" not in df.columns:
        print(f"⚠️ Warning: {exp_name} has no valid predictions!")
        return (
            {
                "Experiment": exp_name,
                "Total Samples": 0,
                "Valid Predictions": 0,
                "Accuracy": 0,
                "Macro-F1": 0,
                "Weighted-F1": 0,
                "Macro-Precision": 0,
                "Macro-Recall": 0,
                "Positive_Precision": 0,
                "Positive_Recall": 0,
                "Positive_F1": 0,
                "Negative_Precision": 0,
                "Negative_Recall": 0,
                "Negative_F1": 0,
                "Neutral_Precision": 0,
                "Neutral_Recall": 0,
                "Neutral_F1": 0,
            },
            np.zeros((3, 3)),
            pd.DataFrame(),
        )

    valid_df = df[df["predicted_sentiment"].isin(["positive", "negative", "neutral"])].copy()
    if valid_df.empty:
        print(f"⚠️ Warning: {exp_name} has no valid predictions after filtering!")
        return (
            {
                "Experiment": exp_name,
                "Total Samples": len(df),
                "Valid Predictions": 0,
                "Accuracy": 0,
                "Macro-F1": 0,
                "Weighted-F1": 0,
                "Macro-Precision": 0,
                "Macro-Recall": 0,
                "Positive_Precision": 0,
                "Positive_Recall": 0,
                "Positive_F1": 0,
                "Negative_Precision": 0,
                "Negative_Recall": 0,
                "Negative_F1": 0,
                "Neutral_Precision": 0,
                "Neutral_Recall": 0,
                "Neutral_F1": 0,
            },
            np.zeros((3, 3)),
            pd.DataFrame(),
        )

    y_true = valid_df["true_sentiment"]
    y_pred = valid_df["predicted_sentiment"]

    metrics = {
        "Experiment": exp_name,
        "Total Samples": len(df),
        "Valid Predictions": len(valid_df),
        "Accuracy": accuracy_score(y_true, y_pred),
        "Macro-F1": f1_score(y_true, y_pred, average="macro"),
        "Weighted-F1": f1_score(y_true, y_pred, average="weighted"),
        "Macro-Precision": precision_score(y_true, y_pred, average="macro"),
        "Macro-Recall": recall_score(y_true, y_pred, average="macro"),
    }

    labels = ["positive", "negative", "neutral"]
    precision_per_class = precision_score(y_true, y_pred, labels=labels, average=None, zero_division=0)
    recall_per_class = recall_score(y_true, y_pred, labels=labels, average=None, zero_division=0)
    f1_per_class = f1_score(y_true, y_pred, labels=labels, average=None, zero_division=0)

    for i, label in enumerate(labels):
        metrics[f"{label.capitalize()}_Precision"] = precision_per_class[i]
        metrics[f"{label.capitalize()}_Recall"] = recall_per_class[i]
        metrics[f"{label.capitalize()}_F1"] = f1_per_class[i]

    cm = confusion_matrix(y_true, y_pred, labels=labels)
    return metrics, cm, valid_df

# Calculate metrics for all experiments
e7_metrics, e7_cm, e7_valid = calculate_metrics(e7_df, "E7: Updated GPT OSS 20B (CoT)")
e8_metrics, e8_cm, e8_valid = calculate_metrics(e8_df, "E8: Updated GPT OSS 120B (CoT)")
e9_metrics, e9_cm, e9_valid = calculate_metrics(e9_df, "E9: Updated Llama-3.3-70B (CoT)")

metrics_df = pd.DataFrame([e7_metrics, e8_metrics, e9_metrics])
print("\n" + "=" * 80)
print("CHAIN-OF-THOUGHT PERFORMANCE COMPARISON")
print("=" * 80)
display(metrics_df[["Experiment", "Accuracy", "Macro-F1", "Macro-Precision", "Macro-Recall"]].round(4))

In [None]:
# Detailed classification reports
for df_result, exp_name in [
    (e7_valid, "E7: GPT OSS 20B (CoT)"),
    (e8_valid, "E8: GPT OSS 120B (CoT)"),
    (e9_valid, "E9: Llama-3.3-70B (CoT)"),
]:
    print("\n" + "=" * 80)
    print(f"CLASSIFICATION REPORT: {exp_name}")
    print("=" * 80)
    print(
        classification_report(
            df_result["true_sentiment"],
            df_result["predicted_sentiment"],
            labels=["positive", "negative", "neutral"],
            target_names=["Positive", "Negative", "Neutral"],
        )
    )

## 6. Visualize Results

In [None]:
# Confidence analysis
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (df_result, title) in enumerate(
    [(e7_valid, "GPT OSS 20B (CoT)"), (e8_valid, "GPT OSS 120B (CoT)"), (e9_valid, "Llama-3.3-70B (CoT)")]
):
    df_result["correct"] = (
        df_result["true_sentiment"] == df_result["predicted_sentiment"]
    )

    correct_conf = df_result[df_result["correct"]]["confidence"]
    incorrect_conf = df_result[~df_result["correct"]]["confidence"]

    axes[idx].hist(
        [correct_conf, incorrect_conf],
        bins=20,
        label=["Correct", "Incorrect"],
        alpha=0.7,
        color=["green", "red"],
    )
    axes[idx].set_xlabel("Confidence Score", fontsize=11, weight="bold")
    axes[idx].set_ylabel("Frequency", fontsize=11, weight="bold")
    axes[idx].set_title(
        f"{title}\nMean Conf: Correct={correct_conf.mean():.3f}, Incorrect={incorrect_conf.mean():.3f}",
        fontsize=11,
        weight="bold",
    )
    axes[idx].legend()
    axes[idx].grid(alpha=0.3)

plt.suptitle(
    "Confidence Distribution: Correct vs Incorrect Predictions",
    fontsize=14,
    weight="bold",
    y=1.02,
)
plt.tight_layout()
plt.savefig("cot_confidence_analysis.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Performance comparison bar chart
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Overall metrics
metrics_to_plot = ["Accuracy", "Macro-F1", "Macro-Precision", "Macro-Recall"]
x = np.arange(len(metrics_to_plot))
width = 0.25

for i, (metrics, label) in enumerate(
    [
        (e7_metrics, "GPT OSS 20B (CoT)"),
        (e8_metrics, "GPT OSS 120B (CoT)"),
        (e9_metrics, "Llama-3.3-70B (CoT)"),
    ]
):
    values = [metrics[m] for m in metrics_to_plot]
    axes[0].bar(x + i * width, values, width, label=label, alpha=0.8)

axes[0].set_xlabel("Metrics", fontsize=12, weight="bold")
axes[0].set_ylabel("Score", fontsize=12, weight="bold")
axes[0].set_title(
    "Overall Performance Comparison (Chain-of-Thought)", fontsize=14, weight="bold"
)
axes[0].set_xticks(x + width)
axes[0].set_xticklabels(metrics_to_plot)
axes[0].legend()
axes[0].set_ylim([0, 1])
axes[0].grid(axis="y", alpha=0.3)

# Per-class F1 scores
classes = ["Positive", "Negative", "Neutral"]
x2 = np.arange(len(classes))

for i, (metrics, label) in enumerate(
    [
        (e7_metrics, "GPT OSS 20B (CoT)"),
        (e8_metrics, "GPT OSS 120B (CoT)"),
        (e9_metrics, "Llama-3.3-70B (CoT)"),
    ]
):
    values = [metrics[f"{c}_F1"] for c in classes]
    axes[1].bar(x2 + i * width, values, width, label=label, alpha=0.8)

axes[1].set_xlabel("Sentiment Class", fontsize=12, weight="bold")
axes[1].set_ylabel("F1 Score", fontsize=12, weight="bold")
axes[1].set_title("Per-Class F1 Scores (Chain-of-Thought)", fontsize=14, weight="bold")
axes[1].set_xticks(x2 + width)
axes[1].set_xticklabels(classes)
axes[1].legend()
axes[1].set_ylim([0, 1])
axes[1].grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.savefig("cot_performance_comparison.png", dpi=300, bbox_inches="tight")
plt.show()

# Confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
labels = ["Positive", "Negative", "Neutral"]

for idx, (cm, title) in enumerate(
    [
        (e7_cm, "E7: GPT OSS 20B (CoT)"),
        (e8_cm, "E8: GPT OSS 120B (CoT)"),
        (e9_cm, "E9: Llama-3.3-70B (CoT)"),
    ]
):
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=labels,
        yticklabels=labels,
        ax=axes[idx],
        cbar_kws={"label": "Count"},
    )
    axes[idx].set_title(title, fontsize=12, weight="bold")
    axes[idx].set_ylabel("True Label", fontsize=11, weight="bold")
    axes[idx].set_xlabel("Predicted Label", fontsize=11, weight="bold")

plt.suptitle(
    "Confusion Matrices - Chain-of-Thought Sentiment Analysis",
    fontsize=14,
    weight="bold",
    y=1.02,
)
plt.tight_layout()
plt.savefig("cot_confusion_matrices.png", dpi=300, bbox_inches="tight")
plt.show()

## 7. Save Results

In [None]:
# Save detailed results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

e7_df.to_csv(f"e7_GPT_OSS_20B_cot_{timestamp}.csv", index=False)
e8_df.to_csv(f"e8_GPT_OSS_120B_cot_{timestamp}.csv", index=False)
e9_df.to_csv(f"e9_Llama-3.3-70B_cot_{timestamp}.csv", index=False)

# Save metrics summary
metrics_df.to_csv(f"cot_metrics_summary_{timestamp}.csv", index=False)

print(f"\n✓ Chain-of-Thought results saved with timestamp: {timestamp}")
print("\nFiles created:")
print(f"  - e7_GPT_OSS_20B_cot_{timestamp}.csv")
print(f"  - e8_GPT_OSS_120B_cot_{timestamp}.csv")
print(f"  - e9_Llama-3.3-70B_cot_{timestamp}.csv")
print(f"  - cot_metrics_summary_{timestamp}.csv")
print(f"  - cot_performance_comparison.png")
print(f"  - cot_confusion_matrices.png")

## 8. Key Findings

### Summary of Chain-of-Thought Results:

1. **Model Performance Ranking**
   - Compare accuracy and F1 scores across the three models
   - Identify which model performs best in chain-of-thought setting

2. **Class-Specific Performance**
   - Analyze which sentiment class is hardest to classify
   - Check if neutral class causes most confusion

3. **Error Analysis**
   - Review misclassified examples
   - Identify common error patterns

4. **Confidence Calibration**
   - Examine relationship between confidence scores and accuracy
   - Identify overconfident or underconfident predictions