In [1]:
!pip install pandas numpy matplotlib seaborn scikit-learn google-generativeai groq python-dotenv tqdm -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Users/sidharthrai/Documents/LJMU - MS AI ML/LJMU TrackThesis v2/venv/bin/python3.13 -m pip install --upgrade pip[0m


In [2]:
# Suppress deprecation warnings
import warnings
import os

warnings.filterwarnings('ignore', category=FutureWarning)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import time
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report,
)

from groq import Groq
from dotenv import load_dotenv

load_dotenv()

if os.getenv("GROQ_API_KEY"):
    groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (14, 6)

print("✓ Setup complete")

✓ Setup complete


## 1. Load Dataset

In [3]:
data_path = (
    "../../DatasetAnalysis_FinancialPhraseBank/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt"
)

sentences = []
sentiments = []

with open(data_path, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        line = line.strip()
        if "@" in line:
            parts = line.rsplit("@", 1)
            if len(parts) == 2:
                sentences.append(parts[0])
                sentiments.append(parts[1])

df = pd.DataFrame({"sentence": sentences, "true_sentiment": sentiments})
print(f"Dataset loaded: {len(df)} sentences")
print(f"\nSentiment distribution:")
print(df["true_sentiment"].value_counts())

Dataset loaded: 2264 sentences

Sentiment distribution:
true_sentiment
neutral     1391
positive     570
negative     303
Name: count, dtype: int64


## 2. Tree-of-Thought Prompt Design

**Multi-Path Reasoning**:
- Path 1: Consider "positive" hypothesis
- Path 2: Consider "negative" hypothesis  
- Path 3: Consider "neutral" hypothesis
- Evaluation: Score each path's evidence strength
- Selection: Choose the most supported hypothesis

In [4]:
def create_tot_prompt(sentence):
    """
    Creates a Tree-of-Thought prompt with multi-path exploration.
    """
    prompt = f"""You are a financial sentiment analysis expert. Analyze this statement using a tree-of-thought approach.

Financial Statement:
"{sentence}"

TASK: Explore three possible sentiment classifications and select the best one.

---
PATH 1: Hypothesis = POSITIVE
Consider if this statement represents positive news for investors.
- What evidence supports this being positive?
- What evidence contradicts this being positive?
- Confidence score (0-1) for this hypothesis:

PATH 2: Hypothesis = NEGATIVE
Consider if this statement represents negative news for investors.
- What evidence supports this being negative?
- What evidence contradicts this being negative?
- Confidence score (0-1) for this hypothesis:

PATH 3: Hypothesis = NEUTRAL
Consider if this statement has no clear market impact.
- What evidence supports this being neutral?
- What evidence contradicts this being neutral?
- Confidence score (0-1) for this hypothesis:

---
FINAL DECISION:
Based on evaluating all three paths, select the hypothesis with the strongest evidence.

Provide your final answer in this exact JSON format:
{{
    "sentiment": "positive/negative/neutral",
    "confidence": 0.0-1.0,
    "rationale": "Explanation of why this hypothesis was selected over the others",
    "path_scores": {{
        "positive": 0.0-1.0,
        "negative": 0.0-1.0,
        "neutral": 0.0-1.0
    }}
}}
"""
    return prompt


# Test prompt
test_sentence = (
    "The company reported mixed results with revenue up 10% but margins declining."
)
print("=" * 80)
print("TREE-OF-THOUGHT PROMPT EXAMPLE")
print("=" * 80)
print(create_tot_prompt(test_sentence))

TREE-OF-THOUGHT PROMPT EXAMPLE
You are a financial sentiment analysis expert. Analyze this statement using a tree-of-thought approach.

Financial Statement:
"The company reported mixed results with revenue up 10% but margins declining."

TASK: Explore three possible sentiment classifications and select the best one.

---
PATH 1: Hypothesis = POSITIVE
Consider if this statement represents positive news for investors.
- What evidence supports this being positive?
- What evidence contradicts this being positive?
- Confidence score (0-1) for this hypothesis:

PATH 2: Hypothesis = NEGATIVE
Consider if this statement represents negative news for investors.
- What evidence supports this being negative?
- What evidence contradicts this being negative?
- Confidence score (0-1) for this hypothesis:

PATH 3: Hypothesis = NEUTRAL
Consider if this statement has no clear market impact.
- What evidence supports this being neutral?
- What evidence contradicts this being neutral?
- Confidence score (0-

## 3. Model Inference Functions

In [8]:
def call_llama(prompt, model_name, temperature=0.0):
    max_retries = 3
    for attempt in range(max_retries):
        try:
            chat_completion = groq_client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model=model_name,
                temperature=temperature,
                max_tokens=1500,
            )
            return chat_completion.choices[0].message.content
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2**attempt)
                continue
            return None
    return None


def parse_response(response_text):
    """Parse JSON with path scores from ToT response"""
    try:
        if "```json" in response_text:
            json_str = response_text.split("```json")[1].split("```")[0].strip()
        elif "{" in response_text:
            start = response_text.find("{")
            end = response_text.rfind("}") + 1
            json_str = response_text[start:end]
        else:
            json_str = response_text.strip()

        result = json.loads(json_str)
        return result
    except:
        response_lower = response_text.lower()
        if "positive" in response_lower and "negative" not in response_lower:
            return {
                "sentiment": "positive",
                "confidence": 0.5,
                "rationale": "Parsed",
                "path_scores": {},
            }
        elif "negative" in response_lower:
            return {
                "sentiment": "negative",
                "confidence": 0.5,
                "rationale": "Parsed",
                "path_scores": {},
            }
        elif "neutral" in response_lower:
            return {
                "sentiment": "neutral",
                "confidence": 0.5,
                "rationale": "Parsed",
                "path_scores": {},
            }
        return None


print("✓ Inference functions defined")

✓ Inference functions defined


## 4. Run Experiments

In [9]:
# Updated experiment runs for new LLMs
test_df = df.head(100).copy()  # Use a sample of the dataset

# Updated Tree-of-Thought experiments for new LLMs
def run_tot_experiment(test_df, model_func, model_name, exp_id):
    print(f"Running {exp_id}: {model_name} (Tree-of-Thought)...")
    results = []

    for idx, row in tqdm(
        test_df.iterrows(), total=len(test_df), desc=f"{exp_id} Progress"
    ):
        prompt = create_tot_prompt(row["sentence"])
        response = model_func(prompt)

        if response:
            parsed = parse_response(response)
            if parsed:
                results.append(
                    {
                        "sentence": row["sentence"],
                        "true_sentiment": row["true_sentiment"],
                        "predicted_sentiment": parsed.get("sentiment", "unknown"),
                        "confidence": parsed.get("confidence", 0),
                        "rationale": parsed.get("rationale", ""),
                        "path_scores": str(parsed.get("path_scores", {})),
                        "full_response": response[:700],
                    }
                )

        time.sleep(0.5)  # Adjusted for new LLMs

    results_df = pd.DataFrame(results)
    print(f"\n✓ {exp_id} completed: {len(results_df)} predictions")
    return results_df

# Run Tree-of-Thought experiments with new LLMs
e10_df = run_tot_experiment(
    test_df, lambda p: call_llama(p, model_name="openai/gpt-oss-20b"), "GPT OSS 20B", "E10"
)
e11_df = run_tot_experiment(
    test_df, lambda p: call_llama(p, model_name="openai/gpt-oss-120b"), "GPT OSS 120B", "E11"
)
e12_df = run_tot_experiment(
    test_df, lambda p: call_llama(p, model_name="llama-3.3-70b-versatile"), "Llama-3.3-70B", "E12"
)

display(e10_df.head())
display(e11_df.head())
display(e12_df.head())

Running E10: GPT OSS 20B (Tree-of-Thought)...


E10 Progress: 100%|██████████| 100/100 [07:09<00:00,  4.29s/it]



✓ E10 completed: 51 predictions
Running E11: GPT OSS 120B (Tree-of-Thought)...


E11 Progress: 100%|██████████| 100/100 [08:20<00:00,  5.01s/it]



✓ E11 completed: 100 predictions
Running E12: Llama-3.3-70B (Tree-of-Thought)...


E12 Progress: 100%|██████████| 100/100 [07:02<00:00,  4.23s/it]


✓ E12 completed: 6 predictions





Unnamed: 0,sentence,true_sentiment,predicted_sentiment,confidence,rationale,path_scores,full_response
0,"According to Gran , the company has no plans t...",neutral,positive,0.6,The statement highlights that the company is e...,"{'positive': 0.6, 'negative': 0.2, 'neutral': ...","{\n ""sentiment"": ""positive"",\n ""confiden..."
1,"For the last quarter of 2010 , Componenta 's n...",positive,positive,0.85,The statement highlights a significant improve...,"{'positive': 0.85, 'negative': 0.1, 'neutral':...","{\n ""sentiment"": ""positive"",\n ""confiden..."
2,"In the third quarter of 2010 , net sales incre...",positive,positive,0.92,The statement reports a 5.2% increase in net s...,"{'positive': 0.92, 'negative': 0.04, 'neutral'...","{\n ""sentiment"": ""positive"",\n ""confiden..."
3,Operating profit rose to EUR 13.1 mn from EUR ...,positive,positive,0.85,The statement reports a clear increase in oper...,"{'positive': 0.85, 'negative': 0.1, 'neutral':...","{\n ""sentiment"": ""positive"",\n ""confiden..."
4,"Operating profit totalled EUR 21.1 mn , up fro...",positive,positive,0.8,The statement reports a clear increase in oper...,"{'positive': 0.8, 'negative': 0.05, 'neutral':...","{\n ""sentiment"": ""positive"",\n ""confiden..."


Unnamed: 0,sentence,true_sentiment,predicted_sentiment,confidence,rationale,path_scores,full_response
0,"According to Gran , the company has no plans t...",neutral,neutral,0.71,The statement mainly conveys factual informati...,"{'positive': 0.45, 'negative': 0.3, 'neutral':...",**PATH 1 – Hypothesis: POSITIVE** \n- **Evide...
1,"For the last quarter of 2010 , Componenta 's n...",positive,positive,0.68,The statement reports a more than doubling of ...,"{'positive': 0.68, 'negative': 0.22, 'neutral'...",**PATH 1 – Hypothesis = POSITIVE**\n\n*Evidenc...
2,"In the third quarter of 2010 , net sales incre...",positive,positive,0.71,The statement shows a clear increase in both t...,"{'positive': 0.71, 'negative': 0.22, 'neutral'...",**PATH 1 – Hypothesis = POSITIVE** \n\n- **Ev...
3,Operating profit rose to EUR 13.1 mn from EUR ...,positive,positive,0.78,The statement reports a clear increase in oper...,"{'positive': 0.78, 'negative': 0.12, 'neutral'...","{\n ""sentiment"": ""positive"",\n ""confiden..."
4,"Operating profit totalled EUR 21.1 mn , up fro...",positive,positive,0.78,The statement reports a year‑over‑year rise in...,"{'positive': 0.78, 'negative': 0.04, 'neutral'...",**PATH 1 – Hypothesis = POSITIVE** \n- **Evid...


Unnamed: 0,sentence,true_sentiment,predicted_sentiment,confidence,rationale,path_scores,full_response
0,"According to Gran , the company has no plans t...",neutral,positive,0.6,"The company's growth in Russia, despite not pl...","{'positive': 0.6, 'negative': 0.3, 'neutral': ...",To analyze the given financial statement using...
1,"For the last quarter of 2010 , Componenta 's n...",positive,positive,0.8,The significant increase in net sales and the ...,"{'positive': 0.8, 'negative': 0.2, 'neutral': ...",To analyze the given financial statement using...
2,"In the third quarter of 2010 , net sales incre...",positive,positive,0.9,The statement reports significant increases in...,"{'positive': 0.9, 'negative': 0.0, 'neutral': ...",To analyze the given financial statement using...
3,Operating profit rose to EUR 13.1 mn from EUR ...,positive,positive,0.8,The statement indicates an increase in operati...,"{'positive': 0.8, 'negative': 0.1, 'neutral': ...",To analyze the given financial statement using...
4,"Operating profit totalled EUR 21.1 mn , up fro...",positive,positive,0.8,The increase in operating profit and its signi...,"{'positive': 0.8, 'negative': 0.1, 'neutral': ...",To analyze the given financial statement using...


## 5. Calculate Metrics & Visualize

In [None]:
def calculate_metrics(df, exp_name):
    valid_df = df[
        df["predicted_sentiment"].isin(["positive", "negative", "neutral"])
    ].copy()
    y_true = valid_df["true_sentiment"]
    y_pred = valid_df["predicted_sentiment"]

    metrics = {
        "Experiment": exp_name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Macro-F1": f1_score(y_true, y_pred, average="macro"),
        "Macro-Precision": precision_score(y_true, y_pred, average="macro"),
        "Macro-Recall": recall_score(y_true, y_pred, average="macro"),
    }

    cm = confusion_matrix(y_true, y_pred, labels=["positive", "negative", "neutral"])
    return metrics, cm, valid_df


e10_metrics, e10_cm, e10_valid = calculate_metrics(e10_df, "E10a: GPT-OSS-20B (ToT)")
e10b_metrics, e10b_cm, e10b_valid = calculate_metrics(
    e10b_df, "E10b: GPT-OSS-120B (ToT)"
)
e10c_metrics, e10c_cm, e10c_valid = calculate_metrics(
    e10c_df, "E10c: Llama-3.3-70B (ToT)"
)

metrics_df = pd.DataFrame([e10_metrics, e10b_metrics, e10c_metrics])

print("\n" + "=" * 80)
print("TREE-OF-THOUGHT PERFORMANCE COMPARISON")
print("=" * 80)
display(metrics_df.round(4))

KeyError: 'predicted_sentiment'

In [None]:
def calculate_metrics(df, exp_name):
    """Calculate all evaluation metrics"""
    # Check if dataframe is empty or missing required columns
    if df.empty or "predicted_sentiment" not in df.columns:
        print(f"⚠️ Warning: {exp_name} has no valid predictions!")
        return (
            {
                "Experiment": exp_name,
                "Total Samples": 0,
                "Valid Predictions": 0,
                "Accuracy": 0,
                "Macro-F1": 0,
                "Weighted-F1": 0,
                "Macro-Precision": 0,
                "Macro-Recall": 0,
                "Positive_Precision": 0,
                "Positive_Recall": 0,
                "Positive_F1": 0,
                "Negative_Precision": 0,
                "Negative_Recall": 0,
                "Negative_F1": 0,
                "Neutral_Precision": 0,
                "Neutral_Recall": 0,
                "Neutral_F1": 0,
            },
            np.zeros((3, 3)),
            pd.DataFrame(),
        )

    # Filter out errors
    valid_df = df[
        df["predicted_sentiment"].isin(["positive", "negative", "neutral"])
    ].copy()

    # Check if we have valid predictions
    if valid_df.empty:
        print(f"⚠️ Warning: {exp_name} has no valid predictions after filtering!")
        return (
            {
                "Experiment": exp_name,
                "Total Samples": len(df),
                "Valid Predictions": 0,
                "Accuracy": 0,
                "Macro-F1": 0,
                "Weighted-F1": 0,
                "Macro-Precision": 0,
                "Macro-Recall": 0,
                "Positive_Precision": 0,
                "Positive_Recall": 0,
                "Positive_F1": 0,
                "Negative_Precision": 0,
                "Negative_Recall": 0,
                "Negative_F1": 0,
                "Neutral_Precision": 0,
                "Neutral_Recall": 0,
                "Neutral_F1": 0,
            },
            np.zeros((3, 3)),
            pd.DataFrame(),
        )

    y_true = valid_df["true_sentiment"]
    y_pred = valid_df["predicted_sentiment"]

    metrics = {
        "Experiment": exp_name,
        "Total Samples": len(df),
        "Valid Predictions": len(valid_df),
        "Accuracy": accuracy_score(y_true, y_pred),
        "Macro-F1": f1_score(y_true, y_pred, average="macro"),
        "Weighted-F1": f1_score(y_true, y_pred, average="weighted"),
        "Macro-Precision": precision_score(y_true, y_pred, average="macro"),
        "Macro-Recall": recall_score(y_true, y_pred, average="macro"),
    }

    # Per-class metrics
    labels = ["positive", "negative", "neutral"]
    precision_per_class = precision_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    recall_per_class = recall_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    f1_per_class = f1_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )

    for i, label in enumerate(labels):
        metrics[f"{label.capitalize()}_Precision"] = precision_per_class[i]
        metrics[f"{label.capitalize()}_Recall"] = recall_per_class[i]
        metrics[f"{label.capitalize()}_F1"] = f1_per_class[i]

    cm = confusion_matrix(y_true, y_pred, labels=labels)

    return metrics, cm, valid_df


# Calculate metrics
e10_metrics, e10_cm, e10_valid = calculate_metrics(e10_df, "E10: GPT-OSS-20B ToT")

print("\n" + "=" * 80)
print("TREE-OF-THOUGHT PERFORMANCE")
print("=" * 80)
print(f"\nExperiment: E10")
print(f"Accuracy: {e10_metrics['Accuracy']:.4f}")
print(f"Macro-F1: {e10_metrics['Macro-F1']:.4f}")
print(f"Macro-Precision: {e10_metrics['Macro-Precision']:.4f}")
print(f"Macro-Recall: {e10_metrics['Macro-Recall']:.4f}")
print(f"\nPer-class F1 Scores:")
print(f"  Positive: {e10_metrics['Positive_F1']:.4f}")
print(f"  Negative: {e10_metrics['Negative_F1']:.4f}")
print(f"  Neutral: {e10_metrics['Neutral_F1']:.4f}")

## 6. Save Results

In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

e10_df.to_csv(f"e10_GPT_OSS_20B_tot_{timestamp}.csv", index=False)
e10b_df.to_csv(f"e10b_GPT_OSS_120B_flash_tot_{timestamp}.csv", index=False)
e10c_df.to_csv(f"e10c_Llama_3.3_70B_tot_{timestamp}.csv", index=False)
metrics_df.to_csv(f"tot_metrics_summary_{timestamp}.csv", index=False)

print(f"\n✓ Tree-of-Thought results saved")

## 7. Key Insights

### Tree-of-Thought Analysis:

1. **Multi-Path Reasoning**: How does exploring all three sentiment hypotheses affect accuracy?
2. **Decision Quality**: Are ToT predictions more justified and explainable?
3. **Computational Cost**: Does the added complexity justify performance gains?
4. **Path Score Analysis**: Which sentiment hypotheses receive highest scores for which types of statements?