In [None]:
# Install required packages (if needed)
# !pip3 install pandas numpy matplotlib seaborn scikit-learn tqdm requests -q

In [None]:
# Ollama API setup and imports
import requests
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import json
import time
import re
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    matthews_corrcoef,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report,
)
import numpy as np

OLLAMA_BASE_URL = "http://localhost:11434/api/generate"

print("✓ Libraries imported successfully")
print("✓ Ollama will be used for LLM inference (local)")
print("\n" + "=" * 80)
print("SETUP COMPLETE - CHAIN-OF-THOUGHT INSIGHT GENERATION")
print("=" * 80)
print("Models configured:")
print("  • I7: Llama3.1:8b (Ollama - Local - Chain-of-Thought)")
print("  • I8: Qwen3:8b (Ollama - Local - Chain-of-Thought)")
print("  • I9: DeepSeek-R1:8b (Ollama - Local - Chain-of-Thought)")
print("=" * 80)

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (14, 6)

## 1. Load Dataset

In [None]:
data_path = "../../DatasetAnalysis_FinancialPhraseBank/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt"

sentences = []
insights = []

with open(data_path, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        line = line.strip()
        if "@" in line:
            parts = line.rsplit("@", 1)
            if len(parts) == 2:
                sentences.append(parts[0])
                insights.append(parts[1])

df = pd.DataFrame({"sentence": sentences, "true_insight": insights})

print(f"Dataset loaded: {len(df)} sentences")
print(f"\nInsight distribution:")
print(df["true_insight"].value_counts())

print("\n" + "=" * 80)
print("Sample sentences:")
print("=" * 80)
display(df.sample(5, random_state=42))

## 2. Chain-of-Thought Prompt Design

**Reasoning Structure**:
- Step 1: Identify key financial metrics/events
- Step 2: Analyze positive indicators
- Step 3: Analyze negative indicators
- Step 4: Determine net impact from investor perspective
- Step 5: Classify insight with confidence

In [None]:
def create_cot_prompt(sentence):
    """
    Creates a Chain-of-Thought prompt for financial insight classification.
    Guides the model through stepwise reasoning.
    """
    prompt = f"""You are a financial insight analysis expert. Use step-by-step reasoning to classify this financial statement.

Task: Classify the following financial statement as "positive", "negative", or "neutral" from an investor's perspective, using stepwise reasoning.

Guidelines:
- Positive: Financial improvements, growth, profits, revenue increases
- Negative: Financial declines, losses, revenue drops
- Neutral: Factual statements with no clear financial impact

Financial Statement:
"{sentence}"

Think step-by-step about the financial signals. Then, provide a final concise summary in the 'rationale' field of the JSON.
Return ONLY strictly JSON in this exact format:
{{
    "insight": "positive/negative/neutral",
    "confidence": 0.0-1.0,
    "rationale": "Step-by-step reasoning in one or two sentences"
}}
"""
    return prompt


# Test prompt
test_sentence = "Net sales increased by 18.5% to EUR 167.8 million."
print("=" * 80)
print("CHAIN-OF-THOUGHT PROMPT EXAMPLE (INSIGHT GENERATION)")
print("=" * 80)
print(create_cot_prompt(test_sentence))

## 3. Model Inference Functions

In [None]:
def call_ollama(model_name, prompt, temperature=0.0):
    """Call Ollama API for LLM inference"""
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = requests.post(
                OLLAMA_BASE_URL,
                json={
                    "model": model_name,
                    "prompt": prompt,
                    "temperature": temperature,
                    "stream": False,
                },
                timeout=150,
            )
            if response.status_code == 200:
                return response.json().get("response", "")
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2**attempt)
                continue
            return None
    return None


def call_llama(prompt, temperature=0.0):
    """Call Llama3.1:8b via Ollama (I7)"""
    return call_ollama("llama3.1:8b", prompt, temperature)


def call_qwen(prompt, temperature=0.0):
    """Call Qwen3:8b via Ollama (I8)"""
    return call_ollama("qwen3:8b", prompt, temperature)


def call_deepseek(prompt, temperature=0.0):
    """Call DeepSeek-R1:8b via Ollama (I9)"""
    return call_ollama("deepseek-r1:8b", prompt, temperature)


def parse_response(response_text):
    """Robustly parse JSON from potentially verbose CoT response"""
    if not response_text:
        return None
    try:
        if "```json" in response_text:
            json_str = response_text.split("```json")[1].split("```")[0].strip()
        elif "```" in response_text:
            json_str = response_text.split("```")[1].strip()
        else:
            match = re.search(r'(\{.*\})', response_text, re.DOTALL)
            if match:
                json_str = match.group(1).strip()
            else:
                json_str = response_text.strip()
        result = json.loads(json_str)
        if "sentiment" in result and "insight" not in result:
            result["insight"] = result["sentiment"]
        return result
    except Exception:
        response_lower = response_text.lower()
        if "positive" in response_lower and "negative" not in response_lower:
            return {"insight": "positive", "confidence": 0.5, "rationale": "Parsed from text"}
        elif "negative" in response_lower:
            return {"insight": "negative", "confidence": 0.5, "rationale": "Parsed from text"}
        elif "neutral" in response_lower:
            return {"insight": "neutral", "confidence": 0.5, "rationale": "Parsed from text"}
        return None


print("✓ Model inference functions defined")
print("  • call_llama() - Llama3.1:8b (I7)")
print("  • call_qwen() - Qwen3:8b (I8)")
print("  • call_deepseek() - DeepSeek-R1:8b (I9)")
print("  • parse_response() - JSON parser")

## 4. Run Experiments

In [None]:
# Test sample (remove .head(200) for full run)
test_df = df.head(200).copy()


def run_cot_experiment(test_df, model_func, model_name, exp_id):
    """Generic function to run CoT experiment"""
    print("=" * 80)
    print(f"Running {exp_id}: {model_name} (Chain-of-Thought)")
    print("=" * 80)
    results = []

    for idx, row in tqdm(
        test_df.iterrows(), total=len(test_df), desc=f"{exp_id} Progress"
    ):
        prompt = create_cot_prompt(row["sentence"])
        response = model_func(prompt)

        if response:
            parsed = parse_response(response)
            if parsed:
                results.append(
                    {
                        "sentence": row["sentence"],
                        "true_insight": row["true_insight"],
                        "predicted_insight": parsed.get("insight", "unknown"),
                        "confidence": parsed.get("confidence", 0),
                        "rationale": parsed.get("rationale", ""),
                    }
                )
            else:
                results.append(
                    {
                        "sentence": row["sentence"],
                        "true_insight": row["true_insight"],
                        "predicted_insight": "error",
                        "confidence": 0,
                        "rationale": "Parse error",
                    }
                )
        else:
            results.append(
                {
                    "sentence": row["sentence"],
                    "true_insight": row["true_insight"],
                    "predicted_insight": "error",
                    "confidence": 0,
                    "rationale": "API call failed",
                }
            )

        time.sleep(0.1)

    results_df = pd.DataFrame(results)
    print(f"\n✓ {exp_id} completed: {len(results_df)} predictions")
    print(f"  Valid predictions: {len(results_df[results_df['predicted_insight'].isin(['positive', 'negative', 'neutral'])])}")
    print(f"  Errors: {len(results_df[results_df['predicted_insight'] == 'error'])}")
    return results_df


# Run all three experiments
i7_df = run_cot_experiment(test_df, call_llama, "Llama3.1:8b", "I7")
i8_df = run_cot_experiment(test_df, call_qwen, "Qwen3:8b", "I8")
i9_df = run_cot_experiment(test_df, call_deepseek, "DeepSeek-R1:8b", "I9")

display(i7_df.head())

## 5. Calculate Metrics & Visualize

In [None]:
def calculate_metrics(df, exp_name):
    """Calculate all evaluation metrics including MCC"""
    if df.empty or "predicted_insight" not in df.columns:
        print(f"⚠️ Warning: {exp_name} has no valid predictions!")
        return (
            {"Experiment": exp_name, "Total Samples": 0, "Valid Predictions": 0, "Accuracy": 0, "Macro-F1": 0, "Weighted-F1": 0, "Macro-Precision": 0, "Macro-Recall": 0, "MCC": 0},
            np.zeros((3, 3)),
            pd.DataFrame(),
        )
    valid_df = df[df["predicted_insight"].isin(["positive", "negative", "neutral"])].copy()
    if valid_df.empty:
        print(f"⚠️ Warning: {exp_name} has no valid predictions after filtering!")
        return (
            {"Experiment": exp_name, "Total Samples": len(df), "Valid Predictions": 0, "Accuracy": 0, "Macro-F1": 0, "Weighted-F1": 0, "Macro-Precision": 0, "Macro-Recall": 0, "MCC": 0},
            np.zeros((3, 3)),
            pd.DataFrame(),
        )
    y_true = valid_df["true_insight"]
    y_pred = valid_df["predicted_insight"]
    labels = ["positive", "negative", "neutral"]
    mcc_score = matthews_corrcoef(y_true, y_pred)
    metrics = {
        "Experiment": exp_name,
        "Total Samples": len(df),
        "Valid Predictions": len(valid_df),
        "Accuracy": accuracy_score(y_true, y_pred),
        "Macro-F1": f1_score(y_true, y_pred, average="macro"),
        "Weighted-F1": f1_score(y_true, y_pred, average="weighted"),
        "Macro-Precision": precision_score(y_true, y_pred, average="macro"),
        "Macro-Recall": recall_score(y_true, y_pred, average="macro"),
        "MCC": mcc_score,
    }
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    return metrics, cm, valid_df


print("\n" + "=" * 80)
print("CALCULATING METRICS")
print("=" * 80)

i7_metrics, i7_cm, i7_valid = calculate_metrics(i7_df, "I7: Llama3.1:8b (CoT)")
i8_metrics, i8_cm, i8_valid = calculate_metrics(i8_df, "I8: Qwen3:8b (CoT)")
i9_metrics, i9_cm, i9_valid = calculate_metrics(i9_df, "I9: DeepSeek-R1:8b (CoT)")

metrics_df = pd.DataFrame([i7_metrics, i8_metrics, i9_metrics])

print("\n" + "=" * 80)
print("CHAIN-OF-THOUGHT INSIGHT GENERATION PERFORMANCE COMPARISON")
print("=" * 80)
display(
    metrics_df[["Experiment", "Valid Predictions", "Accuracy", "Macro-F1", "MCC"]].round(4)
)

In [None]:
# Metrics computed above. Optional: Confusion matrix visualization
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
labels = ["Positive", "Negative", "Neutral"]
for idx, (cm, title) in enumerate([
    (i7_cm, "I7: Llama3.1:8b"),
    (i8_cm, "I8: Qwen3:8b"),
    (i9_cm, "I9: DeepSeek-R1:8b"),
]):
    sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", xticklabels=labels, yticklabels=labels, ax=axes[idx])
    axes[idx].set_title(title, fontsize=12, weight="bold")
    axes[idx].set_ylabel("True Label", fontsize=11)
    axes[idx].set_xlabel("Predicted Label", fontsize=11)
plt.suptitle("Confusion Matrices - Chain-of-Thought Insight Generation", fontsize=14, weight="bold", y=1.02)
plt.tight_layout()
plt.savefig("cot_insight_confusion_matrices.png", dpi=300, bbox_inches="tight")
plt.show()

## 6. Save Results

In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

i7_df.to_csv(f"i7_llama3.1_8b_cot_insight_{timestamp}.csv", index=False)
i8_df.to_csv(f"i8_qwen3_8b_cot_insight_{timestamp}.csv", index=False)
i9_df.to_csv(f"i9_deepseek_r1_8b_cot_insight_{timestamp}.csv", index=False)
metrics_df.to_csv(f"cot_insight_metrics_summary_{timestamp}.csv", index=False)

print(f"\n✓ Chain-of-Thought Insight Generation results saved")