In [None]:
# Install required packages (if needed)
# !pip3 install pandas numpy matplotlib seaborn scikit-learn tqdm requests -q

In [None]:
# Ollama API setup and imports
import requests
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import json
import time
import re
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, precision_score, recall_score, confusion_matrix, classification_report
import numpy as np

print("✓ Libraries imported successfully")
print("✓ Ollama will be used for LLM inference")

print("\n" + "=" * 80)
print("SETUP COMPLETE - FEW-SHOT INSIGHT GENERATION")
print("=" * 80)
print("Models configured:")
print("  • I4: Llama3.1:8b (Ollama - Local - Few-Shot)")
print("  • I5: Qwen3:8b (Ollama - Local - Few-Shot)")
print("  • I6: DeepSeek-R1:8b (Ollama - Local - Few-Shot)")
print("=" * 80)

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (14, 6)

## 1. Load Dataset

In [None]:
# Load the 100% agreement dataset (highest quality)
data_path = "../../DatasetAnalysis_FinancialPhraseBank/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt"

sentences = []
insights = []

with open(data_path, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        line = line.strip()
        if "@" in line:
            parts = line.rsplit("@", 1)
            if len(parts) == 2:
                sentences.append(parts[0])
                insights.append(parts[1])

df = pd.DataFrame({"sentence": sentences, "true_insight": insights})

print(f"Dataset loaded: {len(df)} sentences")
print(f"\nInsight distribution:")
print(df["true_insight"].value_counts())

# Display sample
print("\n" + "=" * 80)
print("Sample sentences:")
print("=" * 80)
display(df.sample(5, random_state=42))

## 2. Few-Shot Examples

Carefully curated examples (2 positive, 2 negative, 1 neutral) representing typical financial insight patterns.

In [None]:
# Curated few-shot examples for INSIGHT GENERATION
FEW_SHOT_EXAMPLES = [
    {
        "sentence": "Operating profit rose to EUR 13.1 mn from EUR 8.7 mn in the corresponding period in 2007.",
        "insight": "positive",
        "rationale": "Operating profit increased significantly, indicating improved financial performance.",
    },
    {
        "sentence": "Net sales increased by 18.5% to EUR 167.8 million compared to the previous year.",
        "insight": "positive",
        "rationale": "Strong revenue growth of 18.5% signals business expansion and market success.",
    },
    {
        "sentence": "The company reported a net loss of EUR 2.5 million compared to a profit of EUR 1.2 million in the previous quarter.",
        "insight": "negative",
        "rationale": "Shift from profit to loss represents deteriorating financial health.",
    },
    {
        "sentence": "Sales decreased by 15% year-over-year due to weakening demand in key markets.",
        "insight": "negative",
        "rationale": "Significant sales decline indicates business challenges and market difficulties.",
    },
    {
        "sentence": "The company announced the appointment of a new chief financial officer effective next month.",
        "insight": "neutral",
        "rationale": "Executive appointment is routine corporate news without clear financial impact.",
    },
]

print("Few-Shot Examples for INSIGHT GENERATION:")
print("=" * 80)
for i, ex in enumerate(FEW_SHOT_EXAMPLES, 1):
    print(f"\nExample {i} [{ex['insight'].upper()}]:")
    print(f"Sentence: {ex['sentence']}")
    print(f"Insight: {ex['rationale']}")

## 3. Few-Shot Prompt Design

In [None]:
def create_few_shot_prompt(sentence):
    """
    Creates a few-shot prompt with 5 labeled examples for financial insight generation.
    """
    examples_text = ""
    for i, ex in enumerate(FEW_SHOT_EXAMPLES, 1):
        examples_text += f"""\nExample {i}:
Sentence: "{ex["sentence"]}"
Analysis:
{{
    "insight": "{ex["insight"]}",
    "confidence": 0.95,
    "rationale": "{ex["rationale"]}"
}}
"""

    prompt = f"""You are a financial insight analysis expert.

Classify the following financial statement as "positive", "negative", or "neutral" from an investor's perspective.

Guidelines:
- Positive: Good news for stock price (revenue increase, profit growth, expansion)
- Negative: Bad news for stock price (losses, declining sales, setbacks)
- Neutral: No clear impact or mixed signals

Here are 5 examples to learn from:
{examples_text}

Now classify this new statement:
Sentence: "{sentence}"

Provide your response in JSON format:
{{
    "insight": "positive/negative/neutral",
    "confidence": 0.0-1.0,
    "rationale": "Brief explanation"
}}
"""
    return prompt


# Test prompt
test_sentence = "The company's quarterly revenue exceeded analyst expectations by 12%."
print("=" * 80)
print("FEW-SHOT PROMPT EXAMPLE (INSIGHT GENERATION)")
print("=" * 80)
print(create_few_shot_prompt(test_sentence)[:1000] + "...")

## 4. Model Inference Functions

In [None]:
# Ollama API configuration
OLLAMA_BASE_URL = "http://localhost:11434/api/generate"


def call_ollama(model_name, prompt, temperature=0.0):
    """Generic function to call Ollama API for any model"""
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = requests.post(
                OLLAMA_BASE_URL,
                json={
                    "model": model_name,
                    "prompt": prompt,
                    "temperature": temperature,
                    "stream": False,
                },
                timeout=120,
            )
            if response.status_code == 200:
                return response.json().get("response", "")
            else:
                if attempt < max_retries - 1:
                    time.sleep(2**attempt)
                    continue
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2**attempt)
                continue
            return None
    return None


def call_llama(prompt, temperature=0.0):
    """Call Llama3.1:8b via Ollama API (I4)"""
    return call_ollama("llama3.1:8b", prompt, temperature)


def call_qwen(prompt, temperature=0.0):
    """Call Qwen3:8b via Ollama API (I5)"""
    return call_ollama("qwen3:8b", prompt, temperature)


def call_deepseek(prompt, temperature=0.0):
    """Call DeepSeek-R1:8b via Ollama API (I6)"""
    return call_ollama("deepseek-r1:8b", prompt, temperature)


def parse_response(response_text):
    """Robustly parse JSON response, handling conversational filler and markdown blocks."""
    if not response_text:
        return None
    try:
        if "```json" in response_text:
            json_str = response_text.split("```json")[1].split("```")[0].strip()
        elif "```" in response_text:
            json_str = response_text.split("```")[1].strip()
        else:
            match = re.search(r'(\{.*\})', response_text, re.DOTALL)
            if match:
                json_str = match.group(1).strip()
            else:
                json_str = response_text.strip()
        result = json.loads(json_str)
        if "sentiment" in result and "insight" not in result:
            result["insight"] = result["sentiment"]
        return result
    except Exception:
        response_lower = response_text.lower()
        if "positive" in response_lower and "negative" not in response_lower:
            return {"insight": "positive", "confidence": 0.5, "rationale": "Parsed from text"}
        elif "negative" in response_lower:
            return {"insight": "negative", "confidence": 0.5, "rationale": "Parsed from text"}
        elif "neutral" in response_lower:
            return {"insight": "neutral", "confidence": 0.5, "rationale": "Parsed from text"}
        return None


print("✓ Model inference functions defined")
print("  • call_llama() - Llama3.1:8b (I4)")
print("  • call_qwen() - Qwen3:8b (I5)")
print("  • call_deepseek() - DeepSeek-R1:8b (I6)")
print("  • parse_response() - JSON parser")

## 5. Run Experiments

### I4: Llama3.1:8b (Few-Shot)

In [None]:
# Test sample (remove .head(200) for full run)
test_df = df.head(200).copy()

# I4: Llama3.1:8b
print("=" * 80)
print("Running I4: Llama3.1:8b (Few-Shot)")
print("=" * 80)
i4_results = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="I4 Progress"):
    prompt = create_few_shot_prompt(row["sentence"])
    response = call_llama(prompt)

    if response:
        parsed = parse_response(response)
        if parsed:
            i4_results.append(
                {
                    "sentence": row["sentence"],
                    "true_insight": row["true_insight"],
                    "predicted_insight": parsed.get("insight", "unknown"),
                    "confidence": parsed.get("confidence", 0),
                    "rationale": parsed.get("rationale", ""),
                }
            )
        else:
            i4_results.append(
                {
                    "sentence": row["sentence"],
                    "true_insight": row["true_insight"],
                    "predicted_insight": "error",
                    "confidence": 0,
                    "rationale": "Parse error",
                }
            )
    else:
        i4_results.append(
            {
                "sentence": row["sentence"],
                "true_insight": row["true_insight"],
                "predicted_insight": "error",
                "confidence": 0,
                "rationale": "API call failed",
            }
        )

    time.sleep(0.5)

i4_df = pd.DataFrame(i4_results)
print(f"\n✓ I4 completed: {len(i4_df)} predictions")
print(f"  Valid predictions: {len(i4_df[i4_df['predicted_insight'].isin(['positive', 'negative', 'neutral'])])}")
print(f"  Errors: {len(i4_df[i4_df['predicted_insight'] == 'error'])}")
display(i4_df.head())

### I5: Qwen3:8b (Few-Shot)

In [None]:
# I5: Qwen3:8b
print("=" * 80)
print("Running I5: Qwen3:8b (Few-Shot)")
print("=" * 80)
i5_results = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="I5 Progress"):
    prompt = create_few_shot_prompt(row["sentence"])
    response = call_qwen(prompt)

    if response:
        parsed = parse_response(response)
        if parsed:
            i5_results.append(
                {
                    "sentence": row["sentence"],
                    "true_insight": row["true_insight"],
                    "predicted_insight": parsed.get("insight", "unknown"),
                    "confidence": parsed.get("confidence", 0),
                    "rationale": parsed.get("rationale", ""),
                }
            )
        else:
            i5_results.append(
                {
                    "sentence": row["sentence"],
                    "true_insight": row["true_insight"],
                    "predicted_insight": "error",
                    "confidence": 0,
                    "rationale": "Parse error",
                }
            )
    else:
        i5_results.append(
            {
                "sentence": row["sentence"],
                "true_insight": row["true_insight"],
                "predicted_insight": "error",
                "confidence": 0,
                "rationale": "API call failed",
            }
        )

    time.sleep(0.5)

i5_df = pd.DataFrame(i5_results)
print(f"\n✓ I5 completed: {len(i5_df)} predictions")
print(f"  Valid predictions: {len(i5_df[i5_df['predicted_insight'].isin(['positive', 'negative', 'neutral'])])}")
print(f"  Errors: {len(i5_df[i5_df['predicted_insight'] == 'error'])}")
display(i5_df.head())

### I6: DeepSeek-R1:8b (Few-Shot)

In [None]:
# I6: DeepSeek-R1:8b
print("=" * 80)
print("Running I6: DeepSeek-R1:8b (Few-Shot)")
print("=" * 80)
i6_results = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="I6 Progress"):
    prompt = create_few_shot_prompt(row["sentence"])
    response = call_deepseek(prompt)

    if response:
        parsed = parse_response(response)
        if parsed:
            i6_results.append(
                {
                    "sentence": row["sentence"],
                    "true_insight": row["true_insight"],
                    "predicted_insight": parsed.get("insight", "unknown"),
                    "confidence": parsed.get("confidence", 0),
                    "rationale": parsed.get("rationale", ""),
                }
            )
        else:
            i6_results.append(
                {
                    "sentence": row["sentence"],
                    "true_insight": row["true_insight"],
                    "predicted_insight": "error",
                    "confidence": 0,
                    "rationale": "Parse error",
                }
            )
    else:
        i6_results.append(
            {
                "sentence": row["sentence"],
                "true_insight": row["true_insight"],
                "predicted_insight": "error",
                "confidence": 0,
                "rationale": "API call failed",
            }
        )

    time.sleep(0.5)

i6_df = pd.DataFrame(i6_results)
print(f"\n✓ I6 completed: {len(i6_df)} predictions")
print(f"  Valid predictions: {len(i6_df[i6_df['predicted_insight'].isin(['positive', 'negative', 'neutral'])])}")
print(f"  Errors: {len(i6_df[i6_df['predicted_insight'] == 'error'])}")
display(i6_df.head())

## 6. Calculate Metrics

In [None]:
def calculate_metrics(df, exp_name):
    """Calculate all evaluation metrics including MCC"""
    if df.empty or "predicted_insight" not in df.columns:
        print(f"⚠️ Warning: {exp_name} has no valid predictions!")
        return (
            {
                "Experiment": exp_name,
                "Total Samples": 0,
                "Valid Predictions": 0,
                "Accuracy": 0,
                "Macro-F1": 0,
                "Weighted-F1": 0,
                "Macro-Precision": 0,
                "Macro-Recall": 0,
                "MCC": 0,
                "Positive_Precision": 0,
                "Positive_Recall": 0,
                "Positive_F1": 0,
                "Negative_Precision": 0,
                "Negative_Recall": 0,
                "Negative_F1": 0,
                "Neutral_Precision": 0,
                "Neutral_Recall": 0,
                "Neutral_F1": 0,
            },
            np.zeros((3, 3)),
            pd.DataFrame(),
        )

    valid_df = df[
        df["predicted_insight"].isin(["positive", "negative", "neutral"])
    ].copy()

    if valid_df.empty:
        print(f"⚠️ Warning: {exp_name} has no valid predictions after filtering!")
        return (
            {
                "Experiment": exp_name,
                "Total Samples": len(df),
                "Valid Predictions": 0,
                "Accuracy": 0,
                "Macro-F1": 0,
                "Weighted-F1": 0,
                "Macro-Precision": 0,
                "Macro-Recall": 0,
                "MCC": 0,
                "Positive_Precision": 0,
                "Positive_Recall": 0,
                "Positive_F1": 0,
                "Negative_Precision": 0,
                "Negative_Recall": 0,
                "Negative_F1": 0,
                "Neutral_Precision": 0,
                "Neutral_Recall": 0,
                "Neutral_F1": 0,
            },
            np.zeros((3, 3)),
            pd.DataFrame(),
        )

    y_true = valid_df["true_insight"]
    y_pred = valid_df["predicted_insight"]

    mcc_score = matthews_corrcoef(y_true, y_pred)

    metrics = {
        "Experiment": exp_name,
        "Total Samples": len(df),
        "Valid Predictions": len(valid_df),
        "Accuracy": accuracy_score(y_true, y_pred),
        "Macro-F1": f1_score(y_true, y_pred, average="macro"),
        "Weighted-F1": f1_score(y_true, y_pred, average="weighted"),
        "Macro-Precision": precision_score(y_true, y_pred, average="macro"),
        "Macro-Recall": recall_score(y_true, y_pred, average="macro"),
        "MCC": mcc_score,
    }

    labels = ["positive", "negative", "neutral"]
    precision_per_class = precision_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    recall_per_class = recall_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    f1_per_class = f1_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )

    for i, label in enumerate(labels):
        metrics[f"{label.capitalize()}_Precision"] = precision_per_class[i]
        metrics[f"{label.capitalize()}_Recall"] = recall_per_class[i]
        metrics[f"{label.capitalize()}_F1"] = f1_per_class[i]

    cm = confusion_matrix(y_true, y_pred, labels=labels)

    return metrics, cm, valid_df


# Calculate metrics for all experiments
print("\n" + "=" * 80)
print("CALCULATING METRICS")
print("=" * 80)

i4_metrics, i4_cm, i4_valid = calculate_metrics(i4_df, "I4: Llama3.1:8b (Few-Shot)")
i5_metrics, i5_cm, i5_valid = calculate_metrics(i5_df, "I5: Qwen3:8b (Few-Shot)")
i6_metrics, i6_cm, i6_valid = calculate_metrics(i6_df, "I6: DeepSeek-R1:8b (Few-Shot)")

metrics_df = pd.DataFrame([i4_metrics, i5_metrics, i6_metrics])

print("\n" + "=" * 80)
print("FEW-SHOT INSIGHT GENERATION PERFORMANCE COMPARISON")
print("=" * 80)
display(
    metrics_df[
        ["Experiment", "Valid Predictions", "Accuracy", "Macro-F1", "MCC"]
    ].round(4)
)

print("\n" + "=" * 80)
print("DETAILED METRICS")
print("=" * 80)
display(
    metrics_df[["Experiment", "Macro-Precision", "Macro-Recall", "Weighted-F1"]].round(4)
)

print("\n" + "=" * 80)
print("PER-CLASS F1 SCORES")
print("=" * 80)
display(
    metrics_df[
        ["Experiment", "Positive_F1", "Negative_F1", "Neutral_F1"]
    ].round(4)
)

## 7. Visualize Results

In [None]:
# Metrics and confusion matrices (i4_cm, i5_cm, i6_cm) computed in cell above.

In [None]:
# Confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
labels = ["Positive", "Negative", "Neutral"]

for idx, (cm, title) in enumerate(
    [
        (i4_cm, "I4: Llama3.1:8b"),
        (i5_cm, "I5: Qwen3:8b"),
        (i6_cm, "I6: DeepSeek-R1:8b"),
    ]
):
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Greens",
        xticklabels=labels,
        yticklabels=labels,
        ax=axes[idx],
    )
    axes[idx].set_title(title, fontsize=12, weight="bold")
    axes[idx].set_ylabel("True Label", fontsize=11, weight="bold")
    axes[idx].set_xlabel("Predicted Label", fontsize=11, weight="bold")

plt.suptitle(
    "Confusion Matrices - Few-Shot Insight Generation",
    fontsize=14,
    weight="bold",
    y=1.02,
)
plt.tight_layout()
plt.savefig("few_shot_confusion_matrices.png", dpi=300, bbox_inches="tight")
plt.show()

## 8. Save Results

In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

i4_df.to_csv(f"i4_llama3.1_8b_few_shot_insight_{timestamp}.csv", index=False)
i5_df.to_csv(f"i5_qwen3_8b_few_shot_insight_{timestamp}.csv", index=False)
i6_df.to_csv(f"i6_deepseek_r1_8b_few_shot_insight_{timestamp}.csv", index=False)
metrics_df.to_csv(f"few_shot_insight_metrics_summary_{timestamp}.csv", index=False)

print(f"\n✓ Results saved with timestamp: {timestamp}")

## 9. Key Insights

### Few-Shot Learning Impact:

1. **Learning from Examples**: Compare performance improvements vs zero-shot
2. **Model Comparison**: Which model benefits most from examples?
3. **Example Quality**: How do curated examples influence predictions?
4. **Confidence Calibration**: Are models more confident with examples?