In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn google-generativeai groq python-dotenv tqdm -q

In [None]:
# Suppress deprecation warnings
import warnings
import os

warnings.filterwarnings('ignore', category=FutureWarning, module='google.generativeai')


# Fix SSL/TLS certificate verification for gRPC (required for Google Gemini API on macOS)
os.environ['GRPC_DEFAULT_SSL_ROOTS_FILE_PATH'] = ''
os.environ['GRPC_SSL_CIPHER_SUITES'] = 'HIGH'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import time
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report,
)

import google.generativeai as genai
from groq import Groq
from dotenv import load_dotenv

load_dotenv()

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if GOOGLE_API_KEY:
    genai.configure(api_key=GOOGLE_API_KEY)
if GROQ_API_KEY:
    groq_client = Groq(api_key=GROQ_API_KEY)

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (14, 6)

print("✓ Setup complete")

## 1. Load Dataset

In [None]:
data_path = (
    "../../FinancialPhraseBank_Analysis/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt"
)

sentences = []
sentiments = []

with open(data_path, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        line = line.strip()
        if "@" in line:
            parts = line.rsplit("@", 1)
            if len(parts) == 2:
                sentences.append(parts[0])
                sentiments.append(parts[1])

df = pd.DataFrame({"sentence": sentences, "true_sentiment": sentiments})

print(f"Dataset loaded: {len(df)} sentences")
print(f"\nSentiment distribution:")
print(df["true_sentiment"].value_counts())

## 2. Chain-of-Thought Prompt Design

**Reasoning Structure**:
- Step 1: Identify key financial metrics/events
- Step 2: Analyze positive indicators
- Step 3: Analyze negative indicators  
- Step 4: Determine net impact on investor sentiment
- Step 5: Classify sentiment with confidence

In [None]:
def create_cot_prompt(sentence):
    """
    Creates a Chain-of-Thought prompt that guides stepwise reasoning.
    """
    prompt = f"""You are a financial sentiment analysis expert. Analyze the following financial statement step-by-step.

Financial Statement:
"{sentence}"

Think through this systematically:

Step 1: Identify the key financial metrics, events, or indicators mentioned in the statement.
Step 2: List any positive signals (growth, profit increases, expansions, etc.).
Step 3: List any negative signals (losses, declines, challenges, etc.).
Step 4: Evaluate the net impact on stock price from an investor's perspective.
Step 5: Based on your analysis, classify the sentiment.

Classification guidelines:
- Positive: Clear good news for stock price
- Negative: Clear bad news for stock price
- Neutral: No clear impact or mixed signals

IMPORTANT: Provide your final answer in this exact JSON format:
{{
    "sentiment": "positive/negative/neutral",
    "confidence": 0.0-1.0,
    "rationale": "Brief summary of your step-by-step reasoning"
}}
"""
    return prompt


# Test prompt
test_sentence = "Net sales increased by 18.5% to EUR 167.8 million."
print("=" * 80)
print("CHAIN-OF-THOUGHT PROMPT EXAMPLE")
print("=" * 80)
print(create_cot_prompt(test_sentence))

## 3. Model Inference Functions

In [None]:
def call_gemini(prompt, model_name="gemini-2.0-flash-exp", temperature=0.0):
    max_retries = 3
    for attempt in range(max_retries):
        try:
            model = genai.GenerativeModel(model_name)
            response = model.generate_content(
                prompt,
                generation_config=genai.types.GenerationConfig(
                    temperature=temperature,
                    max_output_tokens=1000,  # More tokens for reasoning
                ),
            )
            return response.text
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2**attempt)
                continue
            return None
    return None


def call_llama(prompt, temperature=0.0):
    max_retries = 3
    for attempt in range(max_retries):
        try:
            chat_completion = groq_client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model="llama-3.3-70b-versatile",
                temperature=temperature,
                max_tokens=1000,
            )
            return chat_completion.choices[0].message.content
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2**attempt)
                continue
            return None
    return None


def parse_response(response_text):
    """Parse JSON from potentially verbose CoT response"""
    try:
        # Look for JSON in the response
        if "```json" in response_text:
            json_str = response_text.split("```json")[1].split("```")[0].strip()
        elif "```" in response_text:
            json_str = response_text.split("```")[1].strip()
        elif "{" in response_text:
            # Extract JSON object
            start = response_text.find("{")
            end = response_text.rfind("}") + 1
            json_str = response_text[start:end]
        else:
            json_str = response_text.strip()

        result = json.loads(json_str)
        return result
    except:
        response_lower = response_text.lower()
        if "positive" in response_lower and "negative" not in response_lower:
            return {
                "sentiment": "positive",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        elif "negative" in response_lower:
            return {
                "sentiment": "negative",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        elif "neutral" in response_lower:
            return {
                "sentiment": "neutral",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        return None


print("✓ Inference functions defined")

## 4. Run Experiments

In [None]:
# Test sample
test_df = df.head(100).copy()


def run_cot_experiment(test_df, model_func, model_name, exp_id):
    """Generic function to run CoT experiment"""
    print(f"Running {exp_id}: {model_name} (Chain-of-Thought)...")
    results = []

    for idx, row in tqdm(
        test_df.iterrows(), total=len(test_df), desc=f"{exp_id} Progress"
    ):
        prompt = create_cot_prompt(row["sentence"])
        response = model_func(prompt)

        if response:
            parsed = parse_response(response)
            if parsed:
                results.append(
                    {
                        "sentence": row["sentence"],
                        "true_sentiment": row["true_sentiment"],
                        "predicted_sentiment": parsed.get("sentiment", "unknown"),
                        "confidence": parsed.get("confidence", 0),
                        "rationale": parsed.get("rationale", ""),
                        "full_response": response[:500],  # Store reasoning for analysis
                    }
                )

        time.sleep(0.5)

    results_df = pd.DataFrame(results)
    print(f"\n✓ {exp_id} completed: {len(results_df)} predictions")
    return results_df


# Run all three experiments
e7_df = run_cot_experiment(
    test_df, lambda p: call_gemini(p, "gemini-2.0-flash-exp"), "Gemini Pro", "E7"
)
e8_df = run_cot_experiment(
    test_df, lambda p: call_gemini(p, "gemini-2.0-flash-exp"), "Gemini Flash", "E8"
)
e9_df = run_cot_experiment(test_df, call_llama, "Llama-3.3-70B", "E9")

display(e7_df.head())

## 5. Calculate Metrics & Visualize

In [None]:
def calculate_metrics(df, exp_name):
    valid_df = df[
        df["predicted_sentiment"].isin(["positive", "negative", "neutral"])
    ].copy()
    y_true = valid_df["true_sentiment"]
    y_pred = valid_df["predicted_sentiment"]

    metrics = {
        "Experiment": exp_name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Macro-F1": f1_score(y_true, y_pred, average="macro"),
        "Macro-Precision": precision_score(y_true, y_pred, average="macro"),
        "Macro-Recall": recall_score(y_true, y_pred, average="macro"),
    }

    cm = confusion_matrix(y_true, y_pred, labels=["positive", "negative", "neutral"])
    return metrics, cm, valid_df


e7_metrics, e7_cm, e7_valid = calculate_metrics(e7_df, "E7: Gemini Pro (CoT)")
e8_metrics, e8_cm, e8_valid = calculate_metrics(e8_df, "E8: Gemini Flash (CoT)")
e9_metrics, e9_cm, e9_valid = calculate_metrics(e9_df, "E9: Llama-3.3-70B (CoT)")

metrics_df = pd.DataFrame([e7_metrics, e8_metrics, e9_metrics])

print("\n" + "=" * 80)
print("CHAIN-OF-THOUGHT PERFORMANCE COMPARISON")
print("=" * 80)
display(metrics_df.round(4))

In [None]:
def calculate_metrics(df, exp_name):
    """Calculate all evaluation metrics"""
    # Check if dataframe is empty or missing required columns
    if df.empty or "predicted_sentiment" not in df.columns:
        print(f"⚠️ Warning: {exp_name} has no valid predictions!")
        return (
            {
                "Experiment": exp_name,
                "Total Samples": 0,
                "Valid Predictions": 0,
                "Accuracy": 0,
                "Macro-F1": 0,
                "Weighted-F1": 0,
                "Macro-Precision": 0,
                "Macro-Recall": 0,
                "Positive_Precision": 0,
                "Positive_Recall": 0,
                "Positive_F1": 0,
                "Negative_Precision": 0,
                "Negative_Recall": 0,
                "Negative_F1": 0,
                "Neutral_Precision": 0,
                "Neutral_Recall": 0,
                "Neutral_F1": 0,
            },
            np.zeros((3, 3)),
            pd.DataFrame(),
        )

    # Filter out errors
    valid_df = df[
        df["predicted_sentiment"].isin(["positive", "negative", "neutral"])
    ].copy()

    # Check if we have valid predictions
    if valid_df.empty:
        print(f"⚠️ Warning: {exp_name} has no valid predictions after filtering!")
        return (
            {
                "Experiment": exp_name,
                "Total Samples": len(df),
                "Valid Predictions": 0,
                "Accuracy": 0,
                "Macro-F1": 0,
                "Weighted-F1": 0,
                "Macro-Precision": 0,
                "Macro-Recall": 0,
                "Positive_Precision": 0,
                "Positive_Recall": 0,
                "Positive_F1": 0,
                "Negative_Precision": 0,
                "Negative_Recall": 0,
                "Negative_F1": 0,
                "Neutral_Precision": 0,
                "Neutral_Recall": 0,
                "Neutral_F1": 0,
            },
            np.zeros((3, 3)),
            pd.DataFrame(),
        )

    y_true = valid_df["true_sentiment"]
    y_pred = valid_df["predicted_sentiment"]

    metrics = {
        "Experiment": exp_name,
        "Total Samples": len(df),
        "Valid Predictions": len(valid_df),
        "Accuracy": accuracy_score(y_true, y_pred),
        "Macro-F1": f1_score(y_true, y_pred, average="macro"),
        "Weighted-F1": f1_score(y_true, y_pred, average="weighted"),
        "Macro-Precision": precision_score(y_true, y_pred, average="macro"),
        "Macro-Recall": recall_score(y_true, y_pred, average="macro"),
    }

    # Per-class metrics
    labels = ["positive", "negative", "neutral"]
    precision_per_class = precision_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    recall_per_class = recall_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    f1_per_class = f1_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )

    for i, label in enumerate(labels):
        metrics[f"{label.capitalize()}_Precision"] = precision_per_class[i]
        metrics[f"{label.capitalize()}_Recall"] = recall_per_class[i]
        metrics[f"{label.capitalize()}_F1"] = f1_per_class[i]

    cm = confusion_matrix(y_true, y_pred, labels=labels)

    return metrics, cm, valid_df


# Calculate metrics
e7_metrics, e7_cm, e7_valid = calculate_metrics(e7_df, "E7: Gemini Pro CoT")
e8_metrics, e8_cm, e8_valid = calculate_metrics(e8_df, "E8: Gemini Flash CoT")
e9_metrics, e9_cm, e9_valid = calculate_metrics(e9_df, "E9: Llama CoT")

# Comparison table
metrics_df = pd.DataFrame([e7_metrics, e8_metrics, e9_metrics])

print("\n" + "=" * 80)
print("CHAIN-OF-THOUGHT PERFORMANCE COMPARISON")
print("=" * 80)
display(
    metrics_df[
        ["Experiment", "Accuracy", "Macro-F1", "Macro-Precision", "Macro-Recall"]
    ].round(4)
)

## 6. Save Results

In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

e7_df.to_csv(f"e7_gemini_pro_cot_{timestamp}.csv", index=False)
e8_df.to_csv(f"e8_gemini_flash_cot_{timestamp}.csv", index=False)
e9_df.to_csv(f"e9_llama_cot_{timestamp}.csv", index=False)
metrics_df.to_csv(f"cot_metrics_summary_{timestamp}.csv", index=False)

print(f"\n✓ Chain-of-Thought results saved")