In [None]:
# Install required packages
!pip install pandas numpy matplotlib seaborn scikit-learn google-generativeai groq python-dotenv tqdm -q

In [None]:
# Suppress deprecation warnings
import warnings
import os

warnings.filterwarnings('ignore', category=FutureWarning, module='google.generativeai')

# Import libraries

# Fix SSL/TLS certificate verification for gRPC (required for Google Gemini API on macOS)
os.environ['GRPC_DEFAULT_SSL_ROOTS_FILE_PATH'] = ''
os.environ['GRPC_SSL_CIPHER_SUITES'] = 'HIGH'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import time
import ssl
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report,
)

# Fix SSL certificate verification issue on macOS
ssl._create_default_https_context = ssl._create_unverified_context

# API setup
import google.generativeai as genai
from groq import Groq
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configure APIs
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if GOOGLE_API_KEY:
    genai.configure(api_key=GOOGLE_API_KEY)
if GROQ_API_KEY:
    groq_client = Groq(api_key=GROQ_API_KEY)

# Set plot style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (14, 6)

print("✓ Libraries imported successfully")
print("✓ SSL certificate verification disabled for macOS compatibility")
print(f"✓ Google API configured: {bool(GOOGLE_API_KEY)}")
print(f"✓ Groq API configured: {bool(GROQ_API_KEY)}")

## 1. Load Dataset

In [None]:
# Load the 100% agreement dataset
data_path = "../../DatasetAnalysis_FinancialPhraseBank/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt"

sentences = []
sentiments = []

with open(data_path, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        line = line.strip()
        if "@" in line:
            parts = line.rsplit("@", 1)
            if len(parts) == 2:
                sentences.append(parts[0])
                sentiments.append(parts[1])

df = pd.DataFrame({"sentence": sentences, "sentiment": sentiments})

print(f"Dataset loaded: {len(df)} sentences")
print(f"\nSentiment distribution:")
print(df["sentiment"].value_counts())

# Display sample
print("\n" + "=" * 80)
print("Sample sentences for risk assessment:")
print("=" * 80)
display(df.sample(5, random_state=42))

## 2. Zero-Shot Risk Assessment Prompt Design

**Prompt Strategy**: Assess financial risk level based on statement content without examples.

**Risk Mapping**:
- **Low**: Positive news, growth, strong performance
- **Medium**: Neutral or mixed signals, stable conditions
- **High**: Negative news, declining performance
- **Critical**: Severe financial distress, losses, bankruptcy risk

In [None]:
def create_zero_shot_risk_prompt(sentence):
    """
    Creates a zero-shot prompt for financial risk assessment.
    No examples provided - model relies on pretrained knowledge.
    """
    prompt = f"""You are a financial risk assessment expert.

Analyze the following financial statement and assess the risk level as one of: "low", "medium", "high", or "critical".

Risk Level Guidelines:
- **Low Risk**: Strong financial performance, revenue growth, profitability increases, positive market position
- **Medium Risk**: Stable performance with no significant changes, mixed signals, or neutral developments
- **High Risk**: Declining performance, revenue drops, reduced profitability, concerning market conditions
- **Critical Risk**: Severe financial distress, major losses, bankruptcy risk, existential threats to the business

Financial Statement:
"{sentence}"

Provide your response in the following JSON format:
{{
    "risk_level": "low/medium/high/critical",
    "confidence": 0.0-1.0,
    "rationale": "Brief explanation in one sentence",
    "key_indicators": "Main factors influencing the risk assessment"
}}
"""
    return prompt


# Test prompt
test_sentence = "The company reported significant losses and announced layoffs affecting 30% of the workforce."
print("=" * 80)
print("ZERO-SHOT RISK ASSESSMENT PROMPT EXAMPLE")
print("=" * 80)
print(create_zero_shot_risk_prompt(test_sentence))

## 3. Model Inference Functions

In [None]:
def call_gemini(prompt, model_name="gemini-2.0-flash-exp", temperature=0.0):
    """Call Gemini API with retry logic"""
    max_retries = 3
    for attempt in range(max_retries):
        try:
            model = genai.GenerativeModel(model_name)
            response = model.generate_content(
                prompt,
                generation_config=genai.types.GenerationConfig(
                    temperature=temperature,
                    max_output_tokens=600,
                ),
            )
            return response.text
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2**attempt)
                continue
            return None
    return None


def call_llama(prompt, temperature=0.0):
    """Call Llama via Groq API"""
    max_retries = 3
    for attempt in range(max_retries):
        try:
            chat_completion = groq_client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model="llama-3.3-70b-versatile",
                temperature=temperature,
                max_tokens=600,
            )
            return chat_completion.choices[0].message.content
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2**attempt)
                continue
            return None
    return None


def parse_risk_response(response_text):
    """Parse JSON response from model"""
    try:
        # Try to extract JSON from response
        if "```json" in response_text:
            json_str = response_text.split("```json")[1].split("```")[0].strip()
        elif "```" in response_text:
            json_str = response_text.split("```")[1].strip()
        else:
            json_str = response_text.strip()

        result = json.loads(json_str)
        return result
    except:
        # Fallback: try to extract risk level with regex
        response_lower = response_text.lower()
        if "critical" in response_lower:
            return {
                "risk_level": "critical",
                "confidence": 0.5,
                "rationale": "Parsed from text",
                "key_indicators": "N/A",
            }
        elif "high" in response_lower and "risk" in response_lower:
            return {
                "risk_level": "high",
                "confidence": 0.5,
                "rationale": "Parsed from text",
                "key_indicators": "N/A",
            }
        elif "medium" in response_lower:
            return {
                "risk_level": "medium",
                "confidence": 0.5,
                "rationale": "Parsed from text",
                "key_indicators": "N/A",
            }
        elif "low" in response_lower:
            return {
                "risk_level": "low",
                "confidence": 0.5,
                "rationale": "Parsed from text",
                "key_indicators": "N/A",
            }
        return None


print("✓ Model inference functions defined")

## 4. Run Experiments

### E11: Gemini 2.5 Pro (Zero-Shot Risk Assessment)

In [None]:
# For testing, use a sample of the dataset (remove .head(100) for full run)
test_df = df.head(100).copy()  # Remove .head(100) for full dataset

# E11: Gemini 2.5 Pro
print("Running E11: Gemini 2.5 Pro (Zero-Shot Risk Assessment)...")
e11_results = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="E11 Progress"):
    prompt = create_zero_shot_risk_prompt(row["sentence"])
    response = call_gemini(prompt, model_name="gemini-2.0-flash-exp")

    if response:
        parsed = parse_risk_response(response)
        if parsed:
            e11_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["sentiment"],
                    "predicted_risk": parsed.get("risk_level", "unknown"),
                    "confidence": parsed.get("confidence", 0),
                    "rationale": parsed.get("rationale", ""),
                    "key_indicators": parsed.get("key_indicators", ""),
                }
            )
        else:
            e11_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["sentiment"],
                    "predicted_risk": "error",
                    "confidence": 0,
                    "rationale": "Parse error",
                    "key_indicators": "N/A",
                }
            )

    time.sleep(0.5)  # Rate limiting

e11_df = pd.DataFrame(e11_results)
print(f"\n✓ E11 completed: {len(e11_df)} risk assessments")
display(e11_df.head())

### E12: Gemini 2.5 Flash (Zero-Shot Risk Assessment)

In [None]:
# E12: Gemini 2.5 Flash
print("Running E12: Gemini 2.5 Flash (Zero-Shot Risk Assessment)...")
e12_results = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="E12 Progress"):
    prompt = create_zero_shot_risk_prompt(row["sentence"])
    response = call_gemini(prompt, model_name="gemini-2.0-flash-exp")

    if response:
        parsed = parse_risk_response(response)
        if parsed:
            e12_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["sentiment"],
                    "predicted_risk": parsed.get("risk_level", "unknown"),
                    "confidence": parsed.get("confidence", 0),
                    "rationale": parsed.get("rationale", ""),
                    "key_indicators": parsed.get("key_indicators", ""),
                }
            )
        else:
            e12_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["sentiment"],
                    "predicted_risk": "error",
                    "confidence": 0,
                    "rationale": "Parse error",
                    "key_indicators": "N/A",
                }
            )

    time.sleep(0.5)

e12_df = pd.DataFrame(e12_results)
print(f"\n✓ E12 completed: {len(e12_df)} risk assessments")
display(e12_df.head())

### E13: Llama-3.3-70B-Versatile (Zero-Shot Risk Assessment)

In [None]:
# E13: Llama-3.3-70B
print("Running E13: Llama-3.3-70B (Zero-Shot Risk Assessment)...")
e13_results = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="E13 Progress"):
    prompt = create_zero_shot_risk_prompt(row["sentence"])
    response = call_llama(prompt)

    if response:
        parsed = parse_risk_response(response)
        if parsed:
            e13_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["sentiment"],
                    "predicted_risk": parsed.get("risk_level", "unknown"),
                    "confidence": parsed.get("confidence", 0),
                    "rationale": parsed.get("rationale", ""),
                    "key_indicators": parsed.get("key_indicators", ""),
                }
            )
        else:
            e13_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["sentiment"],
                    "predicted_risk": "error",
                    "confidence": 0,
                    "rationale": "Parse error",
                    "key_indicators": "N/A",
                }
            )

    time.sleep(0.5)

e13_df = pd.DataFrame(e13_results)
print(f"\n✓ E13 completed: {len(e13_df)} risk assessments")
display(e13_df.head())

## 5. Analyze Risk Distribution

In [None]:
# Risk distribution analysis
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (df_result, title) in enumerate(
    [
        (e11_df, "E11: Gemini Pro"),
        (e12_df, "E12: Gemini Flash"),
        (e13_df, "E13: Llama-3.3-70B"),
    ]
):
    valid_df = df_result[
        df_result["predicted_risk"].isin(["low", "medium", "high", "critical"])
    ]
    risk_counts = valid_df["predicted_risk"].value_counts()

    axes[idx].bar(
        risk_counts.index,
        risk_counts.values,
        color=["green", "yellow", "orange", "red"],
    )
    axes[idx].set_title(f"{title}\nRisk Distribution", fontsize=12, weight="bold")
    axes[idx].set_xlabel("Risk Level", fontsize=11)
    axes[idx].set_ylabel("Count", fontsize=11)
    axes[idx].grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.savefig("zero_shot_risk_distribution.png", dpi=300, bbox_inches="tight")
plt.show()

# Print statistics
for df_result, exp_name in [
    (e11_df, "E11: Gemini Pro"),
    (e12_df, "E12: Gemini Flash"),
    (e13_df, "E13: Llama-3.3-70B"),
]:
    print(f"\n{exp_name} - Risk Level Distribution:")
    print(df_result["predicted_risk"].value_counts())
    print(f"Average Confidence: {df_result['confidence'].mean():.3f}")

## 6. Sentiment-Risk Correlation Analysis

In [None]:
# Analyze how sentiment correlates with assessed risk
def analyze_sentiment_risk_mapping(df_result, exp_name):
    """
    Expected mapping:
    - positive sentiment -> low/medium risk
    - neutral sentiment -> medium risk
    - negative sentiment -> high/critical risk
    """
    valid_df = df_result[
        df_result["predicted_risk"].isin(["low", "medium", "high", "critical"])
    ].copy()

    mapping = (
        pd.crosstab(
            valid_df["true_sentiment"], valid_df["predicted_risk"], normalize="index"
        )
        * 100
    )

    print(f"\n{'=' * 80}")
    print(f"{exp_name}: Sentiment → Risk Mapping (%)")
    print(f"{'=' * 80}")
    print(mapping.round(2))

    return mapping


e11_mapping = analyze_sentiment_risk_mapping(e11_df, "E11: Gemini Pro")
e12_mapping = analyze_sentiment_risk_mapping(e12_df, "E12: Gemini Flash")
e13_mapping = analyze_sentiment_risk_mapping(e13_df, "E13: Llama-3.3-70B")

## 7. Visualize Sentiment-Risk Heatmaps

In [None]:
# Create heatmaps for sentiment-risk correlation
fig, axes = plt.subplots(1, 3, figsize=(20, 5))

for idx, (mapping, title) in enumerate(
    [
        (e11_mapping, "E11: Gemini Pro"),
        (e12_mapping, "E12: Gemini Flash"),
        (e13_mapping, "E13: Llama-3.3-70B"),
    ]
):
    sns.heatmap(
        mapping,
        annot=True,
        fmt=".1f",
        cmap="RdYlGn_r",
        ax=axes[idx],
        cbar_kws={"label": "Percentage (%)"},
        vmin=0,
        vmax=100,
    )
    axes[idx].set_title(
        f"{title}\nSentiment → Risk Mapping", fontsize=12, weight="bold"
    )
    axes[idx].set_xlabel("Predicted Risk Level", fontsize=11)
    axes[idx].set_ylabel("True Sentiment", fontsize=11)

plt.suptitle(
    "Sentiment to Risk Level Correlation Analysis", fontsize=14, weight="bold", y=1.02
)
plt.tight_layout()
plt.savefig("zero_shot_sentiment_risk_heatmaps.png", dpi=300, bbox_inches="tight")
plt.show()

## 8. Save Results

In [None]:
# Save detailed results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

e11_df.to_csv(f"e11_gemini_pro_zero_shot_risk_{timestamp}.csv", index=False)
e12_df.to_csv(f"e12_gemini_flash_zero_shot_risk_{timestamp}.csv", index=False)
e13_df.to_csv(f"e13_llama_zero_shot_risk_{timestamp}.csv", index=False)

# Save summary statistics
summary_data = []
for df_result, exp_name in [
    (e11_df, "E11: Gemini Pro"),
    (e12_df, "E12: Gemini Flash"),
    (e13_df, "E13: Llama-3.3-70B"),
]:
    valid_df = df_result[
        df_result["predicted_risk"].isin(["low", "medium", "high", "critical"])
    ]
    summary_data.append(
        {
            "Experiment": exp_name,
            "Total_Samples": len(df_result),
            "Valid_Predictions": len(valid_df),
            "Avg_Confidence": valid_df["confidence"].mean(),
            "Low_Risk_Count": (valid_df["predicted_risk"] == "low").sum(),
            "Medium_Risk_Count": (valid_df["predicted_risk"] == "medium").sum(),
            "High_Risk_Count": (valid_df["predicted_risk"] == "high").sum(),
            "Critical_Risk_Count": (valid_df["predicted_risk"] == "critical").sum(),
        }
    )

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv(f"zero_shot_risk_summary_{timestamp}.csv", index=False)

print(f"\n✓ Results saved with timestamp: {timestamp}")
print("\nFiles created:")
print(f"  - e11_gemini_pro_zero_shot_risk_{timestamp}.csv")
print(f"  - e12_gemini_flash_zero_shot_risk_{timestamp}.csv")
print(f"  - e13_llama_zero_shot_risk_{timestamp}.csv")
print(f"  - zero_shot_risk_summary_{timestamp}.csv")
print(f"  - zero_shot_risk_distribution.png")
print(f"  - zero_shot_sentiment_risk_heatmaps.png")

## 9. Key Findings

### Analysis Questions:

1. **Risk Assessment Consistency**
   - Do models agree on risk levels?
   - Which model is most conservative/aggressive in risk assessment?

2. **Sentiment-Risk Correlation**
   - Does negative sentiment correctly map to high/critical risk?
   - Does positive sentiment correctly map to low/medium risk?

3. **Model Confidence**
   - Which model shows highest confidence in risk assessments?
   - Correlation between confidence and sentiment clarity?

4. **Edge Cases**
   - Which statements cause disagreement between models?
   - Neutral statements - do they map to medium risk as expected?