In [12]:
# Install required packages
!pip install pandas numpy matplotlib seaborn scikit-learn google-generativeai groq python-dotenv tqdm -q

In [13]:
# Suppress deprecation warnings
import warnings
import os

warnings.filterwarnings('ignore', category=FutureWarning, module='google.generativeai')

# Import libraries

# Fix SSL/TLS certificate verification for gRPC (required for Google Gemini API on macOS)
os.environ['GRPC_DEFAULT_SSL_ROOTS_FILE_PATH'] = ''
os.environ['GRPC_SSL_CIPHER_SUITES'] = 'HIGH'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import time
import ssl
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report,
)

# Fix SSL certificate verification issue on macOS
ssl._create_default_https_context = ssl._create_unverified_context

# API setup
import google.generativeai as genai
from groq import Groq
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configure APIs
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if GOOGLE_API_KEY:
    genai.configure(api_key=GOOGLE_API_KEY)
if GROQ_API_KEY:
    groq_client = Groq(api_key=GROQ_API_KEY)

# Set plot style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (14, 6)

print("✓ Libraries imported successfully")
print("✓ SSL certificate verification disabled for macOS compatibility")
print(f"✓ Google API configured: {bool(GOOGLE_API_KEY)}")
print(f"✓ Groq API configured: {bool(GROQ_API_KEY)}")

✓ Libraries imported successfully
✓ SSL certificate verification disabled for macOS compatibility
✓ Google API configured: True
✓ Groq API configured: True


## 1. Load Dataset

In [14]:
# Load the 100% agreement dataset (highest quality)
data_path = "../../DatasetAnalysis_FinancialPhraseBank/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt"

sentences = []
sentiments = []

with open(data_path, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        line = line.strip()
        if "@" in line:
            parts = line.rsplit("@", 1)
            if len(parts) == 2:
                sentences.append(parts[0])
                sentiments.append(parts[1])

df = pd.DataFrame({"sentence": sentences, "true_sentiment": sentiments})

print(f"Dataset loaded: {len(df)} sentences")
print(f"\nSentiment distribution:")
print(df["true_sentiment"].value_counts())

# Display sample
print("\n" + "=" * 80)
print("Sample sentences:")
print("=" * 80)
display(df.sample(5, random_state=42))

Dataset loaded: 2264 sentences

Sentiment distribution:
true_sentiment
neutral     1391
positive     570
negative     303
Name: count, dtype: int64

Sample sentences:


Unnamed: 0,sentence,true_sentiment
1755,The contract value amounts to EUR 2.4 million .,neutral
1281,Kemira shares closed at ( x20ac ) 16.66 ( $ 2...,neutral
350,The company slipped to an operating loss of EU...,negative
420,According to Atria 's President and CEO Matti ...,positive
56,"In 2009 , Fiskars ' cash flow from operating a...",positive


## 2. Zero-Shot Prompt Design

**Prompt Strategy**: Simple, direct instruction with no examples. Enforces strict JSON output format.

In [15]:
def create_zero_shot_prompt(sentence):
    """
    Creates a zero-shot prompt for sentiment classification.
    No examples provided - model relies on pretrained knowledge.
    """
    prompt = f"""You are a financial sentiment analysis expert.

Classify the sentiment of the following financial statement as either "positive", "negative", or "neutral" from an investor's perspective.

Consider:
- Positive: Good news for stock price (revenue increase, profit growth, etc.)
- Negative: Bad news for stock price (losses, declining sales, etc.)
- Neutral: No clear impact on stock price or mixed signals

Financial Statement:
"{sentence}"

Provide your response in the following JSON format:
{{
    "sentiment": "positive/negative/neutral",
    "confidence": 0.0-1.0,
    "rationale": "Brief explanation in one sentence"
}}
"""
    return prompt


# Test prompt
test_sentence = "Operating profit rose to EUR 13.1 mn from EUR 8.7 mn in the corresponding period in 2007."
print("=" * 80)
print("ZERO-SHOT PROMPT EXAMPLE")
print("=" * 80)
print(create_zero_shot_prompt(test_sentence))

ZERO-SHOT PROMPT EXAMPLE
You are a financial sentiment analysis expert.

Classify the sentiment of the following financial statement as either "positive", "negative", or "neutral" from an investor's perspective.

Consider:
- Positive: Good news for stock price (revenue increase, profit growth, etc.)
- Negative: Bad news for stock price (losses, declining sales, etc.)
- Neutral: No clear impact on stock price or mixed signals

Financial Statement:
"Operating profit rose to EUR 13.1 mn from EUR 8.7 mn in the corresponding period in 2007."

Provide your response in the following JSON format:
{
    "sentiment": "positive/negative/neutral",
    "confidence": 0.0-1.0,
    "rationale": "Brief explanation in one sentence"
}



## 3. Model Inference Functions

In [16]:
def call_gemini(prompt, model_name="gemini-2.0-flash-exp", temperature=0.0):
    """Call Gemini API with retry logic"""
    max_retries = 3
    for attempt in range(max_retries):
        try:
            model = genai.GenerativeModel(model_name)
            response = model.generate_content(
                prompt,
                generation_config=genai.types.GenerationConfig(
                    temperature=temperature,
                    max_output_tokens=500,
                ),
            )
            return response.text
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2**attempt)  # Exponential backoff
                continue
            return None
    return None


def call_llama(prompt, temperature=0.0):
    """Call Llama via Groq API"""
    max_retries = 3
    for attempt in range(max_retries):
        try:
            chat_completion = groq_client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ],
                model="llama-3.3-70b-versatile",
                temperature=temperature,
                max_tokens=500,
            )
            return chat_completion.choices[0].message.content
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2**attempt)
                continue
            return None
    return None


def parse_response(response_text):
    """Parse JSON response from model"""
    try:
        # Try to extract JSON from response
        if "```json" in response_text:
            json_str = response_text.split("```json")[1].split("```")[0].strip()
        elif "```" in response_text:
            json_str = response_text.split("```")[1].strip()
        else:
            json_str = response_text.strip()

        result = json.loads(json_str)
        return result
    except:
        # Fallback: try to extract sentiment with regex
        response_lower = response_text.lower()
        if "positive" in response_lower and "negative" not in response_lower:
            return {
                "sentiment": "positive",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        elif "negative" in response_lower:
            return {
                "sentiment": "negative",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        elif "neutral" in response_lower:
            return {
                "sentiment": "neutral",
                "confidence": 0.5,
                "rationale": "Parsed from text",
            }
        return None


print("✓ Model inference functions defined")

✓ Model inference functions defined


## 4. Run Experiments

### E1: Gemini 2.5 Pro (Zero-Shot)

In [17]:
# For testing, use a sample of the dataset (remove .head(100) for full run)
test_df = df.head(100).copy()  # Remove .head(100) for full dataset

# E1: Gemini 2.5 Pro
print("Running E1: Gemini 2.5 Pro (Zero-Shot)...")
e1_results = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="E1 Progress"):
    prompt = create_zero_shot_prompt(row["sentence"])
    response = call_gemini(
        prompt, model_name="gemini-2.0-flash-exp"
    )  # Using available model

    if response:
        parsed = parse_response(response)
        if parsed:
            e1_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": parsed.get("sentiment", "unknown"),
                    "confidence": parsed.get("confidence", 0),
                    "rationale": parsed.get("rationale", ""),
                }
            )
        else:
            e1_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": "error",
                    "confidence": 0,
                    "rationale": "Parse error",
                }
            )

    time.sleep(0.5)  # Rate limiting

e1_df = pd.DataFrame(e1_results)
print(f"\n✓ E1 completed: {len(e1_df)} predictions")
display(e1_df.head())

Running E1: Gemini 2.5 Pro (Zero-Shot)...


E1 Progress:   0%|          | 0/100 [00:00<?, ?it/s]I0000 00:00:1768734802.831363 18756116 ssl_transport_security.cc:1884] Handshake failed with error SSL_ERROR_SSL: error:1000007d:SSL routines:OPENSSL_internal:CERTIFICATE_VERIFY_FAILED: unable to get local issuer certificate
I0000 00:00:1768734803.679019 18756120 ssl_transport_security.cc:1884] Handshake failed with error SSL_ERROR_SSL: error:1000007d:SSL routines:OPENSSL_internal:CERTIFICATE_VERIFY_FAILED: unable to get local issuer certificate
I0000 00:00:1768734803.698051 18756116 ssl_transport_security.cc:1884] Handshake failed with error SSL_ERROR_SSL: error:1000007d:SSL routines:OPENSSL_internal:CERTIFICATE_VERIFY_FAILED: unable to get local issuer certificate
I0000 00:00:1768734803.711998 18756118 ssl_transport_security.cc:1884] Handshake failed with error SSL_ERROR_SSL: error:1000007d:SSL routines:OPENSSL_internal:CERTIFICATE_VERIFY_FAILED: unable to get local issuer certificate
I0000 00:00:1768734803.882289 18756116 ssl_trans

KeyboardInterrupt: 

I0000 00:00:1768734867.269632 18756118 ssl_transport_security.cc:1884] Handshake failed with error SSL_ERROR_SSL: error:1000007d:SSL routines:OPENSSL_internal:CERTIFICATE_VERIFY_FAILED: unable to get local issuer certificate
I0000 00:00:1768734867.271832 18756119 ssl_transport_security.cc:1884] Handshake failed with error SSL_ERROR_SSL: error:1000007d:SSL routines:OPENSSL_internal:CERTIFICATE_VERIFY_FAILED: unable to get local issuer certificate
I0000 00:00:1768734867.273174 18756119 ssl_transport_security.cc:1884] Handshake failed with error SSL_ERROR_SSL: error:1000007d:SSL routines:OPENSSL_internal:CERTIFICATE_VERIFY_FAILED: unable to get local issuer certificate
I0000 00:00:1768734867.274278 18756119 ssl_transport_security.cc:1884] Handshake failed with error SSL_ERROR_SSL: error:1000007d:SSL routines:OPENSSL_internal:CERTIFICATE_VERIFY_FAILED: unable to get local issuer certificate
I0000 00:00:1768734867.279298 18756121 ssl_transport_security.cc:1884] Handshake failed with error S

### E2: Gemini 2.5 Flash (Zero-Shot)

In [None]:
# E2: Gemini 2.5 Flash
print("Running E2: Gemini 2.5 Flash (Zero-Shot)...")
e2_results = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="E2 Progress"):
    prompt = create_zero_shot_prompt(row["sentence"])
    response = call_gemini(prompt, model_name="gemini-2.0-flash-exp")

    if response:
        parsed = parse_response(response)
        if parsed:
            e2_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": parsed.get("sentiment", "unknown"),
                    "confidence": parsed.get("confidence", 0),
                    "rationale": parsed.get("rationale", ""),
                }
            )
        else:
            e2_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": "error",
                    "confidence": 0,
                    "rationale": "Parse error",
                }
            )

    time.sleep(0.5)

e2_df = pd.DataFrame(e2_results)
print(f"\n✓ E2 completed: {len(e2_df)} predictions")
display(e2_df.head())

### E3: Llama-3.3-70B-Versatile (Zero-Shot)

In [None]:
# E3: Llama-3.3-70B
print("Running E3: Llama-3.3-70B (Zero-Shot)...")
e3_results = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="E3 Progress"):
    prompt = create_zero_shot_prompt(row["sentence"])
    response = call_llama(prompt)

    if response:
        parsed = parse_response(response)
        if parsed:
            e3_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": parsed.get("sentiment", "unknown"),
                    "confidence": parsed.get("confidence", 0),
                    "rationale": parsed.get("rationale", ""),
                }
            )
        else:
            e3_results.append(
                {
                    "sentence": row["sentence"],
                    "true_sentiment": row["true_sentiment"],
                    "predicted_sentiment": "error",
                    "confidence": 0,
                    "rationale": "Parse error",
                }
            )

    time.sleep(0.5)

e3_df = pd.DataFrame(e3_results)
print(f"\n✓ E3 completed: {len(e3_df)} predictions")
display(e3_df.head())

## 5. Calculate Metrics

In [None]:
def calculate_metrics(df, exp_name):
    """Calculate all evaluation metrics"""
    # Check if dataframe is empty or missing required columns
    if df.empty or "predicted_sentiment" not in df.columns:
        print(f"⚠️ Warning: {exp_name} has no valid predictions!")
        return (
            {
                "Experiment": exp_name,
                "Total Samples": 0,
                "Valid Predictions": 0,
                "Accuracy": 0,
                "Macro-F1": 0,
                "Weighted-F1": 0,
                "Macro-Precision": 0,
                "Macro-Recall": 0,
                "Positive_Precision": 0,
                "Positive_Recall": 0,
                "Positive_F1": 0,
                "Negative_Precision": 0,
                "Negative_Recall": 0,
                "Negative_F1": 0,
                "Neutral_Precision": 0,
                "Neutral_Recall": 0,
                "Neutral_F1": 0,
            },
            np.zeros((3, 3)),
            pd.DataFrame(),
        )

    # Filter out errors
    valid_df = df[
        df["predicted_sentiment"].isin(["positive", "negative", "neutral"])
    ].copy()

    # Check if we have valid predictions
    if valid_df.empty:
        print(f"⚠️ Warning: {exp_name} has no valid predictions after filtering!")
        return (
            {
                "Experiment": exp_name,
                "Total Samples": len(df),
                "Valid Predictions": 0,
                "Accuracy": 0,
                "Macro-F1": 0,
                "Weighted-F1": 0,
                "Macro-Precision": 0,
                "Macro-Recall": 0,
                "Positive_Precision": 0,
                "Positive_Recall": 0,
                "Positive_F1": 0,
                "Negative_Precision": 0,
                "Negative_Recall": 0,
                "Negative_F1": 0,
                "Neutral_Precision": 0,
                "Neutral_Recall": 0,
                "Neutral_F1": 0,
            },
            np.zeros((3, 3)),
            pd.DataFrame(),
        )

    y_true = valid_df["true_sentiment"]
    y_pred = valid_df["predicted_sentiment"]

    metrics = {
        "Experiment": exp_name,
        "Total Samples": len(df),
        "Valid Predictions": len(valid_df),
        "Accuracy": accuracy_score(y_true, y_pred),
        "Macro-F1": f1_score(y_true, y_pred, average="macro"),
        "Weighted-F1": f1_score(y_true, y_pred, average="weighted"),
        "Macro-Precision": precision_score(y_true, y_pred, average="macro"),
        "Macro-Recall": recall_score(y_true, y_pred, average="macro"),
    }

    # Per-class metrics
    labels = ["positive", "negative", "neutral"]
    precision_per_class = precision_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    recall_per_class = recall_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )
    f1_per_class = f1_score(
        y_true, y_pred, labels=labels, average=None, zero_division=0
    )

    for i, label in enumerate(labels):
        metrics[f"{label.capitalize()}_Precision"] = precision_per_class[i]
        metrics[f"{label.capitalize()}_Recall"] = recall_per_class[i]
        metrics[f"{label.capitalize()}_F1"] = f1_per_class[i]

    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred, labels=labels)

    return metrics, cm, valid_df


# Calculate metrics for all experiments
e1_metrics, e1_cm, e1_valid = calculate_metrics(e1_df, "E1: Gemini Pro")
e2_metrics, e2_cm, e2_valid = calculate_metrics(e2_df, "E2: Gemini Flash")
e3_metrics, e3_cm, e3_valid = calculate_metrics(e3_df, "E3: Llama-3.3-70B")

# Create metrics comparison table
metrics_df = pd.DataFrame([e1_metrics, e2_metrics, e3_metrics])

print("\n" + "=" * 80)
print("ZERO-SHOT PERFORMANCE COMPARISON")
print("=" * 80)
display(
    metrics_df[
        ["Experiment", "Accuracy", "Macro-F1", "Macro-Precision", "Macro-Recall"]
    ].round(4)
)

## 6. Visualize Results

In [None]:
# Performance comparison bar chart
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Overall metrics
metrics_to_plot = ["Accuracy", "Macro-F1", "Macro-Precision", "Macro-Recall"]
x = np.arange(len(metrics_to_plot))
width = 0.25

for i, (metrics, label) in enumerate(
    [
        (e1_metrics, "Gemini Pro"),
        (e2_metrics, "Gemini Flash"),
        (e3_metrics, "Llama-3.3-70B"),
    ]
):
    values = [metrics[m] for m in metrics_to_plot]
    axes[0].bar(x + i * width, values, width, label=label, alpha=0.8)

axes[0].set_xlabel("Metrics", fontsize=12, weight="bold")
axes[0].set_ylabel("Score", fontsize=12, weight="bold")
axes[0].set_title(
    "Overall Performance Comparison (Zero-Shot)", fontsize=14, weight="bold"
)
axes[0].set_xticks(x + width)
axes[0].set_xticklabels(metrics_to_plot)
axes[0].legend()
axes[0].set_ylim([0, 1])
axes[0].grid(axis="y", alpha=0.3)

# Per-class F1 scores
classes = ["Positive", "Negative", "Neutral"]
x2 = np.arange(len(classes))

for i, (metrics, label) in enumerate(
    [
        (e1_metrics, "Gemini Pro"),
        (e2_metrics, "Gemini Flash"),
        (e3_metrics, "Llama-3.3-70B"),
    ]
):
    values = [metrics[f"{c}_F1"] for c in classes]
    axes[1].bar(x2 + i * width, values, width, label=label, alpha=0.8)

axes[1].set_xlabel("Sentiment Class", fontsize=12, weight="bold")
axes[1].set_ylabel("F1 Score", fontsize=12, weight="bold")
axes[1].set_title("Per-Class F1 Scores (Zero-Shot)", fontsize=14, weight="bold")
axes[1].set_xticks(x2 + width)
axes[1].set_xticklabels(classes)
axes[1].legend()
axes[1].set_ylim([0, 1])
axes[1].grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.savefig("zero_shot_performance_comparison.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
labels = ["Positive", "Negative", "Neutral"]

for idx, (cm, title) in enumerate(
    [
        (e1_cm, "E1: Gemini Pro"),
        (e2_cm, "E2: Gemini Flash"),
        (e3_cm, "E3: Llama-3.3-70B"),
    ]
):
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=labels,
        yticklabels=labels,
        ax=axes[idx],
        cbar_kws={"label": "Count"},
    )
    axes[idx].set_title(title, fontsize=12, weight="bold")
    axes[idx].set_ylabel("True Label", fontsize=11, weight="bold")
    axes[idx].set_xlabel("Predicted Label", fontsize=11, weight="bold")

plt.suptitle(
    "Confusion Matrices - Zero-Shot Sentiment Analysis",
    fontsize=14,
    weight="bold",
    y=1.02,
)
plt.tight_layout()
plt.savefig("zero_shot_confusion_matrices.png", dpi=300, bbox_inches="tight")
plt.show()

## 7. Detailed Classification Reports

In [None]:
# Print detailed classification reports
for df_result, exp_name in [
    (e1_valid, "E1: Gemini Pro"),
    (e2_valid, "E2: Gemini Flash"),
    (e3_valid, "E3: Llama-3.3-70B"),
]:
    print("\n" + "=" * 80)
    print(f"CLASSIFICATION REPORT: {exp_name}")
    print("=" * 80)
    print(
        classification_report(
            df_result["true_sentiment"],
            df_result["predicted_sentiment"],
            labels=["positive", "negative", "neutral"],
            target_names=["Positive", "Negative", "Neutral"],
        )
    )

## 8. Save Results

In [None]:
# Save detailed results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

e1_df.to_csv(f"e1_gemini_pro_zero_shot_{timestamp}.csv", index=False)
e2_df.to_csv(f"e2_gemini_flash_zero_shot_{timestamp}.csv", index=False)
e3_df.to_csv(f"e3_llama_zero_shot_{timestamp}.csv", index=False)

# Save metrics summary
metrics_df.to_csv(f"zero_shot_metrics_summary_{timestamp}.csv", index=False)

print(f"\n✓ Results saved with timestamp: {timestamp}")
print("\nFiles created:")
print(f"  - e1_gemini_pro_zero_shot_{timestamp}.csv")
print(f"  - e2_gemini_flash_zero_shot_{timestamp}.csv")
print(f"  - e3_llama_zero_shot_{timestamp}.csv")
print(f"  - zero_shot_metrics_summary_{timestamp}.csv")
print(f"  - zero_shot_performance_comparison.png")
print(f"  - zero_shot_confusion_matrices.png")

## 9. Key Findings

### Summary of Zero-Shot Results:

1. **Model Performance Ranking**
   - Compare accuracy and F1 scores across the three models
   - Identify which model performs best in zero-shot setting

2. **Class-Specific Performance**
   - Analyze which sentiment class is hardest to classify
   - Check if neutral class causes most confusion

3. **Error Analysis**
   - Review misclassified examples
   - Identify common error patterns

4. **Confidence Calibration**
   - Examine relationship between confidence scores and accuracy
   - Identify overconfident or underconfident predictions

In [None]:
# Confidence analysis
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (df_result, title) in enumerate(
    [(e1_valid, "Gemini Pro"), (e2_valid, "Gemini Flash"), (e3_valid, "Llama-3.3-70B")]
):
    df_result["correct"] = (
        df_result["true_sentiment"] == df_result["predicted_sentiment"]
    )

    correct_conf = df_result[df_result["correct"]]["confidence"]
    incorrect_conf = df_result[~df_result["correct"]]["confidence"]

    axes[idx].hist(
        [correct_conf, incorrect_conf],
        bins=20,
        label=["Correct", "Incorrect"],
        alpha=0.7,
        color=["green", "red"],
    )
    axes[idx].set_xlabel("Confidence Score", fontsize=11, weight="bold")
    axes[idx].set_ylabel("Frequency", fontsize=11, weight="bold")
    axes[idx].set_title(
        f"{title}\nMean Conf: Correct={correct_conf.mean():.3f}, Incorrect={incorrect_conf.mean():.3f}",
        fontsize=11,
        weight="bold",
    )
    axes[idx].legend()
    axes[idx].grid(alpha=0.3)

plt.suptitle(
    "Confidence Distribution: Correct vs Incorrect Predictions",
    fontsize=14,
    weight="bold",
    y=1.02,
)
plt.tight_layout()
plt.savefig("zero_shot_confidence_analysis.png", dpi=300, bbox_inches="tight")
plt.show()