In [1]:
# LLM Stability and Internal Consistency Research Pipeline
import json
import os
import time
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any, Tuple
import pandas as pd
import numpy as np
from collections import defaultdict
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.stats import ttest_ind
from itertools import combinations
import warnings
warnings.filterwarnings('ignore')
try:
    from sentence_transformers import SentenceTransformer
    SENTENCE_TRANSFORMER_AVAILABLE = True
    print("Sentence transformers available, using semantic similarity.")
except ImportError:
    SENTENCE_TRANSFORMER_AVAILABLE = False
    print("Sentence transformers not available, using TF-IDF similarity.")

Sentence transformers available, using semantic similarity.


In [2]:
# Configuration & Setup
class Config:
    MODELS = ["deepseek-r1:7b", "llama3:latest"]    
    OLLAMA_URL = "http://localhost:11434/api/generate"    
    NUM_RUNS = 3    
    GENERATION_PARAMS = {
        "temperature": 0.7, "top_p": 0.9,
        "max_tokens": 512, "stream": False}    
    OUTPUT_DIR = Path("experiment_outputs")
    RAW_RESPONSES_DIR = OUTPUT_DIR / "raw_responses"
    CLAIMS_DIR = OUTPUT_DIR / "extracted_claims"
    ANALYSIS_DIR = OUTPUT_DIR / "analysis"
    PLOTS_DIR = OUTPUT_DIR / "plots"
    @classmethod
    def setup_directories(cls):
        for directory in [cls.OUTPUT_DIR, cls.RAW_RESPONSES_DIR, 
                         cls.CLAIMS_DIR, cls.ANALYSIS_DIR, cls.PLOTS_DIR]:
            directory.mkdir(parents=True, exist_ok=True)
        print("Directories created.")
Config.setup_directories()

Directories created.


In [3]:
# Query Dataset Definition
QUERY_DATASET = {
    "FACTUAL_EXPLANATORY": {
        "Inflation": [
            "Is inflation always harmful to an economy?",
            "Does inflation have only negative effects?",
            "Can inflation ever be beneficial?",
            "Are there situations where inflation helps economic growth?",
            "Is inflation necessarily bad for all economies?"],
        "Capital Punishment": [
            "Does capital punishment deter crime?",
            "Is the death penalty an effective deterrent?",
            "Can capital punishment reduce crime rates?",
            "Is there evidence that executions prevent crime?",
            "Does the death penalty fail to deter criminals?"],
        "Climate Change": [
            "Is climate change primarily caused by human activity?",
            "Are humans the main contributors to climate change?",
            "Does climate change occur mostly due to natural cycles?",
            "Is anthropogenic climate change scientifically accepted?",
            "Can climate change happen without human involvement?"]},
    "DEFINITION_STABILITY": {
        "Democracy": [
            "What is democracy?",
            "How would you define democracy?",
            "What does democracy mean in political theory?",
            "Is democracy simply majority rule?",
            "Can a system be democratic without elections?"],
        "Artificial Intelligence": [
            "What is artificial intelligence?",
            "How is AI defined in computer science?",
            "Does AI require human-level intelligence?",
            "Is machine learning the same as AI?",
            "Can rule-based systems be considered AI?"],
        "Free Will": [
            "What is free will?",
            "How do philosophers define free will?",
            "Is free will compatible with determinism?",
            "Do humans truly have free will?",
            "Can free will exist in a deterministic universe?"]},
    "ADVICE_POLICY": {
        "Career Advice": [
            "Is it better to specialize early in one field?",
            "Should people keep their career options open?",
            "Is early specialization risky?",
            "Can focusing on one skill early improve career success?",
            "Should graduates avoid specializing too soon?"],
        "Exercise & Health": [
            "Is exercising every day healthy?",
            "Should people work out daily?",
            "Can daily exercise be harmful?",
            "Is rest necessary for physical fitness?",
            "Should workouts be done without rest days?"],
        "Studying Late at Night": [
            "Is studying late at night effective?",
            "Does studying at night improve learning?",
            "Is night-time studying harmful?",
            "Are people more productive when studying late?",
            "Should students avoid studying at night?"]},
    "TEMPORAL_CAUSAL": {
        "Remote Work": [
            "Is remote work more productive than office work?",
            "Has remote work increased productivity?",
            "Does working from home reduce efficiency?",
            "Is remote work better for long-term productivity?",
            "Will remote work remain productive in the future?"],
        "Social Media": [
            "Does social media harm mental health?",
            "Is social media linked to increased anxiety?",
            "Can social media improve mental well-being?",
            "Does social media have net negative effects?",
            "Is social media beneficial for mental health in some cases?"]},
    "LOGICAL_EDGE_CASE": {
        "Lying": [
            "Is lying always wrong?",
            "Can lying ever be morally justified?",
            "Is it unethical to lie to protect someone?",
            "Are there exceptions to the rule against lying?",
            "Should honesty always override consequences?"],
        "Rules & Exceptions": [
            "Should rules always be followed?",
            "Are exceptions necessary for rules?",
            "Can strict rules cause harm?",
            "Is rule-breaking sometimes justified?",
            "Should rules be flexible?"]},
    "TECHNICAL_CONCEPT": {
        "Overfitting": [
            "What is overfitting in machine learning?",
            "How does overfitting occur?",
            "Is overfitting always bad?",
            "Can overfitting ever be useful?",
            "How does overfitting relate to generalization?"],
        "Gradient Descent": [
            "What is gradient descent?",
            "How does gradient descent optimize models?",
            "Is gradient descent guaranteed to find the global minimum?",
            "Can gradient descent fail?",
            "Why does gradient descent sometimes converge slowly?"]},
    "META_REASONING": {
        "Uncertainty": [
            "Can uncertainty be eliminated?",
            "Is uncertainty always undesirable?",
            "Can uncertainty be beneficial?",
            "Should decisions avoid uncertainty?",
            "How should uncertainty be handled in decision-making?"],
        "Knowledge Limits": [
            "Can humans know everything?",
            "Are there limits to human knowledge?",
            "Is complete knowledge possible?",
            "Does science have ultimate limits?",
            "Can all truths be discovered?"]}}
def get_total_queries():
    total = 0
    for category in QUERY_DATASET.values():
        for concept_queries in category.values():
            total += len(concept_queries)
    return total
print(f"Query dataset loaded: {get_total_queries()} total queries across {len([c for cat in QUERY_DATASET.values() for c in cat])} concepts")

Query dataset loaded: 85 total queries across 17 concepts


In [4]:
# LLM Interaction Module
class OllamaClient:    
    def __init__(self, base_url: str = Config.OLLAMA_URL):
        self.base_url = base_url
        self.request_count = 0
    def generate(self, model: str, prompt: str, params: Dict = None) -> Dict[str, Any]:
        if params is None:
            params = Config.GENERATION_PARAMS.copy()
        payload = {
            "model": model, "prompt": prompt, "stream": False,
            "options": {
                "temperature": params.get("temperature", 0.7),
                "top_p": params.get("top_p", 0.9),
                "num_predict": params.get("max_tokens", 512)}}
        try:
            self.request_count += 1
            start_time = time.time()
            response = requests.post(self.base_url, json=payload, timeout=120)
            response.raise_for_status()
            elapsed_time = time.time() - start_time
            result = response.json()
            return {
                "response": result.get("response", ""),
                "model": model,
                "prompt": prompt,
                "elapsed_time": elapsed_time,
                "success": True,
                "error": None,
                "timestamp": datetime.now().isoformat()}
        except requests.exceptions.Timeout:
            return {
                "response": "",
                "model": model,
                "prompt": prompt,
                "elapsed_time": None,
                "success": False,
                "error": "Request timeout",
                "timestamp": datetime.now().isoformat()}
        except Exception as e:
            return {
                "response": "",
                "model": model,
                "prompt": prompt,
                "elapsed_time": None,
                "success": False,
                "error": str(e),
                "timestamp": datetime.now().isoformat()}
    def test_connection(self) -> bool:
        try:
            response = requests.get("http://localhost:11434/api/tags", timeout=5)
            return response.status_code == 200
        except:
            return False
client = OllamaClient()
if client.test_connection():
    print("Ollama server is accessible")
else:
    print("Cannot connect to Ollama server. Make sure it's running on localhost:11434")

Ollama server is accessible


In [5]:
# Data Collection
def generate_experiment_id() -> str:
    return datetime.now().strftime("%Y%m%d_%H%M%S")
def collect_all_responses(client: OllamaClient, 
                         models: List[str] = Config.MODELS,
                         num_runs: int = Config.NUM_RUNS) -> pd.DataFrame:
    experiment_id = generate_experiment_id()
    all_responses = []    
    total_iterations = sum(
        len(queries) * len(models) * num_runs 
        for category in QUERY_DATASET.values() 
        for queries in category.values())
    print(f"\nStarting data collection - Experiment ID: {experiment_id}")
    print(f"Total responses to collect: {total_iterations}")
    print("\n")    
    pbar = tqdm(total=total_iterations, desc="Collecting responses")
    for category_name, concepts in QUERY_DATASET.items():
        for concept_name, queries in concepts.items():
            for query_idx, query in enumerate(queries):
                for model in models:
                    for run_idx in range(num_runs):
                        result = client.generate(model, query)                        
                        response_data = {
                            "experiment_id": experiment_id,
                            "category": category_name,
                            "concept": concept_name,
                            "query_idx": query_idx,
                            "query": query,
                            "model": model,
                            "run_idx": run_idx,
                            "response": result["response"],
                            "success": result["success"],
                            "error": result["error"],
                            "elapsed_time": result["elapsed_time"],
                            "timestamp": result["timestamp"]}
                        all_responses.append(response_data)
                        pbar.update(1)
                        time.sleep(0.1)    
    pbar.close()    
    df = pd.DataFrame(all_responses)    
    output_file = Config.RAW_RESPONSES_DIR / f"responses_{experiment_id}.csv"
    df.to_csv(output_file, index=False)    
    json_file = Config.RAW_RESPONSES_DIR / f"responses_{experiment_id}.json"
    df.to_json(json_file, orient="records", indent=2)
    print(f"\nData collection completed.")
    print(f"Saved to: {output_file}")
    print(f"Success rate: {df['success'].mean()*100:.1f}%")
    return df

In [6]:
# Claim Extraction Module
class ClaimExtractor:
    def __init__(self, client: OllamaClient, extraction_model: str = "llama3:latest"):
        self.client = client
        self.extraction_model = extraction_model
    def extract_claims(self, response_text: str, query: str) -> Dict[str, Any]:
        extraction_prompt = f"""You are analyzing an AI response for research on consistency. Extract the following:

ORIGINAL QUERY: "{query}"

RESPONSE TO ANALYZE:
{response_text}

Please extract:
1. ATOMIC CLAIMS: List each distinct factual or logical statement (one per line, max 10)
2. KEY DEFINITIONS: Any explicit definitions provided, including those embedded in prose
3. STANCE: The overall position taken in response to the query

Format your response EXACTLY as follows:

CLAIMS:
- [claim 1]
- [claim 2]
...

DEFINITIONS:
- [term]: [definition]
...

STANCE: [MUST BE EXACTLY ONE OF: affirmative, negative, conditional, neutral, abstention]

REASONING: [brief explanation of stance classification]

IMPORTANT RULES:
- Extract definitions even if not in "term: definition" format
- STANCE must be exactly one of the five words listed
- Claims should be atomic and self-contained
- Include implicit claims if clearly implied"""

        result = self.client.generate(self.extraction_model, extraction_prompt)
        if not result["success"]:
            return {
                "claims": [],
                "definitions": {},
                "stance": "error",
                "reasoning": result["error"],
                "raw_extraction": ""}        
        parsed = self._parse_extraction(result["response"])
        parsed["raw_extraction"] = result["response"]
        parsed["stance"] = self._validate_stance(parsed["stance"])
        return parsed    
    def _validate_stance(self, stance: str) -> str:
        valid_stances = {"affirmative", "negative", "conditional", "neutral", "abstention"}
        stance_clean = stance.lower().strip()
        first_word = stance_clean.split()[0] if stance_clean else "neutral"        
        if first_word in valid_stances:
            return first_word        
        stance_mapping = {
            "positive": "affirmative",
            "yes": "affirmative",
            "affirm": "affirmative",
            "no": "negative",
            "neg": "negative",
            "maybe": "conditional",
            "depends": "conditional",
            "unknown": "abstention",
            "unclear": "abstention",
            "error": "error"}
        for key, value in stance_mapping.items():
            if key in stance_clean:
                return value        
        return "neutral"    
    def _parse_extraction(self, extraction_text: str) -> Dict[str, Any]:
        claims = []
        definitions = {}
        stance = "neutral"
        reasoning = ""
        lines = extraction_text.split("\n")
        current_section = None
        for line in lines:
            line = line.strip()            
            if line.upper().startswith("CLAIMS:"):
                current_section = "claims"
                continue
            elif line.upper().startswith("DEFINITIONS:"):
                current_section = "definitions"
                continue
            elif line.upper().startswith("STANCE:"):
                current_section = "stance"
                stance_text = line.split(":", 1)[1].strip() if ":" in line else ""
                stance = stance_text.lower()
                continue
            elif line.upper().startswith("REASONING:"):
                current_section = "reasoning"
                reasoning = line.split(":", 1)[1].strip() if ":" in line else ""
                continue
            if not line or line.startswith("#"):
                continue            
            if current_section == "claims":
                if line.startswith("-") or line.startswith("•") or line.startswith("*"):
                    claim = line.lstrip("-•* ").strip()
                    if claim and len(claim) > 10: 
                        claims.append(claim)            
            elif current_section == "definitions":
                if ":" in line:
                    clean_line = line.lstrip("-•* ").strip()
                    parts = clean_line.split(":", 1)
                    if len(parts) == 2:
                        term = parts[0].strip()
                        definition = parts[1].strip()
                        if term and definition:
                            definitions[term] = definition
                elif line.startswith("-") and " is " in line:
                    clean_line = line.lstrip("-•* ").strip()
                    if " is " in clean_line:
                        term = clean_line.split(" is ")[0].strip()
                        definition = clean_line.split(" is ", 1)[1].strip()
                        if term and definition:
                            definitions[term] = definition            
            elif current_section == "reasoning":
                reasoning += " " + line
        return {
            "claims": claims,
            "definitions": definitions,
            "stance": stance,
            "reasoning": reasoning.strip()}
def extract_all_claims(df: pd.DataFrame, client: OllamaClient) -> pd.DataFrame:
    extractor = ClaimExtractor(client)    
    print(f"\nStarting claim extraction for {len(df)} responses")
    print("\n")
    extracted_data = []    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting claims"):
        if not row["success"]:
            extracted = {
                "claims": [],
                "definitions": {},
                "stance": "error",
                "reasoning": "Original response failed",
                "raw_extraction": ""}
        else:
            extracted = extractor.extract_claims(row["response"], row["query"])        
        combined = row.to_dict()
        combined.update({
            "extracted_claims": extracted["claims"],
            "extracted_definitions": extracted["definitions"],
            "extracted_stance": extracted["stance"],
            "extraction_reasoning": extracted["reasoning"],
            "raw_extraction": extracted["raw_extraction"],
            "num_claims": len(extracted["claims"]),
            "num_definitions": len(extracted["definitions"])})
        extracted_data.append(combined)        
        time.sleep(0.1)    
    df_with_claims = pd.DataFrame(extracted_data)    
    experiment_id = df["experiment_id"].iloc[0]
    claims_file = Config.CLAIMS_DIR / f"claims_{experiment_id}.csv"
    df_with_claims.to_csv(claims_file, index=False)    
    json_file = Config.CLAIMS_DIR / f"claims_{experiment_id}.json"
    df_with_claims.to_json(json_file, orient="records", indent=2)
    print(f"\nClaim extraction completed.")
    print(f"Saved to: {claims_file}")
    print(f"Average claims per response: {df_with_claims['num_claims'].mean():.1f}")
    print(f"Average definitions per response: {df_with_claims['num_definitions'].mean():.1f}")    
    print(f"\nStance distribution:")
    stance_counts = df_with_claims['extracted_stance'].value_counts()
    for stance, count in stance_counts.items():
        print(f"  {stance}: {count} ({count/len(df_with_claims)*100:.1f}%)")
    return df_with_claims

In [7]:
# Stability Metrics Calculation
class SemanticSimilarityEngine:    
    def __init__(self):
        if SENTENCE_TRANSFORMER_AVAILABLE:
            self.model = SentenceTransformer('all-MiniLM-L6-v2')
            self.method = "transformer"
            print("Using sentence transformer embeddings")
        else:
            self.vectorizer = TfidfVectorizer(
                max_features=1000,
                ngram_range=(1, 2),
                stop_words='english')
            self.method = "tfidf"
            print("Using TF-IDF embeddings")
    def get_similarity_matrix(self, texts: List[str]) -> np.ndarray:
        if not texts or len(texts) < 2:
            return np.array([[]])
        if self.method == "transformer":
            embeddings = self.model.encode(texts)
            similarity_matrix = cosine_similarity(embeddings)
        else:
            try:
                vectors = self.vectorizer.fit_transform(texts)
                similarity_matrix = cosine_similarity(vectors)
            except:
                return np.eye(len(texts))        
        return similarity_matrix
    def are_similar(self, text1: str, text2: str, threshold: float = 0.75) -> bool:
        sim_matrix = self.get_similarity_matrix([text1, text2])
        if sim_matrix.size == 0:
            return False
        return sim_matrix[0, 1] >= threshold
class StabilityMetrics:    
    def __init__(self):
        self.similarity_engine = SemanticSimilarityEngine()
    def claim_stability_score_semantic(self, claims_list: List[List[str]], 
                                      similarity_threshold: float = 0.75) -> Dict[str, float]:
        if not claims_list or len(claims_list) < 2:
            return {
                "css": 0.0,
                "css_exact": 0.0,
                "avg_cluster_size": 0.0,
                "num_unique_semantic_claims": 0}        
        all_claims = []
        claim_sources = []  
        for response_idx, claims in enumerate(claims_list):
            for claim in claims:
                if claim and len(claim) > 10: 
                    all_claims.append(claim.lower().strip())
                    claim_sources.append(response_idx)
        if len(all_claims) < 2:
            return {
                "css": 0.0,
                "css_exact": 0.0,
                "avg_cluster_size": 0.0,
                "num_unique_semantic_claims": len(all_claims)}        
        unique_exact = len(set(all_claims))
        css_exact = 1.0 - (unique_exact / len(all_claims))        
        similarity_matrix = self.similarity_engine.get_similarity_matrix(all_claims)        
        clusters = self._cluster_by_similarity(
            all_claims, 
            similarity_matrix, 
            similarity_threshold)        
        num_semantic_clusters = len(clusters)
        css_semantic = 1.0 - (num_semantic_clusters / len(all_claims))        
        avg_cluster_size = len(all_claims) / num_semantic_clusters if num_semantic_clusters > 0 else 1.0
        return {
            "css": css_semantic,
            "css_exact": css_exact,
            "avg_cluster_size": avg_cluster_size,
            "num_unique_semantic_claims": num_semantic_clusters,
            "total_claims": len(all_claims)}
    def _cluster_by_similarity(self, texts: List[str], 
                              similarity_matrix: np.ndarray, 
                              threshold: float) -> List[List[int]]:
        n = len(texts)
        assigned = [False] * n
        clusters = []     
        for i in range(n):
            if assigned[i]:
                continue            
            cluster = [i]
            assigned[i] = True            
            for j in range(i + 1, n):
                if not assigned[j] and similarity_matrix[i, j] >= threshold:
                    cluster.append(j)
                    assigned[j] = True
            clusters.append(cluster)
        return clusters
    def stance_volatility_index(self, stances: List[str], 
                               query_indices: List[int] = None) -> Dict[str, float]:
        if not stances or len(stances) < 2:
            return {
                "svi_overall": 0.0,
                "svi_within_paraphrase": 0.0,
                "svi_across_paraphrase": 0.0}        
        changes_overall = sum(1 for i in range(len(stances)-1) if stances[i] != stances[i+1])
        svi_overall = changes_overall / (len(stances) - 1)
        if query_indices is None:
            return {
                "svi_overall": svi_overall,
                "svi_within_paraphrase": 0.0,
                "svi_across_paraphrase": 0.0}        
        paraphrase_groups = {}
        for stance, query_idx in zip(stances, query_indices):
            if query_idx not in paraphrase_groups:
                paraphrase_groups[query_idx] = []
            paraphrase_groups[query_idx].append(stance)        
        within_changes = 0
        within_transitions = 0
        for stances_in_paraphrase in paraphrase_groups.values():
            if len(stances_in_paraphrase) > 1:
                within_changes += sum(
                    1 for i in range(len(stances_in_paraphrase)-1) 
                    if stances_in_paraphrase[i] != stances_in_paraphrase[i+1])
                within_transitions += len(stances_in_paraphrase) - 1
        svi_within = within_changes / within_transitions if within_transitions > 0 else 0.0        
        majority_stances = []
        for stances_in_paraphrase in paraphrase_groups.values():
            stance_counts = {}
            for s in stances_in_paraphrase:
                stance_counts[s] = stance_counts.get(s, 0) + 1
            majority_stance = max(stance_counts, key=stance_counts.get)
            majority_stances.append(majority_stance)
        if len(majority_stances) > 1:
            across_changes = sum(
                1 for i in range(len(majority_stances)-1) 
                if majority_stances[i] != majority_stances[i+1])
            svi_across = across_changes / (len(majority_stances) - 1)
        else:
            svi_across = 0.0
        return {
            "svi_overall": svi_overall,
            "svi_within_paraphrase": svi_within,
            "svi_across_paraphrase": svi_across}
    def definition_drift_score_semantic(self, definitions_list: List[Dict[str, str]], 
                                       similarity_threshold: float = 0.80) -> Dict[str, Any]:
        if not definitions_list:
            return {
                "dds": 0.0,
                "dds_exact": 0.0,
                "num_terms": 0,
                "terms_analyzed": []}        
        all_terms = set()
        for defs in definitions_list:
            all_terms.update(defs.keys())
        if not all_terms:
            return {
                "dds": 0.0,
                "dds_exact": 0.0,
                "num_terms": 0,
                "terms_analyzed": []}        
        drift_scores_semantic = []
        drift_scores_exact = []
        terms_analyzed = []
        for term in all_terms:
            term_defs = [defs.get(term, "") for defs in definitions_list]
            term_defs = [d.strip() for d in term_defs if d.strip()]
            if len(term_defs) < 2:
                continue            
            unique_defs_exact = len(set(term_defs))
            drift_exact = (unique_defs_exact - 1) / (len(term_defs) - 1)
            drift_scores_exact.append(drift_exact)            
            similarity_matrix = self.similarity_engine.get_similarity_matrix(term_defs)            
            n = len(term_defs)
            dissimilarities = []
            for i in range(n):
                for j in range(i + 1, n):
                    dissimilarities.append(1.0 - similarity_matrix[i, j])
            drift_semantic = np.mean(dissimilarities) if dissimilarities else 0.0
            drift_scores_semantic.append(drift_semantic)
            terms_analyzed.append({
                "term": term,
                "num_definitions": len(term_defs),
                "drift_semantic": drift_semantic,
                "drift_exact": drift_exact})
        return {
            "dds": np.mean(drift_scores_semantic) if drift_scores_semantic else 0.0,
            "dds_exact": np.mean(drift_scores_exact) if drift_scores_exact else 0.0,
            "num_terms": len(terms_analyzed),
            "terms_analyzed": terms_analyzed}
def calculate_stability_metrics(df: pd.DataFrame) -> pd.DataFrame:
    print("\nCalculating stability metrics (with semantic similarity):")
    metrics = StabilityMetrics()
    results = []    
    for (model, concept), group in tqdm(df.groupby(["model", "concept"]), 
                                        desc="Computing metrics"):
        all_claims = group["extracted_claims"].tolist()
        all_stances = group["extracted_stance"].tolist()
        all_definitions = group["extracted_definitions"].tolist()
        query_indices = group["query_idx"].tolist()        
        css_results = metrics.claim_stability_score_semantic(all_claims)        
        svi_results = metrics.stance_volatility_index(all_stances, query_indices)        
        dds_results = metrics.definition_drift_score_semantic(all_definitions)        
        avg_claims = group["num_claims"].mean()
        std_claims = group["num_claims"].std()
        avg_response_length = group["response"].str.len().mean()
        results.append({
            "model": model,
            "concept": concept,
            "category": group["category"].iloc[0],            
            "claim_stability_score": css_results["css"],
            "claim_stability_score_exact": css_results["css_exact"],
            "avg_claim_cluster_size": css_results["avg_cluster_size"],
            "num_unique_semantic_claims": css_results["num_unique_semantic_claims"],            
            "stance_volatility_index": svi_results["svi_overall"],
            "stance_volatility_within": svi_results["svi_within_paraphrase"],
            "stance_volatility_across": svi_results["svi_across_paraphrase"],            
            "definition_drift_score": dds_results["dds"],
            "definition_drift_score_exact": dds_results["dds_exact"],
            "num_terms_with_definitions": dds_results["num_terms"],            
            "avg_claims_per_response": avg_claims,
            "std_claims_per_response": std_claims,
            "avg_response_length": avg_response_length,
            "num_responses": len(group)})
    metrics_df = pd.DataFrame(results)    
    experiment_id = df["experiment_id"].iloc[0]
    metrics_file = Config.ANALYSIS_DIR / f"stability_metrics_{experiment_id}.csv"
    metrics_df.to_csv(metrics_file, index=False)
    print(f"\nMetrics calculated for {len(metrics_df)} concept-model pairs")
    print(f"Saved to: {metrics_file}")    
    print("\nMetric Comparison (Exact vs Semantic):")
    print(f"CSS (exact):    {metrics_df['claim_stability_score_exact'].mean():.3f}")
    print(f"CSS (semantic): {metrics_df['claim_stability_score'].mean():.3f}")
    print(f"Improvement:    {(metrics_df['claim_stability_score'].mean() - metrics_df['claim_stability_score_exact'].mean()):.3f}")
    print(f"\nDDS (exact):    {metrics_df['definition_drift_score_exact'].mean():.3f}")
    print(f"DDS (semantic): {metrics_df['definition_drift_score'].mean():.3f}")
    print(f"Improvement:    {(metrics_df['definition_drift_score_exact'].mean() - metrics_df['definition_drift_score'].mean()):.3f}")
    return metrics_df

In [8]:
# Visualization & Analysis
def create_stability_visualizations(metrics_df: pd.DataFrame, 
                                   experiment_id: str):    
    print("\n")
    print("Generating visualizations")    
    plt.figure(figsize=(12, 6))
    sns.barplot(data=metrics_df, x="concept", y="claim_stability_score", 
                hue="model", palette="Set2")
    plt.xticks(rotation=45, ha="right")
    plt.title("Claim Stability Score by Concept and Model", fontsize=14, weight="bold")
    plt.ylabel("CSS (higher = more stable)")
    plt.tight_layout()
    plt.savefig(Config.PLOTS_DIR / f"css_by_concept_{experiment_id}.png", dpi=300)
    plt.close()    
    plt.figure(figsize=(12, 6))
    sns.barplot(data=metrics_df, x="concept", y="stance_volatility_index", 
                hue="model", palette="Set1")
    plt.xticks(rotation=45, ha="right")
    plt.title("Stance Volatility Index by Concept and Model", fontsize=14, weight="bold")
    plt.ylabel("SVI (lower = more stable)")
    plt.tight_layout()
    plt.savefig(Config.PLOTS_DIR / f"svi_by_concept_{experiment_id}.png", dpi=300)
    plt.close()    
    plt.figure(figsize=(12, 6))
    sns.barplot(data=metrics_df, x="concept", y="definition_drift_score", 
                hue="model", palette="Set3")
    plt.xticks(rotation=45, ha="right")
    plt.title("Definition Drift Score by Concept and Model", fontsize=14, weight="bold")
    plt.ylabel("DDS (lower = more stable)")
    plt.tight_layout()
    plt.savefig(Config.PLOTS_DIR / f"dds_by_concept_{experiment_id}.png", dpi=300)
    plt.close()    
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    model_summary = metrics_df.groupby("model").agg({
        "claim_stability_score": "mean",
        "stance_volatility_index": "mean",
        "definition_drift_score": "mean"
    }).reset_index()
    sns.barplot(data=model_summary, x="model", y="claim_stability_score", 
                ax=axes[0], palette="Blues_d")
    axes[0].set_title("Average CSS")
    axes[0].set_ylabel("Score")
    sns.barplot(data=model_summary, x="model", y="stance_volatility_index", 
                ax=axes[1], palette="Reds_d")
    axes[1].set_title("Average SVI")
    axes[1].set_ylabel("Score")
    sns.barplot(data=model_summary, x="model", y="definition_drift_score", 
                ax=axes[2], palette="Greens_d")
    axes[2].set_title("Average DDS")
    axes[2].set_ylabel("Score")
    plt.suptitle("Overall Model Stability Comparison", fontsize=16, weight="bold")
    plt.tight_layout()
    plt.savefig(Config.PLOTS_DIR / f"model_comparison_{experiment_id}.png", dpi=300)
    plt.close()    
    plt.figure(figsize=(12, 6))
    category_avg = metrics_df.groupby(["category", "model"])["claim_stability_score"].mean().reset_index()
    sns.barplot(data=category_avg, x="category", y="claim_stability_score", 
                hue="model", palette="viridis")
    plt.xticks(rotation=45, ha="right")
    plt.title("Claim Stability by Category", fontsize=14, weight="bold")
    plt.ylabel("Average CSS")
    plt.tight_layout()
    plt.savefig(Config.PLOTS_DIR / f"css_by_category_{experiment_id}.png", dpi=300)
    plt.close()
    print("Visualizations saved to:", Config.PLOTS_DIR)
def generate_summary_report(df_claims: pd.DataFrame, 
                           metrics_df: pd.DataFrame,
                           experiment_id: str):    
    report = []
    report.append("LLM Stability Research - Experimental Summary:")
    report.append(f"\nExperiment ID: {experiment_id}")
    report.append(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    report.append(f"\n")
    report.append("Dataset Statistics:")
    report.append(f"Total responses collected: {len(df_claims)}")
    report.append(f"Successful responses: {df_claims['success'].sum()}")
    report.append(f"Success rate: {df_claims['success'].mean()*100:.1f}%")
    report.append(f"Models evaluated: {', '.join(df_claims['model'].unique())}")
    report.append(f"Concepts evaluated: {df_claims['concept'].nunique()}")
    report.append(f"Categories: {df_claims['category'].nunique()}")
    report.append(f"\n")
    report.append("Extraction Statistics:")
    report.append(f"Average claims per response: {df_claims['num_claims'].mean():.2f}")
    report.append(f"Average definitions per response: {df_claims['num_definitions'].mean():.2f}")
    report.append(f"Total unique stances observed: {df_claims['extracted_stance'].nunique()}")    
    report.append(f"\n")
    report.append("Stability Metrics - Overall:")
    report.append(f"Average Claim Stability Score (CSS): {metrics_df['claim_stability_score'].mean():.3f}")
    report.append(f"Average Stance Volatility Index (SVI): {metrics_df['stance_volatility_index'].mean():.3f}")
    report.append(f"Average Definition Drift Score (DDS): {metrics_df['definition_drift_score'].mean():.3f}")
    report.append(f"\n")
    report.append("Model Comparison:")    
    for model in metrics_df['model'].unique():
        model_data = metrics_df[metrics_df['model'] == model]
        report.append(f"\n{model}:")
        report.append(f"  CSS: {model_data['claim_stability_score'].mean():.3f} (±{model_data['claim_stability_score'].std():.3f})")
        report.append(f"  SVI: {model_data['stance_volatility_index'].mean():.3f} (±{model_data['stance_volatility_index'].std():.3f})")
        report.append(f"  DDS: {model_data['definition_drift_score'].mean():.3f} (±{model_data['definition_drift_score'].std():.3f})")
    report.append(f"\n")
    report.append("Top 5 Most Stable Concepts:")    
    top_stable = metrics_df.nlargest(5, 'claim_stability_score')[['concept', 'model', 'claim_stability_score']]
    for idx, row in top_stable.iterrows():
        report.append(f"{row['concept']} ({row['model']}): CSS = {row['claim_stability_score']:.3f}")
    report.append(f"\n")
    report.append("Top 5 Most Volatile Concepts:")    
    top_volatile = metrics_df.nlargest(5, 'stance_volatility_index')[['concept', 'model', 'stance_volatility_index']]
    for idx, row in top_volatile.iterrows():
        report.append(f"{row['concept']} ({row['model']}): SVI = {row['stance_volatility_index']:.3f}")
    report.append(f"\n")
    report.append("Category-Level Analysis:")    
    for category in metrics_df['category'].unique():
        cat_data = metrics_df[metrics_df['category'] == category]
        report.append(f"\n{category}:")
        report.append(f"  Concepts: {cat_data['concept'].nunique()}")
        report.append(f"  Average CSS: {cat_data['claim_stability_score'].mean():.3f}")
        report.append(f"  Average SVI: {cat_data['stance_volatility_index'].mean():.3f}")
        report.append(f"  Average DDS: {cat_data['definition_drift_score'].mean():.3f}")
    report.append(f"\n")
    report.append("End of Report.")    
    report_text = "\n".join(report)
    report_file = Config.ANALYSIS_DIR / f"summary_report_{experiment_id}.txt"
    with open(report_file, 'w') as f:
        f.write(report_text)
    print(report_text)
    print(f"\nReport saved to: {report_file}")
    return report_text

In [9]:
# Main Execution Pipeline
def run_complete_experiment():
    print("\n")
    print("Starting Complete LLM Stability Experiment")
    print(f"\nThis will collect {17 * 5 * 2 * 3} responses total")
    print("\n")    
    client = OllamaClient()    
    if not client.test_connection():
        print("Cannot connect to Ollama server.")
        print("Please ensure Ollama is running: ollama serve")
        return None
    try:
        print("\nStep 1/5: Collecting model responses...")
        df_responses = collect_all_responses(client)        
        print("\nStep 2/5: Extracting claims and definitions...")
        df_claims = extract_all_claims(df_responses, client)        
        print("\nStep 3/5: Computing stability metrics...")
        metrics_df = calculate_stability_metrics(df_claims)        
        print("\nStep 4/5: Generating visualizations...")
        experiment_id = df_claims['experiment_id'].iloc[0]
        create_stability_visualizations(metrics_df, experiment_id)        
        print("\nStep 5/5: Creating summary report...")
        generate_summary_report(df_claims, metrics_df, experiment_id)
        print("\n")
        print("Experiment Complete.")
        print(f"\nAll results saved in: {Config.OUTPUT_DIR}")
        print(f"Experiment ID: {experiment_id}")
        print("\n")
        return {
            'df_responses': df_responses,
            'df_claims': df_claims,
            'metrics_df': metrics_df,
            'experiment_id': experiment_id}
    except KeyboardInterrupt:
        print("\n\nExperiment interrupted by user")
        return None
    except Exception as e:
        print(f"\n\nERROR: Experiment failed with exception:")
        print(f"{type(e).__name__}: {e}")
        import traceback
        traceback.print_exc()
        return None

In [24]:
# Post-Experiment Analysis Utilities
def load_experiment_results(experiment_id: str) -> Dict[str, pd.DataFrame]:
    print(f"Loading experiment: {experiment_id}")
    results = {}    
    responses_file = Config.RAW_RESPONSES_DIR / f"responses_{experiment_id}.csv"
    if responses_file.exists():
        results['responses'] = pd.read_csv(responses_file)
        print(f"Loaded {len(results['responses'])} raw responses")    
    claims_file = Config.CLAIMS_DIR / f"claims_{experiment_id}.csv"
    if claims_file.exists():
        results['claims'] = pd.read_csv(claims_file)
        print(f"Loaded {len(results['claims'])} responses with claims")    
    metrics_file = Config.ANALYSIS_DIR / f"stability_metrics_{experiment_id}.csv"
    if metrics_file.exists():
        results['metrics'] = pd.read_csv(metrics_file)
        print(f"Loaded {len(results['metrics'])} metric rows")
    return results
def list_experiments() -> List[str]:
    experiments = set()
    for file in Config.RAW_RESPONSES_DIR.glob("responses_*.csv"):
        exp_id = file.stem.replace("responses_", "")
        experiments.add(exp_id)
    return sorted(list(experiments))
def analyze_concept_stability(df_claims: pd.DataFrame, concept: str, model: str = None):
    import ast
    def safe_parse_list(x):
        if isinstance(x, list):
            return x
        if isinstance(x, str):
            try:
                return ast.literal_eval(x)
            except:
                return []
        return []
    mask = df_claims['concept'] == concept
    if model:
        mask = mask & (df_claims['model'] == model)
    concept_data = df_claims[mask].copy()
    if len(concept_data) == 0:
        print(f"No data found for concept: {concept}")
        return
    print(f"\n")
    print(f"Concept Analysis: {concept}")
    if model:
        print(f"Model: {model}")
    print(f"\n")
    print(f"Total responses: {len(concept_data)}")
    print(f"Unique queries: {concept_data['query'].nunique()}")
    print(f"Runs per query: {len(concept_data) // concept_data['query'].nunique()}")    
    print(f"\nStance Distribution:")
    stance_counts = concept_data['extracted_stance'].value_counts()
    for stance, count in stance_counts.items():
        print(f"  {stance}: {count} ({count/len(concept_data)*100:.1f}%)")    
    all_claims = []
    for claims_raw in concept_data['extracted_claims']:
        claims = safe_parse_list(claims_raw)
        all_claims.extend(claims)
    if all_claims:
        print(f"\nTotal claims extracted: {len(all_claims)}")
        print(f"Unique claims: {len(set(all_claims))}")
        print(f"Claim repetition: {(1 - len(set(all_claims))/len(all_claims))*100:.1f}%")
    else:
        print("\nNo claims extracted")    
    print(f"\n")
    print("Sample Responses:")
    print(f"\n")
    for idx, row in concept_data.head(3).iterrows():
        print(f"\nQuery: {row['query']}")
        print(f"Stance: {row['extracted_stance']}")
        claims = safe_parse_list(row['extracted_claims'])
        if claims:
            print(f"Claims ({len(claims)}):")
            for claim in claims[:3]:
                print(f"  - {claim}")
            if len(claims) > 3:
                print(f"  ... and {len(claims)-3} more")
        print("-" * 30)
def compare_models_on_concept(df_claims: pd.DataFrame, concept: str):
    concept_data = df_claims[df_claims['concept'] == concept]
    if len(concept_data) == 0:
        print(f"No data found for concept: {concept}")
        return
    print(f"\n")
    print(f"Model Comparison: {concept}")
    print(f"\n")
    for model in concept_data['model'].unique():
        model_data = concept_data[concept_data['model'] == model]
        print(f"\n{model}:")
        print(f"  Responses: {len(model_data)}")
        print(f"  Avg claims/response: {model_data['num_claims'].mean():.2f}")
        print(f"  Avg response length: {model_data['response'].str.len().mean():.0f} chars")
        stances = model_data['extracted_stance'].value_counts()
        print(f"  Dominant stance: {stances.index[0]} ({stances.iloc[0]} times)")        
        unique_stances = model_data['extracted_stance'].nunique()
        print(f"  Stance consistency: {(1 - (unique_stances-1)/len(model_data))*100:.1f}%")
def export_for_paper(metrics_df: pd.DataFrame, experiment_id: str):
    print("\nExporting publication-ready data...")    
    model_summary = metrics_df.groupby('model').agg({
        'claim_stability_score': ['mean', 'std'],
        'stance_volatility_index': ['mean', 'std'],
        'definition_drift_score': ['mean', 'std']
    }).round(3)
    model_summary.columns = ['_'.join(col) for col in model_summary.columns]
    table1_file = Config.ANALYSIS_DIR / f"table1_model_comparison_{experiment_id}.csv"
    model_summary.to_csv(table1_file)
    print(f"Table 1 saved: {table1_file}")    
    category_summary = metrics_df.groupby(['category', 'model']).agg({
        'claim_stability_score': 'mean',
        'stance_volatility_index': 'mean',
        'definition_drift_score': 'mean'
    }).round(3)
    table2_file = Config.ANALYSIS_DIR / f"table2_category_breakdown_{experiment_id}.csv"
    category_summary.to_csv(table2_file)
    print(f"Table 2 saved: {table2_file}")    
    top_stable = metrics_df.nlargest(10, 'claim_stability_score')[
        ['concept', 'model', 'claim_stability_score', 'stance_volatility_index']]
    bottom_stable = metrics_df.nsmallest(10, 'claim_stability_score')[
        ['concept', 'model', 'claim_stability_score', 'stance_volatility_index']]
    table3_file = Config.ANALYSIS_DIR / f"table3_extreme_concepts_{experiment_id}.csv"
    pd.concat([
        top_stable.assign(rank='Most Stable'),
        bottom_stable.assign(rank='Least Stable')
    ]).to_csv(table3_file, index=False)
    print(f"Table 3 saved: {table3_file}")
    print("\nAll tables exported.")

In [11]:
results = run_complete_experiment() # Main Pipeline



Starting Complete LLM Stability Experiment

This will collect 510 responses total



Step 1/5: Collecting model responses...

Starting data collection - Experiment ID: 20251217_163437
Total responses to collect: 510




Collecting responses:   0%|          | 0/510 [00:00<?, ?it/s]


Data collection completed.
Saved to: experiment_outputs/raw_responses/responses_20251217_163437.csv
Success rate: 100.0%

Step 2/5: Extracting claims and definitions...

Starting claim extraction for 510 responses




Extracting claims:   0%|          | 0/510 [00:00<?, ?it/s]


Claim extraction completed.
Saved to: experiment_outputs/extracted_claims/claims_20251217_163437.csv
Average claims per response: 6.4
Average definitions per response: 1.9

Stance distribution:
  neutral: 238 (46.7%)
  affirmative: 129 (25.3%)
  conditional: 115 (22.5%)
  negative: 27 (5.3%)
  abstention: 1 (0.2%)

Step 3/5: Computing stability metrics...

Calculating stability metrics (with semantic similarity):
Using sentence transformer embeddings


Computing metrics:   0%|          | 0/34 [00:00<?, ?it/s]


Metrics calculated for 34 concept-model pairs
Saved to: experiment_outputs/analysis/stability_metrics_20251217_163437.csv

Metric Comparison (Exact vs Semantic):
CSS (exact):    0.028
CSS (semantic): 0.453
Improvement:    0.425

DDS (exact):    0.968
DDS (semantic): 0.395
Improvement:    0.574

Step 4/5: Generating visualizations...


Generating visualizations
Visualizations saved to: experiment_outputs/plots

Step 5/5: Creating summary report...
LLM Stability Research - Experimental Summary:

Experiment ID: 20251217_163437
Date: 2025-12-17 21:23:39


Dataset Statistics:
Total responses collected: 510
Successful responses: 510
Success rate: 100.0%
Models evaluated: deepseek-r1:7b, llama3:latest
Concepts evaluated: 17
Categories: 7


Extraction Statistics:
Average claims per response: 6.42
Average definitions per response: 1.85
Total unique stances observed: 5


Stability Metrics - Overall:
Average Claim Stability Score (CSS): 0.453
Average Stance Volatility Index (SVI): 0.483
Average 

In [25]:
# Statistical Validation Module
def calculate_confidence_intervals(metrics_df: pd.DataFrame, 
                                  n_bootstrap: int = 1000) -> pd.DataFrame:    
    print(f"\n")
    print("Calculating confidence intervals (bootstrap)")
    print(f"\n")
    ci_results = []
    for model in metrics_df['model'].unique():
        model_data = metrics_df[metrics_df['model'] == model]
        for metric in ['claim_stability_score', 'stance_volatility_index', 'definition_drift_score']:
            values = model_data[metric].values            
            bootstrap_means = []
            for _ in range(n_bootstrap):
                sample = np.random.choice(values, size=len(values), replace=True)
                bootstrap_means.append(np.mean(sample))            
            ci_lower = np.percentile(bootstrap_means, 2.5)
            ci_upper = np.percentile(bootstrap_means, 97.5)
            mean = np.mean(values)
            ci_results.append({
                'model': model,
                'metric': metric,
                'mean': mean,
                'ci_lower': ci_lower,
                'ci_upper': ci_upper,
                'ci_width': ci_upper - ci_lower})
    ci_df = pd.DataFrame(ci_results)
    print("\nConfidence Intervals (95%):")
    for model in ci_df['model'].unique():
        print(f"\n{model}:")
        model_ci = ci_df[ci_df['model'] == model]
        for _, row in model_ci.iterrows():
            print(f"  {row['metric']}: {row['mean']:.3f} [{row['ci_lower']:.3f}, {row['ci_upper']:.3f}]")
    return ci_df
def test_model_differences(metrics_df: pd.DataFrame) -> pd.DataFrame:    
    print("\n")
    print("Testing statistical significance of model differences")
    print("\n")
    models = metrics_df['model'].unique()
    if len(models) != 2:
        print("Need exactly 2 models for comparison")
        return pd.DataFrame()
    model1, model2 = models[0], models[1]
    m1_data = metrics_df[metrics_df['model'] == model1]
    m2_data = metrics_df[metrics_df['model'] == model2]
    test_results = []
    for metric in ['claim_stability_score', 'stance_volatility_index', 'definition_drift_score']:
        values1 = m1_data[metric].values
        values2 = m2_data[metric].values        
        t_stat, p_value = ttest_ind(values1, values2)        
        pooled_std = np.sqrt((np.std(values1)**2 + np.std(values2)**2) / 2)
        cohens_d = (np.mean(values1) - np.mean(values2)) / pooled_std if pooled_std > 0 else 0        
        if p_value < 0.001:
            significance = "***"
        elif p_value < 0.01:
            significance = "**"
        elif p_value < 0.05:
            significance = "*"
        else:
            significance = "ns"
        test_results.append({
            'metric': metric,
            'model1': model1,
            'mean1': np.mean(values1),
            'model2': model2,
            'mean2': np.mean(values2),
            'difference': np.mean(values1) - np.mean(values2),
            't_statistic': t_stat,
            'p_value': p_value,
            'cohens_d': cohens_d,
            'significance': significance})
    results_df = pd.DataFrame(test_results)
    print(f"Comparing {model1} vs {model2}:")
    for _, row in results_df.iterrows():
        print(f"\n{row['metric']}:")
        print(f"  {model1}: {row['mean1']:.3f}")
        print(f"  {model2}: {row['mean2']:.3f}")
        print(f"  Difference: {row['difference']:.3f}")
        print(f"  p-value: {row['p_value']:.4f} {row['significance']}")
        print(f"  Effect size (Cohen's d): {row['cohens_d']:.3f}")
    print(f"\n")
    print("*** p<0.001, ** p<0.01, * p<0.05, ns = not significant")
    return results_df
def analyze_category_differences(metrics_df: pd.DataFrame) -> pd.DataFrame:    
    print(f"\n")
    print("Category-level stability analysis")
    print(f"\n")
    category_results = []
    for category in metrics_df['category'].unique():
        cat_data = metrics_df[metrics_df['category'] == category]
        category_results.append({
            'category': category,
            'num_concepts': cat_data['concept'].nunique(),
            'css_mean': cat_data['claim_stability_score'].mean(),
            'css_std': cat_data['claim_stability_score'].std(),
            'svi_mean': cat_data['stance_volatility_index'].mean(),
            'svi_std': cat_data['stance_volatility_index'].std(),
            'dds_mean': cat_data['definition_drift_score'].mean(),
            'dds_std': cat_data['definition_drift_score'].std(),})
    cat_df = pd.DataFrame(category_results)
    cat_df = cat_df.sort_values('css_mean', ascending=False)
    print("Categories ranked by claim stability (CSS):")
    print(cat_df[['category', 'css_mean', 'svi_mean', 'dds_mean']].to_string(index=False))
    return cat_df

In [26]:
# Qualitative Analysis Module
def extract_paper_examples(df_claims, metrics_df, experiment_id, num_examples=10):    
    import ast
    def safe_parse_list(x):
        if isinstance(x, list):
            return x
        if isinstance(x, str):
            try:
                return ast.literal_eval(x)
            except:
                return []
        return []
    def safe_parse_dict(x):
        if isinstance(x, dict):
            return x
        if isinstance(x, str):
            try:
                return ast.literal_eval(x)
            except:
                return {}
        return {}
    print(f"\n")
    print("Extracting Qualitative Examples")
    print(f"\n")
    examples = []    
    print("Example Type 1: Stance Volatility\n")
    volatile = metrics_df.nlargest(3, 'stance_volatility_index')
    for _, row in volatile.iterrows():
        concept = row['concept']
        model = row['model']
        concept_data = df_claims[
            (df_claims['concept'] == concept) & 
            (df_claims['model'] == model)
        ].head(3)
        print(f"--- {concept} ({model}) - SVI: {row['stance_volatility_index']:.3f} ---\n")
        for idx, resp_row in concept_data.iterrows():
            print(f"Query: {resp_row['query']}")
            print(f"Stance: {resp_row['extracted_stance']}")            
            claims = safe_parse_list(resp_row['extracted_claims'])
            if claims:
                print(f"Sample claims:")
                for claim in claims[:2]:
                    print(f"  • {claim}")
            print()
        examples.append({
            'type': 'stance_volatility',
            'concept': concept,
            'model': model,
            'svi': row['stance_volatility_index']})    
    print("\nExample Type 2: Definition Drift\n")
    drifty = metrics_df.nlargest(3, 'definition_drift_score')
    for _, row in drifty.iterrows():
        concept = row['concept']
        model = row['model']
        concept_data = df_claims[
            (df_claims['concept'] == concept) & 
            (df_claims['model'] == model)]
        print(f"--- {concept} ({model}) - DDS: {row['definition_drift_score']:.3f} ---")
        print("Definitions provided:\n")
        found_definitions = False
        for i, defs_raw in enumerate(concept_data['extracted_definitions'].head(3)):
            defs = safe_parse_dict(defs_raw)
            if defs:
                found_definitions = True
                print(f"Response {i+1}:")
                for term, definition in defs.items():
                    print(f"  {term}: {definition}")
                print()
        if not found_definitions:
            print("No explicit definitions found in responses.\n")
        examples.append({
            'type': 'definition_drift',
            'concept': concept,
            'model': model,
            'dds': row['definition_drift_score']})
    print("\nExample Type 3: Inflation Paradox\n")
    inflation_data = df_claims[
        (df_claims['concept'] == 'Inflation') & 
        (df_claims['model'] == 'deepseek-r1:7b')
    ].head(5)
    if len(inflation_data) > 0:
        print("--- Inflation (DeepSeek) ---")
        print("Stable factual claims + volatile stance\n")
        for idx, row in inflation_data.iterrows():
            print(f"Query: {row['query']}")
            print(f"Stance: {row['extracted_stance']}")
            claims = safe_parse_list(row['extracted_claims'])
            if claims:
                print("Claims:")
                for claim in claims[:3]:
                    print(f"  • {claim}")
            print()
        examples.append({
            'type': 'inflation_paradox',
            'concept': 'Inflation',
            'model': 'deepseek-r1:7b'})
    examples_file = Config.ANALYSIS_DIR / f"paper_examples_{experiment_id}.json"
    with open(examples_file, 'w') as f:
        json.dump(examples, f, indent=2)
    print(f"\nExamples saved: {examples_file}")
    print(f"Total examples: {len(examples)}\n")
    return examples

In [27]:
# Additional Visualizations
def create_additional_visualizations(metrics_df, experiment_id):    
    print(f"\n")
    print("Creating additional visualizations")
    print(f"\n")    
    plt.figure(figsize=(10, 8))
    for model in metrics_df['model'].unique():
        model_data = metrics_df[metrics_df['model'] == model]
        plt.scatter(model_data['claim_stability_score'], 
                    model_data['stance_volatility_index'],
                    label=model, s=100, alpha=0.7)        
        for _, row in model_data.iterrows():
            if row['stance_volatility_index'] > 0.6 or row['claim_stability_score'] > 0.55:
                plt.annotate(row['concept'], 
                            (row['claim_stability_score'], row['stance_volatility_index']),
                            fontsize=8, alpha=0.7)
    plt.xlabel('Claim Stability Score (CSS)', fontsize=12)
    plt.ylabel('Stance Volatility Index (SVI)', fontsize=12)
    plt.title('Claim Stability vs Stance Volatility', fontsize=14, weight='bold')
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(Config.PLOTS_DIR / f"css_vs_svi_scatter_{experiment_id}.png", dpi=300)
    plt.close()
    print("Created CSS vs SVI scatter plot")    
    pivot_css = metrics_df.pivot(index='concept', columns='model', values='claim_stability_score')
    pivot_svi = metrics_df.pivot(index='concept', columns='model', values='stance_volatility_index')
    fig, axes = plt.subplots(1, 2, figsize=(14, 10))
    sns.heatmap(pivot_css, annot=True, fmt='.2f', cmap='YlGn', ax=axes[0], 
                cbar_kws={'label': 'CSS'}, vmin=0, vmax=0.7)
    axes[0].set_title('Claim Stability Score by Concept', fontsize=12, weight='bold')
    sns.heatmap(pivot_svi, annot=True, fmt='.2f', cmap='YlOrRd', ax=axes[1], 
                cbar_kws={'label': 'SVI'}, vmin=0, vmax=0.8)
    axes[1].set_title('Stance Volatility Index by Concept', fontsize=12, weight='bold')
    plt.tight_layout()
    plt.savefig(Config.PLOTS_DIR / f"heatmap_metrics_{experiment_id}.png", dpi=300)
    plt.close()
    print("Created metrics heatmap")    
    cat_ranks = metrics_df.groupby('category').agg({
        'claim_stability_score': 'mean',
        'stance_volatility_index': 'mean',
        'definition_drift_score': 'mean'
    }).sort_values('claim_stability_score', ascending=False)
    fig, ax = plt.subplots(figsize=(12, 6))
    x = range(len(cat_ranks))
    width = 0.25
    ax.bar([i - width for i in x], cat_ranks['claim_stability_score'], 
           width, label='CSS (↑ better)', color='steelblue')
    ax.bar([i for i in x], 1 - cat_ranks['stance_volatility_index'], 
           width, label='Stance Stability (↑ better)', color='coral')
    ax.bar([i + width for i in x], 1 - cat_ranks['definition_drift_score'], 
           width, label='Definition Stability (↑ better)', color='mediumseagreen')
    ax.set_ylabel('Score', fontsize=12)
    ax.set_title('Category Stability Rankings', fontsize=14, weight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(cat_ranks.index, rotation=45, ha='right')
    ax.legend()
    ax.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.savefig(Config.PLOTS_DIR / f"category_rankings_{experiment_id}.png", dpi=300)
    plt.close()
    print("Created category rankings plot")
    print("\nAll additional visualizations created.\n")

In [28]:
# Final Comprehensive Summary
def generate_final_summary(df_claims, metrics_df, ci_df, test_results, experiment_id):    
    summary = []    
    summary.append("Comprehensive Research Summary:")
    summary.append("LLM Stability and Internal Consistency Study -")
    summary.append(f"\nExperiment ID: {experiment_id}")
    summary.append(f"Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    summary.append("\n")
    summary.append("Key Findings:")
    summary.append("\n1. Overall Stability")
    summary.append(f"   • Claim Stability Score (CSS): {metrics_df['claim_stability_score'].mean():.3f}")
    summary.append(f"     → LLMs maintain ~{metrics_df['claim_stability_score'].mean()*100:.1f}% semantic overlap")
    summary.append(f"   • Stance Volatility Index (SVI): {metrics_df['stance_volatility_index'].mean():.3f}")
    summary.append(f"     → Models flip stance in ~{metrics_df['stance_volatility_index'].mean()*100:.1f}% of cases")
    summary.append(f"   • Definition Drift Score (DDS): {metrics_df['definition_drift_score'].mean():.3f}")
    summary.append(f"     → Definitions drift by ~{metrics_df['definition_drift_score'].mean()*100:.1f}%")    
    summary.append("\n2. Model Comparison")
    for model in metrics_df['model'].unique():
        model_data = metrics_df[metrics_df['model'] == model]
        summary.append(f"\n   {model}:")
        summary.append(f"   • CSS: {model_data['claim_stability_score'].mean():.3f} (±{model_data['claim_stability_score'].std():.3f})")
        summary.append(f"   • SVI: {model_data['stance_volatility_index'].mean():.3f} (±{model_data['stance_volatility_index'].std():.3f})")
        summary.append(f"   • DDS: {model_data['definition_drift_score'].mean():.3f} (±{model_data['definition_drift_score'].std():.3f})")    
    summary.append("\n3. Statistical Significance")
    for _, row in test_results.iterrows():
        summary.append(f"\n   {row['metric']}:")
        summary.append(f"   • Difference: {row['difference']:.3f}")
        summary.append(f"   • p-value: {row['p_value']:.4f} {row['significance']}")
        summary.append(f"   • Effect size: {row['cohens_d']:.3f}")    
    summary.append("\n4. Category Insights")
    cat_summary = metrics_df.groupby('category').agg({
        'claim_stability_score': 'mean',
        'stance_volatility_index': 'mean'
    }).sort_values('claim_stability_score', ascending=False)
    summary.append(f"\n   Most Stable: {cat_summary.index[0]} (CSS: {cat_summary.iloc[0]['claim_stability_score']:.3f})")
    summary.append(f"   Least Stable: {cat_summary.index[-1]} (CSS: {cat_summary.iloc[-1]['claim_stability_score']:.3f})")
    summary.append(f"   Stability Range: {(cat_summary.iloc[0]['claim_stability_score'] - cat_summary.iloc[-1]['claim_stability_score']):.3f}")    
    summary.append("\n5. Inflation Paradox")
    inflation = metrics_df[metrics_df['concept'] == 'Inflation']
    summary.append(f"   • High factual stability: CSS = {inflation['claim_stability_score'].mean():.3f}")
    summary.append(f"   • High stance volatility: SVI = {inflation['stance_volatility_index'].mean():.3f}")
    summary.append("\n")
    summary.append("Outputs Generated.")
    summary.append("\nData collection: 510/510 responses")
    summary.append("Semantic metrics calculated.")
    summary.append("Statistical tests performed.")
    summary.append("Qualitative examples extracted.")
    summary.append("Publication-ready tables exported.")
    summary.append("Comprehensive visualizations created.")
    summary.append("\n")
    summary.append("End of Summary.")
    summary_text = "\n".join(summary)
    summary_file = Config.ANALYSIS_DIR / f"final_summary_{experiment_id}.txt"
    with open(summary_file, 'w') as f:
        f.write(summary_text)
    print(summary_text)
    print(f"\nFinal summary saved to: {summary_file}\n")
    return summary_text

In [29]:
# Complete Post-Experiment Analysis
def run_complete_analysis(experiment_id=None):    
    import ast
    def safe_parse_list(x):
        if isinstance(x, list):
            return x
        if isinstance(x, str):
            try:
                return ast.literal_eval(x)
            except:
                return []
        return []
    def safe_parse_dict(x):
        if isinstance(x, dict):
            return x
        if isinstance(x, str):
            try:
                return ast.literal_eval(x)
            except:
                return {}
        return {}
    print("Complete Post-Experiment Analysis")    
    experiments = list_experiments()
    if experiment_id is None:
        experiment_id = experiments[-1]
    print(f"\nLoading experiment: {experiment_id}")
    data = load_experiment_results(experiment_id)
    df_claims = data['claims']
    metrics_df = data['metrics']    
    print("Parsing data structures...")
    df_claims['extracted_claims'] = df_claims['extracted_claims'].apply(safe_parse_list)
    df_claims['extracted_definitions'] = df_claims['extracted_definitions'].apply(safe_parse_dict)
    print("Data structures parsed\n")    
    print("\nStatistical Analysis")
    ci_df = calculate_confidence_intervals(metrics_df, n_bootstrap=1000)
    test_results = test_model_differences(metrics_df)
    category_analysis = analyze_category_differences(metrics_df)    
    ci_df.to_csv(Config.ANALYSIS_DIR / f"confidence_intervals_{experiment_id}.csv", index=False)
    test_results.to_csv(Config.ANALYSIS_DIR / f"significance_tests_{experiment_id}.csv", index=False)
    category_analysis.to_csv(Config.ANALYSIS_DIR / f"category_analysis_{experiment_id}.csv", index=False)
    print("\nStatistical results saved")
    print("\nConcept Deep Dives")
    print("\n1. Inflation Paradox:")
    analyze_concept_stability(df_claims, "Inflation", model="deepseek-r1:7b")
    compare_models_on_concept(df_claims, "Inflation")
    print("\n2. Most Volatile:")
    most_volatile = metrics_df.nlargest(1, 'stance_volatility_index').iloc[0]
    analyze_concept_stability(df_claims, most_volatile['concept'], model=most_volatile['model'])
    print("\n3. Most Stable:")
    most_stable = metrics_df.nlargest(1, 'claim_stability_score').iloc[0]
    analyze_concept_stability(df_claims, most_stable['concept'], model=most_stable['model'])    
    examples = extract_paper_examples(df_claims, metrics_df, experiment_id)    
    print("\nExporting Publication Tables")
    export_for_paper(metrics_df, experiment_id)
    print("\nCreating Visualizations")
    create_additional_visualizations(metrics_df, experiment_id)
    print("\nGenerating Final Summary")
    summary = generate_final_summary(df_claims, metrics_df, ci_df, test_results, experiment_id)    
    print("\nComplete Analysis Finished.")    
    return {
        'df_claims': df_claims, 'metrics_df': metrics_df, 'ci_df': ci_df, 'test_results': test_results, 
        'category_analysis': category_analysis, 'examples': examples, 'experiment_id': experiment_id}

In [30]:
analysis_results = run_complete_analysis() # Comprehensive Pipeline

Complete Post-Experiment Analysis

Loading experiment: 20251217_163437
Loading experiment: 20251217_163437
Loaded 510 raw responses
Loaded 510 responses with claims
Loaded 34 metric rows
Parsing data structures...
Data structures parsed


Statistical Analysis


Calculating confidence intervals (bootstrap)



Confidence Intervals (95%):

deepseek-r1:7b:
  claim_stability_score: 0.442 [0.382, 0.500]
  stance_volatility_index: 0.508 [0.445, 0.576]
  definition_drift_score: 0.382 [0.311, 0.445]

llama3:latest:
  claim_stability_score: 0.464 [0.410, 0.515]
  stance_volatility_index: 0.458 [0.382, 0.542]
  definition_drift_score: 0.407 [0.374, 0.443]


Testing statistical significance of model differences


Comparing deepseek-r1:7b vs llama3:latest:

claim_stability_score:
  deepseek-r1:7b: 0.442
  llama3:latest: 0.464
  Difference: -0.023
  p-value: 0.5923 ns
  Effect size (Cohen's d): -0.191

stance_volatility_index:
  deepseek-r1:7b: 0.508
  llama3:latest: 0.458
  Difference: 0.050
  p-va