In [42]:
import os
import zipfile
import pandas as pd
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

# === SIMILARITY FUNCTIONS ===
def levenshtein_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

def cosine_text_similarity(a, b):
    vectorizer = TfidfVectorizer().fit([a, b])
    tfidf = vectorizer.transform([a, b])
    return cosine_similarity(tfidf[0], tfidf[1])[0, 0]

In [46]:
########## OK - Compare search on Scopus #################
# Locate the reference string and GT file
# base_dir = os.path.join(extract_path, "Model-driven engineering for digital twins")
base_dir = os.path.join(os.getcwd(), "Model-based Trustworthiness Evaluation")

ref_path = [f for f in os.listdir(base_dir) if f.endswith("String.txt")][0]
gt_path = [f for f in os.listdir(base_dir) if f.endswith("scopus-Full.csv")][0]

with open(os.path.join(base_dir, ref_path), encoding='utf-8') as f:
    ref_string = f.read().strip()

gt_df = pd.read_csv(os.path.join(base_dir, gt_path))
gt_titles = set(gt_df['title'].dropna().str.strip().str.lower())

results = []
def evaluate_llm():
    for llm_dir in os.listdir(base_dir):
        llm_path = os.path.join(base_dir, llm_dir)
        if not os.path.isdir(llm_path):
            continue
        for test_dir in os.listdir(llm_path):
            test_path = os.path.join(llm_path, test_dir)
            str_file = os.path.join(test_path, "String.txt")
            csv_file = os.path.join(test_path, "Scopus_Search_results.csv")
            if os.path.isfile(str_file) and os.path.isfile(csv_file):
                with open(str_file, encoding='utf-8') as f:
                    gen_string = f.read().strip()
                try:
                    results_df = pd.read_csv(csv_file)
                except:
                    continue
                titles = set(results_df['title'].dropna().str.strip().str.lower())
    
                tp = len(titles & gt_titles)
                fp = len(titles - gt_titles)
                fn = len(gt_titles - titles)
    
                precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
                recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
                f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    
                results.append({
                    "LLM": llm_dir,
                    "Test": test_dir,
                    "Levenshtein": levenshtein_similarity(ref_string, gen_string),
                    "Cosine": cosine_text_similarity(ref_string, gen_string),
                    "Precision": precision,
                    "Recall": recall,
                    "F1": f1,
                    "TP": tp,
                    "FP": fp,
                    "FN": fn
                })

    return pd.DataFrame(results)            

df_results = evaluate_llm()
print(df_results)
df_results.to_csv("llm_metrics-scopus-full.csv", index=False)

              LLM                                               Test  \
0       Gemma2-2b           00-Query-Agent-MDTE-gemma2-2b-0.0-Test01   
1       Gemma2-2b           00-Query-Agent-MDTE-gemma2-2b-0.0-Test02   
2       Gemma2-2b           00-Query-Agent-MDTE-gemma2-2b-0.0-Test03   
3       Gemma2-2b           00-Query-Agent-MDTE-gemma2-2b-1.0-Test01   
4       Gemma2-2b           00-Query-Agent-MDTE-gemma2-2b-1.0-Test02   
5       Gemma2-2b           00-Query-Agent-MDTE-gemma2-2b-1.0-Test03   
6       Gemma2-2b   00-Query-Agent-RAG-WEB-MDTE-gemma2-2b-0.0-Test01   
7       Gemma2-2b   00-Query-Agent-RAG-WEB-MDTE-gemma2-2b-0.0-Test02   
8       Gemma2-2b   00-Query-Agent-RAG-WEB-MDTE-gemma2-2b-0.0-Test03   
9       Gemma2-2b   00-Query-Agent-RAG-WEB-MDTE-gemma2-2b-1.0-Test01   
10      Gemma2-2b   00-Query-Agent-RAG-WEB-MDTE-gemma2-2b-1.0-Test02   
11      Gemma2-2b   00-Query-Agent-RAG-WEB-MDTE-gemma2-2b-1.0-Test03   
12    Llama3.1-8B            00-Query-Agent-MDTE-llama3.1-0.0-Te

In [48]:
################## OK - Compare String-Results with selected papers ################
import os
import pandas as pd

# Define base folder where the structure is located
base_dir = os.path.join(os.getcwd(), "Model-based Trustworthiness Evaluation")

# Locate reference string and GT dataset
ref_path = [f for f in os.listdir(base_dir) if f.endswith("String.txt")][0]
gt_path = [f for f in os.listdir(base_dir) if f.endswith("Full - Scopus.csv")][0]

with open(os.path.join(base_dir, ref_path), encoding='utf-8') as f:
    ref_string = f.read().strip()

gt_df = pd.read_csv(os.path.join(base_dir, gt_path))

# Normalize GT titles and dois
gt_df['title_norm'] = gt_df['title'].astype(str).str.strip().str.lower()
gt_df['doi_norm'] = gt_df['doi'].astype(str).str.strip().str.lower()
gt_titles = set(gt_df['title_norm'])
gt_dois = set(gt_df['doi_norm'])

results = []

def evaluate_llm():
    for llm_dir in os.listdir(base_dir):
        llm_path = os.path.join(base_dir, llm_dir)
        if not os.path.isdir(llm_path):
            continue
        for test_dir in os.listdir(llm_path):
            test_path = os.path.join(llm_path, test_dir)
            str_file = os.path.join(test_path, "String.txt")
            csv_file = os.path.join(test_path, "Scopus_Search_results.csv")

            if os.path.isfile(str_file) and os.path.isfile(csv_file):
                with open(str_file, encoding='utf-8') as f:
                    gen_string = f.read().strip()

                try:
                    results_df = pd.read_csv(csv_file)
                except:
                    continue

                # Normalize titles and DOIs in the retrieved set
                results_df['title_norm'] = results_df['title'].astype(str).str.strip().str.lower()
                results_df['doi_norm'] = results_df['doi'].astype(str).str.strip().str.lower()

                titles = set(results_df['title_norm'])
                dois = set(results_df['doi_norm'])

                # Combined match: title OR doi
                tp = len((titles & gt_titles) | (dois & gt_dois))
                fp = len((titles | dois) - (gt_titles | gt_dois))
                fn = len((gt_titles | gt_dois) - (titles | dois))

                precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
                recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
                f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

                results.append({
                    "LLM": llm_dir,
                    "Test": test_dir,
                    "Levenshtein": levenshtein_similarity(ref_string, gen_string),
                    "Cosine": cosine_text_similarity(ref_string, gen_string),
                    "Precision": precision,
                    "Recall": recall,
                    "F1": f1,
                    "TP": tp,
                    "FP": fp,
                    "FN": fn
                })

    return pd.DataFrame(results)

# === Execute
df_results = evaluate_llm()
print(df_results)
df_results.to_csv("llm_metrics.csv", index=False)

              LLM                                               Test  \
0       Gemma2-2b           00-Query-Agent-MDTE-gemma2-2b-0.0-Test01   
1       Gemma2-2b           00-Query-Agent-MDTE-gemma2-2b-0.0-Test02   
2       Gemma2-2b           00-Query-Agent-MDTE-gemma2-2b-0.0-Test03   
3       Gemma2-2b           00-Query-Agent-MDTE-gemma2-2b-1.0-Test01   
4       Gemma2-2b           00-Query-Agent-MDTE-gemma2-2b-1.0-Test02   
5       Gemma2-2b           00-Query-Agent-MDTE-gemma2-2b-1.0-Test03   
6       Gemma2-2b   00-Query-Agent-RAG-WEB-MDTE-gemma2-2b-0.0-Test01   
7       Gemma2-2b   00-Query-Agent-RAG-WEB-MDTE-gemma2-2b-0.0-Test02   
8       Gemma2-2b   00-Query-Agent-RAG-WEB-MDTE-gemma2-2b-0.0-Test03   
9       Gemma2-2b   00-Query-Agent-RAG-WEB-MDTE-gemma2-2b-1.0-Test01   
10      Gemma2-2b   00-Query-Agent-RAG-WEB-MDTE-gemma2-2b-1.0-Test02   
11      Gemma2-2b   00-Query-Agent-RAG-WEB-MDTE-gemma2-2b-1.0-Test03   
12    Llama3.1-8B            00-Query-Agent-MDTE-llama3.1-0.0-Te

# String Consistency

In [50]:
import os
import pandas as pd
from itertools import combinations
from pytextdist.vector_similarity import jaccard_similarity

base_dir = os.path.join(os.getcwd(), "Model-based Trustworthiness Evaluation")

def jaccard_set(a, b):
    return len(a & b) / len(a | b) if (a | b) else 1.0

def compute_consistency(sets):
    pairs = list(combinations(sets, 2))
    if not pairs:
        return 1.0
    scores = [jaccard_set(a, b) for a, b in pairs]
    return sum(scores) / len(scores)

def compute_string_consistency(strings):
    pairs = list(combinations(strings, 2))
    scores = [jaccard_similarity(s1, s2, n=2) for s1, s2 in pairs]
    return sum(scores) / len(scores) if pairs else 1.0

def compute_nested_consistency_metrics():
    results = []

    # Loop over all LLM root folders (e.g., Llama3.1-8B)
    for llm_name in os.listdir(base_dir):
        llm_path = os.path.join(base_dir, llm_name)
        if not os.path.isdir(llm_path):
            continue

        # Gather all test runs (subfolders like ...-Test01, -Test02, -Test03)
        test_runs = [
            os.path.join(llm_path, subdir)
            for subdir in os.listdir(llm_path)
            if os.path.isdir(os.path.join(llm_path, subdir)) and "Test" in subdir
        ]

        string_variants = []
        title_sets = []

        for path in test_runs:
            str_file = os.path.join(path, "String.txt")
            csv_file = os.path.join(path, "Scopus_Search_results.csv")

            if os.path.isfile(str_file) and os.path.isfile(csv_file):
                try:
                    with open(str_file, encoding='utf-8') as f:
                        string_variants.append(f.read().strip())

                    df = pd.read_csv(csv_file)
                    titles = set(df['title'].dropna().astype(str).str.strip().str.lower())
                    title_sets.append(titles)
                except Exception as e:
                    print(f"Skipped {path} due to error: {e}")

        if string_variants and title_sets:
            string_cons = compute_string_consistency(string_variants)
            result_cons = compute_consistency(title_sets)
            results.append({
                "LLM": llm_name,
                "String_Consistency": round(string_cons, 4),
                "Result_Consistency": round(result_cons, 4),
                "Num_Tests": len(test_runs)
            })

    return pd.DataFrame(results)

# === Run and Save
df_consistency = compute_nested_consistency_metrics()
print(df_consistency)
df_consistency.to_csv("llm_consistency_metrics.csv", index=False)

             LLM  String_Consistency  Result_Consistency  Num_Tests
0      Gemma2-2b              0.3003              0.1373         12
1    Llama3.1-8B              0.3427              0.1532         12
2  Mistral-Large              0.5647              0.3167         12


In [52]:
################ OK ###################
import os
import pandas as pd
from itertools import combinations
from pytextdist.vector_similarity import jaccard_similarity

base_dir = os.path.join(os.getcwd(), "Model-based Trustworthiness Evaluation")

def jaccard_set(a, b):
    return len(a & b) / len(a | b) if (a | b) else 1.0

def compute_consistency(sets):
    pairs = list(combinations(sets, 2))
    if not pairs:
        return 1.0
    scores = [jaccard_set(a, b) for a, b in pairs]
    return sum(scores) / len(scores)

def compute_string_consistency(strings):
    pairs = list(combinations(strings, 2))
    scores = [jaccard_similarity(s1, s2, n=2) for s1, s2 in pairs]
    return sum(scores) / len(scores) if pairs else 1.0

def compute_nested_consistency_metrics():
    results = []

    print(f"Scanning base directory: {base_dir}")
    for llm_name in os.listdir(base_dir):
        llm_path = os.path.join(base_dir, llm_name)
        if not os.path.isdir(llm_path):
            continue

        print(f"Exploring LLM folder: {llm_name}")
        test_runs = []
        for root, dirs, files in os.walk(llm_path):
            for d in dirs:
                if "Test" in d:
                    full_path = os.path.join(root, d)
                    print(f"  Found test folder: {full_path}")
                    test_runs.append(full_path)

        # Group by prefix before -TestXX
        grouped = {}
        for path in test_runs:
            folder = os.path.basename(path)
            key = folder.split("-Test")[0] if "-Test" in folder else folder
            grouped.setdefault(key, []).append(path)

        for group_key, paths in grouped.items():
            if len(paths) < 2:
                continue

            rag_type = "RAG-WEB" if "RAG-WEB" in group_key else "No-RAG-WEB"
            string_variants = []
            title_sets = []

            valid_paths = []
            for path in paths:
                str_file = os.path.join(path, "String.txt")
                csv_file = os.path.join(path, "Scopus_Search_results.csv")

                if os.path.isfile(str_file) and os.path.isfile(csv_file):
                    try:
                        with open(str_file, encoding='utf-8') as f:
                            string_variants.append(f.read().strip())

                        df = pd.read_csv(csv_file)
                        if 'title' in df.columns:
                            titles = set(df['title'].dropna().astype(str).str.strip().str.lower())
                            title_sets.append(titles)
                            valid_paths.append(path)
                        else:
                            print(f"Missing 'title' column in: {csv_file}")
                    except Exception as e:
                        print(f"Error processing {path}: {e}")

            if len(valid_paths) >= 2:
                string_cons = compute_string_consistency(string_variants)
                result_cons = compute_consistency(title_sets)
                results.append({
                    "LLM": llm_name,
                    "Group": group_key,
                    "RAG_Type": rag_type,
                    "String_Consistency": round(string_cons, 4),
                    "Result_Consistency": round(result_cons, 4),
                    "Num_Tests": len(valid_paths)
                })

    return pd.DataFrame(results)

# === Run and Save
df_consistency = compute_nested_consistency_metrics()
print(df_consistency)
df_consistency.to_csv("llm_consistency_metrics_detailed.csv", index=False)

Scanning base directory: C:\Users\vitto\Desktop\ALESSIO-AUTOMATIC-LLM\Results\Model-based Trustworthiness Evaluation
Exploring LLM folder: Gemma2-2b
  Found test folder: C:\Users\vitto\Desktop\ALESSIO-AUTOMATIC-LLM\Results\Model-based Trustworthiness Evaluation\Gemma2-2b\00-Query-Agent-MDTE-gemma2-2b-0.0-Test01
  Found test folder: C:\Users\vitto\Desktop\ALESSIO-AUTOMATIC-LLM\Results\Model-based Trustworthiness Evaluation\Gemma2-2b\00-Query-Agent-MDTE-gemma2-2b-0.0-Test02
  Found test folder: C:\Users\vitto\Desktop\ALESSIO-AUTOMATIC-LLM\Results\Model-based Trustworthiness Evaluation\Gemma2-2b\00-Query-Agent-MDTE-gemma2-2b-0.0-Test03
  Found test folder: C:\Users\vitto\Desktop\ALESSIO-AUTOMATIC-LLM\Results\Model-based Trustworthiness Evaluation\Gemma2-2b\00-Query-Agent-MDTE-gemma2-2b-1.0-Test01
  Found test folder: C:\Users\vitto\Desktop\ALESSIO-AUTOMATIC-LLM\Results\Model-based Trustworthiness Evaluation\Gemma2-2b\00-Query-Agent-MDTE-gemma2-2b-1.0-Test02
  Found test folder: C:\Users\v