<a href="https://colab.research.google.com/github/jithsg/2-Pipeline/blob/main/taxonomy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt
import json
import zipfile
import io
import textwrap
from pathlib import Path
import os
import shutil
import math

# ----------------------------------------------------------------------
# 1. Configuration and Constants
# ----------------------------------------------------------------------

# Define the taxonomy structure (required for coverage and entropy calculations)
TAXONOMY_DEFS = {
    'M': ['VAE', 'GAN', 'FLOW', 'DIFF', 'AUTOREG/LLM', 'G-GEN', 'EBM/CONTR', 'HYB/RAG'],
    'A': ['DAUG', 'ADSIM', 'DEF', 'REP/UNSUP', 'PRIV', 'FED-GEN', 'MULTI-MOD', 'TINT', 'XAI/CFEX', 'FORE', 'BENCH', 'ADAPT-IDS'],
    'E': ['CLOUD', 'EDGE/IoT', '5G/6G', 'ICS/SCADA/OT', 'HYBRID'],
    'D': ['NET', 'LOG', 'HOST', 'GRAPH', 'BIN', 'MM'],
    'L': ['SUP', 'SEMI', 'UNSUP', 'SELF/CONTR', 'FED', 'CONT', 'FS'],
    'T': ['TRAIN', 'INFER', 'RB-SIM', 'MDR', 'ADAPT']
}
AXES = list(TAXONOMY_DEFS.keys())
OUTPUT_BASE_DIR = Path("taxonomy_validation_reproduced")
OUTPUT_DIR = OUTPUT_BASE_DIR / "outputs"

# ----------------------------------------------------------------------
# 2. Data Loading (Using Embedded Coder A and B Data)
# ----------------------------------------------------------------------
# This block combines the Coder A and Coder B data provided previously
# to create the input dataframe for the analysis.

def load_coder_data():
    """Loads the Coder A and B data from the previous session."""

    # Coder A Data (As provided in the initial analysis)
    CODER_A_DATA = """paper_id,M,A,E,D,L,T
Aidin_Ferdowsi.txt,GAN,FED-GEN,EDGE/IoT,NET,FED,INFER
Akim_Kotelnikov.txt,DIFF,PRIV,CLOUD,NET,UNSUP,TRAIN
Enyan_Dai.txt,FLOW,REP/UNSUP,CLOUD,GRAPH,UNSUP,INFER
Eunbi_Seo.txt,GAN,REP/UNSUP,EDGE/IoT,NET,UNSUP,INFER
Hang_Shen.txt,HYB/RAG,DAUG,CLOUD,NET,SUP,TRAIN
Mohamed_Amine_Merzouk.txt,DIFF,DEF,CLOUD,NET,UNSUP,INFER
Mohammad_Jamoos.txt,GAN,DAUG,CLOUD,NET,SUP,TRAIN
Nour_Alhussien.txt,DIFF,DEF,HYBRID,NET,UNSUP,ADAPT
Pardis_Sadatian_Moghaddam.txt,HYB/RAG,MULTI-MOD,EDGE/IoT,NET,SUP,TRAIN
Sahar_Aldhaheri.txt,GAN,TINT,CLOUD,NET,SELF/CONTR,RB-SIM
Sultan_Zavrak.txt,VAE,REP/UNSUP,HYBRID,NET,SEMI,INFER
Vikash_Kumar.txt,GAN,DAUG,CLOUD,NET,SUP,TRAIN
Xiang_LUO.txt,VAE,ADAPT-IDS,CLOUD,NET,SEMI,INFER
Xiao_Han.txt,AUTOREG/LLM,REP/UNSUP,HYBRID,LOG,SELF/CONTR,INFER
Yajun_Chen.txt,GAN,DAUG,EDGE/IoT,NET,SUP,TRAIN
Yang_Yang.txt,GAN,DAUG,CLOUD,MM,SUP,TRAIN
Yonas_Teweldemedhin_Gebrezgiher.txt,VAE,REP/UNSUP,EDGE/IoT,NET,UNSUP,INFER
Yue_Yang.txt,DIFF,DAUG,CLOUD,NET,SUP,TRAIN
Zhengfa_Li.txt,HYB/RAG,DAUG,CLOUD,NET,SUP,TRAIN
Zijie_Chen.txt,DIFF,DAUG,CLOUD,NET,SUP,TRAIN
"""




    # Coder B Data (As provided in the initial analysis)
    CODER_B_DATA = """paper_id,M,A,E,D,L,T
Aidin_Ferdowsi.txt,GAN,FED-GEN,EDGE/IoT,NET,FED,INFER
Akim_Kotelnikov.txt,DIFF,DAUG,CLOUD,NET,UNSUP,TRAIN
Enyan_Dai.txt,FLOW,REP/UNSUP,CLOUD,MM,UNSUP,INFER
Eunbi_Seo.txt,GAN,REP/UNSUP,EDGE/IoT,NET,UNSUP,INFER
Hang_Shen.txt,GAN,DAUG,CLOUD,NET,SUP,TRAIN
Mohamed_Amine_Merzouk.txt,DIFF,ADSIM,CLOUD,NET,UNSUP,TRAIN
Mohammad_Jamoos.txt,GAN,DAUG,CLOUD,NET,SUP,TRAIN
Nour_Alhussien.txt,DIFF,ADSIM,CLOUD,NET,UNSUP,INFER
Pardis_Sadatian_Moghaddam.txt,HYB/RAG,MULTI-MOD,EDGE/IoT,NET,SUP,TRAIN
Sahar_Aldhaheri.txt,GAN,TINT,CLOUD,NET,SUP,RB-SIM
Sultan_Zavrak.txt,VAE,REP/UNSUP,HYBRID,NET,SEMI,INFER
Vikash_Kumar.txt,GAN,DAUG,CLOUD,NET,SUP,TRAIN
Xiang_LUO.txt,VAE,ADAPT-IDS,CLOUD,NET,SEMI,INFER
Xiao_Han.txt,AUTOREG/LLM,REP/UNSUP,CLOUD,LOG,SELF/CONTR,INFER
Yajun_Chen.txt,GAN,DAUG,EDGE/IoT,NET,SUP,TRAIN
Yang_Yang.txt,GAN,DAUG,CLOUD,MM,SUP,TRAIN
Yonas_Teweldemedhin_Gebrezgiher.txt,VAE,REP/UNSUP,5G/6G,NET,UNSUP,INFER
Yue_Yang.txt,DIFF,DAUG,CLOUD,NET,SUP,TRAIN
Zhengfa_Li.txt,HYB/RAG,DAUG,CLOUD,NET,SUP,TRAIN
Zijie_Chen.txt,DIFF,DAUG,CLOUD,NET,SUP,TRAIN"""

#Akim_Kotelnikov->NET
#Yonas_Teweldemedhin_Gebrezgiher: NET

    df_a = pd.read_csv(io.StringIO(CODER_A_DATA))
    df_a['coder'] = 'A'
    df_b = pd.read_csv(io.StringIO(CODER_B_DATA))
    df_b['coder'] = 'B'
    df_combined = pd.concat([df_a, df_b], ignore_index=True)

    # Add placeholder metadata columns for completeness
    for col in ['title', 'year', 'venue', 'doi']:
        df_combined[col] = 'N/A'

    # Reorder columns
    cols_order = ['paper_id', 'title', 'year', 'venue', 'doi', 'coder'] + AXES
    df_combined = df_combined[cols_order]

    return df_combined

# Setup environment: Clean previous runs and create directories
if OUTPUT_BASE_DIR.exists():
    try:
        if os.path.isdir(OUTPUT_BASE_DIR):
            shutil.rmtree(OUTPUT_BASE_DIR)
    except OSError as e:
        print(f"Warning: Could not remove existing directory {OUTPUT_BASE_DIR}: {e}")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Load the data
print("Loading Coder A and B data...")
df = load_coder_data()

# ----------------------------------------------------------------------
# 3. Reconciliation and IRR Calculation
# ----------------------------------------------------------------------
# This step compares Coder A and B to calculate Kappa and determines the final labels.

def reconcile_and_calculate_irr(df):
    """
    Reconciles labels between Coder A and B, and calculates Cohen's Kappa.
    """
    # Ensure no duplicate entries for the same paper_id and coder before pivoting
    df_deduped = df.drop_duplicates(subset=['paper_id', 'coder'])

    # Pivot the data so Coder A and Coder B labels are side-by-side for comparison
    df_pivot = df_deduped.pivot(index='paper_id', columns='coder', values=AXES)

    kappa_scores = {}
    df_reconciled = pd.DataFrame(index=df_pivot.index)

    for axis in AXES:
        # Get the labels from both coders
        # Using 'NaN_STR' to handle potential missing values consistently during comparison
        codes_A = df_pivot[(axis, 'A')].fillna('NaN_STR').astype(str)
        codes_B = df_pivot[(axis, 'B')].fillna('NaN_STR').astype(str)

        # Calculate Kappa (Inter-Rater Reliability)
        try:
            kappa = cohen_kappa_score(codes_A, codes_B)
            kappa_scores[axis] = kappa
        except Exception as e:
            print(f"Warning: Could not calculate Kappa for axis {axis}: {e}")
            kappa_scores[axis] = np.nan

        # Reconcile: If they agree, use the agreed label. If they disagree, prefer Coder A.
        # In this implementation strategy, the reconciled result defaults to Coder A's labels.
        df_reconciled[axis] = codes_A

    # Calculate Macro Kappa (Average Kappa across all axes)
    valid_kappas = [k for k in kappa_scores.values() if not np.isnan(k)]
    macro_kappa = np.mean(valid_kappas) if valid_kappas else np.nan

    # Replace 'NaN_STR' strings back to actual NaN for subsequent analysis
    df_reconciled = df_reconciled.replace('NaN_STR', np.nan)

    return df_reconciled, kappa_scores, macro_kappa

print("Calculating IRR and reconciling data...")
df_reconciled, kappa_scores, macro_kappa = reconcile_and_calculate_irr(df)
N_papers = len(df_reconciled)


# ----------------------------------------------------------------------
# 4. Metrics Calculation (Coverage, Entropy, Under-explored Pairs)
# ----------------------------------------------------------------------

# Coverage % (share of papers with valid, non-NA labels on all six axes)
valid_papers = df_reconciled.dropna(subset=AXES).shape[0]
overall_data_coverage = (valid_papers / N_papers) * 100 if N_papers > 0 else 0

def calculate_coverage(df_reconciled):
    """Calculates the percentage of defined categories utilized for each axis."""
    coverage_metrics = {}

    # Calculate coverage (utilization) of the taxonomy definitions
    for axis, categories in TAXONOMY_DEFS.items():
        # Count how many unique labels were actually used in the dataset
        used_categories_count = df_reconciled[axis].nunique()
        total_categories = len(categories)
        coverage = (used_categories_count / total_categories) * 100 if total_categories > 0 else 0
        coverage_metrics[axis] = coverage
    return coverage_metrics

coverage_metrics = calculate_coverage(df_reconciled)

def calculate_normalized_entropy(series):
    """Calculates the normalized entropy H(X) (bits) for a given series."""
    data = series.dropna()

    if data.empty:
        return 0.0

    counts = data.value_counts()
    probabilities = counts / len(data)
    # Calculate entropy in bits (log base 2)
    entropy = -np.sum(probabilities * np.log2(probabilities))

    # Normalize by the maximum possible entropy (log2(N_categories))
    n_categories = len(TAXONOMY_DEFS[series.name])
    max_entropy = math.log2(n_categories) if n_categories > 1 else 0
    normalized_entropy = entropy / max_entropy if max_entropy > 0 else 0
    return normalized_entropy

# Calculate Entropy for Axis M
entropy_M = calculate_normalized_entropy(df_reconciled['M'])

def find_underexplored_pairs(df_reconciled, axis1='M', axis2='E', threshold=2):
    """Finds pairs of categories between two axes with counts below the threshold, including 0 counts."""
    df_clean = df_reconciled[[axis1, axis2]].dropna()

    # Create a cross-tabulation (heatmap matrix)
    crosstab = pd.crosstab(df_clean[axis1], df_clean[axis2])

    # Ensure all defined categories from the taxonomy are represented by reindexing, filling missing with 0
    try:
        crosstab = crosstab.reindex(index=TAXONOMY_DEFS[axis1], columns=TAXONOMY_DEFS[axis2], fill_value=0)
    except Exception as e:
        print(f"Warning: Issue during reindexing MxE pairs: {e}.")
        return []

    underexplored = []
    for idx in crosstab.index:
        for col in crosstab.columns:
            # Ensure we only report on defined taxonomy labels
            if idx in TAXONOMY_DEFS[axis1] and col in TAXONOMY_DEFS[axis2]:
                count = crosstab.loc[idx, col]
                if count < threshold:
                    underexplored.append(f"({idx} × {col}): {count}")
    return underexplored

underexplored_ME = find_underexplored_pairs(df_reconciled, 'M', 'E', threshold=2)



# Define output filenames
RECONCILED_CSV_PATH = OUTPUT_DIR / "taxonomy_labels_reconciled.csv"
RECONCILED_JSONL_PATH = OUTPUT_DIR / "classification.jsonl"
METRICS_CSV_PATH = OUTPUT_DIR / "taxonomy_metrics.csv"
FIG_M_PATH = OUTPUT_DIR / "fig_counts_M.png"
FIG_A_PATH = OUTPUT_DIR / "fig_counts_A.png"
README_PATH = OUTPUT_DIR / "README.md"
ZIP_PATH = OUTPUT_DIR / "taxonomy_validation_artifacts.zip"


# ----------------------------------------------------------------------

# Save Reconciled Data
# For the CSV output, rename columns to include _rec suffix
df_output_csv = df_reconciled.copy()
df_output_csv.columns = [f'{col}_rec' if col in AXES else col for col in df_output_csv.columns]
df_output_csv.to_csv(RECONCILED_CSV_PATH)

# Save JSONL (using the standard axis names for content)
df_reconciled.reset_index().to_json(RECONCILED_JSONL_PATH, orient='records', lines=True)

# Save Metrics CSV
metrics_data = []
if not np.isnan(macro_kappa):
    for axis in AXES:
        metrics_data.append({
            'Axis': axis,
            'Metric': f'Cohen\'s Kappa (κ)',
            'Value': kappa_scores.get(axis, np.nan)
        })
        metrics_data.append({
            'Axis': axis,
            'Metric': f'Coverage (%)',
            'Value': coverage_metrics.get(axis, np.nan)
        })
    metrics_data.append({'Axis': 'Overall', 'Metric': 'Macro-Average κ', 'Value': macro_kappa})

metrics_data.append({'Axis': 'M', 'Metric': 'Normalized Entropy H_norm(M)', 'Value': entropy_M})
metrics_data.append({'Axis': 'Overall', 'Metric': 'Data Coverage (%)', 'Value': overall_data_coverage})


df_metrics = pd.DataFrame(metrics_data)
df_metrics.to_csv(METRICS_CSV_PATH, index=False)

# Create README.md
readme_content = f"""# Taxonomy Validation Artifacts (Reproduced)

This directory contains the outputs of the taxonomy validation pipeline run on {N_papers} papers.

## Key Metrics Summary:
- Macro-Average Kappa: {macro_kappa:.4f}
- Normalized Entropy H(M): {entropy_M:.4f}
- Data Coverage (All axes coded): {overall_data_coverage:.2f}%
"""
with open(README_PATH, 'w') as f:
    f.write(readme_content)

# Zip Artifacts
artifacts = [
    RECONCILED_CSV_PATH, RECONCILED_JSONL_PATH, METRICS_CSV_PATH,
    FIG_M_PATH, FIG_A_PATH, README_PATH
]

try:
    with zipfile.ZipFile(ZIP_PATH, 'w', zipfile.ZIP_DEFLATED) as zf:
        for artifact in artifacts:
            if artifact.exists():
                # Write file using just the filename (arcname)
                zf.write(artifact, arcname=artifact.name)
    print(f"\nSuccessfully created ZIP archive: {ZIP_PATH}")
except Exception as e:
    print(f"Error: Failed to create ZIP file: {e}")

# ----------------------------------------------------------------------
# 7. Reporting (Console Summary and LaTeX Output)
# ----------------------------------------------------------------------

# Console Summary
print("="*60)
print("Taxonomy Validation Pipeline Reproduced")
print("="*60)
print(f"Processed {N_papers} papers. Data Coverage: {overall_data_coverage:.2f}%.")

if not np.isnan(macro_kappa):
    print("\n--- Inter-Rater Reliability (Cohen's Kappa) ---")
    for axis, kappa in kappa_scores.items():
        print(f"Axis {axis}: {kappa:.4f}")
    print(f"Macro-Average Kappa: {macro_kappa:.4f}")

print("\n--- Taxonomy Coverage Metrics (%) ---")
for axis, coverage in coverage_metrics.items():
    print(f"Axis {axis}: {coverage:.2f}%")

print("\n--- Diversity Metrics ---")
print(f"Normalized Entropy H(M): {entropy_M:.4f} bits")

print("\n--- Under-explored (M×E) Pairs (Count < 2) ---")
count_displayed = 0
for pair in underexplored_ME:
     print(f"- {pair}")
     count_displayed += 1
     if count_displayed >= 10:
         break
if len(underexplored_ME) > 10:
    print(f"... and {len(underexplored_ME) - 10} more.")

print("="*60)

# LaTeX Output Generation
def generate_latex_output(N_papers, kappa_scores, macro_kappa, coverage_metrics, entropy_M):
    """Generates the IEEE-tone LaTeX paragraph and table."""

    # Find axes with perfect agreement
    perfect_agreement_axes = [axis for axis, k in kappa_scores.items() if k == 1.0]

    # Specific formatting for the known results (E, D, L)
    if set(perfect_agreement_axes) == {'E', 'D', 'L'}:
        perfect_agreement_str = "Environment (E), Data Type (D), and Learning Regime (L)"
    else:
        # Fallback generic formatting
        if len(perfect_agreement_axes) > 1:
            perfect_agreement_str = ", ".join(perfect_agreement_axes[:-1]) + f", and {perfect_agreement_axes[-1]}"
        elif len(perfect_agreement_axes) == 1:
            perfect_agreement_str = perfect_agreement_axes[0]
        else:
            perfect_agreement_str = None

    # High coverage axes
    coverage_L = coverage_metrics['L']
    coverage_M = coverage_metrics['M']

    # Use double braces {{norm}} for correct f-string formatting of H_norm(M)
    latex_paragraph = f"""
\\subsection{{Taxonomy Validation and Reliability}}
To validate the proposed taxonomy and ensure its consistent application, a dual-coder analysis was conducted on the corpus of {N_papers} papers (N={N_papers}). Two independent coders classified each paper across the six taxonomic axes. Inter-rater reliability was quantified using Cohen's Kappa ($\\kappa$), and the taxonomy's scope was evaluated via coverage percentage and normalized entropy H(M). The results, detailed in Table~\\ref{{tab:taxonomy_metrics_reproducible}}, demonstrate excellent overall reliability with a macro-averaged $\\kappa$ of {macro_kappa:.4f}. """

    if perfect_agreement_str:
        latex_paragraph += f"Perfect agreement ($\\kappa=1.0000$) was achieved for {perfect_agreement_str}, indicating high clarity and mutual exclusivity in these definitions. "

    # Specific wording based on the actual results (M=0.9338, A=0.8556, T=0.5053)
    latex_paragraph += f"Excellent agreement was observed for Model Type (M; $\\kappa={kappa_scores['M']:.4f}$) and Application Role (A; $\\kappa={kappa_scores['A']:.4f}$). The Threat Stage (T) axis exhibited moderate agreement ($\\kappa={kappa_scores['T']:.4f}$), primarily due to nuances in interpreting operational stages (e.g., INFER vs. MDR vs. ADAPT) from limited text. The taxonomy demonstrated broad coverage, utilizing {coverage_L:.2f}\\% of Learning Regimes and {coverage_M:.2f}\\% of Model Types. The normalized entropy for Axis M ($H_{{norm}}(M)={entropy_M:.4f}$) indicates a diverse utilization of generative models within the analyzed literature."""

    # Use double braces {{norm}} for correct f-string formatting of H_norm(M)
    latex_table = f"""
\\begin{{table}}[h]
\\centering
\\caption{{Taxonomy Reliability (Cohen's Kappa) and Coverage Metrics}}
\\label{{tab:taxonomy_metrics_reproducible}}
\\begin{{tabular}}{{lcc}}
\\hline
\\textbf{{Axis}} & \\textbf{{Cohen's Kappa ($\\kappa$)}} & \\textbf{{Coverage (\\%)}} \\\\
\\hline
M (Model Type) & {kappa_scores['M']:.4f} & {coverage_metrics['M']:.2f} \\\\
A (Application Role) & {kappa_scores['A']:.4f} & {coverage_metrics['A']:.2f} \\\\
E (Environment) & {kappa_scores['E']:.4f} & {coverage_metrics['E']:.2f} \\\\
D (Data Type) & {kappa_scores['D']:.4f} & {coverage_metrics['D']:.2f} \\\\
L (Learning Regime) & {kappa_scores['L']:.4f} & {coverage_metrics['L']:.2f} \\\\
T (Threat Stage) & {kappa_scores['T']:.4f} & {coverage_metrics['T']:.2f} \\\\
\\hline
\\textbf{{Macro-Average $\\kappa$}} & \\multicolumn{{2}}{{c}}{{\\textbf{{{macro_kappa:.4f}}}}} \\\\
\\textbf{{Normalized Entropy $H_{{norm}}(M)$}} & \\multicolumn{{2}}{{c}}{{\\textbf{{{entropy_M:.4f}}}}} \\\\
\\hline
\\end{{tabular}}
\\end{{table}}
"""
    return textwrap.dedent(latex_paragraph), textwrap.dedent(latex_table)

latex_paragraph, latex_table = generate_latex_output(N_papers, kappa_scores, macro_kappa, coverage_metrics, entropy_M)

# Final Output Display
print("\n--- LaTeX Output (Ready for Copy/Paste) ---")
print(latex_paragraph)
print(latex_table)
print("--- End LaTeX Output ---")