In [10]:
import os
import csv
import json
import re
from typing import List, Dict
from statistics import mean
from sklearn.metrics import cohen_kappa_score
import time
import requests
from io import StringIO


TARGET_CLASSES = ['Protocol', 'Organization']

# Small evaluation dataset (~12 DeFi-style sentences)
DATA_CSV = '''id,text,gold_terms
1,"We integrated Uniswap V3 into our pipeline and measured swaps per second; audits were provided by OpenZeppelin.","[{""term"": ""Uniswap"", ""class"": ""Protocol""}, {""term"": ""OpenZeppelin"", ""class"": ""Organization""}]"
2,"Aave reported an increase in TVL last quarter. The security review was done by CertiK.","[{""term"": ""Aave"", ""class"": ""Protocol""}, {""term"": ""CertiK"", ""class"": ""Organization""}]"
3,"Lido's staking contracts run on Ethereum and collaborations include partnerships with Figment.","[{""term"": ""Lido"", ""class"": ""Protocol""}, {""term"": ""Figment"", ""class"": ""Organization""}]"
4,"SushiSwap announced a grant from a16z to expand liquidity incentives.","[{""term"": ""SushiSwap"", ""class"": ""Protocol""}, {""term"": ""a16z"", ""class"": ""Organization""}]"
5,"Curve Finance focuses on stablecoin swaps; security audits by Trail of Bits increased confidence.","[{""term"": ""Curve Finance"", ""class"": ""Protocol""}, {""term"": ""Trail of Bits"", ""class"": ""Organization""}]"
6,"Balancer Labs introduced a new AMM curve; independent auditors included PeckShield.","[{""term"": ""Balancer Labs"", ""class"": ""Protocol""}, {""term"": ""PeckShield"", ""class"": ""Organization""}]"
7,"We indexed data from MakerDAO governance proposals and referenced audits from Quantstamp.","[{""term"": ""MakerDAO"", ""class"": ""Protocol""}, {""term"": ""Quantstamp"", ""class"": ""Organization""}]"
8,"1inch aggregates liquidity across protocols like Uniswap and SushiSwap; code review by OpenZeppelin was cited.","[{""term"": ""1inch"", ""class"": ""Protocol""}, {""term"": ""Uniswap"", ""class"": ""Protocol""}, {""term"": ""SushiSwap"", ""class"": ""Protocol""}, {""term"": ""OpenZeppelin"", ""class"": ""Organization""}]"
9,"Compound governance proposals were discussed at a conference sponsored by Consensys.","[{""term"": ""Compound"", ""class"": ""Protocol""}, {""term"": ""Consensys"", ""class"": ""Organization""}]"
10,"PancakeSwap runs on BSC and was audited by CertiK earlier this year.","[{""term"": ""PancakeSwap"", ""class"": ""Protocol""}, {""term"": ""CertiK"", ""class"": ""Organization""}]"
11,"OpenZeppelin and Trail of Bits often publish findings about vulnerabilities in DeFi protocols.","[{""term"": ""OpenZeppelin"", ""class"": ""Organization""}, {""term"": ""Trail of Bits"", ""class"": ""Organization""}]"
12,"DeFi protocols such as Aave, Compound, and MakerDAO rely on audits by Quantstamp and PeckShield.","[{""term"": ""Aave"", ""class"": ""Protocol""}, {""term"": ""Compound"", ""class"": ""Protocol""}, {""term"": ""MakerDAO"", ""class"": ""Protocol""}, {""term"": ""Quantstamp"", ""class"": ""Organization""}, {""term"": ""PeckShield"", ""class"": ""Organization""}]"
'''

# write dataset file
with open('assignment3_eval_dataset.csv', 'w', encoding='utf-8') as f:
    f.write(DATA_CSV)

KNOWN = set([
    'uniswap','aave','lido','sushiswap','curve finance','curve','balancer labs','1inch','compound','makerdao','pancakeswap',
    'openzeppelin','certik','trail of bits','peckshield','quantstamp','figment','a16z','consensys'
])

def rule_based_extractor(text: str) -> List[Dict]:
    results = []
    lowered = text.lower()
    for t in KNOWN:
        if t in lowered:
            pattern = re.compile(re.escape(t), re.IGNORECASE)
            for m in pattern.finditer(text):
                term = text[m.start():m.end()]
                if t in ['openzeppelin','certik','trail of bits','peckshield','quantstamp','consensys','figment','a16z']:
                    cls = 'Organization'
                else:
                    cls = 'Protocol'
                results.append({'term': term, 'class': cls})

    # detect multiword with regex (Capitalized words followed by known suffixes)
    for m in re.finditer(r"\b([A-Z][a-z0-9]+(?:\s+(?:Labs|Finance|DAO|Swap|Group|Foundation|Inc|LLC))+)\b", text):
        term = m.group(1)
        cls = 'Protocol'
        if any(k.lower() in term.lower() for k in ['labs','foundation','inc','llc','consensys','openzeppelin','certik','trail of bits','peckshield','quantstamp']):
            cls = 'Organization'
        results.append({'term': term, 'class': cls})

    # deduplicate
    seen = set()
    dedup = []
    for r in results:
        key = (r['term'].strip(), r['class'])
        if key not in seen:
            dedup.append(r)
            seen.add(key)
    return dedup


def _call_gemini_api(prompt: str, response_schema: dict, system_instruction: str = None, retries=5) -> str:
    """Helper function to call the Gemini API with structured JSON output."""

    api_key = os.environ.get("GEMINI_API_KEY", "xxxxxx_Wlb_ZfRtWqw")

    model = "gemini-2.5-flash-preview-09-2025"
    base_url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent"
    api_url = f"{base_url}?key={api_key}"

    contents = [{"role": "user", "parts": [{"text": prompt}]}]

    payload = {
        "contents": contents,
        "generationConfig": {
            "responseMimeType": "application/json",
            "responseSchema": response_schema,
            "temperature": 0.0
        }
    }

    if system_instruction:
        payload["systemInstruction"] = {"parts": [{"text": system_instruction}]}

    headers = {'Content-Type': 'application/json'}

    for i in range(retries):
        try:
            response = requests.post(api_url, headers=headers, data=json.dumps(payload))

            response.raise_for_status()

            result = response.json()

            if result and result.get('candidates') and result['candidates'][0].get('content'):
                return result['candidates'][0]['content']['parts'][0]['text']

            raise ValueError("Gemini returned unexpected content structure or empty response.")

        except requests.exceptions.HTTPError as e:
            error_details = response.text if response is not None else str(e)
            if i < retries - 1:
                delay = 2 ** i
                time.sleep(delay)
                continue
            else:

                raise RuntimeError(f"Gemini API HTTP request failed after {retries} retries. Status: {response.status_code}. Details: {error_details}")

        except Exception as e:

            if i < retries - 1:
                delay = 2 ** i
                time.sleep(delay)
                continue
            else:
                raise RuntimeError(f"Failed to process Gemini response after {retries} retries. Error: {e}")

    return "[]"

def llm_based_extractor(text: str) -> List[Dict]:
    """Extracts entities using the Gemini API with structured JSON output."""

    schema = {
        "type": "ARRAY",
        "items": {
            "type": "OBJECT",
            "properties": {
                "term": {"type": "STRING", "description": "The exact substring from the text."},
                "class": {"type": "STRING", "enum": TARGET_CLASSES, "description": "The class of the entity, either 'Protocol' or 'Organization'."}
            },
            "required": ["term", "class"],
        }
    }

    system_prompt = (
        "You are a specialized DeFi Entity Extractor. Your task is to identify and extract entities "
        "from the text that fit the classes 'Protocol' (DeFi platforms/projects) or 'Organization' "
        "(auditors, venture capital firms, traditional companies). "
        "Return a JSON array where the 'term' field is the exact substring found in the text. "
        "Do not invent terms or modify the capitalization."
    )

    prompt = f"Text to analyze:\n{text}\n\nExtract all entities matching the classes 'Protocol' and 'Organization'."

    try:
        content = _call_gemini_api(prompt, schema, system_instruction=system_prompt)
        j = json.loads(content)
        out = []
        for entry in j:
            if 'term' in entry and 'class' in entry:
                out.append({'term': entry['term'], 'class': entry['class'], 'span': None})
        return out
    except Exception as e:

        return []


def load_dataset(path='assignment3_eval_dataset.csv') -> List[Dict]:
    items = []

    f = StringIO(DATA_CSV)

    reader = csv.DictReader(f)
    for row in reader:
        gold = json.loads(row['gold_terms']) if row['gold_terms'] else []
        items.append({'id': row['id'], 'text': row['text'], 'gold': gold})
    return items


def evaluate_extractor(extractor_fn, dataset, verbose=False):
    """Evaluates an extractor function against the gold standard."""
    TP = FP = FN = 0
    per_doc_results = []
    for item in dataset:
        text = item['text']
        gold_terms = set((g['term'].strip(), g['class']) for g in item['gold'])
        try:
            extracted = extractor_fn(text)
        except Exception as e:
            print(f"Extractor error on Doc {item['id']}: {e}")
            extracted = []

        ext_set = set((e['term'].strip(), e['class']) for e in extracted)
        tp = len(gold_terms & ext_set)
        fp = len(ext_set - gold_terms)
        fn = len(gold_terms - ext_set)
        TP += tp; FP += fp; FN += fn
        per_doc_results.append({'id': item['id'], 'tp': tp, 'fp': fp, 'fn': fn, 'gold': gold_terms, 'extracted': ext_set})
        if verbose:
            print(f"Doc {item['id']}: TP={tp} FP={fp} FN={fn}")

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0
    return {'precision': precision, 'recall': recall, 'f1': f1, 'TP': TP, 'FP': FP, 'FN': FN, 'per_doc': per_doc_results}


def llm_as_judge(text: str, extracted: List[Dict]) -> List[Dict]:
    """Judges extracted terms using the Gemini API with structured JSON output."""

    schema = {
        "type": "ARRAY",
        "items": {
            "type": "OBJECT",
            "properties": {
                "term": {"type": "STRING"},
                "class": {"type": "STRING", "enum": TARGET_CLASSES},
                "judgment": {"type": "STRING", "enum": ["Correct", "Incorrect"]},
                "explanation": {"type": "STRING", "description": "Short justification for the judgment."}
            },
            "required": ["term", "class", "judgment", "explanation"]
        }
    }

    items_text = '\n'.join([f"{i+1}. '{e['term']}' (class: {e['class']})" for i, e in enumerate(extracted)])

    system_prompt = (
        "You are a precise annotator for DeFi extractions. Given the original text and a list of extracted terms, "
        "decide for each extraction whether it is 'Correct' (the term is indeed an instance of the provided class and appears verbatim in the text) "
        "or 'Incorrect'. Respond strictly with the requested JSON array."
    )

    prompt = (
        f"Text:\n{text}\n\n"
        f"Extractions:\n{items_text}\n\n"
        "Provide your judgment (Correct/Incorrect) for each extraction."
    )

    try:
        content = _call_gemini_api(prompt, schema, system_instruction=system_prompt)
        j = json.loads(content)
        return j
    except Exception as e:

        return []

def compute_agreement_with_gold(dataset, extractor_fn, judge_fn=None):
    """Computes agreement between the extracted labels and gold labels."""
    gold_labels = []
    judge_labels = []

    for item in dataset:
        text = item['text']
        gold_terms = set((g['term'].strip(), g['class']) for g in item['gold'])
        try:
            extracted = extractor_fn(text)
        except Exception as e:
            extracted = []

        if judge_fn is not None:
            try:
                judge_results = judge_fn(text, extracted)
            except Exception as e:
                judge_results = []
        else:
            judge_results = [{'term': e['term'], 'class': e['class'], 'judgment': 'Correct'} for e in extracted]

        for jr in judge_results:
            term_class = (jr['term'].strip(), jr['class'])

            # 1 = positive, 0 = negative
            gold = 1 if term_class in gold_terms else 0
            judge = 1 if jr['judgment'].lower().startswith('c') else 0

            gold_labels.append(gold)
            judge_labels.append(judge)

    if len(gold_labels) == 0:
        return {'accuracy': None, 'kappa': None, 'n': 0}

    accuracy = sum(1 for g,j in zip(gold_labels, judge_labels) if g==j)/len(gold_labels)
    try:
        kappa = cohen_kappa_score(gold_labels, judge_labels)
    except Exception:
        kappa = None

    return {'accuracy': accuracy, 'kappa': kappa, 'n': len(gold_labels)}


if __name__ == '__main__':



    print('Loading dataset...')
    ds = load_dataset()
    print(f'Loaded {len(ds)} documents.\n')

    print('Evaluating rule-based extractor...')
    rule_results = evaluate_extractor(rule_based_extractor, ds, verbose=True)
    print('\nRule-based results:')
    print(json.dumps({k: rule_results[k] for k in ['precision','recall','f1','TP','FP','FN']}, indent=2))

    try:
        import requests
        _GEMINI_AVAILABLE = True
    except ImportError:
        _GEMINI_AVAILABLE = False

    if _GEMINI_AVAILABLE:
        print('\nrequests library detected. Running LLM-based extractor (Gemini API).')
        try:
            llm_results = evaluate_extractor(llm_based_extractor, ds)
            print('LLM extractor results:')
            print(json.dumps({k: llm_results[k] for k in ['precision','recall','f1','TP','FP','FN']}, indent=2))

            print('\nRunning LLM-as-judge to evaluate agreement with gold labels (may be slow)...')
            agreement = compute_agreement_with_gold(ds, llm_based_extractor, judge_fn=llm_as_judge)
            print('Agreement:', agreement)

        except RuntimeError as e:

            print(f'Error running LLM experiments: {e}')
        except Exception as e:
            print(f'An unexpected error occurred during LLM experiments: {e}')

    else:
        print('\nrequests library not available. Skipping LLM-based extractor experiments.')

Loading dataset...
Loaded 12 documents.

Evaluating rule-based extractor...
Doc 1: TP=2 FP=0 FN=0
Doc 2: TP=2 FP=0 FN=0
Doc 3: TP=2 FP=0 FN=0
Doc 4: TP=2 FP=0 FN=0
Doc 5: TP=2 FP=1 FN=0
Doc 6: TP=2 FP=2 FN=0
Doc 7: TP=2 FP=0 FN=0
Doc 8: TP=4 FP=0 FN=0
Doc 9: TP=2 FP=0 FN=0
Doc 10: TP=2 FP=0 FN=0
Doc 11: TP=2 FP=0 FN=0
Doc 12: TP=5 FP=0 FN=0

Rule-based results:
{
  "precision": 0.90625,
  "recall": 1.0,
  "f1": 0.9508196721311475,
  "TP": 29,
  "FP": 3,
  "FN": 0
}

requests library detected. Running LLM-based extractor (Gemini API).
LLM extractor results:
{
  "precision": 0.88,
  "recall": 0.7586206896551724,
  "f1": 0.8148148148148148,
  "TP": 22,
  "FP": 3,
  "FN": 7
}

Running LLM-as-judge to evaluate agreement with gold labels (may be slow)...
Agreement: {'accuracy': 0.9230769230769231, 'kappa': np.float64(0.0), 'n': 26}
