In [1]:
import os

from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
!pip uninstall -y bitsandbytes
!pip install --no-cache-dir bitsandbytes
!pip install --upgrade accelerate

[0mCollecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvi

In [3]:
pip install np faiss-cpu sentence-transformers evaluate peft rouge bert_score

Collecting np
  Downloading np-1.0.2.tar.gz (7.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)

In [4]:
#!/usr/bin/env python
import argparse
import json
import time
import re
import os
import csv
import numpy as np
import torch
import faiss
import matplotlib.pyplot as plt
import evaluate

from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, util
from peft import LoraConfig, get_peft_model
from rouge import Rouge
from tqdm import tqdm



# ========================================================
# 1. Mount Google Drive
# ========================================================

from google.colab import drive
drive.mount('/content/drive')


file_path = "/content/drive/My Drive/Dissertation/cuad_qa_dataset.json"


# --- Helper Functions ---


def preprocess_paragraph(paragraph):
    cleaned = re.sub(r"\n+", " ", paragraph)
    cleaned = re.sub(r"\s+", " ", cleaned).strip()
    return cleaned



def build_faiss_index(context, embedding_model, chunk_size=512, overlap=128):
    """
    Splits the context into fixed-length overlapping chunks,
    preprocesses them, and builds a FAISS index.
    """
    # Create overlapping chunks from the text
    chunks = []
    start = 0
    while start < len(context):
        chunk = context[start : start + chunk_size]
        chunks.append(chunk)
        start += (chunk_size - overlap)  # advance by chunk_size minus the overlap

    preprocessed_chunks = [preprocess_paragraph(chunk) for chunk in chunks if chunk.strip()]

    # Compute embeddings for all chunks
    chunk_embeddings = embedding_model.encode(preprocessed_chunks, convert_to_tensor=False)
    chunk_embeddings_np = np.array(chunk_embeddings).astype("float32")
    # Normalize embeddings for cosine similarity (or for L2 index)
    norms = np.linalg.norm(chunk_embeddings_np, axis=1, keepdims=True)
    chunk_embeddings_np = chunk_embeddings_np / norms
    embedding_dim = chunk_embeddings_np.shape[1]
    index = faiss.IndexFlatL2(embedding_dim)
    index.add(chunk_embeddings_np)
    print(f"FAISS Initialised: Indexed {index.ntotal} chunks.")

    preprocessed_paragraphs = preprocessed_chunks

    return index, preprocessed_paragraphs



# def build_faiss_index(context, embedding_model):
#     """Splits the context into paragraphs, preprocesses them, and builds a FAISS index."""
#     raw_paragraphs = context.split("\n\n")
#     preprocessed_paragraphs = [preprocess_paragraph(p) for p in raw_paragraphs if p.strip()]
#     # Compute embeddings for all paragraphs
#     paragraph_embeddings = embedding_model.encode(preprocessed_paragraphs, convert_to_tensor=False)
#     paragraph_embeddings_np = np.array(paragraph_embeddings).astype("float32")
#     # Normalize embeddings for cosine similarity (if using L2, normalized vectors can be compared)
#     norms = np.linalg.norm(paragraph_embeddings_np, axis=1, keepdims=True)
#     paragraph_embeddings_np = paragraph_embeddings_np / norms
#     embedding_dim = paragraph_embeddings_np.shape[1]
#     index = faiss.IndexFlatL2(embedding_dim)
#     index.add(paragraph_embeddings_np)
#     print(f"FAISS Initialised: Indexed {index.ntotal} paragraphs.")
#     return index, preprocessed_paragraphs

def retrieve_context(question, embedding_model, index, preprocessed_paragraphs, k=3, threshold=1):
    """Encodes the question and retrieves top k similar paragraphs. Uses a threshold to decide confidence."""
    question_embedding = embedding_model.encode([question]).astype("float32")
    question_embedding /= np.linalg.norm(question_embedding, axis=1, keepdims=True)
    distances, indices = index.search(question_embedding, k)
    best_distance = distances[0][0]
    if best_distance > threshold:
        print("\nLow confidence detected based on FAISS score.")
        return None  # Indicates low confidence

    selected_paragraphs = [preprocessed_paragraphs[i] for i in indices[0]]
    return "\n\n".join(selected_paragraphs)

def generate_answer(question, retrieved_context, tokenizer, model, device, gen_max_length=50):
    """Builds the final prompt and uses the generative model to produce an answer."""
    final_prompt = (
        "Answer the question in one or two sentences. "
        "Do not repeat the question or the context; only provide the final answer.\n"
        f"Question: {question}\n"
        f"Context: {retrieved_context}\n"
        "Final Answer:"
    )
    inputs = tokenizer(final_prompt, return_tensors="pt", truncation=True, max_length=1024)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    torch.cuda.empty_cache()
    generated_ids = model.generate(**inputs, max_new_tokens=gen_max_length)
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    if "Final Answer:" in generated_text:
        answer = generated_text.split("Final Answer:")[-1].strip()
    else:
        answer = generated_text.strip()
    return answer

def evaluate_generative_predictions(test_examples, tokenizer, model, sbert_model,bertscore_metric, rouge_metric, device, embedding_model):
    results_details = []
    semantic_scores = []
    bert_scores = []
    rouge_scores = []
    times = []

    for entry in tqdm(test_examples, desc="Evaluating test examples"):
        ex_id = entry["id"]
        question = entry["question"]
        contract_context = entry["context"]
        gold_text = entry["answers"]["text"][0].strip() if entry["answers"]["text"] else ""

        start_time = time.time()
        # Build FAISS index for the current contract context
        index, paragraphs = build_faiss_index(contract_context, embedding_model)

        # Retrieve context from the current contract context
        retrieved_context = retrieve_context(question, embedding_model, index, paragraphs)

        if retrieved_context is None:
            # If retrieval fails, we set the prediction as an empty string
            pred_text = ""
            duration = time.time() - start_time
            times.append(duration)
        else:
            # Build final prompt using the retrieved context
            final_prompt = (
                "Answer the question in one or two sentences. "
                "Do not repeat the question or the context; only provide the final answer.\n"
                f"Question: {question}\n"
                f"Context: {retrieved_context}\n"
                "Final Answer:"
            )

            inputs = tokenizer(final_prompt, return_tensors="pt", max_length=1024, truncation=True)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            generated_ids = model.generate(**inputs, max_length=1024)
            duration = time.time() - start_time
            times.append(duration)
            pred_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True).strip()
            # Extract only the final answer part
            if "Final Answer:" in pred_text:
              pred_text = pred_text.split("Final Answer:")[-1].strip()



        # Evaluate metrics with proper empty-answer handling.
        if gold_text == "" and pred_text == "":
            sem_sim = 1.0
            b_score = 1.0
            r_score = 1.0
        elif gold_text == "" or pred_text == "":
            sem_sim = 0.0
            b_score = 0.0
            r_score = 0.0
        else:
            # Compute semantic similarity using cosine similarity of SBERT embeddings.
            emb_gold = sbert_model.encode(gold_text, convert_to_tensor=True)
            emb_pred = sbert_model.encode(pred_text, convert_to_tensor=True)
            sem_sim = float(util.pytorch_cos_sim(emb_gold, emb_pred).item())

            # Compute BERTScore using the evaluate library.
            b_result = bertscore_metric.compute(predictions=[pred_text], references=[gold_text], lang="en")
            b_score = b_result["f1"][0]

            # Compute ROUGE-L score.
            r = rouge_metric.get_scores(pred_text, gold_text)
            r_score = r[0]["rouge-l"]["f"]

        semantic_scores.append(sem_sim)
        bert_scores.append(b_score)
        rouge_scores.append(r_score)

        results_details.append({
            "id": ex_id,
            "question": question,
            "gold_answer": gold_text,
            "model_answer": pred_text,
            "semantic_similarity": sem_sim,
            "bert_score": b_score,
            "rouge_l_f1": r_score,
            "eval_time_sec": duration
        })

    overall_results = {
        "avg_semantic_similarity": float(np.mean(semantic_scores)),
        "avg_bert_score": float(np.mean(bert_scores)),
        "avg_rouge_l_f1": float(np.mean(rouge_scores)),
        "avg_eval_time_sec": float(np.mean(times))
    }

    return overall_results, results_details

def save_results_to_csv(details, filename):
    if details:
        keys = details[0].keys()
        with open(filename, "w", newline="") as f:
            dict_writer = csv.DictWriter(f, keys)
            dict_writer.writeheader()
            dict_writer.writerows(details)
        print(f"Saved evaluation details to {filename}")
    else:
        print("No details to save.")


def plot_model_comparison(aggregated_results, save_dir="./"):
    models = [r["model"] for r in aggregated_results]
    avg_rouge = [r["avg_rouge_l_f1"] for r in aggregated_results]
    avg_semantic = [r["avg_semantic_similarity"] for r in aggregated_results]
    avg_bert = [r["avg_bert_score"] for r in aggregated_results]
    avg_time = [r.get("avg_eval_time_sec", 0) for r in aggregated_results]

    x = np.arange(len(models))
    width = 0.25  # Width for each bar

    # Plot metrics comparison
    plt.figure(figsize=(10, 6))
    plt.bar(x - width, avg_rouge, width, label="ROUGE-L")
    plt.bar(x, avg_semantic, width, label="Semantic Similarity")
    plt.bar(x + width, avg_bert, width, label="BERTScore")
    plt.xlabel("Model")
    plt.ylabel("Metric Score")
    plt.title("Evaluation Metrics Comparison")
    plt.xticks(x, models, rotation=45)
    plt.legend()
    plt.tight_layout()
    # Save the figure to a file
    model_comparison_path = os.path.join(save_dir, "model_comparison.png")
    plt.savefig(model_comparison_path)
    plt.show()
    print(f"Saved model comparison plot to {model_comparison_path}")

    # Plot evaluation time per model
    plt.figure(figsize=(10, 6))
    plt.bar(x, avg_time, width=0.5, color="skyblue")
    plt.xlabel("Model")
    plt.ylabel("Average Evaluation Time (sec)")
    plt.title("Evaluation Time per Model")
    plt.xticks(x, models, rotation=45)
    plt.tight_layout()
    time_plot_path = os.path.join(save_dir, "evaluation_time.png")
    plt.savefig(time_plot_path)
    plt.show()
    print(f"Saved evaluation time plot to {time_plot_path}")

def plot_line_metrics(details, save_dir="./"):
    # Extract metric values for each example
    indices = list(range(len(details)))
    semantic_scores = [d["semantic_similarity"] for d in details]
    bert_scores = [d["bert_score"] for d in details]
    rouge_scores = [d["rouge_l_f1"] for d in details]

    plt.figure(figsize=(12, 6))
    plt.plot(indices, semantic_scores, label="Semantic Similarity", marker='o', linestyle='-')
    plt.plot(indices, bert_scores, label="BERTScore", marker='o', linestyle='-')
    plt.plot(indices, rouge_scores, label="ROUGE-L", marker='o', linestyle='-')

    plt.xlabel("Example Index")
    plt.ylabel("Score")
    plt.title("Evaluation Metrics per Example")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()

    line_plot_path = os.path.join(save_dir, "line_metrics.png")
    plt.savefig(line_plot_path)
    plt.show()


# --- Main Pipeline ---

def main():
    # Set up device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # generative model and tokenizer for Falcon-7B
    model_name = "tiiuae/falcon-7b"  # Updated model name
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    print("✅ Falcon-7B Loaded")



    # Initialize embedding model for retrieval (SBERT)
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    # Initialize ROUGE metric for evaluation
    rouge_metric = Rouge()

    # Load the BERTScore metric
    bertscore_metric = evaluate.load("bertscore")

    # Load your dataset (assuming JSON format with a "test" split)
    with open(file_path, "r", encoding="utf-8") as f:
        raw_data = json.load(f)
    # For demonstration, we use a small subset
    test_examples = raw_data["train"][:500]
    print(f"Loaded {len(test_examples)} test examples.")


    # Evaluate generative predictions over the test examples
    overall_results, details = evaluate_generative_predictions(
        test_examples, tokenizer, model, embedding_model, bertscore_metric, rouge_metric, device, embedding_model
    )

    print("\nOverall Evaluation Results:")
    print(overall_results)

    # Save overall evaluation results to a JSON file
    with open("./overall_evaluation_results.json", "w", encoding="utf-8") as f:
        json.dump(overall_results, f, indent=4)
    print("Saved overall evaluation results to overall_evaluation_results.json")

    # Save detailed per-example results to CSV
    save_results_to_csv(details, "./detailed_evaluation_results.csv")
    # For comparison, aggregate results from different models


    aggregated_results = []
    overall_results["model"] = "falcon-7b"
    aggregated_results.append(overall_results)
    # Plot and save bar charts for overall metrics
    plot_model_comparison(aggregated_results, save_dir="./")

    # Plot and save the line graph for per-example metrics
    plot_line_metrics(details, save_dir="./")

if __name__ == "__main__":
    main()

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

✅ Falcon-7B Loaded


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Loaded 500 test examples.


Evaluating test examples:   0%|          | 1/500 [00:00<07:31,  1.10it/s]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


FAISS Initialised: Indexed 142 chunks.

Low confidence detected based on FAISS score.
FAISS Initialised: Indexed 142 chunks.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating test examples:   0%|          | 2/500 [01:12<5:50:58, 42.29s/it]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


FAISS Initialised: Indexed 142 chunks.


Evaluating test examples:   1%|          | 3/500 [02:12<6:58:30, 50.52s/it]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


FAISS Initialised: Indexed 142 chunks.


Evaluating test examples:   1%|          | 4/500 [03:12<7:29:43, 54.40s/it]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


FAISS Initialised: Indexed 142 chunks.


Evaluating test examples:   1%|          | 5/500 [04:13<7:46:40, 56.57s/it]Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


FAISS Initialised: Indexed 142 chunks.


Evaluating test examples:   1%|          | 5/500 [04:28<7:22:49, 53.68s/it]


KeyboardInterrupt: 