In [None]:
# pip install -q datasets transformers sentence-transformers scikit-learn faiss-cpu nltk rouge_score evaluate

In [None]:
import os
import re
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# Hugging Face libraries
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline as hf_pipeline
from transformers import AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util

# Scikit-learn for TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# NLTK for sentence tokenization (optional, but can be useful for chunking)
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt')

# Evaluation
from evaluate import load as load_metric

# Suppress Hugging Face informational messages
os.environ["TRANSFORMERS_VERBOSITY"] = "error" # error, warning, info, debug
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# --- 0. Setup & Data Loading/Preprocessing ---
print("--- 0. Setup & Data Loading/Preprocessing ---")

# Load the dataset
print("Loading eqanun dataset...")
try:
    dataset = load_dataset("allmalab/eqanun", split="train") # Take train split
    # For demonstration, let's work with a subset
    # dataset = dataset.select(range(1000)) # Using first 1000 documents
    print(f"Dataset loaded. Number of documents: {len(dataset)}")
    print("Dataset features:", dataset.features)
    print("Example document:", dataset[0])
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Please ensure you have internet connectivity and the dataset name is correct.")
    exit()

# Document Preprocessing and Chunking
# Legal documents can be very long. We need to chunk them.
# A simple strategy: split by paragraphs or fixed-size chunks.
# Let's try fixed-size overlapping chunks for simplicity.

MAX_CHUNK_LENGTH = 512  # Max tokens for model context (conservative)
CHUNK_OVERLAP = 64      # Overlap between chunks

def chunk_document(doc_id, text, tokenizer_for_counting, max_length=MAX_CHUNK_LENGTH, overlap=CHUNK_OVERLAP):
    # Use a generic tokenizer just for counting tokens, not for actual model input yet
    tokens = tokenizer_for_counting.encode(text, add_special_tokens=False)
    chunks = []
    start = 0
    doc_idx_counter = 0
    while start < len(tokens):
        end = min(start + max_length, len(tokens))
        chunk_tokens = tokens[start:end]
        chunk_text = tokenizer_for_counting.decode(chunk_tokens)
        chunks.append({
            "doc_id": f"{doc_id}_{doc_idx_counter}", # Unique ID for chunk
            "original_doc_id": doc_id,
            "text": chunk_text,
            "start_char_original": text.find(chunk_text), # Approximate start char
        })
        doc_idx_counter += 1
        if end == len(tokens):
            break
        start += (max_length - overlap)
    return chunks

# Use a basic tokenizer for chunking purposes (e.g., from SentenceTransformer)
# or any BERT tokenizer. This is just for estimating token counts for splitting.
# Using a multilingual model tokenizer is a good general choice.
chunk_tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

all_chunks = []
print("Chunking documents...")
for i, doc in enumerate(tqdm(dataset)):
    # Assuming 'text' field contains the main content.
    # Check if 'text' is None or empty
    if doc.get('text') and isinstance(doc['text'], str) and doc['text'].strip():
        doc_chunks = chunk_document(doc.get('id', f"doc_{i}"), doc['text'], chunk_tokenizer)
        all_chunks.extend(doc_chunks)
    else:
        print(f"Warning: Document ID {doc.get('id', f'doc_{i}')} has empty or invalid text. Skipping.")


if not all_chunks:
    print("No valid chunks were created. Exiting.")
    print("This might happen if the 'text' field in your dataset is consistently empty or not a string.")
    exit()

print(f"Total documents: {len(dataset)}. Total chunks created: {len(all_chunks)}")
if all_chunks:
    print("Example chunk:", all_chunks[0])

# For easier access, let's put chunks into a list of texts and a list of metadata
corpus_texts = [chunk['text'] for chunk in all_chunks]
corpus_metadata = all_chunks # Keeps all info like doc_id, original_doc_id


--- 0. Setup & Data Loading/Preprocessing ---
Loading eqanun dataset...
Dataset loaded. Number of documents: 50989
Dataset features: {'text': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None)}
Example document: {'text': 'Azərbaycan Respublikasının İnzibati Xətalar Məcəlləsində dəyişiklik edilməsi haqqında\nAZƏRBAYCAN RESPUBLİKASININ QANUNU\nAzərbaycan Respublikasının Milli Məclisi Azərbaycan Respublikası Konstitusiyasının 94-cü maddəsinin I hissəsinin 17-ci bəndini rəhbər tutaraq\nqərara alır:\nMaddə 1.\nAzərbaycan Respublikasının İnzibati Xətalar Məcəlləsinin\n(Azərbaycan Respublikasının Qanunvericilik Toplusu, 2016, № 2 (I kitab), maddə 202, № 3, maddələr 397, 403, 429, № 4, maddələr 631, 647, 654, № 5, maddələr 835, 846, № 6, maddələr 997, 1010, № 7, maddələr 1247, 1249, № 10, maddə 1608, № 11, maddələr 1769, 1774, 1781, 1783, 1786, 1788, № 12, maddələr 1984, 2000, 2009, 2024, 2049; 2017, № 1, maddə 21, № 2, maddələr 139, 147, 152, 162, № 3, maddələr 331, 344, № 5

  0%|          | 0/50989 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1575 > 512). Running this sequence through the model will result in indexing errors


Total documents: 50989. Total chunks created: 285300
Example chunk: {'doc_id': '37260_0', 'original_doc_id': '37260', 'text': 'Azərbaycan Respublikasının İnzibati Xətalar Məcəlləsində dəyişiklik edilməsi haqqında AZƏRBAYCAN RESPUBLİKASININ QANUNU Azərbaycan Respublikasının Milli Məclisi Azərbaycan Respublikası Konstitusiyasının 94 - cü maddəsinin I hissəsinin 17 - ci bəndini rəhbər tutaraq qərara alır : Maddə 1. Azərbaycan Respublikasının İnzibati Xətalar Məcəlləsinin ( Azərbaycan Respublikasının Qanunvericilik Toplusu, 2016, № 2 ( I kitab ), maddə 202, № 3, maddələr 397, 403, 429, № 4, maddələr 631, 647, 654, № 5, maddələr 835, 846, № 6, maddələr 997, 1010, № 7, maddələr 1247, 1249, № 10, maddə 1608, № 11, maddələr 1769, 1774, 1781, 1783, 1786, 1788, № 12, maddələr 1984, 2000, 2009, 2024, 2049 ; 2017, № 1, maddə 21, № 2, maddələr 139, 147, 152, 162, № 3, maddələr 331, 344, № 5, maddələr 698, 701, 734, 749, 754, № 6, maddələr 1020, 1033, 1036, № 7, maddələr 1273, 1296, 1297, 1299 ; Azə

In [None]:
# --- 1. Document Retrieval System ---
print("\n--- 1. Document Retrieval System ---")

# --- 1.a Sparse Retrieval (TF-IDF) ---
print("\n--- 1.a Sparse Retrieval (TF-IDF) ---")
tfidf_vectorizer = TfidfVectorizer(stop_words=None, token_pattern=r"(?u)\b\w+\b") # Basic tokenizer
print("Fitting TF-IDF Vectorizer...")
try:
    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus_texts)
    print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

    def retrieve_tfidf(query, top_k=5):
        if tfidf_matrix.shape[0] == 0: # No documents indexed
            return [], []
        query_vector = tfidf_vectorizer.transform([query])
        similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
        # Get top_k indices, sorted by similarity
        # If similarities has very few non-zero values, argsort might not behave as expected for top_k
        # Ensure we don't request more than available documents
        actual_top_k = min(top_k, len(similarities))

        # Efficiently get top_k indices:
        if actual_top_k > 0:
            # If you need to handle cases where fewer than top_k items have non-zero similarity:
            # relevant_indices = np.argsort(similarities)[-actual_top_k:][::-1]
            # A more robust way if many similarities are zero:
            sorted_indices = np.argsort(similarities)[::-1] # Sort all in descending order
            top_k_indices = sorted_indices[:actual_top_k]

            # Filter out results with zero similarity if needed, though top_k usually handles this
            # top_k_indices = [idx for idx in top_k_indices if similarities[idx] > 0]

        else:
            top_k_indices = []

        retrieved_docs_data = [corpus_metadata[i] for i in top_k_indices]
        retrieved_scores = [similarities[i] for i in top_k_indices]
        return retrieved_docs_data, retrieved_scores

except ValueError as e:
    print(f"TF-IDF Error: {e}. This might happen if corpus_texts is empty or all documents are trivial.")
    tfidf_matrix = None # Ensure it's None so dependent functions can check
    def retrieve_tfidf(query, top_k=5): # Dummy function
        print("TF-IDF retriever not initialized due to previous error.")
        return [], []


# --- 1.b Dense Retrieval (Sentence-BERT) ---
print("\n--- 1.b Dense Retrieval (Sentence-BERT) ---")
# Using a multilingual model is good for Azerbaijani
# Other options: 'paraphrase-multilingual-MiniLM-L12-v2', 'distiluse-base-multilingual-cased-v1'
sbert_model_name = 'paraphrase-multilingual-mpnet-base-v2'
print(f"Loading Sentence-BERT model: {sbert_model_name}...")
try:
    sbert_model = SentenceTransformer(sbert_model_name)

    # Encode the corpus (this can take time for large corpora)
    print("Encoding corpus with Sentence-BERT (this might take a while)...")
    if corpus_texts: # only encode if there are texts
        corpus_embeddings = sbert_model.encode(corpus_texts, convert_to_tensor=True, show_progress_bar=True)
        print(f"Corpus embeddings shape: {corpus_embeddings.shape}")
    else:
        print("Corpus is empty. Skipping SBERT encoding.")
        corpus_embeddings = None #  torch.empty(0) if using torch tensors, or None


    def retrieve_sbert(query, top_k=5):
        if corpus_embeddings is None or corpus_embeddings.shape[0] == 0:
            return [], []
        query_embedding = sbert_model.encode(query, convert_to_tensor=True)
        # Cosine similarity
        cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]

        # Ensuring we don't request more than available documents
        actual_top_k = min(top_k, len(cos_scores))

        # Get top_k results
        if actual_top_k > 0:
            top_results = torch.topk(cos_scores, k=actual_top_k)
            top_k_indices = top_results.indices.tolist()
            retrieved_scores = top_results.values.tolist()
        else:
            top_k_indices = []
            retrieved_scores = []

        retrieved_docs_data = [corpus_metadata[i] for i in top_k_indices]
        return retrieved_docs_data, retrieved_scores

except Exception as e:
    print(f"Error initializing or using Sentence-BERT: {e}")
    sbert_model = None
    corpus_embeddings = None
    def retrieve_sbert(query, top_k=5): # Dummy function
        print("Sentence-BERT retriever not initialized due to previous error.")
        return [], []


# --- Test Retrieval ---
sample_query_az = "Əmək müqaviləsinin ləğv edilməsi qaydaları hansılardır?" # "What are the rules for termination of an employment contract?"
print(f"\n--- Testing Retrieval with query: '{sample_query_az}' ---")

print("\nTF-IDF Retrieval Results:")
if tfidf_matrix is not None and tfidf_matrix.shape[0] > 0 :
    tfidf_docs, tfidf_scores = retrieve_tfidf(sample_query_az, top_k=3)
    for doc, score in zip(tfidf_docs, tfidf_scores):
        print(f"  Score: {score:.4f} (ID: {doc['doc_id']})")
        print(f"  Text: {doc['text'][:200]}...\n")
else:
    print("  TF-IDF retrieval skipped due to earlier errors or empty corpus.")

print("\nSentence-BERT Retrieval Results:")
if sbert_model and corpus_embeddings is not None and corpus_embeddings.shape[0] > 0:
    sbert_docs, sbert_scores = retrieve_sbert(sample_query_az, top_k=3)
    for doc, score in zip(sbert_docs, sbert_scores):
        print(f"  Score: {score:.4f} (ID: {doc['doc_id']})")
        print(f"  Text: {doc['text'][:200]}...\n")
else:
    print("  Sentence-BERT retrieval skipped due to earlier errors or empty corpus.")



--- 1. Document Retrieval System ---

--- 1.a Sparse Retrieval (TF-IDF) ---
Fitting TF-IDF Vectorizer...
TF-IDF matrix shape: (285300, 381068)

--- 1.b Dense Retrieval (Sentence-BERT) ---
Loading Sentence-BERT model: paraphrase-multilingual-mpnet-base-v2...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding corpus with Sentence-BERT (this might take a while)...


Batches:   0%|          | 0/8916 [00:00<?, ?it/s]

Corpus embeddings shape: torch.Size([285300, 768])

--- Testing Retrieval with query: 'Əmək müqaviləsinin ləğv edilməsi qaydaları hansılardır?' ---

TF-IDF Retrieval Results:
  Score: 0.2793 (ID: 46943_95)
  Text: ##bsa, onda istər işçi, istərsə işəgötürən tərəfindən və istərsə də tərəflərin iradəsindən asılı olmayan hallarda bu Məcəllənin 68, 69, 70, 73, 74 və 75 - ci maddələrində nəzərdə tutulan əsaslara və q...

  Score: 0.2672 (ID: 46943_45)
  Text: sazişin tərəfləri ona müvafiq dəyişikliklər edə bilərlər. Maddə 41. Kollektiv sazişin yerinə yetirilməsinə nəzarət 1. Kollektiv sazişin yerinə yetirilməsinə nəzarəti tərəflər və müvafiq icra hakimiyyə...

  Score: 0.2591 (ID: 16137_129)
  Text: istifadə olunan qablar bioloji müayinədən və isitmə əməliyyatından keçirilirmi? Bəli ( ) Xeyr ( ) 11. 55. Hisə vermədə istifadə olunan materiallar hansılardır? Odun.......................................


Sentence-BERT Retrieval Results:
  Score: 0.8248 (ID: 54192_6)
  Text: tərəfindən işə baxıl

In [None]:
# --- 2. Retrieval-Augmented Generation (RAG) Pipeline ---
print("\n--- 2. RAG Pipeline ---")

# Choose retriever for the RAG pipeline (e.g., SBERT if available, else TF-IDF)
if sbert_model and corpus_embeddings is not None and corpus_embeddings.shape[0] > 0:
    print("Using Sentence-BERT for RAG pipeline retrieval.")
    rag_retriever = retrieve_sbert
elif tfidf_matrix is not None and tfidf_matrix.shape[0] > 0:
    print("Sentence-BERT unavailable or corpus empty. Using TF-IDF for RAG pipeline retrieval.")
    rag_retriever = retrieve_tfidf
else:
    print("No retrievers available. RAG pipeline cannot function.")
    rag_retriever = None # Will cause pipeline to fail gracefully later

def rag_pipeline_retrieve_and_augment(question, retriever_fn, top_k_retrieval=3):
    if retriever_fn is None:
        print("No retriever function available for RAG.")
        return "", []

    # 1. Question Encoding (implicit in SBERT retriever, TF-IDF takes raw text)
    # 2. Document Retrieval
    retrieved_docs_data, _ = retriever_fn(question, top_k=top_k_retrieval)

    if not retrieved_docs_data:
        return "No relevant documents found.", []

    # 3. Context Augmentation
    # Concatenate texts of retrieved documents
    # Be mindful of context window limits of the downstream QA model
    context_parts = []
    for doc_data in retrieved_docs_data:
        context_parts.append(doc_data['text'])

    # Simple concatenation. More sophisticated methods could be used.
    augmented_context = "\n\n".join(context_parts)

    # Truncate context if too long for QA model (a general good practice)
    # Most QA models have a limit around 512 tokens (context+question)
    # This is a rough truncation based on characters; token-based is more accurate
    # but requires the specific QA model's tokenizer.
    # We will handle specific truncation within the QA model calls.

    return augmented_context, retrieved_docs_data


# --- 2.a Answer Processing: Extractive Approach ---
print("\n--- 2.a Answer Processing: Extractive Approach ---")
# Using a multilingual model fine-tuned on SQuAD-like tasks
# 'bert-large-uncased-whole-word-masking-finetuned-squad'
# 'deepset/xlm-roberta-large-squad2' or 'deepset/bert-base-multilingual-uncased-squad2'
# For Azerbaijani, a multilingual model is preferred.
extractive_qa_model_name = "deepset/xlm-roberta-base-squad2"
print(f"Loading Extractive QA model: {extractive_qa_model_name}...")
try:
    extractive_qa_pipeline = hf_pipeline("question-answering", model=extractive_qa_model_name, tokenizer=extractive_qa_model_name)
except Exception as e:
    print(f"Could not load extractive QA model: {e}")
    extractive_qa_pipeline = None

def answer_extractive(question, context):
    if not extractive_qa_pipeline:
        return {"answer": "Extractive QA model not loaded.", "score": 0}
    if not context or context == "No relevant documents found.":
        return {"answer": "No context provided for extractive QA.", "score": 0}

    # The pipeline handles truncation if input is too long
    # max_answer_len can be adjusted
    try:
        result = extractive_qa_pipeline(question=question, context=context, max_answer_len=100, truncation=True)
        return result
    except Exception as e:
        print(f"Error during extractive QA: {e}")
        # Sometimes pipeline might fail if context is too short or unanswerable after truncation
        return {"answer": f"Error in extractive QA: {str(e)}", "score": 0}


# --- 2.b Answer Processing: Generative Approach ---
print("\n--- 2.b Answer Processing: Generative Approach ---")
# Using a multilingual T5 model like 'google/mt5-small'
generative_model_name = "google/mt5-small"
print(f"Loading Generative QA model: {generative_model_name}...")
try:
    generative_tokenizer = AutoTokenizer.from_pretrained(generative_model_name)
    generative_model = AutoModelForSeq2SeqLM.from_pretrained(generative_model_name)
except Exception as e:
    print(f"Could not load generative QA model: {e}")
    generative_tokenizer = None
    generative_model = None

def answer_generative(question, context, max_length=150):
    if not generative_model or not generative_tokenizer:
        return "Generative QA model not loaded."
    if not context or context == "No relevant documents found.":
        return "No context provided for generative QA."

    # Format input for T5-style models
    # mT5 was not specifically trained with "question: ... context: ..." prefix,
    # but it's a common way to structure input for generation from context.
    # For some models, just "translate English to X: " + question + " context: " + context might work
    # Or simply prepending task prefix like "cavab ver:" (answer:)
    input_text = f"Sual: {question} Kontekst: {context} Cavab:" # Azerbaijani: Question, Context, Answer

    # Ensure input_text is not excessively long for the tokenizer
    # Tokenize the input text
    # T5 models typically have a 512 token limit for input. We need to truncate.
    inputs = generative_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate answer
    # Adjust generation parameters as needed: num_beams, early_stopping, etc.
    try:
        outputs = generative_model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_length=max_length, # Max length of the generated answer
            num_beams=4,
            early_stopping=True
        )
        generated_answer = generative_tokenizer.decode(outputs[0], skip_special_tokens=True)
        return generated_answer
    except Exception as e:
        print(f"Error during generative QA: {e}")
        return f"Error in generative QA: {str(e)}"

# --- Test RAG Pipeline ---
print(f"\n--- Testing RAG Pipeline with query: '{sample_query_az}' ---")

if rag_retriever:
    augmented_context, retrieved_docs_for_rag = rag_pipeline_retrieve_and_augment(sample_query_az, rag_retriever, top_k_retrieval=3)

    print(f"\nAugmented Context (first 500 chars):\n{augmented_context[:500]}...\n")
    print(f"Retrieved {len(retrieved_docs_for_rag)} documents for RAG.")

    print("Extractive Answer:")
    ext_answer = answer_extractive(sample_query_az, augmented_context)
    print(f"  Answer: {ext_answer.get('answer', 'N/A')}")
    print(f"  Score: {ext_answer.get('score', 0):.4f}")

    print("\nGenerative Answer:")
    gen_answer = answer_generative(sample_query_az, augmented_context)
    print(f"  Answer: {gen_answer}")
else:
    print("RAG pipeline cannot be tested as no retriever is available.")



--- 2. RAG Pipeline ---
Using Sentence-BERT for RAG pipeline retrieval.

--- 2.a Answer Processing: Extractive Approach ---
Loading Extractive QA model: deepset/xlm-roberta-base-squad2...


config.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Could not load extractive QA model: 
 requires the protobuf library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


--- 2.b Answer Processing: Generative Approach ---
Loading Generative QA model: google/mt5-small...


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Could not load generative QA model: 
 requires the protobuf library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


--- Testing RAG Pipeline with query: 'Əmək müqaviləsinin ləğv edilməsi qaydaları hansılardır?' ---

Augmented Context (first 500 chars):
tərəfindən işə baxılması ona əsaslanır ki, əmək haqqının ödənilməsi mülkiyyətin əldə edilməsinin [UNK] qanuni gözlənti [UNK] sini təşkil edir. Bununla bağlı mübahisələndirilən ödəmə müqavilə və ya normativ aktla müəyyən edilməli və işəgötürənin müvafiq öhdəlikləri yaranmış olmalıdır. Əmək Məcəlləsinin 172 - ci maddəsinin 5 - ci hissəsinə əsasən, əmək haqqının verilməsi işəgötürənin təqsiri üzündən gecikdirildikdə və bu hal fərdi əmək mübahisəsi yaratmayıbsa, hər gecikdirilmiş gün üçü

In [None]:
# --- 3. Evaluation ---
print("\n--- 3. Evaluation ---")

# For evaluation, we need a small set of (question, gold_retrieved_doc_ids, gold_answer_extractive, gold_answer_generative)
# Since we don't have this for eqanun, we'll create a few mock examples
# based on some hypothetical questions and answers we might derive by quickly looking at the data.
# THIS IS A SIMPLIFIED DEMONSTRATION. Proper evaluation needs a curated dataset.

# Let's assume we manually create some evaluation data.
# For a real scenario, you'd need to carefully craft these.
# I'll use the `sample_query_az` and *assume* some documents are relevant
# and *manually* craft a hypothetical answer for demonstration.

# Find some actual document IDs that might be relevant to sample_query_az from retrieval tests above
# This is highly dependent on your actual data and retrieval results.
# For example, if `retrieve_sbert` returned docs with ID 'doc_X_Y' and 'doc_A_B' as relevant:
mock_relevant_doc_ids_for_query1 = []
if 'sbert_docs' in locals() and sbert_docs: # Check if sbert_docs exists and is not empty
    mock_relevant_doc_ids_for_query1 = [d['doc_id'] for d in sbert_docs[:2]] # Take top 2 SBERT results as "gold"
elif 'tfidf_docs' in locals() and tfidf_docs:
    mock_relevant_doc_ids_for_query1 = [d['doc_id'] for d in tfidf_docs[:2]] # Fallback to TF-IDF


eval_data = [
    {
        "question": sample_query_az,
        "gold_retrieved_doc_ids": mock_relevant_doc_ids_for_query1, # Placeholder
        "gold_answer_extractive": "Əmək müqaviləsinə xitam verilməsi üçün əsaslar Əmək Məcəlləsinin müvafiq maddələrində göstərilmişdir.", # Placeholder
        "gold_answer_generative": "Əmək müqaviləsinin ləğvi, tərəflərin razılığı, müddətin bitməsi və ya qanunvericilikdə nəzərdə tutulmuş digər əsaslarla mümkündür.", # Placeholder
        # For generative answers, we'd list multiple good references for ROUGE
        "gold_references_generative": [
            "Əmək müqaviləsinin ləğvi, tərəflərin razılığı, müddətin bitməsi və ya qanunvericilikdə nəzərdə tutulmuş digər əsaslarla mümkündür.",
            "İşəgötürən və ya işçi tərəfindən əmək müqaviləsinə qanunda göstərilən qaydada xitam verilə bilər."
        ]
    },
    # Add more questions here for a more robust evaluation
]

# --- 3.a Evaluate Retrieval Component ---
print("\n--- 3.a Evaluate Retrieval Component ---")
# Metrics: Precision@K, Recall@K, MRR. Requires knowing relevant documents for each query.

def evaluate_retrieval(retriever_fn, eval_data, top_k_eval=5):
    if not retriever_fn:
        print("Retriever function not available for evaluation.")
        return {"P@K": 0, "R@K": 0, "MRR": 0}

    precisions_at_k = []
    recalls_at_k = []
    reciprocal_ranks = []

    for item in tqdm(eval_data, desc="Evaluating Retrieval"):
        query = item["question"]
        gold_doc_ids = set(item["gold_retrieved_doc_ids"])

        if not gold_doc_ids: # Skip if no gold documents are defined for this question
            continue

        retrieved_docs_data, _ = retriever_fn(query, top_k=top_k_eval)
        retrieved_doc_ids = set([doc['doc_id'] for doc in retrieved_docs_data])

        # Precision@K
        num_relevant_retrieved = len(gold_doc_ids.intersection(retrieved_doc_ids))
        precision_k = num_relevant_retrieved / len(retrieved_doc_ids) if retrieved_doc_ids else 0
        precisions_at_k.append(precision_k)

        # Recall@K
        recall_k = num_relevant_retrieved / len(gold_doc_ids) if gold_doc_ids else 0
        recalls_at_k.append(recall_k)

        # MRR (Mean Reciprocal Rank)
        rr = 0.0
        for rank, doc_data in enumerate(retrieved_docs_data):
            if doc_data['doc_id'] in gold_doc_ids:
                rr = 1.0 / (rank + 1)
                break
        reciprocal_ranks.append(rr)

    avg_precision_k = np.mean(precisions_at_k) if precisions_at_k else 0
    avg_recall_k = np.mean(recalls_at_k) if recalls_at_k else 0
    mrr = np.mean(reciprocal_ranks) if reciprocal_ranks else 0

    return {"P@K": avg_precision_k, "R@K": avg_recall_k, "MRR": mrr}

# Evaluate TF-IDF Retriever
if tfidf_matrix is not None and tfidf_matrix.shape[0] > 0 and any(d['gold_retrieved_doc_ids'] for d in eval_data):
    print("\nEvaluating TF-IDF Retriever:")
    tfidf_eval_results = evaluate_retrieval(retrieve_tfidf, eval_data, top_k_eval=5)
    for metric, value in tfidf_eval_results.items():
        print(f"  {metric}: {value:.4f}")
else:
    print("\nSkipping TF-IDF Retriever evaluation (no gold data or retriever not initialized).")


# Evaluate SBERT Retriever
if sbert_model and corpus_embeddings is not None and corpus_embeddings.shape[0] > 0 and any(d['gold_retrieved_doc_ids'] for d in eval_data):
    print("\nEvaluating Sentence-BERT Retriever:")
    sbert_eval_results = evaluate_retrieval(retrieve_sbert, eval_data, top_k_eval=5)
    for metric, value in sbert_eval_results.items():
        print(f"  {metric}: {value:.4f}")
else:
    print("\nSkipping SBERT Retriever evaluation (no gold data or retriever not initialized).")


# --- 3.b Evaluate End-to-End Open-Domain QA ---
print("\n--- 3.b Evaluate End-to-End Open-Domain QA ---")
# Metrics: Exact Match (EM), F1-score for extractive. ROUGE for generative.

# Helper for EM/F1 (from SQuAD evaluation script, simplified)
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    import string, re
    def remove_articles(text): # Azerbaijani doesn't have articles like 'a', 'an', 'the'
        return text
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = len(set(prediction_tokens) & set(ground_truth_tokens))
    if common == 0:
        return 0
    precision = 1.0 * common / len(prediction_tokens)
    recall = 1.0 * common / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))

# ROUGE metric loader
try:
    rouge_metric = load_metric("rouge")
except Exception as e:
    print(f"Failed to load ROUGE metric: {e}. Generative evaluation will be skipped.")
    rouge_metric = None

# Evaluate Extractive QA
if extractive_qa_pipeline and rag_retriever:
    print("\nEvaluating Extractive QA System:")
    extractive_ems = []
    extractive_f1s = []
    for item in tqdm(eval_data, desc="Evaluating Extractive QA"):
        question = item["question"]
        gold_answer = item["gold_answer_extractive"]

        context, _ = rag_pipeline_retrieve_and_augment(question, rag_retriever)
        if context == "No relevant documents found." or not context.strip():
            predicted_answer_obj = {"answer": ""} # Handle no context
        else:
            predicted_answer_obj = answer_extractive(question, context)

        predicted_answer = predicted_answer_obj.get('answer', "")

        extractive_ems.append(exact_match_score(predicted_answer, gold_answer))
        extractive_f1s.append(f1_score(predicted_answer, gold_answer))

    avg_em = np.mean(extractive_ems) if extractive_ems else 0
    avg_f1 = np.mean(extractive_f1s) if extractive_f1s else 0
    print(f"  Exact Match (EM): {avg_em:.4f}")
    print(f"  F1 Score: {avg_f1:.4f}")
else:
    print("\nSkipping Extractive QA System evaluation (pipeline or retriever not initialized).")


# Evaluate Generative QA
if generative_model and generative_tokenizer and rag_retriever and rouge_metric:
    print("\nEvaluating Generative QA System:")
    all_predictions_gen = []
    all_references_gen = [] # ROUGE expects list of lists of references

    for item in tqdm(eval_data, desc="Evaluating Generative QA"):
        question = item["question"]
        # ROUGE can take multiple reference answers
        gold_references = item.get("gold_references_generative", [item["gold_answer_generative"]])

        context, _ = rag_pipeline_retrieve_and_augment(question, rag_retriever)
        if context == "No relevant documents found." or not context.strip():
            predicted_answer = "" # Handle no context
        else:
            predicted_answer = answer_generative(question, context)

        all_predictions_gen.append(predicted_answer)
        all_references_gen.append(gold_references) # Store list of references

    if all_predictions_gen and all_references_gen:
        rouge_results = rouge_metric.compute(predictions=all_predictions_gen, references=all_references_gen)
        print(f"  ROUGE Scores: ")
        for key, value in rouge_results.items():
            print(f"    {key}: {value:.4f}")
    else:
        print("  Not enough data to compute ROUGE scores.")
else:
    print("\nSkipping Generative QA System evaluation (model, retriever, or ROUGE metric not initialized).")


print("\n--- End of Part 2: Open-Domain Question Answering System ---")

# Example of using the full RAG system with a new question
if rag_retriever:
    print("\n--- Example Query for Full RAG System ---")
    new_question_az = "Boşanma üçün hansı sənədlər tələb olunur?" # "What documents are required for divorce?"

    print(f"Question: {new_question_az}")

    # 1. Retrieve and Augment
    context_for_new_q, retrieved_docs_new_q = rag_pipeline_retrieve_and_augment(new_question_az, rag_retriever, top_k_retrieval=3)
    print(f"\nRetrieved {len(retrieved_docs_new_q)} documents. Context (first 200 chars): {context_for_new_q[:200]}...")

    # 2. Extractive Answer
    if extractive_qa_pipeline:
        ext_ans_new_q = answer_extractive(new_question_az, context_for_new_q)
        print(f"\nExtractive Answer: {ext_ans_new_q.get('answer', 'N/A')} (Score: {ext_ans_new_q.get('score',0):.2f})")

    # 3. Generative Answer
    if generative_model:
        gen_ans_new_q = answer_generative(new_question_az, context_for_new_q)
        print(f"\nGenerative Answer: {gen_ans_new_q}")
else:
    print("\nSkipping example query as RAG retriever is not available.")


--- 3. Evaluation ---

--- 3.a Evaluate Retrieval Component ---

Evaluating TF-IDF Retriever:


Evaluating Retrieval:   0%|          | 0/1 [00:00<?, ?it/s]

  P@K: 0.0000
  R@K: 0.0000
  MRR: 0.0000

Evaluating Sentence-BERT Retriever:


Evaluating Retrieval:   0%|          | 0/1 [00:00<?, ?it/s]

  P@K: 0.4000
  R@K: 1.0000
  MRR: 1.0000

--- 3.b Evaluate End-to-End Open-Domain QA ---


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]


Skipping Extractive QA System evaluation (pipeline or retriever not initialized).

Skipping Generative QA System evaluation (model, retriever, or ROUGE metric not initialized).

--- End of Part 2: Open-Domain Question Answering System ---

--- Example Query for Full RAG System ---
Question: Boşanma üçün hansı sənədlər tələb olunur?

Retrieved 3 documents. Context (first 200 chars): 12. Boşanma prosesində ər - arvadın ümumi əmlakının bölünməsi məsələsi də həll olunarsa, məhkəmə hər şeydən əvvəl hansı əmlakın bölünməli olmasını, bu əmlakın nə vaxt əldə edilməsini, ümumi mülkiyyətd...
