# ЁЯУШ Hindi QA using FastText

In [None]:
# тЬЕ Install Required Libraries
!pip install -q nltk rouge-score scikit-learn fasttext

  Preparing metadata (setup.py) ... [?25l[?25hdone
[?25l     [90mтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБ[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90mтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБ[0m [32m73.4/73.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone


In [None]:
# тЬЕ Imports
import json, csv, re, gzip
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/M Tech/Sem3")

Mounted at /content/drive


In [None]:
# тЬЕ Load and Flatten SQuAD-style Hindi QA Dataset
def load_and_flatten(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    samples = []
    for domain in data['domains']:
        for ctx in domain['contexts']:
            context = ctx['context']
            for qa in ctx['qas']:
                question = qa['question']
                answer_text = qa['answer']
                answer_start = context.find(answer_text)
                if answer_start == -1:
                    continue
                samples.append({
                    'id': qa['id'],
                    'context': context,
                    'question': question,
                    'answer_text': answer_text,
                    'answer_start': answer_start
                })
    return samples

In [None]:
# тЬЕ Load FastText Embeddings
def load_fasttext_vectors(path, vocab_limit=500000):
    embeddings = {}
    open_func = gzip.open if path.endswith('.gz') else open
    with open_func(path, 'rt', encoding='utf-8') as f:
        next(f)
        for i, line in enumerate(f):
            if i >= vocab_limit:
                break
            parts = line.strip().split()
            word = parts[0]
            vec = np.array(parts[1:], dtype='float32')
            embeddings[word] = vec
    print(f"Loaded {len(embeddings)} word vectors.")
    return embeddings

In [None]:
# тЬЕ Hindi Sentence Tokenizer
def hindi_sent_tokenize(text):
    sentence_endings = re.compile(r'(?<=[ред!?])\s+')
    sentences = sentence_endings.split(text.strip())
    return [s.strip() for s in sentences if s.strip()]

In [None]:
# тЬЕ Sentence to Vector
def sentence_to_vec(sentence, embeddings, dim=300):
    words = sentence.strip().split()
    vecs = [embeddings[word] for word in words if word in embeddings]
    return np.mean(vecs, axis=0) if vecs else np.zeros(dim)

In [None]:
# тЬЕ Get Best Matching Sentence as Answer
def get_best_answer(context, question, embeddings, dim=300):
    sentences = hindi_sent_tokenize(context)
    q_vec = sentence_to_vec(question, embeddings, dim)
    s_vecs = [sentence_to_vec(s, embeddings, dim) for s in sentences]
    if not any(np.any(v) for v in s_vecs):
        return ""
    sims = cosine_similarity([q_vec], s_vecs)[0]
    return sentences[int(np.argmax(sims))]

In [None]:
# тЬЕ Evaluation Metrics
def calculate_metrics(true_answer, predicted_answer):
    smooth = SmoothingFunction().method1
    bleu = sentence_bleu([true_answer.split()], predicted_answer.split(), smoothing_function=smooth)
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_l = scorer.score(true_answer, predicted_answer)['rougeL'].fmeasure
    true_words = set(true_answer.split())
    pred_words = set(predicted_answer.split())
    common = true_words.intersection(pred_words)
    precision = len(common) / len(pred_words) if pred_words else 0
    recall = len(common) / len(true_words) if true_words else 0
    f1 = 2 * precision * recall / (precision + recall + 1e-8) if (precision + recall) > 0 else 0
    return {"BLEU": bleu, "ROUGE-L": rouge_l, "F1": f1}

In [None]:
# тЬЕ Predict and Save Results + Metrics
def predict_and_evaluate(samples, embeddings, save_prefix="predictions_fasttext", dim=300):
    predictions = []
    all_bleu, all_rouge, all_f1 = [], [], []
    for i, sample in enumerate(samples):
        question, context, true_answer = sample['question'], sample['context'], sample['answer_text']
        predicted_answer = get_best_answer(context, question, embeddings, dim)
        scores = calculate_metrics(true_answer, predicted_answer)
        predictions.append({
            "id": sample['id'],
            "question": question,
            "context": context,
            "true_answer": true_answer,
            "predicted_answer": predicted_answer,
            "bleu": scores["BLEU"],
            "rouge_l": scores["ROUGE-L"],
            "f1": scores["F1"]
        })
        all_bleu.append(scores["BLEU"])
        all_rouge.append(scores["ROUGE-L"])
        all_f1.append(scores["F1"])
        if i % 100 == 0:
            print(f"[{i}/{len(samples)}] Q: {question}\nтЖТ Predicted: {predicted_answer}\n")
    # Save to JSON
    with open(f"{save_prefix}.json", "w", encoding="utf-8") as jf:
        json.dump(predictions, jf, indent=2, ensure_ascii=False)
    # Save to CSV
    with open(f"{save_prefix}.csv", "w", encoding="utf-8", newline='') as cf:
        writer = csv.DictWriter(cf, fieldnames=predictions[0].keys())
        writer.writeheader()
        writer.writerows(predictions)
    print("\nЁЯУК Average Scores:")
    print(f"BLEU: {np.mean(all_bleu):.4f}, ROUGE-L: {np.mean(all_rouge):.4f}, F1: {np.mean(all_f1):.4f}")
    return predictions

In [None]:
# Load dataset and embeddings
test_data = load_and_flatten("Dataset/test-A-gold.json")
embeddings = load_fasttext_vectors("Embeddings/cc.hi.300.vec")

# Run prediction + metrics
predict_and_evaluate(test_data, embeddings, save_prefix="Embeddings/val_predictions_fasttext")


Loaded 500000 word vectors.
[0/362] Q: рдкрдВрдЪрдЧрдВрдЧрд╛ рдХреБрдВрдб рд╡рд╛рд░рд╛рдгрд╕реА рдореЗрдВ рдХрд╣рд╛рдБ рд╕реНрдерд┐рдд рд╣реИ?
тЖТ Predicted: рдкрдВрдЪрдЧрдВрдЧрд╛ рдХреБрдВрдб рд╡рд╛рд░рд╛рдгрд╕реА рдХреЗ рдкрд╡рд┐рддреНрд░ рдкрдВрдЪрдЧрдВрдЧрд╛ рдШрд╛рдЯ рдХреЗ рдкрд╛рд╕ рд╕реНрдерд┐рдд рд╣реИред

[100/362] Q: рд╢реНрд░реА рддрд╛рд░рдХреЗрд╢реНрд╡рд░ рдорд╣рд╛рджреЗрд╡ рдордВрджрд┐рд░ рд╡рд╛рд░рд╛рдгрд╕реА рд░реЗрд▓рд╡реЗ рд╕реНрдЯреЗрд╢рди (рдХреИрдВрдЯ) рд╕реЗ рдХрд┐рддрдиреА рджреВрд░ рд╣реИ?
тЖТ Predicted: рд╢реНрд░реА рддрд╛рд░рдХреЗрд╢реНрд╡рд░ рдорд╣рд╛рджреЗрд╡ рдордВрджрд┐рд░ рд╡рд╛рд░рд╛рдгрд╕реА рд░реЗрд▓рд╡реЗ рд╕реНрдЯреЗрд╢рди (рдХреИрдВрдЯ) рд╕реЗ рд▓рдЧрднрдЧ 5.1 рдХрд┐рд▓реЛрдореАрдЯрд░ рджреВрд░ рд╣реИред

[200/362] Q: рд╢реНрд░реА рдХрд╛рд▓ рднреИрд░рд╡ рдордВрджрд┐рд░ рдореЗрдВ рдХреНрдпрд╛ рдЪреЭрд╛рдпрд╛ рдЬрд╛рддрд╛ рд╣реИ?
тЖТ Predicted: рд╢реНрд░реА рдХрд╛рд▓ рднреИрд░рд╡ рдордВрджрд┐рд░ рдореЗрдВ рдЪрдврд╝рд╛рд╡реЗ рдХреЗ рд░реВрдк рдореЗрдВ 

[{'id': 'kund_748',
  'question': 'рдкрдВрдЪрдЧрдВрдЧрд╛ рдХреБрдВрдб рд╡рд╛рд░рд╛рдгрд╕реА рдореЗрдВ рдХрд╣рд╛рдБ рд╕реНрдерд┐рдд рд╣реИ?',
  'context': 'рдкрдВрдЪрдЧрдВрдЧрд╛ рдХреБрдВрдб рд╡рд╛рд░рд╛рдгрд╕реА рдХреЗ рдкрд╡рд┐рддреНрд░ рдкрдВрдЪрдЧрдВрдЧрд╛ рдШрд╛рдЯ рдХреЗ рдкрд╛рд╕ рд╕реНрдерд┐рдд рд╣реИред рдпрд╣ рд╕реНрдерд╛рди рдзрд╛рд░реНрдорд┐рдХ рдФрд░ рдРрддрд┐рд╣рд╛рд╕рд┐рдХ рдорд╣рддреНрд╡ рд░рдЦрддрд╛ рд╣реИ, рдХреНрдпреЛрдВрдХрд┐ рдЗрд╕реЗ рдкрд╛рдБрдЪ рдкреНрд░рдореБрдЦ рдирджрд┐рдпреЛрдВ рдЧрдВрдЧрд╛, рдпрдореБрдирд╛, рд╕рд░рд╕реНрд╡рддреА, рдХрд┐рд░рдгрд╛ рдФрд░ рдзреВрддрдкрд╛рдкрд╛ рдХреЗ рдкреНрд░рддреАрдХрд╛рддреНрдордХ рд╕рдВрдЧрдо рд╕реНрдерд▓ рдХреЗ рд░реВрдк рдореЗрдВ рдорд╛рдиреНрдпрддрд╛ рдкреНрд░рд╛рдкреНрдд рд╣реИред рдХреБрдВрдб рдХреЗ рдкрд╛рд╕ рдХрдИ рдкреБрд░рд╛рдиреЗ рдордВрджрд┐рд░ рдореМрдЬреВрдж рд╣реИрдВ, рдЬреЛ рдЗрд╕реЗ рддреАрд░реНрдердпрд╛рддреНрд░рд┐рдпреЛрдВ рдХреЗ рд▓рд┐рдП рдПрдХ рдкрд╡рд┐рддреНрд░ рд╕реНрдерд▓ рдмрдирд╛рддреЗ рд╣реИрдВред

In [None]:
def evaluate_only(samples, embeddings, dim=300):
    from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
    from rouge_score import rouge_scorer

    def calculate_metrics(true_answer, predicted_answer):
        smooth = SmoothingFunction().method1
        bleu = sentence_bleu([true_answer.split()], predicted_answer.split(), smoothing_function=smooth)
        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
        rouge_l = scorer.score(true_answer, predicted_answer)['rougeL'].fmeasure
        true_words = set(true_answer.split())
        pred_words = set(predicted_answer.split())
        common = true_words.intersection(pred_words)
        if not true_words or not pred_words:
            return {"BLEU": 0.0, "ROUGE-L": 0.0, "F1": 0.0}
        precision = len(common) / len(pred_words)
        recall = len(common) / len(true_words)
        f1 = 2 * precision * recall / (precision + recall + 1e-8)
        return {"BLEU": bleu, "ROUGE-L": rouge_l, "F1": f1}

    bleu_scores, rouge_scores, f1_scores = [], [], []

    for i, sample in enumerate(samples):
        context = sample["context"]
        question = sample["question"]
        true_answer = sample["answer_text"]
        pred_answer = get_best_answer(context, question, embeddings, dim)
        scores = calculate_metrics(true_answer, pred_answer)
        bleu_scores.append(scores["BLEU"])
        rouge_scores.append(scores["ROUGE-L"])
        f1_scores.append(scores["F1"])

    # Final averages
    print("ЁЯУК Average Scores:")
    print(f"BLEU:     {np.mean(bleu_scores):.4f}")
    print(f"ROUGE-L:  {np.mean(rouge_scores):.4f}")
    print(f"F1 Score: {np.mean(f1_scores):.4f}")


In [None]:
# Load your validation data and FastText embeddings
test_data = load_and_flatten("Dataset/test-A-gold.json")
embeddings = load_fasttext_vectors("Embeddings/cc.hi.300.vec")

# Just print evaluation scores
evaluate_only(test_data, embeddings)

Loaded 500000 word vectors.
ЁЯУК Average Scores:
BLEU:     0.8770
ROUGE-L:  0.4604
F1 Score: 0.9180
