# ЁЯУШ Hindi QA using FastText

In [None]:
# тЬЕ Install Required Libraries
!pip install -q nltk rouge-score scikit-learn fasttext

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90mтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБ[0m [32m73.4/73.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone


In [None]:
# тЬЕ Imports
import json, csv, re, gzip
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/M Tech/Sem3")

Mounted at /content/drive


In [None]:
# тЬЕ Load and Flatten SQuAD-style Hindi QA Dataset
def load_and_flatten(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    samples = []
    for domain in data['domains']:
        for ctx in domain['contexts']:
            context = ctx['context']
            for qa in ctx['qas']:
                question = qa['question']
                answer_text = qa['answer']
                answer_start = context.find(answer_text)
                if answer_start == -1:
                    continue
                samples.append({
                    'id': qa['id'],
                    'context': context,
                    'question': question,
                    'answer_text': answer_text,
                    'answer_start': answer_start
                })
    return samples

In [None]:
# тЬЕ Load FastText Embeddings
def load_fasttext_vectors(path, vocab_limit=500000):
    embeddings = {}
    open_func = gzip.open if path.endswith('.gz') else open
    with open_func(path, 'rt', encoding='utf-8') as f:
        next(f)
        for i, line in enumerate(f):
            if i >= vocab_limit:
                break
            parts = line.strip().split()
            word = parts[0]
            vec = np.array(parts[1:], dtype='float32')
            embeddings[word] = vec
    print(f"Loaded {len(embeddings)} word vectors.")
    return embeddings

In [None]:
# тЬЕ Hindi Sentence Tokenizer
def hindi_sent_tokenize(text):
    sentence_endings = re.compile(r'(?<=[ред!?])\s+')
    sentences = sentence_endings.split(text.strip())
    return [s.strip() for s in sentences if s.strip()]

In [None]:
# тЬЕ Sentence to Vector
def sentence_to_vec(sentence, embeddings, dim=300):
    words = sentence.strip().split()
    vecs = [embeddings[word] for word in words if word in embeddings]
    return np.mean(vecs, axis=0) if vecs else np.zeros(dim)

In [None]:
# тЬЕ Get Best Matching Sentence as Answer
def get_best_answer(context, question, embeddings, dim=300):
    sentences = hindi_sent_tokenize(context)
    q_vec = sentence_to_vec(question, embeddings, dim)
    s_vecs = [sentence_to_vec(s, embeddings, dim) for s in sentences]
    if not any(np.any(v) for v in s_vecs):
        return ""
    sims = cosine_similarity([q_vec], s_vecs)[0]
    return sentences[int(np.argmax(sims))]

In [None]:
# тЬЕ Evaluation Metrics
def calculate_metrics(true_answer, predicted_answer):
    smooth = SmoothingFunction().method1
    bleu = sentence_bleu([true_answer.split()], predicted_answer.split(), smoothing_function=smooth)
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_l = scorer.score(true_answer, predicted_answer)['rougeL'].fmeasure
    true_words = set(true_answer.split())
    pred_words = set(predicted_answer.split())
    common = true_words.intersection(pred_words)
    precision = len(common) / len(pred_words) if pred_words else 0
    recall = len(common) / len(true_words) if true_words else 0
    f1 = 2 * precision * recall / (precision + recall + 1e-8) if (precision + recall) > 0 else 0
    return {"BLEU": bleu, "ROUGE-L": rouge_l, "F1": f1}

In [None]:
# тЬЕ Predict and Save Results + Metrics
def predict_and_evaluate(samples, embeddings, save_prefix="predictions_fasttext", dim=300):
    predictions = []
    all_bleu, all_rouge, all_f1 = [], [], []
    for i, sample in enumerate(samples):
        question, context, true_answer = sample['question'], sample['context'], sample['answer_text']
        predicted_answer = get_best_answer(context, question, embeddings, dim)
        scores = calculate_metrics(true_answer, predicted_answer)
        predictions.append({
            "id": sample['id'],
            "question": question,
            "context": context,
            "true_answer": true_answer,
            "predicted_answer": predicted_answer,
            "bleu": scores["BLEU"],
            "rouge_l": scores["ROUGE-L"],
            "f1": scores["F1"]
        })
        all_bleu.append(scores["BLEU"])
        all_rouge.append(scores["ROUGE-L"])
        all_f1.append(scores["F1"])
        if i % 100 == 0:
            print(f"[{i}/{len(samples)}] Q: {question}\nтЖТ Predicted: {predicted_answer}\n")
    # Save to JSON
    with open(f"{save_prefix}.json", "w", encoding="utf-8") as jf:
        json.dump(predictions, jf, indent=2, ensure_ascii=False)
    # Save to CSV
    with open(f"{save_prefix}.csv", "w", encoding="utf-8", newline='') as cf:
        writer = csv.DictWriter(cf, fieldnames=predictions[0].keys())
        writer.writeheader()
        writer.writerows(predictions)
    print("\nЁЯУК Average Scores:")
    print(f"BLEU: {np.mean(all_bleu):.4f}, ROUGE-L: {np.mean(all_rouge):.4f}, F1: {np.mean(all_f1):.4f}")
    return predictions

In [None]:
# Load dataset and embeddings
val_data = load_and_flatten("Dataset/validation.json")
embeddings = load_fasttext_vectors("Embeddings/cc.hi.300.vec")

# Run prediction + metrics
predict_and_evaluate(val_data, embeddings, save_prefix="Embeddings/val_predictions_fasttext")


Loaded 500000 word vectors.
[0/395] Q: рдХреНрдпрд╛ рдХреНрд░реАрдВ рдХреБрдВрдб рддрдХ рдкрд╣реБрдБрдЪрдиреЗ рдХреЗ рд▓рд┐рдП рд╕рд╛рд░реНрд╡рдЬрдирд┐рдХ рдкрд░рд┐рд╡рд╣рди рдЙрдкрд▓рдмреНрдз рд╣реИ?
тЖТ Predicted: рд╡рд╛рд░рд╛рдгрд╕реА рд╢рд╣рд░ рдХреЗ рдкреНрд░рдореБрдЦ рд╕реНрдерд╛рдиреЛрдВ рд╕реЗ рдХреНрд░реАрдВ рдХреБрдВрдб рддрдХ рдкрд╣реБрдБрдЪрдиреЗ рдХреЗ рд▓рд┐рдП рд╡рд┐рднрд┐рдиреНрди рд╕рд╛рд░реНрд╡рдЬрдирд┐рдХ рдкрд░рд┐рд╡рд╣рди рд╡рд┐рдХрд▓реНрдк рдЙрдкрд▓рдмреНрдз рд╣реИрдВред

[100/395] Q: рдЕрдЧрд╕реНрддреНрдпреЗрд╢реНрд╡рд░ рдордВрджрд┐рд░ рдмрдирд╛рд░рд╕ рд░реЗрд▓рд╡реЗ рд╕реНрдЯреЗрд╢рди рд╕реЗ рдХрд┐рддрдиреА рджреВрд░реА рдкрд░ рд╣реИ?
тЖТ Predicted: рдЕрдЧрд╕реНрддреНрдпреЗрд╢реНрд╡рд░ рдордВрджрд┐рд░ рдмрдирд╛рд░рд╕ рд░реЗрд▓рд╡реЗ рд╕реНрдЯреЗрд╢рди рд╕реЗ 5 рдХрд┐рд▓реЛрдореАрдЯрд░ рдХреА рджреВрд░реА рдкрд░ рд╕реНрдерд┐рдд рд╣реИред

[200/395] Q: рдХреНрдпрд╛ рджреБрд░реНрдЧ рд╡рд┐рдирд╛рдпрдХ рдордВрджрд┐рд░ рдореЗрдВ рдЧрдгреЗрд╢ рдЪрддреБрд░реНрдереА рдХр

[{'id': 'kund_980',
  'question': 'рдХреНрдпрд╛ рдХреНрд░реАрдВ рдХреБрдВрдб рддрдХ рдкрд╣реБрдБрдЪрдиреЗ рдХреЗ рд▓рд┐рдП рд╕рд╛рд░реНрд╡рдЬрдирд┐рдХ рдкрд░рд┐рд╡рд╣рди рдЙрдкрд▓рдмреНрдз рд╣реИ?',
  'context': 'рд╣рд╛рдБ, рдХреНрд░реАрдВ рдХреБрдВрдб рддрдХ рдкрд╣реБрдБрдЪрдиреЗ рдХреЗ рд▓рд┐рдП рдмрд╕, рдСрдЯреЛ-рд░рд┐рдХреНрд╢рд╛ рдФрд░ рдЯреИрдХреНрд╕реА рдХреА рд╕реБрд╡рд┐рдзрд╛ рдЙрдкрд▓рдмреНрдз рд╣реИред рд╡рд╛рд░рд╛рдгрд╕реА рд╢рд╣рд░ рдХреЗ рдкреНрд░рдореБрдЦ рд╕реНрдерд╛рдиреЛрдВ рд╕реЗ рдХреНрд░реАрдВ рдХреБрдВрдб рддрдХ рдкрд╣реБрдБрдЪрдиреЗ рдХреЗ рд▓рд┐рдП рд╡рд┐рднрд┐рдиреНрди рд╕рд╛рд░реНрд╡рдЬрдирд┐рдХ рдкрд░рд┐рд╡рд╣рди рд╡рд┐рдХрд▓реНрдк рдЙрдкрд▓рдмреНрдз рд╣реИрдВред рд╢реНрд░рджреНрдзрд╛рд▓реБ рдФрд░ рдкрд░реНрдпрдЯрдХ рдЗрди рд╡рд╛рд╣рдиреЛрдВ рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░рдХреЗ рдЖрд░рд╛рдо рд╕реЗ рдХреНрд░реАрдВ рдХреБрдВрдб рддрдХ рдкрд╣реБрдБрдЪ рд╕рдХрддреЗ рд╣реИрдВред рдСрдЯреЛ-рд░рд┐рдХреНрд╢рд╛ рдФрд░ рдЯреИрдХреНрд╕реА рд╡рд┐рд╢реЗрд╖ рд░реВрдк рд╕реЗ

In [None]:
def evaluate_only(samples, embeddings, dim=300):
    from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
    from rouge_score import rouge_scorer

    def calculate_metrics(true_answer, predicted_answer):
        smooth = SmoothingFunction().method1
        bleu = sentence_bleu([true_answer.split()], predicted_answer.split(), smoothing_function=smooth)
        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
        rouge_l = scorer.score(true_answer, predicted_answer)['rougeL'].fmeasure
        true_words = set(true_answer.split())
        pred_words = set(predicted_answer.split())
        common = true_words.intersection(pred_words)
        if not true_words or not pred_words:
            return {"BLEU": 0.0, "ROUGE-L": 0.0, "F1": 0.0}
        precision = len(common) / len(pred_words)
        recall = len(common) / len(true_words)
        f1 = 2 * precision * recall / (precision + recall + 1e-8)
        return {"BLEU": bleu, "ROUGE-L": rouge_l, "F1": f1}

    bleu_scores, rouge_scores, f1_scores = [], [], []

    for i, sample in enumerate(samples):
        context = sample["context"]
        question = sample["question"]
        true_answer = sample["answer_text"]
        pred_answer = get_best_answer(context, question, embeddings, dim)
        scores = calculate_metrics(true_answer, pred_answer)
        bleu_scores.append(scores["BLEU"])
        rouge_scores.append(scores["ROUGE-L"])
        f1_scores.append(scores["F1"])

    # Final averages
    print("ЁЯУК Average Scores:")
    print(f"BLEU:     {np.mean(bleu_scores):.4f}")
    print(f"ROUGE-L:  {np.mean(rouge_scores):.4f}")
    print(f"F1 Score: {np.mean(f1_scores):.4f}")


In [None]:
# Load your validation data and FastText embeddings
val_data = load_and_flatten("Dataset/validation.json")
embeddings = load_fasttext_vectors("Embeddings/cc.hi.300.vec")

# Just print evaluation scores
evaluate_only(val_data, embeddings)

Loaded 500000 word vectors.
ЁЯУК Average Scores:
BLEU:     0.8402
ROUGE-L:  0.4017
F1 Score: 0.8874
