In [None]:
!pip install gensim nltk rouge-score scikit-learn



In [None]:
import json
import re
import nltk
import numpy as np
import pandas as pd
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir("/content/drive/MyDrive")

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# --- Split Hindi context using danda (।) ---
def split_hindi_sentences(text):
    return [s.strip() for s in re.split(r'।+', text) if s.strip()]

# --- Load Hindi QA Dataset and convert to DataFrame ---
def load_custom_format(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    examples = []
    for domain in data['domains']:
        for context_obj in domain['contexts']:
            context = context_obj['context']
            for qa in context_obj['qas']:
                question = qa['question']
                answer_text = qa['answer']
                answer_start = context.find(answer_text)

                if answer_start == -1:
                    continue  # skip samples where answer not found in context

                examples.append({
                    'id': qa['id'],
                    'context': context,
                    'question': question,
                    'answer_text': answer_text,
                    'answer_start': answer_start
                })
    return pd.DataFrame(examples)

In [None]:
# --- Train Word2Vec model on all context sentences ---
def train_word2vec(df):
    tokenized_sentences = []
    for context in df['context']:
        sentences = split_hindi_sentences(context)
        tokenized_sentences.extend([word_tokenize(sent) for sent in sentences])
    model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)
    return model

# --- Generate sentence vector by averaging word vectors ---
def sentence_vector(sentence, model):
    words = word_tokenize(sentence)
    vecs = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

# --- Cosine similarity between two dense vectors ---
def cosine_similarity(vec1, vec2):
    if np.linalg.norm(vec1) == 0 or np.linalg.norm(vec2) == 0:
        return 0
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# --- Get best matching sentence from context for a question ---
def get_best_answer(context, question, model):
    sentences = split_hindi_sentences(context)
    q_vec = sentence_vector(question, model)
    max_sim, best_sent = -1, ""
    for sent in sentences:
        s_vec = sentence_vector(sent, model)
        sim = cosine_similarity(q_vec, s_vec)
        if sim > max_sim:
            max_sim = sim
            best_sent = sent
    return best_sent

# --- Evaluate with F1, BLEU, ROUGE-L (fuzzy F1 included) ---
def evaluate(df, predictions):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    smooth_fn = SmoothingFunction().method1

    f1s, bleus, rouges, exacts = [], [], [], []

    for gold, pred in zip(df['answer_text'], predictions):
        # Fuzzy F1
        gold_tokens = set(gold.split())
        pred_tokens = set(pred.split())
        common = gold_tokens.intersection(pred_tokens)
        f1 = (2 * len(common)) / (len(gold_tokens) + len(pred_tokens)) if gold_tokens and pred_tokens else 0
        f1s.append(f1)

        # BLEU
        bleu = sentence_bleu([gold.split()], pred.split(), smoothing_function=smooth_fn)
        bleus.append(bleu)

        # ROUGE-L
        rouge = scorer.score(gold, pred)['rougeL'].fmeasure
        rouges.append(rouge)

        # Exact Match (substring)
        exact = int(gold in pred)
        exacts.append(exact)

    print("\n--- Evaluation ---")
    print(f"Fuzzy F1 Score:     {sum(f1s)/len(f1s):.4f}")
    print(f"BLEU Score:         {sum(bleus)/len(bleus):.4f}")
    print(f"ROUGE-L Score:      {sum(rouges)/len(rouges):.4f}")
    print(f"Exact Match Score:  {sum(exacts)/len(exacts):.4f}")

In [None]:
# --- Main execution ---
if __name__ == "__main__":
    # Load training data
    train_df = load_custom_format("dataset/train.json")

    # Train Word2Vec model on all contexts
    model = train_word2vec(train_df)

    # Predict answers
    predictions = []
    for _, row in train_df.iterrows():
        pred = get_best_answer(row['context'], row['question'], model)
        predictions.append(pred)

    # Show some predictions
    for i in range(3):
        print(f"\nQ: {train_df.iloc[i]['question']}")
        print(f"A (True): {train_df.iloc[i]['answer_text']}")
        print(f"A (Pred): {predictions[i]}")

    # Evaluate predictions
    evaluate(train_df, predictions)


Q: भागीरथ कुंड पं. दीन दयाल उपाध्याय रेलवे स्टेशन से कितना किलोमीटर दूर है?
A (True): भागीरथ कुंड पं. दीन दयाल उपाध्याय रेलवे स्टेशन से 14.1 किलोमीटर दूर है।
A (Pred): भागीरथ कुंड पं. दीन दयाल उपाध्याय रेलवे स्टेशन से 14.1 किलोमीटर दूर है

Q: क्या मणिकर्णिका कुंड में शुद्ध पेय जल की सुविधा उपलब्ध है?
A (True): नहीं, मणिकर्णिका कुंड में शुद्ध पेय जल की सुविधा उपलब्ध नहीं है।
A (Pred): नहीं, मणिकर्णिका कुंड में शुद्ध पेय जल की सुविधा उपलब्ध नहीं है

Q: दुर्गा कुंड वाराणसी रेलवे स्टेशन (कैंट) से कितनी दूरी है?
A (True): दुर्गा कुंड वाराणसी रेलवे स्टेशन (कैंट) से लगभग 5.9 किलोमीटर दूर स्थित है।
A (Pred): दुर्गा कुंड वाराणसी रेलवे स्टेशन (कैंट) से लगभग 5.9 किलोमीटर दूर स्थित है

--- Evaluation ---
Fuzzy F1 Score:     0.8393
BLEU Score:         0.7923
ROUGE-L Score:      0.4123
Exact Match Score:  0.0017


In [None]:
# --- Main execution ---
if __name__ == "__main__":
    # Load training data
    val_df = load_custom_format("dataset/validation.json")

    # Train Word2Vec model on all contexts
    model = train_word2vec(val_df)

    # Predict answers
    predictions = []
    for _, row in val_df.iterrows():
        pred = get_best_answer(row['context'], row['question'], model)
        predictions.append(pred)

    # Show some predictions
    for i in range(3):
        print(f"\nQ: {val_df.iloc[i]['question']}")
        print(f"A (True): {val_df.iloc[i]['answer_text']}")
        print(f"A (Pred): {predictions[i]}")

    # Evaluate predictions
    evaluate(val_df, predictions)

In [None]:
# --- Main execution ---
if __name__ == "__main__":
    # Load training data
    test_df = load_custom_format("dataset/test-A-gold.json")

    # Train Word2Vec model on all contexts
    model = train_word2vec(test_df)

    # Predict answers
    predictions = []
    for _, row in test_df.iterrows():
        pred = get_best_answer(row['context'], row['question'], model)
        predictions.append(pred)

    # Show some predictions
    for i in range(3):
        print(f"\nQ: {test_df.iloc[i]['question']}")
        print(f"A (True): {test_df.iloc[i]['answer_text']}")
        print(f"A (Pred): {predictions[i]}")

    # Evaluate predictions
    evaluate(test_df, predictions)

In [None]:
# --- Main execution ---
if __name__ == "__main__":
    # Load training data
    test_df = load_custom_format("dataset/test-B.json")

    # Train Word2Vec model on all contexts
    model = train_word2vec(test_df)

    # Predict answers
    predictions = []
    for _, row in test_df.iterrows():
        pred = get_best_answer(row['context'], row['question'], model)
        predictions.append(pred)

    # Show some predictions
    for i in range(3):
        print(f"\nQ: {test_df.iloc[i]['question']}")
        # print(f"A (True): {test_df.iloc[i]['answer_text']}")
        print(f"A (Pred): {predictions[i]}")


Q: आदिकेशव कुंड किन नदियों के संगम पर स्थित है और इसका धार्मिक महत्व क्या है?
A (Pred): आदिकेशव कुंड, जो वाराणसी के प्राचीन राजघाट के निकट स्थित है, वरुणा एवं गंगा नदियों के पावन संगम स्थल पर स्थित एक अत्यंत धार्मिक और सांस्कृतिक दृष्टि से महत्त्वपूर्ण स्थान है

Q: आदिकेशव कुंड का संबंध किस देवता से है और श्रद्धालु यहाँ क्यों आते हैं?
A (Pred): श्रद्धालु इस कुंड में स्नान कर आत्मशुद्धि की अभिलाषा रखते हैं तथा इसे वाराणसी की समृद्ध ऐतिहासिक एवं धार्मिक विरासत का अनिवार्य अंग मानते हैं

Q: आदिकेशव कुंड को वाराणसी की धार्मिक विरासत में कैसे देखा जाता है?
A (Pred): यह स्थल वाराणसी की ऐतिहासिक और धार्मिक धरोहर का हिस्सा है


In [None]:
json_data = None

In [None]:
with open('/content/drive/MyDrive/dataset/test-B.json', 'r', encoding='utf-8') as f:
    json_data = json.load(f)

In [None]:
count = 0

domains = json_data['domains']
for domain in domains:
    domain_name = domain['domain']
    for context in domain['contexts']:
        for qa in context['qas']:
            qa['answer'] = predictions[count]
            count += 1

In [None]:
modified_json = json.dumps(json_data)

In [None]:
import os
os.chdir("/content/drive/MyDrive/Word2Vec")

In [None]:
with open(f"Word2Vec-Test-B-answered.json", "w", encoding="utf-8") as jf:
    json.dump(json_data, jf, indent=2, ensure_ascii=False)