In [2]:
!pip install gensim nltk rouge-score

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m72.0 MB/s[0m eta [36m0:00:00[0m


In [4]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/M Tech/Sem3")

Mounted at /content/drive


In [1]:
import json
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sklearn.metrics import f1_score

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
w2v_path = "Embeddings/cc.hi.300.vec"
w2v_model = KeyedVectors.load_word2vec_format(w2v_path)

In [7]:
def sentence_vector(sentence, model):
    words = word_tokenize(sentence.lower())
    word_vecs = [model[word] for word in words if word in model]
    return np.mean(word_vecs, axis=0) if word_vecs else np.zeros(model.vector_size)

In [8]:
def load_dataset(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        raw = json.load(f)
    samples = []
    for domain in raw["domains"]:
        for item in domain["contexts"]:
            context = item["context"]
            for qa in item["qas"]:
                samples.append({
                    "id": qa["id"],
                    "context": context,
                    "question": qa["question"],
                    "answer": qa["answer"]
                })
    return pd.DataFrame(samples)


In [10]:
def predict_answer(context, question):
    sentences = sent_tokenize(context)
    sentence_vecs = [sentence_vector(sent, w2v_model) for sent in sentences]
    q_vec = sentence_vector(question, w2v_model)
    similarities = cosine_similarity([q_vec], sentence_vecs)[0]

    return sentences[np.argmax(similarities)]

def calculate_metrics(samples):

    smoothie = SmoothingFunction().method4
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    f1s, bleus, rougels = [], [], []

    for _, row in samples.iterrows():
        pred = row['predicted']
        true = row['answer']

        # F1
        pred_tokens = word_tokenize(pred)
        true_tokens = word_tokenize(true)
        common = set(pred_tokens) & set(true_tokens)
        if len(pred_tokens) == 0 or len(true_tokens) == 0:
            f1 = 0
        else:
            precision = len(common) / len(pred_tokens)
            recall = len(common) / len(true_tokens)
            f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1s.append(f1)

        # BLEU
        bleus.append(sentence_bleu([true_tokens], pred_tokens, smoothing_function=smoothie))

        # ROUGE-L
        rouge = scorer.score(true, pred)
        rougels.append(rouge['rougeL'].fmeasure)

    return {
        "F1": np.mean(f1s),
        "BLEU": np.mean(bleus),
        "ROUGE-L": np.mean(rougels)
    }


In [13]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [18]:
df = load_dataset("Dataset/train.json")
df["predicted"] = df.apply(lambda row: predict_answer(row["context"], row["question"]), axis=1)

df.to_csv("Embeddings/Word2Vec_results_train.csv", index=False, encoding='utf-8')
df[["id", "question", "answer", "predicted"]].to_json("results.json", force_ascii=False, indent=2)

In [19]:
# Print Evaluation
metrics = calculate_metrics(df)
print("Evaluation Metrics:")

for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

Evaluation Metrics:
F1: 0.2672
BLEU: 0.1141
ROUGE-L: 0.2272


In [20]:
df = load_dataset("Dataset/validation.json")
df["predicted"] = df.apply(lambda row: predict_answer(row["context"], row["question"]), axis=1)

df.to_csv("Embeddings/Word2Vec_results_val.csv", index=False, encoding='utf-8')
df[["id", "question", "answer", "predicted"]].to_json("results.json", force_ascii=False, indent=2)

In [None]:
# Print Evaluation
metrics = calculate_metrics(df)
print("Evaluation Metrics:")

for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

Evaluation Metrics:
F1: 0.2668
BLEU: 0.1144
ROUGE-L: 0.2190


In [None]:
df = load_dataset("Dataset/test-A-gold.json")
df["predicted"] = df.apply(lambda row: predict_answer(row["context"], row["question"]), axis=1)

df.to_csv("Embeddings/Word2Vec_results_test.csv", index=False, encoding='utf-8')
df[["id", "question", "answer", "predicted"]].to_json("results.json", force_ascii=False, indent=2)

In [None]:
# Print Evaluation
metrics = calculate_metrics(df)
print("Evaluation Metrics:")

for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

Evaluation Metrics:
F1: 0.2668
BLEU: 0.1144
ROUGE-L: 0.2190
