In [None]:
!pip install numpy pandas scikit-learn tqdm nltk



In [None]:
import json
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir("/content/drive/MyDrive/M Tech/Sem3")

In [None]:
def load_and_flatten(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)

    samples = []
    for domain in data["domains"]:
        for ctx in domain["contexts"]:
            context = ctx["context"]
            for qa in ctx["qas"]:
                question = qa["question"]
                answer_text = qa["answer"]
                answer_start = context.find(answer_text)
                if answer_start == -1:
                    continue
                samples.append({
                    "id": qa["id"],
                    "context": context,
                    "question": question,
                    "answer_text": answer_text,
                    "answer_start": answer_start
                })
    return samples

train_data = load_and_flatten("Dataset/train.json")
val_data = load_and_flatten("Dataset/validation.json")

In [None]:
def load_fasttext_embeddings(path):
    embedding_index = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in tqdm(f):
            values = line.strip().split()
            word = values[0]
            try:
                vector = np.asarray(values[1:], dtype='float32')
                embedding_index[word] = vector
            except:
                continue
    return embedding_index

embedding_path = "Embeddings/cc.hi.300.vec"
embedding_index = load_fasttext_embeddings(embedding_path)
embedding_dim = len(next(iter(embedding_index.values())))

1876654it [03:48, 8203.98it/s] 


In [None]:
def text_to_vector(text, embeddings, dim):
    tokens = word_tokenize(text)
    vecs = [embeddings[tok] for tok in tokens if tok in embeddings]
    if vecs:
        return np.mean(vecs, axis=0)
    else:
        return np.zeros(dim)

In [None]:
def prepare_dataset(samples, embeddings, dim):
    X, y = [], []
    for sample in tqdm(samples):
        c_vec = text_to_vector(sample["context"], embeddings, dim)
        q_vec = text_to_vector(sample["question"], embeddings, dim)
        X.append(np.concatenate([c_vec, q_vec]))
        y.append(sample["answer_start"])
    return np.array(X), np.array(y)

X_train, y_train = prepare_dataset(train_data, embedding_index, embedding_dim)
X_val, y_val = prepare_dataset(val_data, embedding_index, embedding_dim)

100%|██████████| 1773/1773 [00:02<00:00, 606.21it/s]
100%|██████████| 395/395 [00:00<00:00, 1211.12it/s]


In [None]:
model = Ridge()
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
print("MSE:", mean_squared_error(y_val, y_pred))

MSE: 341.1457214355469


In [None]:
def evaluate_exact(samples, y_pred):
    correct = 0
    for i, sample in enumerate(samples):
        context = sample["context"]
        true_ans = sample["answer_text"]
        pred_start = int(round(y_pred[i]))
        pred_text = context[pred_start:pred_start + len(true_ans)]

        if pred_text == true_ans:
            correct += 1
    return correct / len(samples)

exact_match = evaluate_exact(val_data, y_pred)
print("Exact Match Accuracy:", exact_match)

Exact Match Accuracy: 0.06329113924050633


In [None]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=0772fe3bd32dd564cb3c9281bfc85564f1317d1d08b65cdb6d2f5a830769df66
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

def calculate_metrics(samples, y_pred):
    bleu_scores, rouge_l_scores, f1_scores = [], [], []

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    smooth_fn = SmoothingFunction().method4

    for i, sample in enumerate(samples):
        context = sample['context']
        gold = sample['answer_text']
        pred_start = int(round(y_pred[i]))
        pred = context[pred_start:pred_start + len(gold)]

        # Tokenize
        gold_tokens = word_tokenize(gold)
        pred_tokens = word_tokenize(pred)

        # BLEU
        bleu = sentence_bleu([gold_tokens], pred_tokens, smoothing_function=smooth_fn)
        bleu_scores.append(bleu)

        # ROUGE-L
        rouge = scorer.score(gold, pred)['rougeL'].fmeasure
        rouge_l_scores.append(rouge)

        # F1 (token-level)
        common = set(gold_tokens) & set(pred_tokens)
        if len(common) == 0:
            f1 = 0
        else:
            precision = len(common) / len(pred_tokens)
            recall = len(common) / len(gold_tokens)
            f1 = 2 * precision * recall / (precision + recall)
        f1_scores.append(f1)

    return {
        "BLEU": np.mean(bleu_scores),
        "ROUGE-L": np.mean(rouge_l_scores),
        "F1": np.mean(f1_scores)
    }

# Compute metrics
metrics = calculate_metrics(val_data, y_pred)
print("BLEU:", metrics["BLEU"])
print("ROUGE-L:", metrics["ROUGE-L"])
print("F1 Score:", metrics["F1"])


BLEU: 0.65237867784357
ROUGE-L: 0.3383966244725738
F1 Score: 0.677714580353205


In [None]:
import pandas as pd

def save_predictions_to_csv(samples, y_pred, path="predictions.csv"):
    data = []
    for i, sample in enumerate(samples):
        context = sample["context"]
        question = sample["question"]
        gold = sample["answer_text"]
        pred_start = int(round(y_pred[i]))
        pred_ans = context[pred_start:pred_start + len(gold)]

        data.append({
            "id": sample["id"],
            "question": question,
            "context": context,
            "gold_answer": gold,
            "predicted_answer": pred_ans,
            "predicted_start": pred_start
        })

    df = pd.DataFrame(data)
    df.to_csv(path, index=False, encoding="utf-8")
    print(f"Saved predictions to {path}")

save_predictions_to_csv(val_data, y_pred, path="Embeddings/GLOVE_val_predictions.csv")

Saved predictions to GLOVE_val_predictions.csv


In [None]:
import json

def save_predictions_to_json(samples, y_pred, path="val_predictions.json"):
    output = []

    for i, sample in enumerate(samples):
        context = sample["context"]
        question = sample["question"]
        gold = sample["answer_text"]
        pred_start = int(round(y_pred[i]))
        pred_ans = context[pred_start:pred_start + len(gold)]

        output.append({
            "id": sample["id"],
            "question": question,
            "context": context,
            "gold_answer": gold,
            "predicted_answer": pred_ans,
            "predicted_start": pred_start
        })

    with open(path, "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

    print(f"Saved predictions to {path}")

save_predictions_to_json(val_data, y_pred, path="Embeddings/GLOVE_val_predictions.json")

Saved predictions to Glove\GLOVE_val_predictions.json
