<a href="https://colab.research.google.com/github/kfirs127/Ambiguity-Classifier-in-Open-Domain-Questions/blob/main/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
# !pip install transformers torch scikit-learn pandas tqdm

import json
import glob
import os
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import make_scorer, hamming_loss, f1_score, jaccard_score

In [29]:
BASE = '/content/drive/Shared drives/Ambiguity Classifier in Open-Domain Questions'

In [30]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
label_cols = [
    "Event references",
    "Properties",
    "Entity references",
    "Answer types",
    "Time dependency",
    "Multiple sub-questions"
]

In [32]:
split_dirs = [os.path.join(BASE, d) for d in os.listdir(BASE) if d.lower().startswith('split_part')]
sheet_ids = {
    'split_part_1 - roi': '1KfLv3mogWEQtc-ucVkqWsQGtrklS1Wd-Rk2pS4avigc',
    'split_part_2 - kfir': '10sGw8TXTytUgpdKqi9Myoyiof1PiU8wIsFcEnQU0SZc',
    'split_part_3 - bar': '1suxKtlo4rEzVAPHHJBvpj7IlKTu35JkYyUNJ8Y3ha6k',
    'split_part_4 - amit': '1DOSNgWgEVybI1DRyCu0Tv0a6C12ihJFiUit3wlCdLbs',
}
sheet_name = 'Labeling'


In [33]:
def id_truncate(x):
    x = str(x).strip()
    if x.startswith('-'):
        return x[1:]
    return x


all_data = []

for split_dir in split_dirs:
    folder_name = os.path.basename(split_dir)
    SHEET_ID = sheet_ids[folder_name.lower()]
    LABEL_URL = f'https://docs.google.com/spreadsheets/d/{SHEET_ID}/gviz/tq?tqx=out:csv&sheet={sheet_name}'
    df_labels = pd.read_csv(LABEL_URL, dtype=str)
    # Use plain string for IDs, remove scientific, strip
    df_labels['Id'] = df_labels['Id'].astype(str).str.strip()

    json_files = glob.glob(os.path.join(split_dir, '*.json'))
    assert len(json_files) == 1, f"Expected exactly 1 JSON in {split_dir}, found {len(json_files)}"
    with open(json_files[0], "r", encoding="utf-8") as f:
        questions = json.load(f)
    df_questions = pd.DataFrame([
        {"Id": str(q["id"]).strip(), "Question": q["question"]}
        for q in questions
    ])
    df_questions['Id'] = df_questions['Id'].astype(str).str.strip()

    # Create truncated key columns for merging
    df_labels['Id_trunc'] = df_labels['Id'].apply(lambda x: id_truncate(x))
    df_questions['Id_trunc'] = df_questions['Id'].apply(lambda x: id_truncate(x))

    # Merge on truncated IDs
    df = df_labels.merge(df_questions, on="Id_trunc", suffixes=('_label', '_question'))
    # Use the "full" ID from the questions side (to keep one good column)
    df['Id'] = df['Id_question']
    # Also keep only unique merged records by "Id"
    df = df.drop_duplicates('Id')

    # Keep only relevant columns
    final_cols = ['Id', 'Question'] + [
        "Event references",
        "Properties",
        "Entity references",
        "Answer types",
        "Time dependency",
        "Multiple sub-questions"
    ]
    # If your Google Sheet has extra columns (e.g., 'Unnamed'), select only existing ones
    final_cols_existing = [col for col in final_cols if col in df.columns]
    df = df[final_cols_existing]

    print(f"[{folder_name}] After unsafe merge: {df.shape}")

    all_data.append(df)

df_final = pd.concat(all_data, ignore_index=True)
print("Total combined shape:", df_final.shape)

[split_part_4 - Amit] After unsafe merge: (99, 8)
[split_part_3 - Bar] After unsafe merge: (200, 8)
[split_part_2 - Kfir] After unsafe merge: (194, 8)
[split_part_1 - Roi] After unsafe merge: (199, 8)
Total combined shape: (692, 8)


In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert = BertModel.from_pretrained("bert-base-uncased").to(device)
bert.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [44]:
@torch.no_grad()
def get_bert_embeddings(texts, batch_size=1024):
    all_embeds = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=512)
        input_ids = enc["input_ids"].to(device)
        attention_mask = enc["attention_mask"].to(device)
        outputs = bert(input_ids, attention_mask=attention_mask)
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu()
        all_embeds.append(cls_embeddings)
    return torch.cat(all_embeds, dim=0).numpy()

X = get_bert_embeddings(df_final["Question"].tolist())
y = df_final[label_cols].values

100%|██████████| 1/1 [01:05<00:00, 65.43s/it]


In [45]:
print(X)
print(y)

[[-0.21865879  0.22218935 -0.1446351  ... -0.4684483   0.5020686
   1.0531839 ]
 [-0.59331685 -0.09805765 -0.28061807 ... -0.47908232  0.7137002
   0.40427676]
 [-0.02363765  0.03771932  0.46016505 ... -0.5326069   0.45115116
   0.51329595]
 ...
 [-0.40732935  0.09684791 -0.26155922 ... -0.53190845  0.60872567
   0.67011017]
 [-0.15433094 -0.47385085  0.38733056 ... -0.57622606  0.31879175
   0.17274837]
 [-0.37659708  0.04265192  0.11136115 ... -0.5242056   0.52409494
   0.5302221 ]]
[['0' '1' '0' '1' '0' '0']
 ['0' '0' '0' '1' '1' '0']
 ['0' '0' '0' '1' '1' '0']
 ...
 ['0' '0' '0' '0' '1' '0']
 ['1' '0' '0' '0' '0' '0']
 ['0' '0' '0' '1' '1' '0']]


In [49]:
def hamming_score(y_true, y_pred):
    return 1 - hamming_loss(y_true, y_pred)


In [54]:
clf = MultiOutputClassifier(LogisticRegression(class_weight='balanced', max_iter=500))
kf = KFold(n_splits=5, shuffle=True, random_state=42)

y = y.astype(int)

# Example with Jaccard similarity (partial credit for overlaps)
jaccard = make_scorer(jaccard_score, average='samples')
scores = cross_val_score(clf, X, y, cv=kf, scoring=jaccard)
print("5-fold Jaccard scores:", scores)
print("Mean Jaccard score:", np.mean(scores))

# Example with Hamming score
hamming = make_scorer(hamming_score)
scores = cross_val_score(clf, X, y, cv=kf, scoring=hamming)
print("5-fold Hamming scores:", scores)
print("Mean Hamming score:", np.mean(scores))

# Example with Micro F1 (best for imbalanced multilabel)
micro_f1 = make_scorer(f1_score, average='micro')
scores = cross_val_score(clf, X, y, cv=kf, scoring=micro_f1)
print("5-fold Micro F1 scores:", scores)
print("Mean Micro F1:", np.mean(scores))

5-fold Jaccard scores: [0.35851319 0.36211031 0.35326087 0.36413043 0.31702899]
Mean Jaccard score: 0.3510087582108226
5-fold Hamming scores: [0.74460432 0.7529976  0.74758454 0.75483092 0.73550725]
Mean Hamming score: 0.7471049247558474
5-fold Micro F1 scores: [0.42895442 0.44021739 0.43665768 0.42816901 0.40326975]
Mean Micro F1: 0.42745365313808825
