<a href="https://colab.research.google.com/github/kfirs127/Ambiguity-Classifier-in-Open-Domain-Questions/blob/main/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers torch scikit-learn pandas tqdm

import json
import glob
import os
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.multioutput import MultiOutputClassifier

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
BASE = '/content/drive/Shared drives/Ambiguity Classifier in Open-Domain Questions'

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
label_cols = [
    "Event references",
    "Properties",
    "Entity references",
    "Answer types",
    "Time dependency",
    "Multiple sub-questions"
]

In [5]:
split_dirs = [os.path.join(BASE, d) for d in os.listdir(BASE) if d.lower().startswith('split_part')]
sheet_ids = {
    'split_part_1 - roi': '1KfLv3mogWEQtc-ucVkqWsQGtrklS1Wd-Rk2pS4avigc',
    'split_part_2 - kfir': '10sGw8TXTytUgpdKqi9Myoyiof1PiU8wIsFcEnQU0SZc',
    'split_part_3 - bar': '1suxKtlo4rEzVAPHHJBvpj7IlKTu35JkYyUNJ8Y3ha6k',
    'split_part_4 - amit': '1DOSNgWgEVybI1DRyCu0Tv0a6C12ihJFiUit3wlCdLbs',
}
sheet_name = 'Labeling'


In [13]:
def id_truncate(x, n=6):
    x = str(x).strip()
    if x.startswith('-'):
        x = x.replace('.', '')
        return '-' + x[1:n+1]
    return x[:n]


all_data = []

for split_dir in split_dirs:
    folder_name = os.path.basename(split_dir)
    SHEET_ID = sheet_ids[folder_name.lower()]
    LABEL_URL = f'https://docs.google.com/spreadsheets/d/{SHEET_ID}/gviz/tq?tqx=out:csv&sheet={sheet_name}'
    df_labels = pd.read_csv(LABEL_URL, dtype=str)
    # Use plain string for IDs, remove scientific, strip
    df_labels['Id'] = df_labels['Id'].astype(str).str.strip()

    json_files = glob.glob(os.path.join(split_dir, '*.json'))
    assert len(json_files) == 1, f"Expected exactly 1 JSON in {split_dir}, found {len(json_files)}"
    with open(json_files[0], "r", encoding="utf-8") as f:
        questions = json.load(f)
    df_questions = pd.DataFrame([
        {"Id": str(q["id"]).strip(), "Question": q["question"]}
        for q in questions
    ])
    df_questions['Id'] = df_questions['Id'].astype(str).str.strip()

    # Create truncated key columns for merging
    df_labels['Id_trunc'] = df_labels['Id'].apply(lambda x: id_truncate(x, 6))
    df_questions['Id_trunc'] = df_questions['Id'].apply(lambda x: id_truncate(x, 6))

    print(df_labels.head())
    print(df_questions.head())

    # Merge on truncated IDs
    df = df_labels.merge(df_questions, on="Id_trunc", suffixes=('_label', '_question'))
    # Use the "full" ID from the questions side (to keep one good column)
    df['Id'] = df['Id_question']
    # Also keep only unique merged records by "Id"
    df = df.drop_duplicates('Id')

    # Keep only relevant columns
    final_cols = ['Id', 'Question'] + [
        "Event references",
        "Properties",
        "Entity references",
        "Answer types",
        "Time dependency",
        "Multiple sub-questions"
    ]
    # If your Google Sheet has extra columns (e.g., 'Unnamed'), select only existing ones
    final_cols_existing = [col for col in final_cols if col in df.columns]
    df = df[final_cols_existing]

    print(f"[{folder_name}] After unsafe merge: {df.shape}")

    all_data.append(df)

df_final = pd.concat(all_data, ignore_index=True)
print("Total combined shape:", df_final.shape)

                    Id Event references Properties Entity references  \
0  3379982049252536210                0          1                 0   
1         -4.11058E+18                0          0                 0   
2  1590194633735686902                0          0                 0   
3         -1.80572E+18                0          1                 0   
4  3223605858827437025                1          0                 1   

  Answer types Time dependency Multiple sub-questions Id_trunc  
0            1               0                      0   337998  
1            1               1                      0  -411058  
2            1               1                      0   159019  
3            1               0                      0  -180572  
4            1               1                      0   322360  
                     Id                                           Question  \
0   3379982049252536210  If there are 5 fluorines and 1 phosphorous ato...   
1  -41105842558305525

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert = BertModel.from_pretrained("bert-base-uncased").to(device)
bert.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
@torch.no_grad()
def get_bert_embeddings(texts, batch_size=16):
    all_embeds = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=64)
        input_ids = enc["input_ids"].to(device)
        attention_mask = enc["attention_mask"].to(device)
        outputs = bert(input_ids, attention_mask=attention_mask)
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu()
        all_embeds.append(cls_embeddings)
    return torch.cat(all_embeds, dim=0).numpy()

X = get_bert_embeddings(df_final["Question"].tolist())
y = df_final[label_cols].values

100%|██████████| 13/13 [00:14<00:00,  1.08s/it]


In [None]:
clf = MultiOutputClassifier(LogisticRegression(class_weight='balanced', max_iter=500))

kf = KFold(n_splits=5, shuffle=True, random_state=42)
acc_scores = cross_val_score(clf, X, y, cv=kf, scoring='accuracy')

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_scorer.py", line 140, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_scorer.py", line 388, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 216, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py", line 1324, in f1_score
    return fbeta_score(
           ^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 189, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_

In [None]:
print("5-fold exact match (multi-label accuracy) scores:", acc_scores)
print("Mean multi-label accuracy:", np.mean(acc_scores))

5-fold macro F1 scores: [nan nan nan nan nan]
Mean macro F1: nan
