# Configs

In [None]:
CONFIG = {
    # --- Develop
    "is_test": True,
    "test_sample_per_label": 2,
    
    # --- Training data
    "train_data": "../data/train-20k.csv",
    "text_col": "abstract_text",
    "label_col": "target",
    "label_map": {
        "BACKGROUND": "background",
        "OBJECTIVE": "objective",
        "METHODS": "methods",
        "RESULTS": "results",
        "CONCLUSIONS": "conclusions"
    },

    # --- NLI training ---
    "model_name": "bert-base-uncased",  # "bert-base-uncased", "roberta-base", "gpt2", "xlnet-base-cased"
    "batch_size": 16,
    "lr": 2e-5,
    "epochs": 10,
    "max_len": 128,
    "path_model_nli": "../model/nli_model_bert"
}

# Concepts Generation

In [2]:
import pandas as pd

In [3]:
train_df = pd.read_csv(CONFIG["train_data"])
train_df = train_df[[CONFIG["text_col"], CONFIG["label_col"]]]
train_df = train_df.rename(columns={CONFIG["text_col"]: "text", CONFIG["label_col"]: "label"})

In [4]:
train_df["label"] = train_df["label"].map(CONFIG["label_map"])

In [5]:
train_df.sample(5)

Unnamed: 0,text,label
8402,The intervention group had overall crisis reso...,results
6871,"In addition , the mean number of times that pa...",results
17976,The patients were divided into four groups and...,methods
1518,"Gait retraining , comprising biofeedback and/o...",objective
1733,"In the CF group , three ulcers had descemetoce...",methods


In [6]:
keyword_concepts = {
    "background": ['disease', 'therapy', 'aim', 'associate', 'investigate'],
    "objective": ['result', 'associate', 'suggest', 'therapy', 'rate'],
    "methods": ['randomize', 'group', 'receive', 'week', 'measure'],
    "results": ['investigate', 'evaluate', 'determine', 'therapy', 'safety'],
    "conclusions": ['p', 'group', 'difference', 'score', 'ci']
}

abstract_concepts = {
    "background": {
        "Disease Characteristics and Prevalence": ['disease', 'associate'], 
        "Intervention Efficacy Landscape": ['therapy']
    },
    "objective": {
        "Identified Associationss": ['result'], 
        "Effect Direction and Magnitude": ['suggest', 'therapy']
    },
    "methods": {
        "Randomized Group Allocation": ['randomize', 'group'], 
        "Intervention Administration": ['receive', 'group']
    },
    "results": {
        "Study Purpose Articulation": ['investigate', 'determine'], 
        "Safety Profile Assessment": ['safety', 'determine']
    },
    "conclusions": {
        "Inferential Statistics Presentation": ['p', 'ci'], 
        "Observed Group Disparities": ['difference', 'group']
    }
}

# Concepts Scoring

In [7]:
if CONFIG["is_test"] == True:
    train_df = (
        train_df.groupby("label", group_keys=False)
        .apply(lambda x: x.sample(min(len(x), 1), random_state=42))
        .reset_index(drop=True)
)
    
train_df

  .apply(lambda x: x.sample(min(len(x), 1), random_state=42))


Unnamed: 0,text,label
0,"In addition , previous reports have suggested ...",background
1,The comprehensive therapy of EA at qi streets ...,conclusions
2,Patients in the Tiapride group took Tiapride T...,methods
3,We undertook a prospective randomised trial co...,objective
4,Higher echo intensity values were observed 2 d...,results


In [8]:
import numpy as np
import spacy
from nltk.corpus import wordnet as wn
from tqdm import tqdm
from collections import defaultdict

In [9]:
!python -m spacy download en_core_web_sm

import nltk
nltk.download('wordnet')

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 393.8 kB/s eta 0:00:33
     --------------------------------------- 0.1/12.8 MB 655.4 kB/s eta 0:00:20
      -------------------------------------- 0.2/12.8 MB 871.5 kB/s eta 0:00:15
      --------------------------------------- 0.3/12.8 MB 1.1 MB/s eta 0:00:12
     - -------------------------------------- 0.5/12.8 MB 1.8 MB/s eta 0:00:07
     --- ------------------------------------ 1.0/12.8 MB 3.2 MB/s eta 0:00:04
     ---- ----------------------------------- 1.6/12.8 MB 4.4 MB/s eta 0:00:03
     ------- -------------------------------- 2.5/12.8 MB 6.1 MB/s eta 0:00:02
     ------------ ---------------------------


[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
nlp = spacy.load("en_core_web_sm")

In [11]:
train_df["text_doc"] = train_df["text"].apply(nlp)

In [12]:
label_names = list(CONFIG["label_map"].values())

### Weak Label Layer 1

In [13]:
def get_synonyms(word):
    """
    Lấy danh sách synonym của 1 từ (lowercase, bỏ '_')
    """
    synonyms = set()
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().lower().replace('_', ' '))
    return list(synonyms)

In [14]:
def build_lemmatized_synonym_dict(keyword_list):
    """
    Tạo từ điển {keyword: [lemma + synonyms]} cho 1 nhãn
    """
    synonym_dict = defaultdict(set)
    for kw in keyword_list:
        kw_lemma = kw.lower()
        synonym_dict[kw_lemma].add(kw_lemma)
        for syn in get_synonyms(kw):
            doc = nlp(syn)
            if len(doc) > 0:
                lemma = doc[0].lemma_.lower()
                synonym_dict[kw_lemma].add(lemma)
    return {k: list(v) for k, v in synonym_dict.items()}

lemmatized_synonyms_per_label = {
    label: build_lemmatized_synonym_dict(kws)
    for label, kws in keyword_concepts.items()
}

In [15]:
def contains_lemmatized_keywords(text_doc, keywords_to_check):
    """
    Kiểm tra xem trong văn bản có xuất hiện từ/cụm lemma trong danh sách không
    """
    text_lemmas = {t.lemma_.lower() for t in text_doc if t.is_alpha and not t.is_stop}
    text_str = " ".join(text_lemmas)
    for kw in keywords_to_check:
        if " " in kw:
            if kw in text_str:
                return True
        elif kw in text_lemmas:
            return True
    return False

In [16]:
# Khởi tạo cột L1
for label in label_names:
    for kw in keyword_concepts[label]:
        train_df[f"concept_score_L1_{label}_{kw}"] = -1.0

# Gán weak label theo rule-based
for i in tqdm(range(len(train_df)), desc="Gán Weak Label Layer 1"):
    text_doc = train_df.iloc[i]["text_doc"]
    true_label = train_df.iloc[i]["label"]
    
    for label in label_names:
        for kw in keyword_concepts[label]:
            syns = lemmatized_synonyms_per_label[label].get(kw, [kw])
            is_present = contains_lemmatized_keywords(text_doc, syns)
            # Nếu keyword xuất hiện và đúng nhãn thật → 1
            # Nếu không thuộc nhãn → 0
            score = 1 if (is_present and label == true_label) else 0
            train_df.at[i, f"concept_score_L1_{label}_{kw}"] = score

Gán Weak Label Layer 1: 100%|██████████| 5/5 [00:00<00:00, 782.78it/s]




### Weak Label Layer 2

In [17]:
# Khởi tạo cột L2
for label in label_names:
    for abs_concept in abstract_concepts[label]:
        train_df[f"concept_score_L2_{label}_{abs_concept}"] = -1.0

def weak_label_layer2(row, label, abs_concept_name):
    """
    Lấy label cho abstract concept: 1 nếu có ít nhất 1 keyword L1=1 trong nhóm supportive
    """
    supportive_keywords = abstract_concepts[label][abs_concept_name]
    for kw in supportive_keywords:
        col = f"concept_score_L1_{label}_{kw}"
        if row[col] == 1:
            return 1
    return 0

for i in tqdm(range(len(train_df)), desc="Gán Weak Label Layer 2"):
    true_label = train_df.iloc[i]["label"]
    row = train_df.iloc[i]
    
    for label in label_names:
        for abs_concept in abstract_concepts[label]:
            col = f"concept_score_L2_{label}_{abs_concept}"
            if label == true_label:
                score = weak_label_layer2(row, label, abs_concept)
            else:
                score = 0
            train_df.at[i, col] = score

Gán Weak Label Layer 2: 100%|██████████| 5/5 [00:00<00:00, 1985.56it/s]


### Prepare Data

In [18]:
train_nli_data = []

layer1_cols = [c for c in train_df.columns if c.startswith("concept_score_L1_")]
layer2_cols = [c for c in train_df.columns if c.startswith("concept_score_L2_")]

for i in tqdm(range(train_df.shape[0]), desc="Tạo dữ liệu huấn luyện (text, concept, score)"):
    text = train_df.iloc[i]["text"]
    
    # --- Layer 1 ---
    for col in layer1_cols:
        _, _, _, kw = col.split("_", 3)
        score = train_df.iloc[i][col]
        train_nli_data.append({
            "text": text,
            "concept": kw,
            "score": score
        })
    
    # --- Layer 2 ---
    for col in layer2_cols:
        _, _, _, abs_concept = col.split("_", 3) 
        score = train_df.iloc[i][col]
        train_nli_data.append({
            "text": text,
            "concept": abs_concept,
            "score": score
        })

concept_scorer_train_df = pd.DataFrame(train_nli_data)
concept_scorer_train_df.head()

Tạo dữ liệu huấn luyện (text, concept, score): 100%|██████████| 5/5 [00:00<00:00, 665.87it/s]


Unnamed: 0,text,concept,score
0,"In addition , previous reports have suggested ...",background_disease,0.0
1,"In addition , previous reports have suggested ...",background_therapy,0.0
2,"In addition , previous reports have suggested ...",background_aim,0.0
3,"In addition , previous reports have suggested ...",background_associate,0.0
4,"In addition , previous reports have suggested ...",background_investigate,0.0


In [19]:
concept_scorer_train_df.value_counts('score')

score
0.0    168
1.0      7
Name: count, dtype: int64

In [20]:
from sklearn.utils import shuffle

def balance_dataset(df):
    pos_df = df[df['score'] == 1]
    neg_df = df[df['score'] == 0]
    
    n_pos = len(pos_df)
    n_neg = len(neg_df)
    print(f"Original counts - pos: {n_pos}, neg: {n_neg}")
    
    if n_neg > n_pos:
        neg_df = neg_df.sample(n=n_pos, random_state=42)  # cắt ngẫu nhiên nhãn 0
    balanced_df = pd.concat([pos_df, neg_df]).reset_index(drop=True)
    balanced_df = shuffle(balanced_df, random_state=42)
    print(f"Balanced counts - pos: {len(balanced_df[balanced_df['score']==1])}, neg: {len(balanced_df[balanced_df['score']==0])}")
    return balanced_df

balanced_df  = balance_dataset(concept_scorer_train_df)

Original counts - pos: 7, neg: 168
Balanced counts - pos: 7, neg: 7


In [21]:
from sklearn.model_selection import train_test_split

nli_train_df, nli_val_df = train_test_split(
    balanced_df,
    test_size=0.15,
    random_state=42,
    stratify=balanced_df['score']  # giữ tỉ lệ nhãn
)

print(f"Train size: {len(nli_train_df)}, Val size: {len(nli_val_df)}")
print(f"Train label distribution:\n{nli_train_df['score'].value_counts()}")
print(f"Val label distribution:\n{nli_val_df['score'].value_counts()}")

Train size: 11, Val size: 3
Train label distribution:
score
1.0    6
0.0    5
Name: count, dtype: int64
Val label distribution:
score
0.0    2
1.0    1
Name: count, dtype: int64


### Train BERT NLI

In [22]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm
import os
import random

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [24]:
class NLIDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        encoding = self.tokenizer(
            str(row['text']),
            str(row['concept']),
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(row["score"], dtype=torch.long)
        }

In [25]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm
import os

class NLIDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        encoding = self.tokenizer(
            str(row['text']),
            str(row['concept']),
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(row["score"], dtype=torch.long)
        }

class NLIModel:
    def __init__(self, model_name_or_path="bert-base-uncased", num_labels=2, max_length=256):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.max_length = max_length
        print(f"Loading NLI model: {model_name_or_path}")

        # Load tokenizer + model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        self.model = AutoModel.from_pretrained(model_name_or_path).to(self.device)

        # Fix pad_token if missing (GPT2/XLNet)
        if self.tokenizer.pad_token is None:
            if hasattr(self.tokenizer, "eos_token") and self.tokenizer.eos_token is not None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
                print(f" > pad_token set to eos_token: {self.tokenizer.pad_token}")
            else:
                self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
                print(f" > Added new pad_token: [PAD]")
                self.model.resize_token_embeddings(len(self.tokenizer))

        # Hidden size & classifier
        hidden_size = self.model.config.hidden_size
        self.classifier = nn.Linear(hidden_size, num_labels).to(self.device)
        print(f"Model ready on {self.device}, hidden size: {hidden_size}")

    def mean_pooling(self, outputs, attention_mask):
        if not hasattr(outputs, "last_hidden_state"):
            raise ValueError(f"Model {self.model.__class__.__name__} has no last_hidden_state")
        token_embeddings = outputs.last_hidden_state
        mask = attention_mask.unsqueeze(-1)  # (B, L, 1)
        return (token_embeddings * mask).sum(1) / mask.sum(1)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        emb = self.mean_pooling(outputs, attention_mask)
        logits = self.classifier(emb)
        return logits
    def train_model(self, train_df, val_df=None, batch_size=32, lr=2e-5, num_epochs=3, max_len=128):
        train_dataset = NLIDataset(train_df, self.tokenizer, max_len=max_len)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        if val_df is not None:
            val_dataset = NLIDataset(val_df, self.tokenizer, max_len=max_len)
            val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        else:
            val_loader = None

        optimizer = AdamW(list(self.model.parameters()) + list(self.classifier.parameters()), lr=lr)
        criterion = nn.CrossEntropyLoss()

        for epoch in range(num_epochs):
            self.model.train()
            self.classifier.train()
            loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
            total_loss = 0

            for batch in loop:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)

                logits = self.forward(input_ids, attention_mask)
                loss = criterion(logits, labels)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                total_loss += loss.item()
                loop.set_postfix(loss=loss.item())

            avg_loss = total_loss / len(train_loader)
            print(f"Epoch {epoch+1} Avg Loss: {avg_loss:.4f}")

            if val_loader:
                self.evaluate(val_loader)

    def evaluate(self, val_loader: DataLoader):
        self.model.eval()
        self.classifier.eval()
        preds, labels_list = [], []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)

                logits = self.forward(input_ids, attention_mask)
                pred = torch.argmax(logits, dim=-1)

                preds.extend(pred.cpu().numpy())
                labels_list.extend(labels.cpu().numpy())

        acc = accuracy_score(labels_list, preds)
        print(f"Validation Accuracy: {acc:.4f}")
        return acc

    def score(self, text: str, concept: str) -> float:
        self.model.eval()
        self.classifier.eval()
        with torch.no_grad():
            inputs = self.tokenizer(text, concept,
                                    return_tensors="pt",
                                    truncation=True,
                                    padding=True,
                                    max_length=self.max_length).to(self.device)
            logits = self.forward(inputs["input_ids"], inputs["attention_mask"])
            prob = torch.softmax(logits, dim=1)[0][1].item()
        return prob

    def batch_score(self, text: str, concepts: list[str]):
        if not concepts: return []
        self.model.eval()
        self.classifier.eval()
        with torch.no_grad():
            inputs = self.tokenizer([text]*len(concepts), concepts,
                                    return_tensors="pt",
                                    truncation=True,
                                    padding=True,
                                    max_length=self.max_length).to(self.device)
            logits = self.forward(inputs["input_ids"], inputs["attention_mask"])
            probs = torch.softmax(logits, dim=1)[:, 1].cpu().tolist()
        return probs

    def save_model(self, output_dir: str):
        """Save tokenizer, model, and classifier"""
        os.makedirs(output_dir, exist_ok=True)
        self.model.save_pretrained(output_dir)
        self.tokenizer.save_pretrained(output_dir)
        torch.save(self.classifier.state_dict(), os.path.join(output_dir, "classifier.pt"))
        print(f"NLIModel saved to {output_dir}")

In [26]:
# Khởi tạo
nli_model = NLIModel(model_name_or_path=CONFIG["model_name"])

# Train
nli_model.train_model(
    train_df=nli_train_df,
    val_df=nli_val_df,
    batch_size=CONFIG["batch_size"],
    lr=CONFIG["lr"],
    num_epochs=CONFIG["epochs"],
    max_len=CONFIG["max_len"]
)

# Lưu model
nli_model.save_model(CONFIG["path_model_nli"])

Loading NLI model: microsoft/deberta-base-mnli


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model ready on cpu, hidden size: 768


Epoch 1/10: 100%|██████████| 1/1 [00:03<00:00,  3.61s/it, loss=0.758]


Epoch 1 Avg Loss: 0.7582
Validation Accuracy: 0.0000


Epoch 2/10: 100%|██████████| 1/1 [00:03<00:00,  3.54s/it, loss=0.564]


Epoch 2 Avg Loss: 0.5640
Validation Accuracy: 0.0000


Epoch 3/10: 100%|██████████| 1/1 [00:03<00:00,  3.91s/it, loss=0.474]


Epoch 3 Avg Loss: 0.4737
Validation Accuracy: 0.6667


Epoch 4/10: 100%|██████████| 1/1 [00:07<00:00,  7.09s/it, loss=0.364]


Epoch 4 Avg Loss: 0.3640
Validation Accuracy: 0.6667


Epoch 5/10: 100%|██████████| 1/1 [00:06<00:00,  6.58s/it, loss=0.263]


Epoch 5 Avg Loss: 0.2633
Validation Accuracy: 0.6667


Epoch 6/10: 100%|██████████| 1/1 [00:06<00:00,  6.90s/it, loss=0.197]


Epoch 6 Avg Loss: 0.1973
Validation Accuracy: 0.6667


Epoch 7/10: 100%|██████████| 1/1 [00:06<00:00,  6.89s/it, loss=0.169]


Epoch 7 Avg Loss: 0.1691
Validation Accuracy: 0.6667


Epoch 8/10: 100%|██████████| 1/1 [00:05<00:00,  5.18s/it, loss=0.141]


Epoch 8 Avg Loss: 0.1413
Validation Accuracy: 0.6667


Epoch 9/10: 100%|██████████| 1/1 [00:04<00:00,  4.10s/it, loss=0.0783]


Epoch 9 Avg Loss: 0.0783
Validation Accuracy: 0.6667


Epoch 10/10: 100%|██████████| 1/1 [00:06<00:00,  6.93s/it, loss=0.0482]


Epoch 10 Avg Loss: 0.0482
Validation Accuracy: 0.6667
NLIModel saved to ../model/nli_model_bert


In [30]:
# Inference
prob = nli_model.score("This is an example text.", "disease")
print(prob)

0.4307329058647156


In [28]:
import os
import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn

def load_NLI(path: str) -> "NLIModel":
    # 1. Load tokenizer và base model
    tokenizer = AutoTokenizer.from_pretrained(path)
    model = AutoModel.from_pretrained(path)

    # 2. Lấy hidden_size từ config
    hidden_size = model.config.hidden_size

    # 3. Khởi tạo instance NLIModel mà không gọi __init__
    nli_instance = NLIModel.__new__(NLIModel)
    nli_instance.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    nli_instance.max_length = 256
    nli_instance.tokenizer = tokenizer
    nli_instance.model = model.to(nli_instance.device)

    # 4. Khởi tạo classifier và load state dict
    nli_instance.classifier = nn.Linear(hidden_size, 2).to(nli_instance.device)
    classifier_path = os.path.join(path, "classifier.pt")
    if os.path.exists(classifier_path):
        nli_instance.classifier.load_state_dict(torch.load(classifier_path, map_location=nli_instance.device))
    else:
        print(f"Warning: classifier.pt not found in {path}, initialized randomly")

    print(f"NLIModel loaded from {path} on {nli_instance.device}")
    return nli_instance

In [29]:
nli_model2 = load_NLI(CONFIG["path_model_nli"])

prob = nli_model2.score("This is an example text.", "disease")
print(prob)

NLIModel loaded from ../model/nli_model_bert on cpu
0.4307329058647156


# Concepts Interpretable Network