# Configs

In [25]:
CONFIG = {
    # --- Develop
    "is_test": False,
    "test_sample_per_label": 2,
    
    # --- Training data
    "train_data": "/Users/dauduchieu/Desktop/iSE_CBM_CIN/data/test-20k.csv",
    "text_col": "abstract_text",
    "label_col": "target",
    "label_map": {
        "BACKGROUND": "background",
        "OBJECTIVE": "objective",
        "METHODS": "methods",
        "RESULTS": "results",
        "CONCLUSIONS": "conclusions"
    },
    
    # Concept generation
    "concepts": {
        "keyword_concepts": {
            "0": ["aim", "develop", "cancer", "common"],
            "1": ["aim", "develop", "cancer", "common"],
            "2": ["aim", "develop", "cancer", "common"]
        },
        "abstract_concepts": {
            "a": {
                "Adverse": ["aim", "develop"], 
                "Treatment": ["turn", "cold"], 
                "Intense": ["hate"]
            },
            "b": {
                "Reactions": ["aim", "develop"], 
                "Failure": ["turn", "cold"], 
                "Dislike": ["hate"]
            },
        }
    }
}

In [2]:
import pandas as pd

train_df = pd.read_csv(CONFIG["train_data"])
train_df = train_df[[CONFIG["text_col"], CONFIG["label_col"]]]
train_df = train_df.rename(columns={CONFIG["text_col"]: "text", CONFIG["label_col"]: "label"})

train_df["label"] = train_df["label"].map(CONFIG["label_map"])

In [3]:
train_df.sample(5)

Unnamed: 0,text,label
1730,Gilead Sciences .,background
1611,Tuberculosis regimens that are shorter and sim...,background
2902,To verify the clinical efficacy of shu-stream ...,objective
565,Tunneling significantly reduced average extent...,results
1254,Poor management of chronic medical treatments ...,background


# Concepts Generation

# Concepts Scoring

In [4]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm
import os

class NLIModel:
    def __init__(self, model_name_or_path: str = 'bert-base-uncased', num_labels: int = 2):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Loading model from: {model_name_or_path}")

        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
            self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, num_labels=num_labels)
        except Exception as e:
            print(f"Error loading with AutoClasses, trying Bert-specific classes directly. Error: {e}")
            self.tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
            self.model = BertForSequenceClassification.from_pretrained(model_name_or_path, num_labels=num_labels)
            
        self.model.to(self.device)
        print(f"Model loaded successfully on: {self.device}")

    def train(self, train_loader: DataLoader, val_loader: DataLoader = None, epochs: int = 1, lr: float = 2e-5):
        optimizer = AdamW(self.model.parameters(), lr=lr)
        for epoch in range(epochs):
            self.model.train()
            loop = tqdm(train_loader, leave=True)
            total_loss = 0.0
            for batch in loop:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)
                optimizer.zero_grad()
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_loss += loss.item()
                loss.backward()
                optimizer.step()
                loop.set_description(f"Epoch {epoch+1}/{epochs}")
                loop.set_postfix(loss=loss.item())
            avg_loss = total_loss / len(train_loader)
            print(f"Epoch {epoch+1} - Average loss: {avg_loss:.4f}")
            if val_loader is not None:
                self.evaluate(val_loader)

    def evaluate(self, val_loader: DataLoader):
        self.model.eval()
        preds, labels_list = [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                predictions = torch.argmax(outputs.logits, dim=-1)
                preds.extend(predictions.cpu().numpy())
                labels_list.extend(labels.cpu().numpy())
        acc = accuracy_score(labels_list, preds)
        print(f"Validation Accuracy: {acc:.4f}")
        return acc

    def score(self, text: str, concept: str) -> float:
        self.model.eval()
        with torch.no_grad():
            inputs = self.tokenizer(
                text,
                concept,
                return_tensors='pt',
                truncation=True,
                padding=True,
                max_length=256
            ).to(self.device)
            outputs = self.model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            score = probs[0][1].item()
        return score

    def batch_score(self, text: str, concepts: list[str]) -> list[float]:
        self.model.eval()
        num_concepts = len(concepts)
        if num_concepts == 0:
            return []
        
        text_inputs = [text] * num_concepts
        
        with torch.no_grad():
            inputs = self.tokenizer(
                text_inputs,
                concepts,
                return_tensors='pt',
                truncation=True,
                padding=True,
                max_length=256
            ).to(self.device)

            outputs = self.model(**inputs) # [Batch_size, Num_Labels]
            probs = torch.softmax(outputs.logits, dim=1)
            
            # score label 1
            scores = probs[:, 1].cpu().tolist() # [Batch_size]

        return scores

    def save_model(self, output_dir: str):
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        self.model.save_pretrained(output_dir)
        self.tokenizer.save_pretrained(output_dir)
        print(f"Model saved to {output_dir}")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
nli_model = NLIModel()

Loading model from: bert-base-uncased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully on: cpu


# Concepts Interpretable Network

In [28]:
keyword_concept_list = []
abstract_concept_list = []
label_concept_list = []

c_kw_c = CONFIG["concepts"]["keyword_concepts"]
c_ab_c = CONFIG["concepts"]["abstract_concepts"]

for l in c_kw_c.values():
    keyword_concept_list.extend(l)

for l in c_ab_c.keys():
    abstract_concept_list.extend(list(c_ab_c[l].keys()))

label_concept_list = list(CONFIG["label_map"].values())

print(keyword_concept_list)
print(abstract_concept_list)
print(label_concept_list)    

['aim', 'develop', 'cancer', 'common', 'aim', 'develop', 'cancer', 'common', 'aim', 'develop', 'cancer', 'common']
['Adverse', 'Treatment', 'Intense', 'Reactions', 'Failure', 'Dislike']
['background', 'objective', 'methods', 'results', 'conclusions']


In [31]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

In [30]:
class CINDataset(Dataset):
    def __init__(self, texts:list[str], labels:list[int]):
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return {
            "text": self.texts[idx],
            "label": self.labels[idx],
            "idx": idx
        }

In [32]:
class CIN(nn.Module):
    def __init__(self, keyword_concepts:list[str], abstract_concepts:list[str], label_concepts:list[str]):
        super(CIN, self).__init__()
        self.nli_model = nli_model
        self.keyword_concepts = keyword_concepts
        self.abstract_concepts = abstract_concepts
        self.label_concepts = label_concepts
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
    def forward(self, texts:list[str]) -> torch.Tensor:
        pass