## Install dependencies

In [21]:
!pip install torch
!pip install pandas
!pip install simpletransformers



## Import libraries

In [43]:
import os
import pandas as pd
import torch
from simpletransformers.ner import NERArgs
from simpletransformers.ner import NERModel

## Configure model

In [26]:
MODEL_REGISTRY = {
    "bert": "google-bert/bert-base-uncased",
    "biobert": "dmis-lab/biobert-base-cased-v1.2",
    "bluebert": "bionlp/bluebert_pubmed_uncased_L-24_H-1024_A-16",
    "clinical-bert": "emilyalsentzer/Bio_ClinicalBERT",
    "biomed_roberta": "allenai/biomed_roberta_base",
    "pubmedbert": "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext",
    "lava": "./models/lava"
}

def get_model_name(short_name):
    """Returns the full model name for a given short model name."""
    if short_name not in MODEL_REGISTRY:
        raise ValueError(f"Model '{short_name}' not found in registry. Available models: {list(MODEL_REGISTRY.keys())}")
    return MODEL_REGISTRY[short_name]

def get_default_model_args(output_dir, labels):
    model_args = NERArgs()
    model_args.labels_list = labels
    model_args.reprocess_input_data = True
    model_args.overwrite_output_dir = True
    model_args.sliding_window = True
    model_args.num_train_epochs = 3 #10
    model_args.train_batch_size = 4 #16
    model_args.fp16 = False #True
    model_args.output_dir = output_dir
    model_args.best_model_dir = f"{output_dir}/best_model/"
    model_args.evaluate_during_training = True
    model_args.show_running_loss = True
    model_args.use_early_stopping = True
    # model_args.wandb_project = "huggingface"
    # model_args.use_multiprocessing = False  
    # model_args.use_cuda = False  
    return model_args

## Load Data

In [29]:
DATASET_PATHS = { 'ner' : {
                            "bc5cdr": {'train' : "datasets/preprocessed_NER/bc5cdr/train.csv", 'test' : "datasets/preprocessed_NER/bc5cdr/test.csv"},
                            "biored": {'train' : "datasets/preprocessed_NER/biored/train.csv", 'test' : "datasets/preprocessed_NER/biored/test.csv"},
                            "chemprot": {'train' : "datasets/preprocessed_NER/chemprot/train.csv", 'test' : "datasets/preprocessed_NER/chemprot/test.csv"},
                            "ncbi-disease": {'train' : "datasets/preprocessed_NER/ncbi-disease/train.csv", 'test' : "datasets/preprocessed_NER/ncbi-disease/test.csv"}
                        },
                  're' : {
                            "biored": {'train' : "datasets/preprocessed_RE/biored/train.csv", 'test' : "datasets/preprocessed_RE/biored/test.csv"},
                        }
        }

def load_dataset(dataset_name, task  ='ner'):
    """Loads a single dataset as a DataFrame."""
    if task not in DATASET_PATHS.keys():
        raise ValueError(f"Task '{task}' not found. Available: {list(DATASET_PATHS.keys())}")
    if dataset_name not in DATASET_PATHS[task].keys():
        raise ValueError(f"Dataset '{dataset_name}' not found. Available: {list(DATASET_PATHS[task].keys())}")
    
    train_path = DATASET_PATHS[task][dataset_name]['train']
    test_path = DATASET_PATHS[task][dataset_name]['test']
    
    if not os.path.exists(train_path):
        raise FileNotFoundError(f"Dataset file '{train_path}' not found. Make sure datasets are placed in the 'datasets' folder.")
    if not os.path.exists(test_path):
        raise FileNotFoundError(f"Dataset file '{test_path}' not found. Make sure datasets are placed in the 'datasets' folder.")
    
    df_train = pd.read_csv(train_path)
    df_test = pd.read_csv(test_path)

    return df_train.astype(str), df_test.astype(str)

def load_multiple_datasets(dataset_names, task = 'ner'):
    """Loads and combines multiple datasets for training and testing."""
    
    train_dfs = []
    test_dfs = []
    
    counter = 0
    for dataset_name in dataset_names:
        df_train = load_dataset(dataset_name, 'train')
        df_test = load_dataset(dataset_name, 'test')
        
        df_train['sentence_id'] = df_train['sentence_id'].apply(lambda x: (chr(ord('a') + counter) + str(x)))
        df_test['sentence_id'] = df_test['sentence_id'].apply(lambda x: (chr(ord('a') + counter) + str(x)))

        train_dfs.append(df_train)
        test_dfs.append(df_test)
        counter += 1
    
    train_df = pd.concat(train_dfs) if train_dfs else None
    test_df = pd.concat(test_dfs) if test_dfs else None
    
    print(f"Loaded {len(dataset_names)} dataset(s): {dataset_names}")
    print(f"Training samples: {len(train_df)} | Test samples: {len(test_df)}")
    
    return train_df.astype(str), test_df.astype(str)

# Create and load NER model

In [None]:
def create_ner_model(model_type, model_name, labels, output_dir):
    """Initializes or loads a NER model."""
    
    model_args = get_default_model_args(output_dir, labels)

    if os.path.exists(model_name):
        print(f"Loading fine-tuned NER model from {model_name}...")
        model = NERModel(model_type, model_name, labels=labels, args=model_args, use_cuda=torch.cuda.is_available())
    
        # Check if the classifier layer needs to be replaced
        model_config = model.model.config
        if len(labels) != model_config.num_labels:
            print(f"Label mismatch detected: Model has {model_config.num_labels} labels, but dataset has {len(labels)} labels.")
            print("Reinitializing classifier layer to match new dataset labels.")

            # Reinitialize classifier layer
            model.model.classifier = torch.nn.Linear(model_config.hidden_size, len(labels))
            model.model.config.num_labels = len(labels)
            
    else:
        print(f"Initializing new NER model: {model_name}...")
        model = NERModel(model_type, model_name, labels=labels, args=model_args, use_cuda=torch.cuda.is_available())

    return model

def load_ner_model(model_dir):
    """Loads a trained NER model."""
    if not os.path.exists(model_dir):
        raise FileNotFoundError(f"Model directory '{model_dir}' not found.")
    
    return NERModel("bert", model_dir)

## Extract entities and relations

In [35]:
def extract_entities(model_dir, df_text):
    """Extracts named entities from text."""
    model = load_ner_model(model_dir)
    predictions, raw_outputs = model.predict(df_text["text"].tolist())
    return predictions

def extract_relations(model_dir, entity_pairs):
    """Extracts relations between named entities."""
    model = load_re_model(model_dir)
    predictions, raw_outputs = model.predict(entity_pairs)
    return predictions

def full_pipeline(ner_model_dir, re_model_dir, df_text):
    """Runs the full pipeline: NER → RE"""
    
    print("\n=== Extracting Entities ===")
    entities = extract_entities(ner_model_dir, df_text)

    entity_pairs = [(e1, e2) for ent_list in entities for e1 in ent_list for e2 in ent_list if e1 != e2]

    print("\n=== Extracting Relations ===")
    relations = extract_relations(re_model_dir, entity_pairs)

    print("\n=== Final Knowledge Extraction Output ===")
    for (e1, e2), relation in zip(entity_pairs, relations):
        print(f"{e1} -[{relation}]-> {e2}")

## Train

In [38]:
def train_ner(model_type, model_name, df_train, df_dev, output_dir):
    """Trains the NER model."""
    
    labels = list(df_train["labels"].unique())
    model = create_ner_model(model_type, model_name, labels, output_dir)

    model.train_model(df_train, eval_data=df_dev)
    print(f"NER training complete. Model saved to {output_dir}")

def train_re(model_type, model_name, df_train, df_dev, output_dir):
    """Trains the Relation Extraction model."""
    
    labels = list(df_train["relation"].unique())
    model = create_re_model(model_type, model_name, labels, output_dir)

    model.train_model(df_train, eval_data=df_dev)
    print(f"RE training complete. Model saved to {output_dir}")

## Run model

In [47]:
NER_MODEL_DIR = "ner_model"
RE_MODEL_DIR = "re_model"

df_train, df_test = load_dataset('bc5cdr', 'ner')
print(df_train.head())

print("\n=== Training Models ===")
# train_ner("bert","bert-base-uncased", df_train, df_test, NER_MODEL_DIR)
# train_re("bert", "bert-base-uncased", df_train, df_test, RE_MODEL_DIR)

# print("\n=== Evaluating Models ===")
# evaluate_ner(NER_MODEL_DIR, df)
# evaluate_re(RE_MODEL_DIR, df)

# print("\n=== Running Full Pipeline ===")
# full_pipeline(NER_MODEL_DIR, RE_MODEL_DIR, df)

              words sentence_id     labels
0  Antihypertensive           0          O
1             drugs           0          O
2               and           0          O
3        depression           0  B-Disease
4                 :           0          O

=== Training Models ===
