Deep learning approaches:

Fine‑tune Bio_ClinicalBERT (classification head):
For long dialogues >512 tokens: chunk, get CLS for each chunk, mean/max pool, then classify.

ELMo + BiLSTM + Attention classifier.
Fast baseline transformer: DistilBERT (clinical domain variant if available) fine‑tune.

CNN text classifier (Kim CNN) with static + fine‑tunable embeddings (FastText init).
Hierarchical model (utterance encoder → dialogue encoder) if time permits.

## FIRST APPROACH: Fine‑tune Bio_ClinicalBERT (classification head)

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("../../dataset/MTS-Dialog-TrainingSet.csv")

# FINETUNNING

In [None]:
import re
import unicodedata
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

# Load data
df = pd.read_csv("../../dataset/MTS-Dialog-TrainingSet.csv")

# Minimal preprocessing for BERT (keep case, minimal cleaning)
def normalize_for_bert(s):
    if pd.isna(s):
        return ""
    s = unicodedata.normalize("NFKC", str(s))
    # Remove speaker labels (BERT will learn from context, not explicit tags)
    s = re.sub(r'\b(Doctor|Doctor_2|Patient|Guest_family(_\d)?|Guest_clinician)[:\-]\s*', '', s, flags=re.I)
    # Keep punctuation, contractions, case as-is
    s = re.sub(r'\s+', ' ', s).strip()
    return s

df['text_for_bert'] = df['dialogue'].apply(normalize_for_bert)

# Prepare data
X = df['text_for_bert']
y = df['section_header']

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# Load tokenizer and model
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(le.classes_),
    problem_type="single_label_classification"
)

# Tokenize function
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=512
    )

# Create datasets
train_dataset = Dataset.from_dict({'text': X_train.tolist(), 'label': y_train.tolist()})
test_dataset = Dataset.from_dict({'text': X_test.tolist(), 'label': y_test.tolist()})

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1_macro': f1_score(labels, predictions, average='macro')
    }

# Training arguments
training_args = TrainingArguments(
    output_dir='./results_clinicalbert',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    logging_dir='./logs',
    logging_steps=10,
    seed=42,
    fp16=True,
    gradient_accumulation_steps=2,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Fine-tune
trainer.train()

# Evaluate
results = trainer.evaluate()
print(results)

# Save model
model.save_pretrained('./finetuned_clinicalbert')
tokenizer.save_pretrained('./finetuned_clinicalbert')

# Save label encoder
import pickle
with open('./finetuned_clinicalbert/label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 960/960 [00:00<00:00, 3864.05 examples/s]
Map: 100%|██████████| 241/241 [00:00<00:00, 3535.02 examples/s]


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,1.9317,1.64391,0.634855,0.262969
2,1.2774,1.200764,0.759336,0.339809
3,1.1068,1.117554,0.771784,0.345453


{'eval_loss': 1.1175535917282104, 'eval_accuracy': 0.7717842323651453, 'eval_f1_macro': 0.34545303347045025, 'eval_runtime': 1.8796, 'eval_samples_per_second': 128.218, 'eval_steps_per_second': 16.493, 'epoch': 3.0}


INFERENCE TEST

Device set to use cuda:0


Predicted: FAM/SOCHX, Confidence: 0.138


In [8]:
import pandas as pd
from transformers import pipeline
import pickle
import re, unicodedata

# Minimal preprocessing (same as training)
def normalize_for_bert(s):
    s = unicodedata.normalize("NFKC", str(s))
    s = re.sub(r'\b(Doctor|Doctor_2|Patient|Guest_family(_\d)?|Guest_clinician)[:\-]\s*', '', s, flags=re.I)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

# Load label encoder and model
with open('./finetuned_clinicalbert/label_encoder.pkl', 'rb') as f:
    le = pickle.load(f)

clf = pipeline(
    "text-classification",
    model="./finetuned_clinicalbert",
    tokenizer="./finetuned_clinicalbert",
    device=0  # use -1 for CPU
)

# Load a validation sample
df_val = pd.read_csv("../../dataset/MTS-Dialog-ValidationSet.csv")
text = normalize_for_bert(df_val.loc[0, "dialogue"])  # any row from validation set

# Predict
out = clf(text)[0]
pred_label = le.inverse_transform([int(out['label'].split('_')[-1])])[0]
print(f"Predicted: {pred_label} | Confidence: {out['score']:.3f}")

Device set to use cuda:0


Predicted: GENHX | Confidence: 0.924


In order to use any text written by us, we can use the following script

The text is an example and nneds to be written as the model expects, won't work if not like that

In [None]:
sample_text = "Patient reports chest pain for 3 days..."
result = classifier(sample_text)
predicted_label = le.inverse_transform([int(result[0]['label'].split('_')[-1])])[0]
print(f"Predicted: {predicted_label}, Confidence: {result[0]['score']:.3f}")

Predicted: FAM/SOCHX, Confidence: 0.138


## SECOND APPROACH: ELMo + BiLSTM

    ELMo + BiLSTM outline
We already extract ELMo sentence vectors, for sequence modeling we’d keep token-level embeddings and feed them to a BiLSTM + attention before a dense softmax