In [21]:
import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    default_data_collator
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, hamming_loss, precision_score, recall_score
from torch.utils.data import Dataset
import warnings
warnings.filterwarnings("ignore")

In [22]:
def check_gpu():
    """Check GPU availability"""
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    else:
        print("WARNING: No GPU available")

def load_model_and_tokenizer():
    """Load PubMedBERT model and tokenizer"""
    model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=4,
        problem_type="multi_label_classification"
    )

    print(f"Model loaded: {model.num_parameters():,} parameters")
    return tokenizer, model

In [23]:
def prepare_medical_dataset(df):
    """Convert multi-label medical dataset to binary format"""
    category_mapping = {
        'neurological': 0,
        'cardiovascular': 1,
        'hepatorenal': 2,
        'oncological': 3
    }

    def parse_medical_labels(group_str):
        labels = [0, 0, 0, 0]
        if pd.isna(group_str):
            return labels

        categories = str(group_str).split('|')
        for cat in categories:
            cat = cat.strip().lower()
            if cat in category_mapping:
                labels[category_mapping[cat]] = 1
        return labels

    # Create combined text
    df['text'] = df['title'].astype(str) + " [SEP] " + df['abstract'].astype(str)

    # Convert labels
    df['labels'] = df['group'].apply(parse_medical_labels)

    # Print distribution
    categories = ['neurological', 'cardiovascular', 'hepatorenal', 'oncological']
    print("\nLabel distribution:")
    for i, cat in enumerate(categories):
        count = sum(1 for labels in df['labels'] if labels[i] == 1)
        percentage = (count / len(df)) * 100
        print(f"  {cat:15}: {count:4d} samples ({percentage:5.1f}%)")

    return df[['text', 'labels']].copy()

def analyze_text_lengths(df, tokenizer):
    """Analyze text lengths for optimal max_length"""
    lengths = df['text'].apply(lambda x: len(tokenizer.encode(str(x))))

    print(f"\nText length analysis:")
    print(f"  Mean: {lengths.mean():.0f} tokens")
    print(f"  95th percentile: {lengths.quantile(0.95):.0f} tokens")
    print(f"  Max: {lengths.max():.0f} tokens")

    optimal_length = min(512, int(lengths.quantile(0.95)))
    print(f"  Recommended max_length: {optimal_length}")

    return optimal_length

In [24]:
class MedicalPapersDataset(Dataset):
    """Custom dataset ensuring correct data types for multi-label classification"""

    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        labels = self.labels.iloc[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float32)
        }

In [25]:
def compute_multilabel_metrics(eval_pred):
    """Compute comprehensive multi-label metrics"""
    predictions, labels = eval_pred

    # Apply sigmoid and threshold
    predictions = torch.sigmoid(torch.tensor(predictions))
    predictions = (predictions > 0.5).int().numpy()

    # Global metrics
    metrics = {
        'f1_macro': f1_score(labels, predictions, average='macro', zero_division=0),
        'f1_micro': f1_score(labels, predictions, average='micro', zero_division=0),
        'f1_weighted': f1_score(labels, predictions, average='weighted', zero_division=0),
        'subset_accuracy': accuracy_score(labels, predictions),
        'hamming_loss': hamming_loss(labels, predictions)
    }

    # Per-category metrics
    categories = ['neurological', 'cardiovascular', 'hepatorenal', 'oncological']
    for i, cat in enumerate(categories):
        cat_labels = labels[:, i]
        cat_preds = predictions[:, i]

        metrics[f'f1_{cat}'] = f1_score(cat_labels, cat_preds, zero_division=0)
        metrics[f'precision_{cat}'] = precision_score(cat_labels, cat_preds, zero_division=0)
        metrics[f'recall_{cat}'] = recall_score(cat_labels, cat_preds, zero_division=0)

    return metrics

In [26]:
def get_training_args():
    """Configure training arguments optimized for T4 GPU"""
    return TrainingArguments(
        output_dir='./pubmedbert-medical-results',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=3,
        eval_strategy="steps",
        eval_steps=50,
        logging_steps=25,
        fp16=True,
        learning_rate=2e-5,
        warmup_steps=100,
        weight_decay=0.05,
        save_strategy="steps",
        save_steps=150,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        seed=42,
        report_to=None  # Disable wandb
    )


In [27]:
def train_medical_classifier(csv_path, sep=";", quotechar='"'):
    """Complete training pipeline"""

    # Setup
    print("=== Medical Paper Classification Training ===")
    check_gpu()

    # Load data
    print(f"\nLoading data from {csv_path}")
    df = pd.read_csv(csv_path, sep=sep, quotechar=quotechar, quoting=1)
    print(f"Loaded {len(df):,} samples")

    # Load model
    print("\nLoading PubMedBERT model...")
    tokenizer, model = load_model_and_tokenizer()

    # Prepare data
    print("\nPreparing dataset...")
    df_prepared = prepare_medical_dataset(df)

    # Analyze text lengths
    optimal_max_length = analyze_text_lengths(df_prepared, tokenizer)

    # Train/validation split
    print("\nSplitting data...")
    df_prepared['label_string'] = df_prepared['labels'].apply(str)
    train_df, val_df = train_test_split(
        df_prepared,
        test_size=0.2,
        stratify=df_prepared['label_string'],
        random_state=42
    )
    print(f"Train: {len(train_df):,}, Validation: {len(val_df):,}")

    # Create datasets
    print("\nCreating datasets...")
    train_dataset = MedicalPapersDataset(
        train_df['text'], train_df['labels'], tokenizer, optimal_max_length
    )
    val_dataset = MedicalPapersDataset(
        val_df['text'], val_df['labels'], tokenizer, optimal_max_length
    )

    # Verify data types
    sample = train_dataset[0]
    assert sample['labels'].dtype == torch.float32, "Labels must be float32"
    print("Data types verified successfully")

    # Setup training
    training_args = get_training_args()
    data_collator = default_data_collator

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_multilabel_metrics,
    )

    # Train
    print(f"\nStarting training...")
    print(f"Training parameters:")
    print(f"  - Epochs: {training_args.num_train_epochs}")
    print(f"  - Batch size (effective): {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
    print(f"  - Learning rate: {training_args.learning_rate}")
    print(f"  - Max sequence length: {optimal_max_length}")

    train_result = trainer.train()

    # Final evaluation
    print("\nFinal evaluation...")
    final_metrics = trainer.evaluate()

    # Print key results
    print("\nTraining completed! Key metrics:")
    key_metrics = ['eval_f1_macro', 'eval_f1_micro', 'eval_subset_accuracy', 'eval_hamming_loss']
    for metric in key_metrics:
        if metric in final_metrics:
            print(f"  {metric.replace('eval_', '')}: {final_metrics[metric]:.4f}")

    print("\nPer-category F1 scores:")
    categories = ['neurological', 'cardiovascular', 'hepatorenal', 'oncological']
    for cat in categories:
        f1_key = f'eval_f1_{cat}'
        if f1_key in final_metrics:
            print(f"  {cat}: {final_metrics[f1_key]:.4f}")

    # Save model
    model_path = "./pubmedbert-medical-final"
    trainer.save_model(model_path)
    tokenizer.save_pretrained(model_path)
    print(f"\nModel saved to: {model_path}")

    return trainer, final_metrics

In [28]:
def predict_medical_categories(text, model_path="./pubmedbert-medical-final", threshold=0.5):
    """Predict medical categories for new text"""
    categories = ['neurological', 'cardiovascular', 'hepatorenal', 'oncological']

    # Load model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    # Tokenize
    inputs = tokenizer(
        text,
        return_tensors='pt',
        truncation=True,
        padding=True,
        max_length=512
    )

    # Predict
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.sigmoid(outputs.logits)[0]

    # Format results
    results = []
    for i, (category, prob) in enumerate(zip(categories, predictions)):
        results.append({
            'category': category,
            'probability': prob.item(),
            'predicted': prob.item() > threshold
        })

    return results

In [29]:
if __name__ == "__main__":
    # Train the model
    csv_file = "/content/challenge_data-18-ago.csv"  # Update path as needed
    trainer, metrics = train_medical_classifier(csv_file)

    # Example prediction
    sample_text = """
    Alzheimer disease treatment shows significant improvement in cognitive function.
    This study demonstrates the effectiveness of new therapeutic approaches
    for neurodegenerative conditions affecting memory and cognition.
    """

    predictions = predict_medical_categories(sample_text)

    print("\nExample prediction:")
    for pred in predictions:
        if pred['predicted']:
            print(f"  {pred['category']}: {pred['probability']:.3f}")

=== Medical Paper Classification Training ===
GPU: Tesla T4
VRAM: 14.7 GB

Loading data from /content/challenge_data-18-ago.csv
Loaded 3,565 samples

Loading PubMedBERT model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded: 109,485,316 parameters

Preparing dataset...

Label distribution:
  neurological   : 1785 samples ( 50.1%)
  cardiovascular : 1268 samples ( 35.6%)
  hepatorenal    : 1091 samples ( 30.6%)
  oncological    :  601 samples ( 16.9%)

Text length analysis:
  Mean: 145 tokens
  95th percentile: 397 tokens
  Max: 723 tokens
  Recommended max_length: 397

Splitting data...
Train: 2,852, Validation: 713

Creating datasets...
Data types verified successfully

Starting training...
Training parameters:
  - Epochs: 3
  - Batch size (effective): 24
  - Learning rate: 2e-05
  - Max sequence length: 397


Step,Training Loss,Validation Loss,F1 Macro,F1 Micro,F1 Weighted,Subset Accuracy,Hamming Loss,F1 Neurological,Precision Neurological,Recall Neurological,F1 Cardiovascular,Precision Cardiovascular,Recall Cardiovascular,F1 Hepatorenal,Precision Hepatorenal,Recall Hepatorenal,F1 Oncological,Precision Oncological,Recall Oncological,Runtime,Samples Per Second,Steps Per Second
50,0.6065,0.592264,0.166826,0.426829,0.251733,0.28892,0.329593,0.667302,0.506512,0.977654,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.3161,112.887,56.522
100,0.5328,0.410063,0.612242,0.759509,0.715909,0.575035,0.137447,0.859135,0.857939,0.860335,0.811189,0.994286,0.685039,0.745856,0.931034,0.62212,0.032787,1.0,0.016667,9.1902,77.583,38.846
150,0.2432,0.209004,0.904164,0.901235,0.901166,0.775596,0.061711,0.88024,0.948387,0.821229,0.918919,0.973568,0.870079,0.913151,0.989247,0.847926,0.904348,0.945455,0.866667,6.3234,112.756,56.457
200,0.1974,0.185121,0.910882,0.910186,0.910331,0.789621,0.057504,0.904965,0.919308,0.891061,0.910603,0.964758,0.862205,0.920398,1.0,0.852535,0.907563,0.915254,0.9,9.3672,76.117,38.112
250,0.1595,0.160017,0.929904,0.930131,0.929958,0.837307,0.044881,0.911208,0.951368,0.874302,0.943775,0.963115,0.925197,0.957143,0.990148,0.926267,0.907489,0.962617,0.858333,6.1934,115.123,57.642
300,0.1593,0.153291,0.934353,0.932176,0.931755,0.845722,0.043829,0.907216,0.959502,0.860335,0.950298,0.959839,0.940945,0.953488,0.962441,0.9447,0.926407,0.963964,0.891667,6.7697,105.322,52.735
350,0.1324,0.148037,0.934763,0.933261,0.932999,0.84432,0.043128,0.913616,0.96,0.871508,0.948207,0.959677,0.937008,0.95082,0.966667,0.935484,0.926407,0.963964,0.891667,6.8421,104.207,52.177



Final evaluation...



Training completed! Key metrics:
  f1_macro: 0.9344
  f1_micro: 0.9322
  subset_accuracy: 0.8457
  hamming_loss: 0.0438

Per-category F1 scores:
  neurological: 0.9072
  cardiovascular: 0.9503
  hepatorenal: 0.9535
  oncological: 0.9264

Model saved to: ./pubmedbert-medical-final

Example prediction:
  neurological: 0.978


In [30]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_path = "./pubmedbert-medical-final"
try:
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    print("Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")

Model loaded successfully!


In [45]:
def predict_medical_categories(text, model_path="./pubmedbert-medical-final", threshold=0.5):
    categories = ['neurological', 'cardiovascular', 'hepatorenal', 'oncological']

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    inputs = tokenizer(
        text,
        return_tensors='pt',
        truncation=True,
        padding=True,
        max_length=397  # Same as training
    )

    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.sigmoid(outputs.logits)[0]

    results = []
    for i, (category, prob) in enumerate(zip(categories, predictions)):
        results.append({
            'category': category,
            'probability': prob.item(),
            'predicted': prob.item() > threshold
        })

    return results

# Test it
sample_text = "endoscopy reveals ventricular tachycardia secrets Research question: How does metformin affect cancer through pituitary adenoma mechanisms? Methods: randomized controlled study with 53 elderly patients, assessing encephalitis and aphasia. Results: significant improvement in primary endpoints. Implications: therapeutic innovation."


predictions = predict_medical_categories(sample_text)

for pred in predictions:
    if pred['predicted']:
        print(f"{pred['category']}: {pred['probability']:.3f}")

neurological: 0.985


In [37]:
df = pd.read_csv("/content/challenge_data-18-ago.csv", sep=';', quotechar='"', quoting=1)

In [38]:
df_prepared = prepare_medical_dataset(df)


Label distribution:
  neurological   : 1785 samples ( 50.1%)
  cardiovascular : 1268 samples ( 35.6%)
  hepatorenal    : 1091 samples ( 30.6%)
  oncological    :  601 samples ( 16.9%)


In [43]:
df_prepared.loc[3525]

Unnamed: 0,3525
text,Carvedilol protects against doxorubicin-induce...
labels,"[1, 1, 1, 1]"
