<a href="https://colab.research.google.com/github/langeandreas/nlp-project_grp44/blob/main/NLP_Project_Group_44_AL_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install evaluate seqeval



In [2]:
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from datasets import Dataset
from sklearn.preprocessing import  LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
import evaluate

import os



In [3]:
os.environ["WANDB_DISABLED"] = "true"

metric = evaluate.load("seqeval")

MODEL_TYPE = "bert-base-uncased"
batch_size = 32



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
def compute_metrics(p, label_list) -> dict:
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }



Download data if not yet in filesystem

In [9]:
data_dir = "data"
reports_filename = "TCGA_Reports.csv"
labels_filename = "tcga_patient_to_cancer_type.csv"

reports_path = os.path.join(data_dir, reports_filename)
labels_path = os.path.join(data_dir, labels_filename)

if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    print(f"Created directory: {data_dir}")

# Check if reports file exists, if not, download it
if not os.path.exists(reports_path):
    print(f"Downloading {reports_filename}...")
    !wget https://raw.githubusercontent.com/langeandreas/nlp-project_grp44/main/data/TCGA_Reports.csv -P data/
if not os.path.exists(labels_path):
    print(f"Downloading {labels_filename}...")
    !wget https://raw.githubusercontent.com/langeandreas/nlp-project_grp44/main/data/tcga_patient_to_cancer_type.csv -P data/

In [10]:
reports = pd.read_csv("data/TCGA_Reports.csv")
labels = pd.read_csv("data/tcga_patient_to_cancer_type.csv")

# Extract patient_id from filename
reports["patient_id"] = reports["patient_filename"].apply(lambda x: x.split('.')[0])

# merge labels into dataset
df = reports.merge(labels, on="patient_id")
df = df[["text", "cancer_type"]]

# encoode
le = LabelEncoder()
df["label"] = le.fit_transform(df["cancer_type"])



In [11]:
# Train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])




In [12]:
# Load ClinicalBERT
tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")
model = AutoModelForSequenceClassification.from_pretrained(
    "medicalai/ClinicalBERT", num_labels=len(le.classes_)
)
# dataset conversion
train_dataset = Dataset.from_pandas(train_df[["text", "label"]])
test_dataset = Dataset.from_pandas(test_df[["text", "label"]])

# tokenize
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding=True)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)




tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/542M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at medicalai/ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/7618 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1905 [00:00<?, ? examples/s]

In [None]:
# Training args
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()