In [1]:
pip install transformers datasets torch accelerate


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
pip install peft



In [3]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from transformers import AdamW
from peft import LoraConfig, get_peft_model

In [4]:
df = pd.read_excel('/content/final_clinical_mutation_data.xlsx')

In [5]:
df = df.head(10000)

In [6]:
# Preprocess the dataset
df['text'] = df[['Hugo_Symbol', 'Chromosome', 'Consequence', 'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2']].astype(str).agg(' '.join, axis=1)
df = df[['text', 'Variant_Classification']]


In [7]:
# Encode labels
label_mapping = {label: idx for idx, label in enumerate(df['Variant_Classification'].unique())}
df['label'] = df['Variant_Classification'].map(label_mapping)

In [8]:
# Split into train and test
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

In [9]:
# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['text', 'label']])
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

In [10]:
# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_mapping))

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
 #Tokenize data
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=128)

In [12]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [13]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True
)




In [14]:
# Fully Fine-tuning
def train_model(tokenized_datasets, model, training_args):
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['test'],
        tokenizer=tokenizer,
        compute_metrics=lambda p: {"accuracy": (p.predictions.argmax(-1) == p.label_ids).astype(float).mean()}
    )
    trainer.train()
    return trainer.evaluate()

In [15]:
# LoRa Fine-tuning
def train_model_lora(tokenized_datasets, model, training_args):
    lora_config = LoraConfig(
        r=8,
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=["query", "key"],
        bias="none",
        task_type="SEQ_CLS"
    )
    lora_model = get_peft_model(model, lora_config)
    trainer = Trainer(
        model=lora_model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['test'],
        tokenizer=tokenizer,
        compute_metrics=lambda p: {"accuracy": (p.predictions.argmax(-1) == p.label_ids).astype(float).mean()}
    )
    trainer.train()
    return trainer.evaluate()

In [20]:
# Layer Freezing Fine-tuning
def train_model_layer_freezing(tokenized_datasets, model, training_args):
    for param in model.roberta.embeddings.parameters():
        param.requires_grad = True
    for layer in model.roberta.encoder.layer[:-6]:
        for param in layer.parameters():
            param.requires_grad = False

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['test'],
        tokenizer=tokenizer,
        compute_metrics=lambda p: {"accuracy": (p.predictions.argmax(-1) == p.label_ids).astype(float).mean()}
    )
    trainer.train()
    return trainer.evaluate()


In [17]:

results_fully_finetuned = train_model(tokenized_datasets, model, training_args)


  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0601,0.03781,0.989
2,0.0271,0.021287,0.991
3,0.0056,0.01124,0.9975


In [18]:
results_lora = train_model_lora(tokenized_datasets, model, training_args)

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0088,0.012082,0.9965
2,0.0035,0.01051,0.9975
3,0.0029,0.010169,0.998


In [21]:
results_layer_freezing = train_model_layer_freezing(tokenized_datasets, model, training_args)

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0065,0.012275,0.997
2,0.0016,0.010314,0.9985
3,0.0017,0.010023,0.9985


In [22]:

print("Fully Fine-tuned Model Results:", results_fully_finetuned)
print("LoRa Fine-tuned Model Results:", results_lora)
print("Layer Freezing Fine-tuned Model Results:", results_layer_freezing)

Fully Fine-tuned Model Results: {'eval_loss': 0.01124021876603365, 'eval_accuracy': 0.9975, 'eval_runtime': 4.5929, 'eval_samples_per_second': 435.453, 'eval_steps_per_second': 54.432, 'epoch': 3.0}
LoRa Fine-tuned Model Results: {'eval_loss': 0.010169061832129955, 'eval_accuracy': 0.998, 'eval_runtime': 5.0043, 'eval_samples_per_second': 399.659, 'eval_steps_per_second': 49.957, 'epoch': 3.0}
Layer Freezing Fine-tuned Model Results: {'eval_loss': 0.010023231618106365, 'eval_accuracy': 0.9985, 'eval_runtime': 4.8339, 'eval_samples_per_second': 413.745, 'eval_steps_per_second': 51.718, 'epoch': 3.0}
