In [1]:
import torch
print(torch.cuda.is_available())  # should be True
print(torch.cuda.get_device_name(0))  # GPU name

True
NVIDIA GeForce RTX 3090


In [2]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model



2025-10-05 11:15:41.852097: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-05 11:15:41.852118: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-05 11:15:41.852898: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-05 11:15:41.856727: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
df = pd.read_csv("hf://datasets/SandeepKumarRudhravaram/Lung_Cancer_QA/Regenerated_Lung_Cancer_QA_Dataset.csv")
df

Unnamed: 0,input,output
0,What is the survival rate for lung cancer pati...,Numerous clinical trials are exploring targete...
1,How is lung cancer diagnosed?,"Air pollution, particularly fine particulate m..."
2,Is lung cancer hereditary?,"Air pollution, particularly fine particulate m..."
3,What are the stages of lung cancer?,Targeted therapies focus on specific genetic m...
4,Are there clinical trials for lung cancer trea...,Numerous clinical trials are exploring targete...
...,...,...
2995,What treatments are available for Stage 1 lung...,Numerous clinical trials are exploring targete...
2996,Are there alternative therapies for lung cancer?,The survival rate depends on the stage at diag...
2997,Are there clinical trials for lung cancer trea...,Numerous clinical trials are exploring targete...
2998,How is lung cancer diagnosed?,Some alternative therapies like acupuncture ma...


In [4]:
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
validation_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Shape of training set: {train_df.shape}")
print(f"Shape of validation set: {validation_df.shape}")
print(f"Shape of test set: {test_df.shape}")

Shape of training set: (2400, 2)
Shape of validation set: (300, 2)
Shape of test set: (300, 2)


In [5]:
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(validation_df)
test_dataset  = Dataset.from_pandas(test_df)

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['input', 'output', '__index_level_0__'],
        num_rows: 2400
    })
    validation: Dataset({
        features: ['input', 'output', '__index_level_0__'],
        num_rows: 300
    })
    test: Dataset({
        features: ['input', 'output', '__index_level_0__'],
        num_rows: 300
    })
})


In [6]:
def format_example(example):
    prompt = f"### Question:\n{example['input']}\n\n### Answer:\n"
    return {"text": prompt + example['output']}

dataset = dataset.map(format_example)


Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoTokenizer

BASE_MODEL = "Qwen/Qwen2-0.5B"

# Load tokenizer first
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

# Add a pad token if missing (Qwen models often don’t have one)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

"""
# Define tokenize function (example if you have separate question/answer columns)
def tokenize(batch):
    enc = tokenizer(
        f"Question: {batch['input']}\nAnswer:",
        truncation=True,
        padding="max_length",
        max_length=512
    )

    answer_enc = tokenizer(
        batch["output"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

    labels = [
        tok if tok != tokenizer.pad_token_id else -100
        for tok in answer_enc["input_ids"]
    ]

    enc["labels"] = labels
    return enc
"""
def tokenize(batch):
    prompt = f"Question: {batch['input']}\nAnswer:"
    answer = batch["output"]

    prompt_enc = tokenizer(prompt, truncation=True, max_length=256)
    answer_enc = tokenizer(answer, truncation=True, max_length=256)

    input_ids = prompt_enc['input_ids'] + answer_enc['input_ids']
    labels = [-100] * len(prompt_enc['input_ids']) + answer_enc['input_ids']

    # Pad sequences to 512 tokens
    pad_length = 512 - len(input_ids)
    if pad_length > 0:
        input_ids = input_ids + [tokenizer.pad_token_id] * pad_length
        labels = labels + [-100] * pad_length
    else:
        input_ids = input_ids[:512]
        labels = labels[:512]

    attention_mask = [1 if id != tokenizer.pad_token_id else 0 for id in input_ids]

    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": attention_mask
    }

# Apply to dataset
tokenized_dataset = dataset.map(
    tokenize,
    batched=False,
    remove_columns=dataset["train"].column_names
)


Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [8]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,  # <-- force FP16
).cuda()



`torch_dtype` is deprecated! Use `dtype` instead!


In [9]:
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer

# Data collator for causal LM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # causal LM
)

training_args = TrainingArguments(
    output_dir='./qwen-sft-final',
    per_device_train_batch_size=1,  # small batch for stability
    gradient_accumulation_steps=4,  # effective batch size = 4
    learning_rate=1e-5,             # start small
    max_steps=500,                   # short debug run
    fp16=True,
    save_steps=100,
    save_total_limit=3,
    logging_steps=10,
    report_to=None,                  # no wandb/other reporting
    remove_unused_columns=False,
    gradient_checkpointing=True,     # save memory
    max_grad_norm=1.0,               # gradient clipping
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()


  trainer = Trainer(
[codecarbon INFO @ 11:15:50] [setup] RAM Tracking...
[codecarbon INFO @ 11:15:50] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 11:15:51] CPU Model on constant consumption mode: AMD Ryzen 7 5800X 8-Core Processor
[codecarbon INFO @ 11:15:51] [setup] GPU Tracking...
[codecarbon INFO @ 11:15:51] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 11:15:51] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: cpu_load
                GPU Tracking Method: pynvml
            
[codecarbon INFO @ 11:15:51] >>> Tracker's metadata:
[codecarbon INFO @ 11:15:51]   Platform system: Linux-5.15.0-153-generic-x86_64-with-glibc2.35
[codecarbon INFO @ 11:15:51]   Python version: 3.10.18
[codecarbon INFO @ 11:15:51]   CodeCarbon version: 3.0.5
[codecarbon INFO @ 11:15:51]   Available RAM

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,116.5292
20,0.0
30,0.0
40,0.0
50,0.0
60,0.0
70,0.0
80,0.0
90,0.0
100,0.0


[codecarbon INFO @ 11:16:12] Energy consumed for RAM : 0.000086 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 11:16:12] Delta energy consumed for CPU with cpu_load : 0.000045 kWh, power : 10.5240393384 W
[codecarbon INFO @ 11:16:12] Energy consumed for All CPU : 0.000045 kWh
[codecarbon INFO @ 11:16:12] Energy consumed for all GPUs : 0.001364 kWh. Total GPU Power : 306.6665698549011 W
[codecarbon INFO @ 11:16:12] 0.001495 kWh of electricity used since the beginning.
[codecarbon INFO @ 11:16:27] Energy consumed for RAM : 0.000167 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 11:16:27] Delta energy consumed for CPU with cpu_load : 0.000042 kWh, power : 10.52331469153125 W
[codecarbon INFO @ 11:16:27] Energy consumed for All CPU : 0.000088 kWh
[codecarbon INFO @ 11:16:27] Energy consumed for all GPUs : 0.002705 kWh. Total GPU Power : 321.9903391827555 W
[codecarbon INFO @ 11:16:27] 0.002959 kWh of electricity used since the beginning.
[codecarbon INFO @ 11:16:42] Energy consumed for RAM : 0.000

TrainOutput(global_step=500, training_loss=2.330583740234375, metrics={'train_runtime': 548.4807, 'train_samples_per_second': 7.293, 'train_steps_per_second': 0.912, 'total_flos': 4397852000256000.0, 'train_loss': 2.330583740234375, 'epoch': 1.6666666666666665})

In [10]:
from transformers import Trainer, TrainingArguments
print(Trainer)
print(TrainingArguments)


<class 'transformers.trainer.Trainer'>
<class 'transformers.training_args.TrainingArguments'>


In [11]:
trainer.save_model("./qwen-sft-final")
tokenizer.save_pretrained("./qwen-sft-final")


('./qwen-sft-final/tokenizer_config.json',
 './qwen-sft-final/special_tokens_map.json',
 './qwen-sft-final/chat_template.jinja',
 './qwen-sft-final/vocab.json',
 './qwen-sft-final/merges.txt',
 './qwen-sft-final/added_tokens.json',
 './qwen-sft-final/tokenizer.json')

In [12]:
print(tokenized_dataset["train"][0]["labels"][:50])
print(max(tokenized_dataset["train"][0]["labels"]))
print(tokenizer.vocab_size)


[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 25140, 7822, 5244, 389, 16052, 15712, 323, 10601, 51212, 369, 20622, 9387, 13, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]
51212
151643


In [13]:
def check_labels(dataset, tokenizer):
    bad_batches = []
    for i, batch in enumerate(dataset):
        labels = batch["labels"]
        for l in labels:
            if (l != -100 and (l < 0 or l >= tokenizer.vocab_size)):
                bad_batches.append((i, l, tokenizer.vocab_size))
    return bad_batches

bad = check_labels(tokenized_dataset["train"], tokenizer)
print("Bad labels:", bad[:20])  # show first 20 errors if any


Bad labels: []


In [15]:
import evaluate

bleu = evaluate.load("bleu")  # replaces load_metric("sacrebleu")

def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    
    # Decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in labels with pad token ID before decoding
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # BLEU expects list of references per prediction
    decoded_labels = [[label] for label in decoded_labels]
    
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    return


In [16]:
trainer.evaluate(tokenized_dataset["validation"])




{'eval_loss': nan,
 'eval_runtime': 8.0001,
 'eval_samples_per_second': 37.499,
 'eval_steps_per_second': 2.375,
 'epoch': 1.6666666666666665}