In [1]:
import os
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model

# =============================================
# 1. SYSTEM OPTIMIZATION
# =============================================
os.environ["CUDA_VISIBLE_DEVICES"] = ""
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.set_num_threads(os.cpu_count())
print(f"Using CPU with {os.cpu_count()} cores")

# =============================================
# 2. INITIALIZE TOKENIZER AND MODEL
# =============================================
print("\nInitializing model components...")
model_path = "./fine_tuned_distilgpt2"
base_model = "distilgpt2"
model = AutoModelForCausalLM.from_pretrained(
    "distilgpt2",
    num_labels=5,           # Explicit class count
    problem_type="regression"  # Try for ordinal classes
)
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.float32,
    low_cpu_mem_usage=True
)

# =============================================
# 3. PEFT CONFIGURATION WITH LABEL HANDLING
# =============================================
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    bias="none",
    target_modules=["c_attn", "c_proj"],  # Added projections
    modules_to_save=["lm_head"]
)

peft_model = get_peft_model(model, peft_config)
peft_model.config.label_names = ["input_ids", "attention_mask", "labels"]
peft_model.print_trainable_parameters()

# =============================================
# 4. DATA PROCESSING
# =============================================
def preprocess_data(example):
    return {
        "text": f"EEG Data:\n{example['messages'][1]['content']}\nLabel: {example['messages'][2]['content']}",
        "label": int(example['messages'][2]['content'])
    }

print("\nLoading dataset...")
cache_dir = "./dataset_cache"
os.makedirs(cache_dir, exist_ok=True)

# Load dataset with proper JSONL handling
dataset = load_dataset(
    "json",
    data_files="jsonl/train.jsonl",
    split="train",
    cache_dir=cache_dir
).map(
     preprocess_data,
    remove_columns=["messages"],
    load_from_cache_file=True,  # Force recreate cache
    num_proc=1,
    desc="Preprocessing data"
)
print(f"Loaded {len(dataset)} examples")

# =============================================
# 5. TOKENIZATION WITH LABELS
# =============================================
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="np"
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

print("\nTokenizing dataset...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=8,
    remove_columns=["text"],
    num_proc=1,
    cache_file_name=os.path.join(cache_dir, "tokenized.arrow")
)

# =============================================
# 6. TRAINING CONFIGURATION
# =============================================
training_args = TrainingArguments(
    eval_steps=50,
    output_dir="./eeg_results",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    num_train_epochs=10,
    warmup_ratio=0.1,
    learning_rate=5e-5,
    weight_decay=0.1,
    optim="adamw_torch",
    no_cuda=True,
    remove_unused_columns=True,
    report_to="none",
    save_strategy="no",
    logging_steps=10,
    label_names=["input_ids", "attention_mask", "labels"],
    max_grad_norm=1.0

)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# =============================================
# 7. TRAINING EXECUTION
# =============================================
print("\nStarting training...")
trainer.train()

# =============================================
# 8. MODEL SAVING
# =============================================
print("\nSaving model...")
output_dir = "./fine_tuned_distilgpt2"
os.makedirs(output_dir, exist_ok=True)
peft_model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

  from .autonotebook import tqdm as notebook_tqdm


Using CPU with 4 cores

Initializing model components...




trainable params: 39,002,880 || all params: 120,915,456 || trainable%: 32.2563

Loading dataset...
Loaded 210 examples

Tokenizing dataset...

Starting training...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,2.6452
20,2.5505
30,2.5134
40,2.352
50,2.2451
60,2.1247
70,2.0223
80,1.9302
90,1.8603
100,1.7829



Saving model...
Model saved to ./fine_tuned_distilgpt2
