In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model

# =============================================
# 2. CPU OPTIMIZATION
# =============================================
os.environ["CUDA_VISIBLE_DEVICES"] = ""
torch.set_num_threads(os.cpu_count())
print(f"\nUsing CPU with {os.cpu_count()} cores")

# =============================================
# 3. DATASET LOADING
# =============================================
print("\nLoading dataset...")
dataset = load_dataset("json", data_files="train.jsonl", split="train")

def format_example(example):
    # Create input text
    input_text = f"EEG Data:\n{example['messages'][1]['content']}\nLabel:"
    # Get the target label
    target_label = example['messages'][2]['content']
    # Combine them with the target label at the end for language modeling
    full_text = f"{input_text} {target_label}"
    return {
        "input_text": input_text,
        "target_label": target_label,
        "text": full_text  # For language modeling
    }

dataset = dataset.map(format_example)
print(f"Successfully loaded {len(dataset)} examples")

# =============================================
# 4. MODEL SETUP
# =============================================
print("\nInitializing model...")
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# =============================================
# 5. TRAINING CONFIGURATION
# =============================================
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
peft_config = LoraConfig(
    r=4,
    lora_alpha=8,
    target_modules=["c_attn"],
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Tokenize function
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=192,
        return_tensors="pt"
    )
    # Create labels by shifting the inputs to the right
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text", "input_text", "target_label"])

# =============================================
# 6. START TRAINING
# =============================================
print("\nStarting training...")
training_args = TrainingArguments(
    output_dir="../eeg_results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=1e-3,
    no_cuda=True,
    remove_unused_columns=True,
    report_to="none",
    label_names=["input_ids", "attention_mask", "labels"]  # Explicitly set label names
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

trainer.train()
print("\nTraining complete!")

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_distilgpt2")
tokenizer.save_pretrained("./fine_tuned_distilgpt2")

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model

# =============================================
# 2. CPU OPTIMIZATION
# =============================================
os.environ["CUDA_VISIBLE_DEVICES"] = ""
torch.set_num_threads(os.cpu_count())
print(f"\nUsing CPU with {os.cpu_count()} cores")

# =============================================
# normalize_features
# =============================================
def normalize_features(text):
    # Implement feature normalization logic here.
    # For example, you might scale numeric values,
    # filter out noise, standardize timestamps, etc.
    # Right now it’s a placeholder that returns the input.
    normalized_text = text
    return normalized_text

# =============================================
# 3. DATASET LOADING & FORMATTING
# =============================================
print("\nLoading dataset...")
dataset = load_dataset("json", data_files="train.jsonl", split="train")

def format_example(example):
    # apply normalization to the EEG data content
    raw = example['messages'][1]['content']
    norm = normalize_features(raw)
    input_text = f"EEG Data:\n{norm}\nLabel:"
    target_label = example['messages'][2]['content']
    full_text = f"{input_text} {target_label}"
    return {"text": full_text}

dataset = dataset.map(format_example, remove_columns=dataset.column_names)
print(f"Successfully loaded {len(dataset)} examples")

# =============================================
# 4. MODEL & TOKENIZER SETUP
# =============================================
print("\nInitializing model...")
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# =============================================
# 5. TOKENIZATION WITH LOSS MASKING
# =============================================
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=192,
        return_tensors="pt"
    )
    labels = tokenized["input_ids"].clone()
    for i in range(labels.shape[0]):
        non_pad = (labels[i] != tokenizer.pad_token_id).nonzero()
        if non_pad.nelement() == 0:
            labels[i] = -100
        else:
            last_idx = non_pad[-1].item()
            labels[i, :-1] = -100
            labels[i, -1] = tokenized["input_ids"][i, last_idx]
    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "labels": labels
    }

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

# =============================================
# 6. PEFT MODEL CONFIGURATION (LoRA)
# =============================================
model = AutoModelForCausalLM.from_pretrained(model_name)

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn", "c_proj"],      # correct for GPT‑2 / DistilGPT2 :contentReference[oaicite:1]{index=1}
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# =============================================
# 7. TRAINING CONFIGURATION
# =============================================
training_args = TrainingArguments(
    output_dir="./eeg_results",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    optim="adamw_torch",
    logging_steps=10,
    save_strategy="no",
    no_cuda=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

# =============================================
# 8. START TRAINING
# =============================================
print("\nStarting training...")
trainer.train()
print("\nTraining complete!")

# Save only LoRA weights
model.save_pretrained("./fine_tuned_distilgpt2")
tokenizer.save_pretrained("./fine_tuned_distilgpt2")



Using CPU with 12 cores

Loading dataset...


Map: 100%|██████████| 312/312 [00:00<00:00, 10123.49 examples/s]


Successfully loaded 312 examples

Initializing model...


Map: 100%|██████████| 312/312 [00:00<00:00, 3096.83 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 405,504 || all params: 82,318,080 || trainable%: 0.4926

Starting training...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,2.5403
20,2.4965
30,2.4794
40,2.4825
50,2.4599
60,2.4373
70,2.4396
80,2.3964
90,2.3606
100,2.3274



Training complete!


('./fine_tuned_distilgpt2\\tokenizer_config.json',
 './fine_tuned_distilgpt2\\special_tokens_map.json',
 './fine_tuned_distilgpt2\\vocab.json',
 './fine_tuned_distilgpt2\\merges.txt',
 './fine_tuned_distilgpt2\\added_tokens.json',
 './fine_tuned_distilgpt2\\tokenizer.json')