In [None]:
import os
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model

# =============================================
# 1. SYSTEM OPTIMIZATION
# =============================================
os.environ["CUDA_VISIBLE_DEVICES"] = ""
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.set_num_threads(os.cpu_count())
print(f"Using CPU with {os.cpu_count()} cores")

# =============================================
# 2. INITIALIZE TOKENIZER AND MODEL
# =============================================
print("\nInitializing model components...")
model_path = "./fine_tuned_distilgpt2"
base_model = "distilgpt2"
model = AutoModelForCausalLM.from_pretrained(
    "distilgpt2",
    num_labels=5,           # Explicit class count
    problem_type="regression"  # Try for ordinal classes
)
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.float32,
    low_cpu_mem_usage=True
)

# =============================================
# 3. PEFT CONFIGURATION WITH LABEL HANDLING
# =============================================
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    bias="none",
    target_modules=["c_attn", "c_proj"],  # Added projections
    modules_to_save=["lm_head"]
)

peft_model = get_peft_model(model, peft_config)
peft_model.config.label_names = ["input_ids", "attention_mask", "labels"]
peft_model.print_trainable_parameters()

# =============================================
# 4. DATA PROCESSING
# =============================================
def preprocess_data(example):
    return {
        "text": f"EEG Data:\n{example['messages'][1]['content']}\nLabel: {example['messages'][2]['content']}",
        "label": int(example['messages'][2]['content'])
    }

print("\nLoading dataset...")
cache_dir = "./dataset_cache"
os.makedirs(cache_dir, exist_ok=True)

# Load dataset with proper JSONL handling
dataset = load_dataset(
    "json",
    data_files="train.jsonl",
    split="train",
    cache_dir=cache_dir
).map(
     preprocess_data,
    remove_columns=["messages"],
    load_from_cache_file=True,  # Force recreate cache
    num_proc=1,
    desc="Preprocessing data"
)
print(f"Loaded {len(dataset)} examples")

# =============================================
# 5. TOKENIZATION WITH LABELS
# =============================================
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="np"
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

print("\nTokenizing dataset...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=8,
    remove_columns=["text"],
    num_proc=1,
    cache_file_name=os.path.join(cache_dir, "tokenized.arrow")
)

# =============================================
# 6. TRAINING CONFIGURATION
# =============================================
training_args = TrainingArguments(
    eval_steps=50,
    output_dir="./eeg_results",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    num_train_epochs=10,
    warmup_ratio=0.1,
    learning_rate=5e-5,
    weight_decay=0.1,
    optim="adamw_torch",
    no_cuda=True,
    remove_unused_columns=True,
    report_to="none",
    save_strategy="no",
    logging_steps=10,
    label_names=["input_ids", "attention_mask", "labels"],
    max_grad_norm=1.0

)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# =============================================
# 7. TRAINING EXECUTION
# =============================================
print("\nStarting training...")
trainer.train()

# =============================================
# 8. MODEL SAVING
# =============================================
print("\nSaving model...")
output_dir = "./fine_tuned_distilgpt2"
os.makedirs(output_dir, exist_ok=True)
peft_model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.optim import AdamW


# 1. Load and Prepare Data
df = pd.read_csv('concat_files.csv')

# Convert numerical features to text sequences
def create_text_sequence(row):
    features = [
        f"FC3={row['FC3']:.4f}", f"FCz={row['FCz']:.4f}",
        f"FC4={row['FC4']:.4f}", f"C3={row['C3']:.4f}",
        f"Cz={row['Cz']:.4f}", f"C4={row['C4']:.4f}",
        f"CP3={row['CP3']:.4f}", f"CPz={row['CPz']:.4f}",
        f"CP4={row['CP4']:.4f}"
    ]
    return " ".join(features)

df['text'] = df.apply(create_text_sequence, axis=1)
labels = df['label'].astype(int) - 1  # Labels 0-4 for 5 classes

# 2. Split Data
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], labels, test_size=0.2, stratify=labels, random_state=42
)

# 3. Tokenization
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class EEGDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=max_len)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        return item

train_dataset = EEGDataset(X_train, y_train, tokenizer)
test_dataset = EEGDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# 4. Model Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

# 5. Training Loop
model.train()
for epoch in range(5):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# 6. Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# 7. Report
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=['1', '2', '3', '4', '5']))


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.optim import AdamW

# 1. Load & prepare
df = pd.read_csv('concat_files.csv')
def create_text_sequence(row):
    return " ".join([
        f"FC3={row['FC3']:.4f}", f"FCz={row['FCz']:.4f}",
        f"FC4={row['FC4']:.4f}", f"C3={row['C3']:.4f}",
        f"Cz={row['Cz']:.4f}", f"C4={row['C4']:.4f}",
        f"CP3={row['CP3']:.4f}", f"CPz={row['CPz']:.4f}",
        f"CP4={row['CP4']:.4f}"
    ])
df['text'] = df.apply(create_text_sequence, axis=1)
labels = (df['label'].astype(int) - 1).to_numpy()

# 2. Split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], labels,
    test_size=0.2, stratify=labels, random_state=42
)

# 3. Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# 4. Dataset (pre‑tensorize everything for speed)
class EEGDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        enc = tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=max_len
        )
        self.input_ids      = torch.tensor(enc['input_ids'],      dtype=torch.long)
        self.attention_mask = torch.tensor(enc['attention_mask'], dtype=torch.long)
        self.labels         = torch.tensor(labels,                 dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids':      self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels':         self.labels[idx],
        }

train_ds = EEGDataset(X_train, y_train, tokenizer)
test_ds  = EEGDataset(X_test,  y_test,  tokenizer)

# 5. DataLoaders — moderate batch size, let CPU use multiple workers
train_loader = DataLoader(train_ds, batch_size=16, shuffle=True,  num_workers=4)
test_loader  = DataLoader(test_ds,  batch_size=16, shuffle=False, num_workers=4)

# 6. Model + optimizer
device = torch.device('cpu')
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=5
).to(device)

optimizer = AdamW(model.parameters(), lr=3e-5)

# 7. Training loop (on CPU)
model.train()
for epoch in range(3):               # fewer epochs to save time
    running_loss = 0.0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids      = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels         = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    avg = running_loss / len(train_loader)
    print(f"Epoch {epoch+1} — loss: {avg:.4f}")

# 8. Evaluation
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids      = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels         = batch['labels'].to(device)

        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())

print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=['1','2','3','4','5']))


In [None]:
# %% [markdown]
# # EEG Classification Training Notebook
# Lightweight model for EEG label prediction (CPU-only version)

# %%
# Install required packages (run once)
!pip install pandas scikit-learn torch matplotlib seaborn

# %%
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# %%
# Configuration
RANDOM_SEED = 42
BATCH_SIZE = 32
EPOCHS = 50
LEARNING_RATE = 0.001
INPUT_FEATURES = 9  # FC3, FCz,..., CP4
NUM_CLASSES = 5      # Labels 1-5

torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# %%
# Dataset Class
class EEGDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.features[idx], dtype=torch.float32),
            torch.tensor(self.labels[idx], dtype=torch.long)
        )

# %%
# Model Architecture
class EEGClassifier(nn.Module):
    def __init__(self):
        super(EEGClassifier, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Linear(INPUT_FEATURES, 64),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.layer2 = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.output = nn.Linear(32, NUM_CLASSES)

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        return self.output(x)

# %%
# Data Preparation
def load_and_preprocess_data(csv_path):
    # Load data
    df = pd.read_csv(csv_path)

    # Convert labels to 0-indexed
    df['label'] = df['label'].astype(int) - 1

    # Split features and labels
    X = df.drop('label', axis=1).values
    y = df['label'].values

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_SEED
    )

    # Normalize data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test, scaler

# %%
# Training Function
def train_model(model, train_loader, test_loader, epochs=EPOCHS):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    best_accuracy = 0.0
    train_losses = []
    test_accuracies = []

    for epoch in range(epochs):
        # Training
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Evaluation
        model.eval()
        test_preds = []
        test_labels = []
        with torch.no_grad():
            for inputs, labels in test_loader:
                outputs = model(inputs)
                preds = torch.argmax(outputs, dim=1)
                test_preds.extend(preds.numpy())
                test_labels.extend(labels.numpy())

        accuracy = accuracy_score(test_labels, test_preds)
        epoch_loss = running_loss / len(train_loader)
        train_losses.append(epoch_loss)
        test_accuracies.append(accuracy)

        print(f'Epoch {epoch+1}/{epochs}')
        print(f'Train Loss: {epoch_loss:.4f} | Test Accuracy: {accuracy:.4f}')

        # Save best model
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            torch.save(model.state_dict(), 'best_model.pth')

    return train_losses, test_accuracies

# %%
# Evaluation Metrics
def plot_metrics(train_losses, test_accuracies):
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.title('Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')

    plt.subplot(1, 2, 2)
    plt.plot(test_accuracies, label='Test Accuracy')
    plt.title('Test Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')

    plt.tight_layout()
    plt.show()

def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=range(1, 6), yticklabels=range(1, 6))
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

# %%
# Main Execution
if __name__ == "__main__":
    # Load data
    X_train, X_test, y_train, y_test, scaler = load_and_preprocess_data("eeg_data.csv")

    # Create datasets
    train_dataset = EEGDataset(X_train, y_train)
    test_dataset = EEGDataset(X_test, y_test)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

    # Initialize model
    model = EEGClassifier()

    # Train
    train_losses, test_accuracies = train_model(model, train_loader, test_loader)

    # Plot metrics
    plot_metrics(train_losses, test_accuracies)

    # Final evaluation
    model.load_state_dict(torch.load('best_model.pth'))
    model.eval()

    test_preds = []
    test_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1)
            test_preds.extend(preds.numpy())
            test_labels.extend(labels.numpy())

    print("\nFinal Evaluation:")
    print(classification_report(
        test_labels,
        test_preds,
        target_names=[str(i+1) for i in range(NUM_CLASSES)]
    ))
    plot_confusion_matrix(test_labels, test_preds)

    # Save artifacts
    torch.save(model.state_dict(), 'eeg_classifier.pth')
    np.save('scaler_mean.npy', scaler.mean_)
    np.save('scaler_scale.npy', scaler.scale_)

# %%
# Sample Prediction
def predict_sample(model, scaler, sample):
    model.eval()
    sample = scaler.transform([sample])
    with torch.no_grad():
        output = model(torch.tensor(sample, dtype=torch.float32))
    return torch.argmax(output).item() + 1  # Convert back to 1-5 labels

# Example usage with test data
sample = X_test[0]
print(f"\nSample Prediction: {predict_sample(model, scaler, sample)}")
print(f"True Label: {y_test[0]+1}")