In [1]:
!pip install datasets pandas torch transformers peft optimum auto-gptq bitsandbytes huggingface_hub



In [2]:
!pip install pandas



In [None]:
from datasets import Dataset
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, GPTQConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import os
from auto_gptq import exllama_set_max_input_length

# Initialize project settings
project_name = "Phind-CodeLlama-34B-v2-JUCE-GPTQ"
model_id = "TheBloke/Phind-CodeLlama-34B-v2-GPTQ"
data_path = "data/trainer.csv"
model_max_length = 2048
repo_id = "Phind-CodeLlama-34B-v2-JUCE-GPTQ"

# Load dataset
print("Loading Dataset")
train_df = pd.read_csv(data_path)
dataset = Dataset.from_pandas(train_df)

# Check if the dataset is empty
print("Number of samples in dataset: ", len(dataset))
if len(dataset) == 0:
    print("The dataset is empty. Exiting.")
    exit()

# Check the first sample in the dataset to see what keys it contains
print("First sample keys: ", list(dataset[0].keys()))

print("Loading Tokenizer")
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Remove the quantization_config if the model is already pre-quantized
# and you don't want to overwrite its settings
print("Loading model")

model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
print("Model device: ", next(model.parameters()).device)

# Ensure the model is loaded onto the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move the model to the desired device

model.config.use_cache = False
model.config.pretraining_tp = 1

model.gradient_checkpointing_enable()
# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)
model = exllama_set_max_input_length(model, 4096)

print("Model device3: ", next(model.parameters()).device)

# Set up LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["k_proj", "o_proj", "q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Attach LoRA adapter to the model
model = get_peft_model(model, lora_config)

# Define training arguments
training_args = TrainingArguments(
    output_dir=f"{project_name}_outputs",
    per_device_train_batch_size=2,  # Already at minimum
    gradient_accumulation_steps=4,  # Increased from 4
    warmup_steps=2,
    max_steps=300,  # Consider reducing for less GPU time, but could affect model quality
    learning_rate=2e-4,
    fp16=True,  # Already set for reduced memory usage
    logging_steps=1,
    save_strategy="epoch",
    optim="adamw_hf",
    push_to_hub=True,
    hub_model_id="ohshutit/Phind-CodeLlama-34B-v2-JUCE-GPTQ",
    hub_token="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",
    hub_private_repo=True,
    remove_unused_columns=False
)

# Function to tokenize the text and create labels
def tokenize_function(examples):
    tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=model_max_length, return_tensors="pt")
    tokenized["labels"] = tokenized["input_ids"].detach().clone()
    return tokenized

# Tokenize the dataset and create labels
print("Tokenizing dataset and creating labels")
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Display the keys in the tokenized dataset
print("First sample keys in tokenized dataset: ", list(tokenized_dataset[0].keys()))

# Initialize Trainer with the tokenized dataset
print("Initializing Trainer")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,  # Note the change here
)

print("Starting training")
trainer.train()

Loading Dataset
Number of samples in dataset:  865
First sample keys:  ['text']
Loading Tokenizer
Loading model
Model device:  cuda:0
Model device3:  cuda:0
Tokenizing dataset and creating labels


Map:   0%|          | 0/865 [00:00<?, ? examples/s]

First sample keys in tokenized dataset:  ['text', 'input_ids', 'attention_mask', 'labels']
Initializing Trainer
Starting training




Step,Training Loss
1,7.0661
