In [None]:
import json

# Load the train and test datasets
def load_data(file_path):
    with open(file_path, 'r') as file:
        return [json.loads(line) for line in file]

def restructure_dataset(train_data, test_data, num_train=30, num_test=10):
    dataset = []
    # Number of individuals
    num_individuals = len(train_data) // num_train
    
    for i in range(num_individuals):
        start_train = i * num_train
        start_test = i * num_test
        
        individual_data = {
            "train": train_data[start_train:start_train + num_train],
            "test": test_data[start_test:start_test + num_test]
        }
        dataset.append(individual_data)
    return dataset

def save_dataset(dataset, output_file):
    with open(output_file, 'w') as file:
        json.dump(dataset, file, indent=2)

# Paths to input and output files
train_file = "./reversal_curse/data/reverse_experiments/june_version_7921032488/d2p_prompts_train.jsonl"
test_file = "./reversal_curse/data/reverse_experiments/june_version_7921032488/d2p_prompts_test.jsonl"
output_file = "d2p_each_dataset.json"

# Process the data
train_data = load_data(train_file)
test_data = load_data(test_file)

dataset = restructure_dataset(train_data, test_data)
# Save the restructured dataset
save_dataset(dataset, output_file)

print(f"Restructured dataset saved to {output_file}")

In [None]:
import json

# Load the train and test datasets
def load_data(file_path):
    with open(file_path, 'r') as file:
        return [json.loads(line) for line in file]

def restructure_dataset(train_data, test_data, num_train=900, num_test=300):
    dataset = []
    # Number of individuals
    num_individuals = len(train_data) // num_train
    
    for i in range(num_individuals):
        start_train = i * num_train
        start_test = i * num_test
        
        individual_data = {
            "train": train_data[start_train:start_train + num_train],
            "test": test_data[start_test:start_test + num_test]
        }
        dataset.append(individual_data)
    return dataset

def save_dataset(dataset, output_file):
    with open(output_file, 'w') as file:
        json.dump(dataset, file, indent=2)

# Paths to input and output files
train_file = "./reversal_curse/data/reverse_experiments/june_version_7921032488/d2p_prompts_train.jsonl"
test_file = "./reversal_curse/data/reverse_experiments/june_version_7921032488/d2p_prompts_test.jsonl"
output_file = "d2p_whole_dataset.json"

# Process the data
train_data = load_data(train_file)
test_data = load_data(test_file)

dataset = restructure_dataset(train_data, test_data)
# Save the restructured dataset
save_dataset(dataset, output_file)

print(f"Restructured dataset saved to {output_file}")

In [None]:
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW
from tqdm import tqdm
import json
from torch.utils.data import DataLoader
from transformers import default_data_collator
import json


def evaluate(model, tokenizer, test_data):
    model.eval()
    test_prompts = [item['prompt'] for item in test_data]
    test_completions = [item['completion'] for item in test_data]

    # Tokenize the test prompts
    inputs = tokenizer(test_prompts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(model.device)

    # Generate completions
    with torch.no_grad():
        outputs = model.generate(input_ids=inputs['input_ids'], max_length=64, num_return_sequences=1)
    
    generated_completions = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Check if the generated completions contain the corresponding completion
    correct = 0
    total = len(test_completions)
    results = []  # To store the generated completions along with their accuracy status

    for generated, target_completion in zip(generated_completions, test_completions):
        # Check if the target completion is a substring of the generated completion (case-sensitive check)
        contains_completion = target_completion in generated
        if contains_completion:
            correct += 1

        # Store the result for this example
        results.append({
            'generated': generated,
            'target_completion': target_completion,
            'contains_completion': contains_completion
        })

    # Calculate accuracy
    accuracy = correct / total if total > 0 else 0.0
    
    # Return the results along with the accuracy
    return results, accuracy



    
def tokenize_data(examples, tokenizer, max_length=128):
    prompts = examples['prompt']
    completions = examples['completion']
    
    # Concatenate prompts and completions into a single string for autoregressive training
    inputs = [prompt + completion for prompt, completion in zip(prompts, completions)]

    # Tokenize inputs
    encodings = tokenizer(inputs, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

    # Prepare labels: Shift the input_ids so that loss is only computed for the completion part
    labels = encodings['input_ids'].clone()
    encodings['labels'] = labels

    # Return the attention_mask along with the tokenized inputs
    return encodings

########################### main code 

with open("d2p_whole_dataset.json", 'r') as f:
    dataset = json.load(f)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the tokenizer and model using Auto classes
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer.pad_token = tokenizer.eos_token


# Modify your training loop to include attention_mask
for idx, individual_data in enumerate(dataset):
    print(f"Training on dataset {idx + 1}")
    
    # Prepare the train and test data
    train_data = individual_data['train']
    test_data = individual_data['test']

    # Convert to Hugging Face Dataset
    train_dataset = Dataset.from_dict({
        'prompt': [item['prompt'] for item in train_data],
        'completion': [item['completion'] for item in train_data],
    })

    # Tokenize the training data using `map`
    train_dataset = train_dataset.map(lambda x: tokenize_data(x, tokenizer), batched=True)

    # Create a DataLoader for batching
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=default_data_collator)

    # Define optimizer
    optimizer = AdamW(model.parameters(), lr=1e-5)  # Learning rate can be adjusted

    # Training loop
    model.train()
    for epoch in range(10):  # You can adjust the number of epochs
        epoch_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}", unit="batch"):
            optimizer.zero_grad()

            # Move batch to device (GPU/CPU)
            batch = {k: v.to(model.device) for k, v in batch.items()}

            # Forward pass (including attention_mask)
            outputs = model(input_ids=batch['input_ids'], labels=batch['labels'], attention_mask=batch['attention_mask'])
            loss = outputs.loss  # Use the model's built-in loss calculation

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f"Epoch {epoch + 1} - Loss: {epoch_loss / len(train_loader)}")

    # After training, evaluate and print results
    results, accuracy = evaluate(model, tokenizer, test_data)

    # Print the individual results
    for result in results:
        print(f"Generated: {result['generated']}")
        print(f"Target Completion: {result['target_completion']}")
        print(f"Contains Completion: {result['contains_completion']}")
        print("-" * 50)

    # Print the accuracy
    print(f"Accuracy: {accuracy * 100:.2f}%")
    
    # Optionally, save the model after each dataset
    # model.save_pretrained(f"model_after_dataset_{idx + 1}")

    # Clear GPU memory after each dataset
    torch.cuda.empty_cache()


In [None]:
### sft trainer version

In [None]:
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTTrainer, SFTConfig
from torch.utils.data import DataLoader
from transformers import default_data_collator
import json

def exact_match_accuracy(generated_completions, target_completions, prompts):
    correct_count = 0 
    for gen, target, prompt in zip(generated_completions, target_completions, prompts):
        # Remove the prompt from the generated completion
        generated_part = gen[len(prompt):]  # Take everything after the prompt

        # Check if the generated part starts with the target completion (case-insensitive)
        if generated_part.strip().lower().startswith(target.strip().lower()):
            correct_count += 1
    
    # Calculate accuracy as the percentage of correct matches
    accuracy = correct_count / len(generated_completions) if len(generated_completions) > 0 else 0.0
    return accuracy


def evaluate(model, tokenizer, test_data):
    model.eval()
    test_prompts = [item['prompt'] for item in test_data]
    test_completions = [item['completion'] for item in test_data]

    generated_completions = []
    
    for prompt in test_prompts:
        # Tokenize each prompt separately without padding
        inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

        # Generate completion one by one (no padding)
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_length=64,
            )

        # Decode the generated completion
        generated_completions.append(tokenizer.decode(output[0], skip_special_tokens=True))

    # Evaluate the accuracy based on whether the generated part starts with the target completion
    accuracy = exact_match_accuracy(generated_completions, test_completions, test_prompts)

    return generated_completions, accuracy
########################### main code 

with open("d2p_whole_dataset.json", 'r') as f:
    dataset = json.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the tokenizer and model using Auto classes
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer.pad_token = tokenizer.eos_token

# Modify your training loop to include attention_mask
for idx, individual_data in enumerate(dataset):
    print(f"Training on dataset {idx + 1}")
    
    # Prepare the train and test data
    train_data = individual_data['train']
    test_data = individual_data['test']

    # Convert to Hugging Face Dataset (directly use 'text' field)
    train_dataset = Dataset.from_dict({
        'text': [item['prompt'] + item['completion'] for item in train_data]
    })
    print(train_dataset)
    print(train_dataset[0])
    training_args = SFTConfig(
        output_dir="/tmp",   
        num_train_epochs=10,                      # Number of epochs
        per_device_train_batch_size=16,           # Batch size per device
        learning_rate=5e-3,                       # Learning rate
        max_seq_length=128,
        logging_steps=100,
    )

    # Initialize SFTTrainer with the model, dataset, and config
    trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        tokenizer=tokenizer,
        args=training_args,  # Pass the training config
    )

    # Train the model using the SFTTrainer
    trainer.train()
    # After training, evaluate and print results
    generated_completions, accuracy = evaluate(model, tokenizer, test_data)
    
    # Evaluate and print the results
    for example, generated in zip(test_data, generated_completions):
        print(f"Prompt: {example['prompt']}")
        print(f"Generated Completion: {generated}")
        print("-" * 50)
    
    print(f"Exact Match Accuracy: {accuracy * 100:.2f}%")
    
    # Optionally, save the model after each dataset
    # model.save_pretrained(f"model_after_dataset_{idx + 1}")

    # Clear GPU memory after each dataset
    torch.cuda.empty_cache()


In [None]:
import torch
import wandb
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTTrainer, SFTConfig
from torch.utils.data import DataLoader
from transformers import default_data_collator
import json
from itertools import product

# Initialize WandB
wandb.init(project="engram")

def exact_match_accuracy(generated_completions, target_completions, prompts):
    correct_count = 0 
    for gen, target, prompt in zip(generated_completions, target_completions, prompts):
        # Remove the prompt from the generated completion
        generated_part = gen[len(prompt):]  # Take everything after the prompt

        # Check if the generated part starts with the target completion (case-insensitive)
        if generated_part.strip().lower().startswith(target.strip().lower()):
            correct_count += 1
    
    # Calculate accuracy as the percentage of correct matches
    accuracy = correct_count / len(generated_completions) if len(generated_completions) > 0 else 0.0
    return accuracy


def evaluate(model, tokenizer, test_data):
    model.eval()
    test_prompts = [item['prompt'] for item in test_data]
    test_completions = [item['completion'] for item in test_data]

    generated_completions = []
    
    for prompt in test_prompts:
        # Tokenize each prompt separately without padding
        inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

        # Generate completion one by one (no padding)
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_length=64,
            )

        # Decode the generated completion
        generated_completions.append(tokenizer.decode(output[0], skip_special_tokens=True))

    # Evaluate the accuracy based on whether the generated part starts with the target completion
    accuracy = exact_match_accuracy(generated_completions, test_completions, test_prompts)

    return generated_completions, accuracy

########################### Hyperparameter Search ##########################

# Define hyperparameter grid
learning_rates = [0.001, 0.005, 0.0001, 0.0005, 0.00001]
batch_sizes = [1, 2, 4, 8, 16]

# Load the dataset
with open("d2p_whole_dataset.json", 'r') as f:
    dataset = json.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the tokenizer and model
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer.pad_token = tokenizer.eos_token

# Loop over all combinations of learning rates and batch sizes
best_accuracy = 0
best_hyperparameters = {}

for lr, batch_size in product(learning_rates, batch_sizes):
    print(f"Training with learning rate: {lr}, batch size: {batch_size}")

    # Log hyperparameters to WandB
    wandb.config.update({"learning_rate": lr, "batch_size": batch_size})

    for idx, individual_data in enumerate(dataset):
        print(f"\nTraining on dataset {idx + 1}")

        # Prepare the train and test data
        train_data = individual_data['train']
        test_data = individual_data['test']

        # Convert to Hugging Face Dataset (directly use 'text' field)
        train_dataset = Dataset.from_dict({
            'text': [item['prompt'] + item['completion'] for item in train_data]
        })

        # Define training arguments with the current hyperparameters
        training_args = SFTConfig(
            output_dir="/tmp",   
            num_train_epochs=10,                     # Number of epochs
            per_device_train_batch_size=batch_size,  # Batch size per device
            learning_rate=lr,                        # Learning rate
            max_seq_length=128,                      # Maximum sequence length
            lr_scheduler_type='cosine'
        )

        # Initialize SFTTrainer with the model, dataset, and config
        trainer = SFTTrainer(
            model=model,
            train_dataset=train_dataset,
            tokenizer=tokenizer,
            args=training_args,  # Pass the training config
        )

        # Train the model using the SFTTrainer
        trainer.train()

        # After training, evaluate and print results
        generated_completions, accuracy = evaluate(model, tokenizer, test_data)

        # Log accuracy to WandB
        wandb.log({"exact_match_accuracy": accuracy * 100})

        print(f"Exact Match Accuracy: {accuracy * 100:.2f}%")

        # Save the best performing model based on accuracy
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_hyperparameters = {
                'learning_rate': lr,
                'batch_size': batch_size,
                'epoch_accuracy': accuracy
            }

        # Clear GPU memory after each dataset
        torch.cuda.empty_cache()

# After hyperparameter search, print the best results
print("\nBest Hyperparameters found:")
print(f"Learning Rate: {best_hyperparameters['learning_rate']}")
print(f"Batch Size: {best_hyperparameters['batch_size']}")
print(f"Best Accuracy: {best_accuracy * 100:.2f}%")

# Log the best results to WandB
wandb.log({
    "best_learning_rate": best_hyperparameters['learning_rate'],
    "best_batch_size": best_hyperparameters['batch_size'],
    "best_accuracy": best_accuracy * 100
})

# Finish the WandB run
wandb.finish()



0,1
exact_match_accuracy,█▇▁▇█
train/epoch,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇███▁▂▃▄▅▆▆▇██▂▄▆▇█▄▇█▇█▁
train/global_step,▁▁▂▂▃▃▃▄▅▅▆▆▆▇▇███▁▁▂▂▃▃▄▄▄▄▁▁▂▂▂▁▁▂▂▁▁▁
train/grad_norm,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁
train/learning_rate,▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▂▂▂▂▂▁▁▁▁▂▂▁▁▂▁▁█
train/loss,█▅▄▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁█

0,1
exact_match_accuracy,86.0
total_flos,1874769877352448.0
train/epoch,0.55556
train/global_step,500.0
train/grad_norm,8.00626
train/learning_rate,0.00472
train/loss,5.0064
train_loss,0.37465
train_runtime,280.6248
train_samples_per_second,32.071


Using device: cuda
Training with learning rate: 0.001, batch size: 1

Training on dataset 1


Map: 100%|██████████| 900/900 [00:00<00:00, 23596.35 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
500,5.0778
1000,3.0267
1500,2.2007
2000,2.0764
2500,1.8396
3000,1.7996
3500,1.5951
4000,1.3733
4500,1.2574
5000,1.0743


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Exact Match Accuracy: 82.00%
Training with learning rate: 0.001, batch size: 2

Training on dataset 1


Map: 100%|██████████| 900/900 [00:00<00:00, 25954.14 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
500,1.1263
1000,1.2014
1500,1.0801
2000,0.9105
2500,0.7486
3000,0.6132
3500,0.5006
4000,0.4387
4500,0.4149


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Exact Match Accuracy: 80.67%
Training with learning rate: 0.001, batch size: 4

Training on dataset 1


Map: 100%|██████████| 900/900 [00:00<00:00, 26258.53 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
500,0.7697
1000,0.7539
1500,0.5925
2000,0.4514


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Exact Match Accuracy: 80.67%
Training with learning rate: 0.001, batch size: 8

Training on dataset 1


Map: 100%|██████████| 900/900 [00:00<00:00, 23913.40 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
500,0.5806
1000,0.4257


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Exact Match Accuracy: 81.00%
Training with learning rate: 0.001, batch size: 16

Training on dataset 1


Map: 100%|██████████| 900/900 [00:00<00:00, 25140.85 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
500,0.4261


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Exact Match Accuracy: 84.33%
Training with learning rate: 0.005, batch size: 1

Training on dataset 1


Map: 100%|██████████| 900/900 [00:00<00:00, 27348.41 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
500,4.6606
1000,4.0029
1500,3.2167
2000,2.8212
2500,2.3098
3000,2.1719
3500,1.9564
4000,1.5655
4500,1.3914
5000,1.1996


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Exact Match Accuracy: 61.00%
Training with learning rate: 0.005, batch size: 2

Training on dataset 1


Map: 100%|██████████| 900/900 [00:00<00:00, 24122.29 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
500,1.9096
1000,1.8711
1500,1.5606
2000,1.3192
2500,1.0887
3000,0.851
3500,0.7119
4000,0.6142
4500,0.5819


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Exact Match Accuracy: 72.00%
Training with learning rate: 0.005, batch size: 4

Training on dataset 1


Map: 100%|██████████| 900/900 [00:00<00:00, 24643.38 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
500,1.5008
1000,1.31
1500,0.9651
2000,0.6927


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Exact Match Accuracy: 66.33%
Training with learning rate: 0.005, batch size: 8

Training on dataset 1


Map: 100%|██████████| 900/900 [00:00<00:00, 23756.58 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
500,1.1691
1000,0.7648


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Exact Match Accuracy: 54.00%
Training with learning rate: 0.005, batch size: 16

Training on dataset 1


Map: 100%|██████████| 900/900 [00:00<00:00, 24820.81 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
500,0.8205


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Exact Match Accuracy: 50.67%
Training with learning rate: 0.0001, batch size: 1

Training on dataset 1


Map: 100%|██████████| 900/900 [00:00<00:00, 25890.59 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
500,0.6137
1000,0.6015
1500,0.6118
2000,0.5779
2500,0.5802
3000,0.5649
3500,0.5745
4000,0.5504
4500,0.5552
5000,0.5472


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Exact Match Accuracy: 62.67%
Training with learning rate: 0.0001, batch size: 2

Training on dataset 1


Map: 100%|██████████| 900/900 [00:00<00:00, 25596.53 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
500,0.5331
1000,0.5355
1500,0.5229


In [None]:
!pip install wandb

In [1]:
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW
from tqdm import tqdm
import json
from torch.utils.data import DataLoader
from transformers import default_data_collator
import json





device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the tokenizer and model using Auto classes
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer.pad_token = tokenizer.eos_token



  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [2]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):