In [None]:
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTTrainer, SFTConfig
from torch.utils.data import DataLoader
from transformers import default_data_collator
import json
from torch.optim import SGD

def exact_match_accuracy(generated_completions, target_completions, prompts):
    correct_count = 0 
    for gen, target, prompt in zip(generated_completions, target_completions, prompts):
        # Remove the prompt from the generated completion
        generated_part = gen[len(prompt):]  # Take everything after the prompt

        # Check if the generated part starts with the target completion (case-insensitive)
        if generated_part.strip().lower().startswith(target.strip().lower()):
            correct_count += 1
    
    # Calculate accuracy as the percentage of correct matches
    accuracy = correct_count / len(generated_completions) if len(generated_completions) > 0 else 0.0
    return accuracy


def evaluate(model, tokenizer, test_data):
    model.eval()
    test_prompts = [item['prompt'] for item in test_data]
    test_completions = [item['completion'] for item in test_data]

    generated_completions = []
    
    for prompt in test_prompts:
        # Tokenize each prompt separately without padding
        inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

        # Generate completion one by one (no padding)
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_length=64,
            )

        # Decode the generated completion
        generated_completions.append(tokenizer.decode(output[0], skip_special_tokens=True))

    # Evaluate the accuracy based on whether the generated part starts with the target completion
    accuracy = exact_match_accuracy(generated_completions, test_completions, test_prompts)

    return generated_completions, accuracy

########################### main code 

with open("d2p_each_dataset.json", 'r') as f:
    dataset = json.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer.pad_token = tokenizer.eos_token

# Open the .jsonl file in append mode to store results
with open("evaluation_results.jsonl", 'a') as jsonl_file:

    test_datasets = {}

    for idx, individual_data in enumerate(dataset):
        print(f"Training on dataset {idx + 1}")
        
        # Prepare the train and test data
        train_data = individual_data['train']
        test_data = individual_data['test']
        test_datasets[idx] = test_data
        
        # Convert to Hugging Face Dataset (directly use 'text' field)
        train_dataset = Dataset.from_dict({
            'text': [item['prompt'] + item['completion'] for item in train_data]
        })
        print(train_dataset)
        print(train_dataset[0])
        
        training_args = SFTConfig(
            output_dir="/tmp",   
            num_train_epochs=3,                      # Number of epochs
            per_device_train_batch_size=30,           # Batch size per device 
            max_seq_length=128,
            logging_steps=100,
        )

        # Initialize SFTTrainer with the model, dataset, and config
        # Create SGD optimizer
        optimizer = SGD(model.parameters(), lr=5e-2)  # Use a learning rate of 0.005

        # Initialize SFTTrainer with the model, dataset, config, optimizer, and scheduler
        trainer = SFTTrainer(
            model=model,
            train_dataset=train_dataset,
            tokenizer=tokenizer,
            args=training_args,  # Pass the training config
            optimizers=(optimizer,None)  # Pass only the optimizer (no scheduler for now)
        )

        # Train the model using the SFTTrainer
        trainer.train()

        #### Evaluate performance on all test datasets after training on this one
        for test_data_idx, test_data in test_datasets.items():
            generated_completions, accuracy = evaluate(model, tokenizer, test_data)

            # Prepare the result to be written as a JSON object
            result = {
                'train_data_idx': idx,
                'test_data_idx': test_data_idx,
                'accuracy': accuracy
            }
            
            # Write the result as a new line in the .jsonl file
            jsonl_file.write(json.dumps(result) + '\n')

            # Optionally, print the results
            print(f"Evaluating on Test Dataset {test_data_idx + 1} after training on Dataset {idx + 1}")
            print(f"Exact Match Accuracy: {accuracy * 100:.2f}%")
        
        # Clear GPU memory after each dataset
        torch.cuda.empty_cache()



In [None]:
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTTrainer, SFTConfig
from torch.utils.data import DataLoader
from transformers import default_data_collator
import json
from torch.optim import SGD
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
model