In [None]:
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM
import random
import json
from tqdm.auto import tqdm


training_dataset = load_dataset('stanfordnlp/imdb', split='train')
testing_dataset = load_dataset('stanfordnlp/imdb', split='test')

In [4]:
WANDB_NOTEBOOK_NAME = 'hw5/Direct-Preference-Optimization'

model = AutoModelForCausalLM.from_pretrained('openai-community/gpt2-large')


training_args = SFTConfig(
    output_dir="./my_training_output",
    save_strategy = 'steps',
    save_steps = 500,
    report_to='wandb',
    project=WANDB_NOTEBOOK_NAME,
    num_train_epochs=1.0,
)

# Default Learning Rate of 2e-05

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=training_dataset,
)
print("Starting training...")
trainer.train(resume_from_checkpoint=True)
print("Training complete.")

local_save_path = './fine-tuned-gpt2-large'
print(f"Saving model to {local_save_path}...")
trainer.save_model(local_save_path)
print("Model saved.")

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Starting training...


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
3010,3.1431
3020,3.1015
3030,3.1535
3040,3.1113
3050,3.1932
3060,3.2107
3070,3.1273
3080,3.194
3090,3.1146
3100,3.0461


Training complete.
Saving model to ./fine-tuned-gpt2-large...
Model saved.


In [6]:
our_model = AutoModelForCausalLM.from_pretrained('./fine-tuned-gpt2-large')
our_tokenizer = AutoTokenizer.from_pretrained('./fine-tuned-gpt2-large')


sentiment_tokenizer = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")
sentiment_model = AutoModelForSequenceClassification.from_pretrained("siebert/sentiment-roberta-large-english")

print("Models loaded.")

tokenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Models loaded.


In [None]:
NUM_PROMPTS = 1000
NUM_SAMPLES_PER_PROMPT = 4
OUTPUT_FILE = "generations.json"

generate_kwargs = {
    "max_new_tokens": 1024,
    "do_sample": True,
    "temperature": 0.2,
}
# ---------------------


print("Loading dataset...")
prefix_dataset = load_dataset('stanfordnlp/imdb', split='train')

prompt_data = []
print("Tokenizing prompts...")
for index in range(NUM_PROMPTS): 
    data = prefix_dataset[index]
    prefix_length = random.randint(2, 8)
    prompt_text = " ".join(data['text'].split()[:prefix_length])
    
    tokens = our_tokenizer(prompt_text, return_tensors="pt")
    
    prompt_data.append({
        "prompt_text": prompt_text,
        "tokenized_inputs": tokens,
        "original_label": data['label']
    })

generation_params = generate_kwargs.copy()
generation_params['num_return_sequences'] = NUM_SAMPLES_PER_PROMPT

if "pad_token_id" not in generation_params:
    generation_params['pad_token_id'] = our_tokenizer.eos_token_id

json_output_data = []

try:
    print("Generating samples...")
    for data in tqdm(prompt_data):
        inputs = data["tokenized_inputs"]
        prompt_text = data["prompt_text"]
        
        
        try:
            # Assumes our_model is loaded
            generated_sequences = our_model.generate(
                **inputs,
                **generation_params
            )
            
            decoded_samples = []
            input_length = inputs["input_ids"].shape[1]
            
            for seq in generated_sequences:
                generated_tokens_only = seq[input_length:]
                
                # Assumes our_tokenizer is loaded
                decoded_text = our_tokenizer.decode(
                    generated_tokens_only,
                    skip_special_tokens=True
                )
                decoded_samples.append(decoded_text.strip())
            
            json_output_data.append({
                "prompt": prompt_text,
                "generations": decoded_samples,
            })
    
        except Exception as e:
            print(f"Error generating for prompt: '{prompt_text}'. Error: {e}")

except KeyboardInterrupt:
    print("\n--- Generation interrupted by user ---")

finally:
    print("\n--- Generation Complete or Interrupted ---")
    print(f"Total prompts processed: {len(json_output_data)}")
    
    if not json_output_data:
        print("No results to save.")
    else:
        print(f"Saving {len(json_output_data)} results to {OUTPUT_FILE}...")
        try:
            with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
                json.dump(json_output_data, f, indent=4)
            print("Successfully saved to JSON.")
            
            if json_output_data:
                print("\nExample of first item saved:")
                print(json.dumps(json_output_data[0], indent=2))
        
        except Exception as e:
            print(f"Error saving to JSON file: {e}")



Loading dataset...
Tokenizing prompts...


In [None]:
sentiment_tokenizer

generation_data_for_sentiment

scores = []

for completion in generation_data_for_sentiment:
    inputs = sentiment_tokenizer(completion, return_tensors="pt")
    score = sentiment_model.generate(**inputs, **generation_params)
    scores.append(score)
