In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig, pipeline, AutoModelForQuestionAnswering
import pandas as pd
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Finetune Llama 3.2 with 1B params

load in the base model

In [2]:
quantization_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
)

In [3]:
checkpoint = "meta-llama/Llama-3.2-1B"
device = "cuda" if torch.cuda.is_available() else "cpu"

left_model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    quantization_config=quantization_config
)

right_model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    quantization_config=quantization_config
)

left_model = get_peft_model(left_model, peft_config)
right_model = get_peft_model(right_model, peft_config)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token

load in the combined dataset and convert it to text

In [4]:
left_dataset = pd.read_csv("data/combined_left.csv")
right_dataset = pd.read_csv("data/combined_right.csv")

left_dataset = left_dataset[["text", "topic"]]
right_dataset = right_dataset[["text", "topic"]]

# Write the left dataset to left.txt
with open("data/left.txt", "w") as f:
    for _, row in left_dataset.iterrows():
        out = f"Here is an opinion on {row['topic']}: {row['text']}{tokenizer.eos_token}\n"
        f.write(out)

# Write the right dataset to right.txt
with open("data/right.txt", "w") as f:
    for _, row in right_dataset.iterrows():
        out = f"Here is an opinion on {row['topic']}: {row['text']}{tokenizer.eos_token}\n"
        f.write(out)

In [5]:
left_dataset = load_dataset("text", data_files="data/left.txt")
right_dataset = load_dataset("text", data_files="data/right.txt")

left_dataset = left_dataset["train"]
right_dataset = right_dataset["train"]

left_dataset = left_dataset.train_test_split(test_size=0.05)
right_dataset = right_dataset.train_test_split(test_size=0.05)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Fine-tune the model on the left and right context

In [6]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


left_dataset = left_dataset.map(tokenize, batched=True, batch_size=4)
right_dataset = right_dataset.map(tokenize, batched=True, batch_size=4)

Map:   0%|          | 0/1357 [00:00<?, ? examples/s]

Map:   0%|          | 0/72 [00:00<?, ? examples/s]

Map:   0%|          | 0/1356 [00:00<?, ? examples/s]

Map:   0%|          | 0/72 [00:00<?, ? examples/s]

In [7]:
LR = 5e-5
EPOCHS = 3
BATCH_SIZE = 2
WEIGHT_DECAY = 0.01


left_args = TrainingArguments(
    output_dir="models/Llama-3.2-1B-left",
    eval_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    fp16=True,
)

right_args = TrainingArguments(
    output_dir="models/Llama-3.2-1B-right",
    eval_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    fp16=True,
)

In [8]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

left_trainer = Trainer(
    model=left_model,
    args=left_args,
    data_collator=data_collator,
    train_dataset=left_dataset["train"],
    eval_dataset=left_dataset["test"]
)

right_trainer = Trainer(
    model=right_model,
    args=right_args,
    data_collator=data_collator,
    train_dataset=right_dataset["train"],
    eval_dataset=right_dataset["test"]
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [9]:
left_trainer.train()
left_model.save_pretrained("models/Llama-3.2-1B-left")
tokenizer.save_pretrained("models/Llama-3.2-1B-left")

  0%|          | 0/2037 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 2.8361, 'grad_norm': 4.8060994148254395, 'learning_rate': 3.777614138438881e-05, 'epoch': 0.74}


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_runtime': 1.4067, 'eval_samples_per_second': 51.183, 'eval_steps_per_second': 25.591, 'epoch': 1.0}
{'loss': 2.6364, 'grad_norm': 4.6162028312683105, 'learning_rate': 2.5503190967108493e-05, 'epoch': 1.47}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_runtime': 1.4466, 'eval_samples_per_second': 49.773, 'eval_steps_per_second': 24.886, 'epoch': 2.0}
{'loss': 2.5694, 'grad_norm': 7.056900501251221, 'learning_rate': 1.323024054982818e-05, 'epoch': 2.21}
{'loss': 2.5247, 'grad_norm': 9.473525047302246, 'learning_rate': 9.572901325478646e-07, 'epoch': 2.95}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_runtime': 1.5022, 'eval_samples_per_second': 47.929, 'eval_steps_per_second': 23.965, 'epoch': 3.0}
{'train_runtime': 178.671, 'train_samples_per_second': 22.785, 'train_steps_per_second': 11.401, 'train_loss': 2.637256024922, 'epoch': 3.0}


('models/Llama-3.2-1B-left\\tokenizer_config.json',
 'models/Llama-3.2-1B-left\\special_tokens_map.json',
 'models/Llama-3.2-1B-left\\tokenizer.json')

In [10]:
right_trainer.train()
right_model.save_pretrained("models/Llama-3.2-1B-right")
tokenizer.save_pretrained("models/Llama-3.2-1B-right")

  0%|          | 0/2034 [00:00<?, ?it/s]

{'loss': 2.8902, 'grad_norm': 6.436295986175537, 'learning_rate': 3.775811209439528e-05, 'epoch': 0.74}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_runtime': 1.5481, 'eval_samples_per_second': 46.508, 'eval_steps_per_second': 23.254, 'epoch': 1.0}
{'loss': 2.6813, 'grad_norm': 5.033986568450928, 'learning_rate': 2.5467059980334317e-05, 'epoch': 1.47}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_runtime': 1.4857, 'eval_samples_per_second': 48.461, 'eval_steps_per_second': 24.231, 'epoch': 2.0}
{'loss': 2.6224, 'grad_norm': 6.604016304016113, 'learning_rate': 1.3176007866273355e-05, 'epoch': 2.21}
{'loss': 2.5552, 'grad_norm': 7.292864799499512, 'learning_rate': 8.849557522123894e-07, 'epoch': 2.95}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_runtime': 1.4118, 'eval_samples_per_second': 51.0, 'eval_steps_per_second': 25.5, 'epoch': 3.0}
{'train_runtime': 182.2083, 'train_samples_per_second': 22.326, 'train_steps_per_second': 11.163, 'train_loss': 2.6833551457497924, 'epoch': 3.0}


('models/Llama-3.2-1B-right\\tokenizer_config.json',
 'models/Llama-3.2-1B-right\\special_tokens_map.json',
 'models/Llama-3.2-1B-right\\tokenizer.json')

In [11]:
left_pipeline = pipeline("text-generation", model="models/Llama-3.2-1B-left-text", tokenizer="models/Llama-3.2-1B-left-text", device=device)
right_pipeline = pipeline("text-generation", model="models/Llama-3.2-1B-right-text", tokenizer="models/Llama-3.2-1B-right-text", device=device)

In [17]:
topic = "gun control"
messages = ["System: You are a politician at a political debate trying to convince the audience that your stance on issues is the best one.", f"User: What is your stance on {topic}?"]

right_answer = right_pipeline(messages, temperature=0.7, max_length=512, truncation=True)
left_answer = left_pipeline(messages, temperature=0.7, max_length=512, truncation=True)

print(right_answer[0]["generated_text"])

print()

print(left_answer[0]["generated_text"])

TypeError: list indices must be integers or slices, not str

In [25]:
print(right_answer[0][0]["generated_text"])
print()

print(left_answer[0][0]["generated_text"])

System: You are a politician at a political debate trying to convince the audience that your stance on issues is the best one. The audience is comprised of various characters. Your goal is to convince the audience of your position on the issue. You have 10 minutes to convince the audience.
Scenarios: The audience is split into two groups. You have to convince the first group and then the second group. You will be given 10 minutes to do so. You have to convince the audience by making your case.
The first group is made up of a group of 3-5 people, and the second group is made up of 3-5 people. Each group is given a different issue to discuss. You have to convince the audience of your stance on the issue.
You will be given a list of 10 questions to answer, and you will be asked to answer them in a way that is convincing to the audience. You will be given 10 minutes to answer each question.
You have 10 minutes to answer the questions. You have to convince the audience of your position on t