In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig, pipeline, AutoModelForQuestionAnswering
import pandas as pd
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Finetune Llama 3.2 with 1B params

load in the base model

In [2]:
quantization_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="text-generation"
)

In [3]:
checkpoint = "meta-llama/Llama-3.2-1B"
device = "cuda" if torch.cuda.is_available() else "cpu"

left_model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    quantization_config=quantization_config
)

right_model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    quantization_config=quantization_config
)

left_model = get_peft_model(left_model, peft_config)
right_model = get_peft_model(right_model, peft_config)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token

load in the combined dataset and convert it to text

In [4]:
left_dataset = pd.read_csv("data/combined_left.csv")
right_dataset = pd.read_csv("data/combined_right.csv")

left_dataset = left_dataset[["text", "topic"]]
right_dataset = right_dataset[["text", "topic"]]

# Write the left dataset to left.txt
with open("data/left.txt", "w") as f:
    for _, row in left_dataset.iterrows():
        out = f"Topic: {row['topic']}\nOpinion:{row['text']}{tokenizer.eos_token}\n"
        f.write(out)

# Write the right dataset to right.txt
with open("data/right.txt", "w") as f:
    for _, row in right_dataset.iterrows():
        out = f"Topic: {row['topic']}\nOpinion:{row['text']}{tokenizer.eos_token}\n"
        f.write(out)

In [5]:
left_dataset = load_dataset("text", data_files="data/left.txt")
right_dataset = load_dataset("text", data_files="data/right.txt")

left_dataset = left_dataset["train"]
right_dataset = right_dataset["train"]

left_dataset = left_dataset.train_test_split(test_size=0.05)
right_dataset = right_dataset.train_test_split(test_size=0.05)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Fine-tune the model on the left and right context

In [6]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


left_dataset = left_dataset.map(tokenize, batched=True, batch_size=4)
right_dataset = right_dataset.map(tokenize, batched=True, batch_size=4)

Map:   0%|          | 0/2712 [00:00<?, ? examples/s]

Map:   0%|          | 0/143 [00:00<?, ? examples/s]

Map:   0%|          | 0/2712 [00:00<?, ? examples/s]

Map:   0%|          | 0/143 [00:00<?, ? examples/s]

In [7]:
LR = 5e-5
EPOCHS = 3
BATCH_SIZE = 2
WEIGHT_DECAY = 0.01


left_args = TrainingArguments(
    output_dir="models/Llama-3.2-1B-left",
    eval_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    fp16=True,
)

right_args = TrainingArguments(
    output_dir="models/Llama-3.2-1B-right",
    eval_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    fp16=True,
)

In [8]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

left_trainer = Trainer(
    model=left_model,
    args=left_args,
    data_collator=data_collator,
    train_dataset=left_dataset["train"],
    eval_dataset=left_dataset["test"]
)

right_trainer = Trainer(
    model=right_model,
    args=right_args,
    data_collator=data_collator,
    train_dataset=right_dataset["train"],
    eval_dataset=right_dataset["test"]
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [9]:
left_trainer.train()
left_model.save_pretrained("models/Llama-3.2-1B-left")
tokenizer.save_pretrained("models/Llama-3.2-1B-left")

  0%|          | 0/4068 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 3.3331, 'grad_norm': 3.108712673187256, 'learning_rate': 4.390363815142576e-05, 'epoch': 0.37}
{'loss': 3.0623, 'grad_norm': 6.408111095428467, 'learning_rate': 3.775811209439528e-05, 'epoch': 0.74}


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


  0%|          | 0/72 [00:00<?, ?it/s]

{'eval_runtime': 2.656, 'eval_samples_per_second': 53.841, 'eval_steps_per_second': 27.109, 'epoch': 1.0}
{'loss': 2.9751, 'grad_norm': 5.940159797668457, 'learning_rate': 3.16125860373648e-05, 'epoch': 1.11}
{'loss': 2.9572, 'grad_norm': 4.8212480545043945, 'learning_rate': 2.5467059980334317e-05, 'epoch': 1.47}
{'loss': 2.8934, 'grad_norm': 9.45226764678955, 'learning_rate': 1.9321533923303837e-05, 'epoch': 1.84}


  0%|          | 0/72 [00:00<?, ?it/s]

{'eval_runtime': 2.9673, 'eval_samples_per_second': 48.193, 'eval_steps_per_second': 24.265, 'epoch': 2.0}
{'loss': 2.8219, 'grad_norm': 12.48244857788086, 'learning_rate': 1.3176007866273355e-05, 'epoch': 2.21}
{'loss': 2.8164, 'grad_norm': 17.162700653076172, 'learning_rate': 7.030481809242871e-06, 'epoch': 2.58}
{'loss': 2.8396, 'grad_norm': 8.87635326385498, 'learning_rate': 8.849557522123894e-07, 'epoch': 2.95}


  0%|          | 0/72 [00:00<?, ?it/s]

{'eval_runtime': 2.7293, 'eval_samples_per_second': 52.394, 'eval_steps_per_second': 26.38, 'epoch': 3.0}
{'train_runtime': 356.4631, 'train_samples_per_second': 22.824, 'train_steps_per_second': 11.412, 'train_loss': 2.9590589697733742, 'epoch': 3.0}


('models/Llama-3.2-1B-left\\tokenizer_config.json',
 'models/Llama-3.2-1B-left\\special_tokens_map.json',
 'models/Llama-3.2-1B-left\\tokenizer.json')

In [10]:
right_trainer.train()
right_model.save_pretrained("models/Llama-3.2-1B-right")
tokenizer.save_pretrained("models/Llama-3.2-1B-right")

  0%|          | 0/4068 [00:00<?, ?it/s]

{'loss': 3.3746, 'grad_norm': 3.878298759460449, 'learning_rate': 4.390363815142576e-05, 'epoch': 0.37}
{'loss': 3.1284, 'grad_norm': 7.1884589195251465, 'learning_rate': 3.775811209439528e-05, 'epoch': 0.74}


  0%|          | 0/72 [00:00<?, ?it/s]

{'eval_runtime': 2.8709, 'eval_samples_per_second': 49.81, 'eval_steps_per_second': 25.079, 'epoch': 1.0}
{'loss': 3.0637, 'grad_norm': 5.235858917236328, 'learning_rate': 3.16125860373648e-05, 'epoch': 1.11}
{'loss': 3.0015, 'grad_norm': 9.326213836669922, 'learning_rate': 2.5467059980334317e-05, 'epoch': 1.47}
{'loss': 2.936, 'grad_norm': 9.428650856018066, 'learning_rate': 1.9321533923303837e-05, 'epoch': 1.84}


  0%|          | 0/72 [00:00<?, ?it/s]

{'eval_runtime': 2.8776, 'eval_samples_per_second': 49.693, 'eval_steps_per_second': 25.02, 'epoch': 2.0}
{'loss': 2.9068, 'grad_norm': 21.05608558654785, 'learning_rate': 1.3176007866273355e-05, 'epoch': 2.21}
{'loss': 2.8433, 'grad_norm': 12.307822227478027, 'learning_rate': 7.030481809242871e-06, 'epoch': 2.58}
{'loss': 2.8718, 'grad_norm': 19.221954345703125, 'learning_rate': 8.849557522123894e-07, 'epoch': 2.95}


  0%|          | 0/72 [00:00<?, ?it/s]

{'eval_runtime': 2.7879, 'eval_samples_per_second': 51.292, 'eval_steps_per_second': 25.825, 'epoch': 3.0}
{'train_runtime': 355.4977, 'train_samples_per_second': 22.886, 'train_steps_per_second': 11.443, 'train_loss': 3.0113454910623405, 'epoch': 3.0}


('models/Llama-3.2-1B-right\\tokenizer_config.json',
 'models/Llama-3.2-1B-right\\special_tokens_map.json',
 'models/Llama-3.2-1B-right\\tokenizer.json')

In [11]:
left_model.push_to_hub("isaacberlin/Llama-3.2-Leftleaning")

adapter_model.safetensors:   0%|          | 0.00/6.82M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/isaacberlin/Llama-3.2-Leftleaning/commit/b0b005e3d75ac37d659e9c10ae09606534432fc1', commit_message='Upload model', commit_description='', oid='b0b005e3d75ac37d659e9c10ae09606534432fc1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/isaacberlin/Llama-3.2-Leftleaning', endpoint='https://huggingface.co', repo_type='model', repo_id='isaacberlin/Llama-3.2-Leftleaning'), pr_revision=None, pr_num=None)

In [12]:
right_model.push_to_hub("isaacberlin/Llama-3.2-Rightleaning")

README.md:   0%|          | 0.00/63.0 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/6.82M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/isaacberlin/Llama-3.2-Rightleaning/commit/6b77fedd3ab22eb6f07231c3a979bca7a453d9af', commit_message='Upload model', commit_description='', oid='6b77fedd3ab22eb6f07231c3a979bca7a453d9af', pr_url=None, repo_url=RepoUrl('https://huggingface.co/isaacberlin/Llama-3.2-Rightleaning', endpoint='https://huggingface.co', repo_type='model', repo_id='isaacberlin/Llama-3.2-Rightleaning'), pr_revision=None, pr_num=None)