In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling, BitsAndBytesConfig
import pandas as pd
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import os

### Config

In [None]:
MODEL_NAME = "microsoft/phi-1_5"
OUTPUT_DIR_LEFT = "/app/models/phi-1.5-left"
OUTPUT_DIR_RIGHT = "/app/models/phi-1.5-right"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

### Training Hyperparams

In [12]:
LR = 5e-5
EPOCHS = 3
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
MAX_SEQ_LENGTH = 512
WEIGHT_DECAY = 0.01

In [13]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_use_double_quant_nf4=True
)

Unused kwargs: ['bnb_4bit_use_double_quant_nf4']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [14]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "Wqkv",
        "out_proj",
        "fc1",
        "fc2"
    ],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
def load_and_prepare_model():
    """Load and prepare model for training"""
    print(f"Loading {MODEL_NAME}...")
    
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        quantization_config=quantization_config,
        trust_remote_code=True
    )
    
    model = prepare_model_for_kbit_training(model)
    
    model = get_peft_model(model, peft_config)
    
    model.train()
    
    model.enable_input_require_grads()
    
    return model

In [None]:
def prepare_datasets(tokenizer):
    """Prepare and tokenize the datasets"""
    print("Preparing datasets...")

    left_path = "/app/data/combined_left.csv"
    right_path = "/app/data/combined_right.csv"

    left_text_path = "/app/data/left.txt"
    right_text_path = "/app/data/right.txt"
    
    left_dataset = pd.read_csv(left_path)
    right_dataset = pd.read_csv(right_path)
    
    for dataset, filename in [(left_dataset, left_text_path), (right_dataset, right_text_path)]:
        with open(filename, "w") as f:
            for _, row in dataset[["text", "topic"]].iterrows():
                out = f"Topic: {row['topic']}\nOpinion: {row['text']}{tokenizer.eos_token}\n"
                f.write(out)
    
    left_dataset = load_dataset("text", data_files=left_text_path)["train"]
    right_dataset = load_dataset("text", data_files=right_text_path)["train"]
    
    left_dataset = left_dataset.train_test_split(test_size=0.05)
    right_dataset = right_dataset.train_test_split(test_size=0.05)
    
    def tokenize(batch):
        outputs = tokenizer(
            batch["text"],
            padding=True,
            truncation=True,
            max_length=MAX_SEQ_LENGTH,
            return_tensors=None 
        )
        return outputs
    
    left_dataset = left_dataset.map(
        tokenize,
        batched=True,
        batch_size=1000,
        num_proc=os.cpu_count(),
        remove_columns=left_dataset["train"].column_names
    )
    
    right_dataset = right_dataset.map(
        tokenize,
        batched=True,
        batch_size=1000,
        num_proc=os.cpu_count(),
        remove_columns=right_dataset["train"].column_names
    )
    
    return left_dataset, right_dataset

In [None]:
def setup_trainer(model, dataset, tokenizer, output_dir):
    """Setup trainer with optimized configuration"""
    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_steps=100,
        learning_rate=LR,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=EPOCHS,
        weight_decay=WEIGHT_DECAY,
        fp16=True,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        gradient_checkpointing=True,
        save_total_limit=2,
        logging_steps=50,
        optim="paged_adamw_8bit",
        lr_scheduler_type="cosine",
        warmup_ratio=0.03,
        evaluation_strategy="steps",
        save_strategy="steps",
        load_best_model_at_end=True,
        ddp_find_unused_parameters=False,
        group_by_length=True, 
    )
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )
    
    return Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"]
    )

### Load Dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    model_max_length=MAX_SEQ_LENGTH
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

left_dataset, right_dataset = prepare_datasets(tokenizer)

Preparing datasets...


Generating train split: 2855 examples [00:00, 310943.31 examples/s]
Generating train split: 2855 examples [00:00, 1360456.48 examples/s]
Map (num_proc=32): 100%|██████████| 2712/2712 [00:00<00:00, 3646.18 examples/s]
Map (num_proc=32): 100%|██████████| 143/143 [00:00<00:00, 195.02 examples/s]
Map (num_proc=32): 100%|██████████| 2712/2712 [00:00<00:00, 3739.18 examples/s]
Map (num_proc=32): 100%|██████████| 143/143 [00:00<00:00, 197.55 examples/s]


### Training

#### Train Left Model

In [None]:
print("\nTraining left-leaning model...")
left_model = load_and_prepare_model()
left_trainer = setup_trainer(left_model, left_dataset, tokenizer, OUTPUT_DIR_LEFT)
left_trainer.train()


Training left-leaning model...
Loading microsoft/phi-1_5...




Step,Training Loss,Validation Loss
100,2.84,2.741143
200,2.6601,2.614721
300,2.5762,2.529516
400,2.4883,2.558425
500,2.5152,2.569698


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=507, training_loss=2.6919451230141305, metrics={'train_runtime': 403.0512, 'train_samples_per_second': 20.186, 'train_steps_per_second': 1.258, 'total_flos': 6062675908853760.0, 'train_loss': 2.6919451230141305, 'epoch': 2.991150442477876})

#### Save Left Model

In [29]:
left_model.save_pretrained(OUTPUT_DIR_LEFT)
tokenizer.save_pretrained(OUTPUT_DIR_LEFT)

('/app/models/phi-1.5-left/tokenizer_config.json',
 '/app/models/phi-1.5-left/special_tokens_map.json',
 '/app/models/phi-1.5-left/vocab.json',
 '/app/models/phi-1.5-left/merges.txt',
 '/app/models/phi-1.5-left/added_tokens.json',
 '/app/models/phi-1.5-left/tokenizer.json')

#### Clear Cache

In [20]:
del left_model
if torch.cuda.is_available():
        torch.cuda.empty_cache()

#### Train Right 

In [None]:
print("\nTraining right-leaning model...")
right_model = load_and_prepare_model()
right_trainer = setup_trainer(right_model, right_dataset, tokenizer, OUTPUT_DIR_RIGHT)
right_trainer.train()



Training right-leaning model...
Loading microsoft/phi-1_5...




Step,Training Loss,Validation Loss
100,2.9494,2.755931
200,2.7313,2.607172
300,2.6199,2.573784
400,2.5475,2.557201
500,2.5958,2.517114


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


('models/phi-1.5-right/tokenizer_config.json',
 'models/phi-1.5-right/special_tokens_map.json',
 'models/phi-1.5-right/vocab.json',
 'models/phi-1.5-right/merges.txt',
 'models/phi-1.5-right/added_tokens.json',
 'models/phi-1.5-right/tokenizer.json')

#### Save Right Model

In [27]:
right_model.save_pretrained(OUTPUT_DIR_RIGHT)
tokenizer.save_pretrained(OUTPUT_DIR_RIGHT)

('/app/models/phi-1.5-right/tokenizer_config.json',
 '/app/models/phi-1.5-right/special_tokens_map.json',
 '/app/models/phi-1.5-right/vocab.json',
 '/app/models/phi-1.5-right/merges.txt',
 '/app/models/phi-1.5-right/added_tokens.json',
 '/app/models/phi-1.5-right/tokenizer.json')