In [None]:
import json
import time
import torch
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from transformers import (AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, BitsAndBytesConfig)
from trl import SFTTrainer
from datasets import Dataset
from huggingface_hub import login
from accelerate import Accelerator
from peft import LoraConfig
from torch.utils.data import DataLoader

In [None]:
# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


def load_json_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

def prepare_dataset(file_path, sample_fraction=0.2):
    data = load_json_data(file_path)
    full_dataset = Dataset.from_dict({
        "instruction": [item["instruction"] for item in data],
        "input": [item["input"] for item in data],
        "labels": [1 if item["output"].strip().lower() == "yes" else 0 for item in data]
    })
   
    sampled_dataset = full_dataset.train_test_split(test_size=sample_fraction, seed=42)['test']
    return sampled_dataset


dataset_path = 'output.json'
dataset = prepare_dataset(dataset_path, sample_fraction=0.2)


train_test_split = dataset.train_test_split(test_size=0.2, seed=42) 
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']


train_dataset.save_to_disk('train_dataset')
eval_dataset.save_to_disk('eval_dataset')

print(f"Sampled dataset size: {len(dataset)}")
print(f"Training dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")


model_name = "ahxt/LiteLlama-460M-1T"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,  
    problem_type="single_label_classification"
).to(device)

model.config.pad_token_id = tokenizer.pad_token_id
model.resize_token_embeddings(len(tokenizer))

max_seq_length = 512 
def tokenize_and_pad(examples):
    return tokenizer(
        examples['input'],
        padding='max_length',
        max_length=max_seq_length,
        truncation=True
    )

train_dataset = train_dataset.map(tokenize_and_pad, batched=True)
eval_dataset = eval_dataset.map(tokenize_and_pad, batched=True)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


training_args = TrainingArguments(
    output_dir="Results",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    max_grad_norm=1.0,
    learning_rate=5e-5,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    save_steps=50,
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    fp16=True,
    push_to_hub=False,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


start_time = time.time()
trainer.train()
end_time = time.time()


trainer.save_model("trained_model")
tokenizer.save_pretrained("trained_model")
model.config.save_pretrained("trained_model")


print(f"Total training time: {(end_time - start_time) / 60:.2f} minutes")