In [None]:
!pip install bitsandbytes peft trl accelerate transformers

In [None]:
pip install -U bitsandbytes

In [None]:
import torch

In [None]:
torch.manual_seed(1337)

In [None]:
import torch
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig



model_id = "meta-llama/Llama-3.1-8B-Instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    # bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto", token=HF_TOKEN)

In [None]:
!pip install tqdm

In [None]:

from tqdm import tqdm
from datasets import Dataset
import pandas as pd
# Load CSV using pandas
df = pd.read_excel("/kaggle/input/maradonaaa/results_Maradona_Hand_Of_God_with_Llama_3.1_8b_instruct_full_final.xlsx")
df = df[['Comments', 'Label']][:2000]
# Convert pandas DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

In [None]:
df

In [None]:
# max_len = 0
# for i in range(len(dataset)):
#   crr_len = len(tokenizer.encode(prompt.format(dataset[i]['Comments'])))
#   if crr_len > max_len:
#     max_len = crr_len


In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side='right'

In [None]:



formatted_prompt = ("<|begin_of_text|><|begin_of_text|>"
                    "<|start_header_id|>system<|end_header_id|>{}\n\n"
                    "<|eot_id|><|start_header_id|>user<|end_header_id|>{}\n\n"
                    "<|eot_id|><|start_header_id|>assistant<|end_header_id|>{}<|eot_id|>\n\n")

system = ("You are provided with an input. You are required to perform stance detection "
          "on the input with output as one of the following labels - Favor, Against, "
          "Irrelevant, Neutral. The labels are self-explanatory. Only output the stance detected label.")


# Define the formatting function
def formatting_function(item):

    # Use Comments and Label to create formatted text
    user = item["Comments"]
    assistant = item["Label"]

    texts = []
    for input_text, output_text in zip(user, assistant):
        text = formatted_prompt.format(system, input_text, output_text)
        texts.append(text)

    return {"text": texts}

# Apply the formatting function to the dataset
# sampled_dataset = tokenized_dataset.select(2000)
formatted_dataset = dataset.map(formatting_function, batched=True)


max_len = 0
for i in range(len(dataset)):
   # Use Comments and Label to create formatted text
  user = dataset[i]["Comments"]
  assistant = dataset[i]["Label"]

  crr_len = len(tokenizer.encode(formatted_prompt.format(system, user, assistant)))
  if crr_len > max_len:
    max_len = crr_len


def tokenize_function(examples):
    # Tokenize the full text (input)
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=1024,
        # return_tensors=None
    )
    
    # For Llama, we need to process one label at a time
    label_tokens = []
    for label in examples["Label"]:
        # Tokenize each label individually
        label_encoding = tokenizer(
            str(label),
            truncation=True,
            padding="max_length",
            max_length=1024,
            # return_tensors=None
        )
        label_tokens.append(label_encoding["input_ids"])
    # print(label_tokens)
    tokenized["labels"] = label_tokens
    return tokenized
# Apply the tokenization
tokenized_dataset = formatted_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["Comments", "Label", "text"]
)

In [None]:
tokenized_dataset

In [None]:
formatted_dataset['text'][0]

In [None]:
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

model.gradient_checkpointing_enable()

In [None]:
from peft import LoraConfig, get_peft_model

# Configure LoraConfig for model pruning
lora_config = LoraConfig(
    r=32,
    lora_alpha=16,
    lora_dropout = 0.05,
    target_modules=['down_proj', 'gate_proj', 'o_proj', 'v_proj', 'up_proj', 'q_proj', 'k_proj'],
    task_type="CAUSAL_LM",
    use_dora=True
)
# Apply LoRA to the model
model = get_peft_model(model, lora_config)

In [None]:
def formatting_func(example):
    # Extract instruction and output from the example
    instruction = example['instruction']
    input = example['input']
    output = example['output']

    # Format the data into Gemma instruction template format
    text = f"<|im_start|>system\n{instruction}<|im_end|> <|im_start|>user\n{input}<|im_end|> <|im_start|>assistant\n{output}<|im_end|>"

    # Return the formatted data as a list
    return [text]

In [None]:
# # len(dataset)
# dataset = dataset.map(lambda example: {"text": example["text"]}, remove_columns=["Comments", "Label"])
# dataset


In [None]:
# tokenized_dataset

In [None]:
from datasets import Dataset

# Assuming `tokenized_dataset` is already created
# Split the dataset into training and validation sets
# train_dataset, val_dataset = tokenized_dataset.train_test_split(test_size=0.1)  # 10% for validation

# Optionally, you can specify a random seed for reproducibility

train_dataset, val_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42).values()
val_dataset, text_dataset = train_dataset.train_test_split(test_size=0.1, seed=42).values()


In [None]:
train_dataset

In [None]:
import transformers
from transformers import Trainer

# dataset_size = len(dataset)
# effective_batch_size = 8 * 4
# steps_per_epoch = dataset_size // effective_batch_size

# log_every_n_epochs = 0.1
# logging_steps = int(steps_per_epoch * log_every_n_epochs)




# Initialize the SFTTrainer
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset = val_dataset,
    # dataset_text_field="text",
    processing_class =tokenizer,
    # packing=False,
    # max_seq_length=4096,
    args=transformers.TrainingArguments(
    remove_unused_columns=False,
    output_dir = '/kaggle/working/outputs',
    # warmup_steps=50,
    warmup_ratio = 0.03,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    # max_steps=2*len(dataset),
    num_train_epochs=1,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    # save_strategy="steps",
    # save_steps=25,
    max_grad_norm = 0.3,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=True,
    report_to="none",
    fp16=True,
    overwrite_output_dir = 'True',
    group_by_length=True,
),

    # peft_config=lora_config,
    # formatting_func=formatting_function,
)

In [None]:
trainer.train()