### **Model Training**

### Installation and Libraries

In [None]:
!pip install bitsandbytes --progress-bar off
!pip install torch==2.0.1 --progress-bar off
!pip install -U huggingface_hub==0.16.4 --progress-bar off
!pip install -U transformers --progress-bar off
!pip install -U accelerate --progress-bar off
!pip install -U git+https://github.com/huggingface/peft.git --progress-bar off
!pip install datasets==2.12.0 --progress-bar off
!pip install loralib==0.1.1 --progress-bar off
!pip install einops==0.6.1 --progress-bar off
!pip install evaluate --progress-bar off
!pip install scipy --progress-bar off
!pip install gdown --progress-bar off

In [None]:
import json
import os
import csv
from pprint import pprint
import gdown

import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import (
    notebook_login,
    HfApi
)
import evaluate
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
from torch import (
    cuda,
    bfloat16
)

### Log In to HuggingFace

In [None]:
notebook_login()

### Upload files

In [None]:
prefix = "https://drive.google.com/uc?export=download&id="

In [None]:
print("Before typing in the URL, please ensure that the sharing access to the data file is changed to 'Anyone with the link'.")
train_url = input("Please type in the URL of the training data which is saved on Google Drive: ")

In [None]:
train_id = train_url.split("/")[-2]

In [None]:
train_file = gdown.download(prefix + train_id)

In [None]:
# Only applicable if there is a separate dataset for model testing
test_url = input("Please type in the URL of the test data which is saved on Google Drive: ")

In [None]:
# Only applicable if there is a separate dataset for model testing
test_id = test_url.split("/")[-2]

In [None]:
# Only applicable if there is a separate dataset for model testing
test_file = gdown.download(prefix + test_id)

### Data Preparation

In [None]:
# Run this code only if the input file is in CSV format
# Modify the code whenever necessary depending on how the data from CSV file needs to be cleaned
def convert_csv_json(csv_file_path, json_file_path):
  qa_data = []

  with open(csv_file_path, encoding="utf-8") as csvf:
    csv_reader = csv.DictReader(csvf)

    # Change the attributes here according to the attributes found in CSV file
    for rows in csv_reader:
      data = {}
      rows["Context"] = rows.pop("\ufeffContext")
      context = rows["Context"]
      question = rows["Question"]
      answer = rows["Answer"]
      data["context"] = context
      data["question"] = question
      data["answer"] = answer
      qa_data.append(data)

  with open(json_file_path, "w") as jsonf:
    json.dump(qa_data[:300], jsonf)

In [None]:
print("Please type the name to save the output file of train data.")
train_json = input() + ".json"

In [None]:
# Only applicable if there is a separate dataset for model testing
print("Please type a name to save the output file for test data.")
test_json = input() + ".json"

In [None]:
# Train data
convert_csv_json(train_file, train_json)

In [None]:
# Only applicable if there is a separate dataset for model testing
# Test data
convert_csv_json(test_file, test_json)

In [None]:
with open(train_json, "r") as f:
  for data in f:
    pprint(data)

### Load Model and Tokenizer

In [None]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

In [None]:
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=bfloat16
)

In [None]:
model_name = "meta-llama/Llama-2-7b-hf"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    trust_remote_code=True,
    quantization_config=bnb_config
)

model.eval()
print(f"Model loaded on {device}")

In [None]:
print(model)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    return list(lora_module_names)

In [None]:
target_modules = find_all_linear_names(model)

In [None]:
target_modules.append("lm_head")

In [None]:
print(target_modules)

In [None]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=target_modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

### Further Preparation of Dataset

In [None]:
def generate_prompt(data_point):
    return f"""
    You are an AI assistant and you are responsible to answer questions asked by users.

    You will have to carry out proper reasonings based on the context given by users and provide a final answer to a question asked by users.

    However, if you are not sure about the answer to the question, please do not make up an answer and state "I do not know the answer".

    User:
    Context - {data_point["context"]}
    Based on this given context, please answer my question below.
    Question - {data_point["question"]}

    Assistant:
    {data_point["answer"]}
    """.strip()

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
    return tokenized_full_prompt

### Further Preparation of Train Dataset

In [None]:
train_dataset = load_dataset("json", data_files=train_json)

In [None]:
train_data = train_dataset["train"].shuffle().map(generate_and_tokenize_prompt)

In [None]:
train_data

- The splitting of data is only necessary if there is no separate test data

In [None]:
split_data = train_dataset["train"].train_test_split(test_size=0.2)

In [None]:
split_data["train"]

In [None]:
split_data["test"]

In [None]:
train_data = split_data["train"].shuffle().map(generate_and_tokenize_prompt)

In [None]:
test_data = split_data["test"].shuffle().map(generate_and_tokenize_prompt)

In [None]:
train_data

In [None]:
test_data

### Load new test dataset
- This part only needs to be loaded if there is a separate test data

In [None]:
test_attribute = {"test": test_json}
test_dataset = load_dataset("json", data_files=test_attribute)

In [None]:
test_dataset

In [None]:
test_data = test_dataset["test"].shuffle().map(generate_and_tokenize_prompt)

In [None]:
test_data

### Model Training

In [None]:
output_dir = "llama-2-7b-hf"

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    evaluation_strategy="steps",
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    max_steps=400,
    warmup_steps=2,
    # num_train_epochs=1,
    learning_rate=2e-5,
    fp16=True,
    output_dir=output_dir,
    optim="paged_adamw_8bit",
    warmup_ratio=0.05
)

trainer = Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False

In [None]:
trainer.train()

### Save Fine-tuned Model

In [None]:
model.config.to_json_file("config.json")

In [None]:
huggingface_api = HfApi()
huggingface_api.upload_file(
    path_or_fileobj="./config.json",
    path_in_repo="config.json",
    repo_id="aiknight87/llama-2-7b-hf-300d",
    create_pr=1
)

In [None]:
model.push_to_hub(
    "aiknight87/llama-2-7b-hf-300d",
    use_auth_token=True,
    create_pr=1
)

In [None]:
tokenizer.push_to_hub(
    "aiknight87/llama-2-7b-hf-300d",
    use_auth_token=True,
    create_pr=1
)