In [None]:
import os
import getpass
os.environ['LC_ALL'] = 'en_US.UTF-8'
os.environ['LANG'] = 'en_US.UTF-8'
os.environ['LC_CTYPE'] = 'en_US.UTF-8'

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
!pip install -qq git+https://github.com/huggingface/peft.git
!pip install -qq accelerate
!pip install -qq datasets
!pip install -qq trl
!pip install -qq transformers
!pip install -qq bitsandbytes
!pip install -qq safetensors
# note: flash attention installation can take a long time
!pip install -qq flash-attn --no-build-isolation

In [None]:
!git clone https://github.com/harpreetsahota204/fine-tuning-with-sft.git

In [None]:
%cd /content/fine-tuning-with-sft/src/helpers

from helpers import create_directory, select_random_rows

In [None]:
class config:

    FINE_TUNING_DATASET_LOCATION = "ccdv/govreport-summarization"
    DATASET_LOCATION = "/content/fine_tuning_dataset"

    HF_USERNAME = "harpreetsahota"

    DECI_BASE_MODEL = "Deci/DeciLM-7b-redteam4-lm_eval"
    DECI_TUNED_REPO_NAME = "decilm-v3-summarization-tuned-50-steps-sos-tokens"
    DECI_OUTPUT_DIR = "../../../decilm-fine-tuned"

    COMPETITOR_OUTPUT_DIR = "../../../competitor-fine-tuned"
    COMPETITOR_BASE_MODEL = "mistralai/Mistral-7B-v0.1"
    COMPETITOR_TUNED_REPO_NAME = "mistral-summarization-tuned-50-steps-sos-tokens"

    MAX_STEPS = 50

In [None]:
create_directory(dir_name = config.DECI_OUTPUT_DIR)

create_directory(dir_name = config.COMPETITOR_OUTPUT_DIR)

In [None]:
from datasets import load_dataset

_train = load_dataset(config.FINE_TUNING_DATASET_LOCATION, split='train')

_test = load_dataset(config.FINE_TUNING_DATASET_LOCATION, split='test')

_train = select_random_rows(_train.shuffle(seed=42), 5000)

In [None]:
from typing import Dict, Optional

def format_row(row: Dict[str, str]) -> str:
    """
    This function takes a dictionary with keys 'source', 'rationale', and 'target',
    and optionally a prefix string and examples string. It formats them into a single string
    with the provided structure.

    Args:
    :param row: A dictionary representing a row in the dataset with keys 'source', 'rationale', and 'target'.
    :param prefix: An optional string to include before the examples and formatted row.
    :param examples: An optional string representing concatenated examples.
    :return: A formatted string.
    """
    formatted_string = ""
    formatted_string += "<s> Below is Full text paired with it's Summary. You task is to summarize the full text. \n\n"
    formatted_string += f"### Full text: {row['report']} \n"
    formatted_string += f" ### Summary: {row['summary']} \n </s>"

    return formatted_string.strip()

def add_text_column(example):
    example["text"] = format_row(example)
    return example

In [None]:
formatted_train = _train.map(add_text_column)

_split = formatted_train.train_test_split(test_size=0.2, seed=42)

_split.save_to_disk(config.DATASET_LOCATION)

In [None]:
%cd /content/fine-tuning-with-sft/src
!python train_model.py \
    --model_id {config.DECI_BASE_MODEL} \
    --dataset  {config.DATASET_LOCATION} \
    --output_dir {config.DECI_OUTPUT_DIR} \
    --hf_username {config.HF_USERNAME} \
    --hf_repo_name {config.DECI_TUNED_REPO_NAME} \
    --max_steps {config.MAX_STEPS}

In [None]:
%cd /content/fine-tuning-with-sft/src
!python train_model.py \
    --model_id {config.COMPETITOR_BASE_MODEL} \
    --dataset {config.DATASET_LOCATION} \
    --output_dir {config.COMPETITOR_OUTPUT_DIR} \
    --hf_username {config.HF_USERNAME} \
    --hf_repo_name {config.COMPETITOR_TUNED_REPO_NAME} \
    --max_steps {config.MAX_STEPS}