## Installing Necessary Libraries

In [1]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb
!pip install torch



In [2]:
import os, sys
import torch
import datasets
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
    GenerationConfig
)
from peft import PeftModel, LoraConfig, prepare_model_for_kbit_training, get_peft_model



In [3]:
import pandas as pd
train = pd.read_csv("/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv")
test = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv')

In [4]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(train, test_size=0.2, random_state=42)

In [5]:
train_df.head(10)

Unnamed: 0,id,article,highlights
23333,42488473183697474d4d2b6baf5ee986656c74de,Bayern Munich's first team stars have stepped ...,Pep Guardiola put his players through their pa...
152315,50db11d86b4ae854adfa40119a54e6a0cc795648,(CNN) -- It's one of the best insomnia cures a...,"Can't sleep? A night on a ""star bed"" might hel..."
187109,7e51f944f4f589604762236521f7bf772042c90f,By . Lizzie Edmonds . and Daily Mail Reporter ...,Mr Sharon has been in a coma for eight years f...
165141,6188fa96edece2ccd3f7d38667024736a5795a62,By . Lizzie Edmonds . The family of a two-year...,Trinity Liliana Coward killed when a fireplace...
5309,0f04a4604075976a07cf2a9bc23cc98b6edda5c8,"By . Simon Tomlinson . PUBLISHED: . 12:38 EST,...",Kelly Schaecher was first told to 'suckle baby...
179095,73e64c83f3c980baa45efd6cf37cbd0f2ea4d98e,By . Daily Mail Reporter . A famed New York de...,Dr Jonathan Zizmor's youthful face has been pl...
15140,2b054dbd3746af30fc181c939628909f7e6c6dbd,(CNN) -- A 22-year-old man from Pakistan hopes...,College teammates want to swim English Channel...
135609,3b710c31e4d1a230745c269e36443db37f823ab1,"By . Damien Gayle . UPDATED: . 03:41 EST, 9 Ja...",Secretive state releases series of he-man imag...
105093,13928e2431c586448e31ce49ebae82d852bab868,The Michelin Guide's latest roster of top New ...,"Michael Ellis, director of the Michelin guides..."
196213,89edd826a8ca18ae86b78db3bfd69184962986a5,President Barack Obama warned Republicans in C...,President is angry at House Republicans for re...


In [6]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer

In [7]:
model_id="NousResearch/Llama-2-7b-hf"

tokenizer = LlamaTokenizer.from_pretrained(model_id)

model =LlamaForCausalLM.from_pretrained(model_id, load_in_8bit=True, device_map='auto', torch_dtype=torch.float16)

Downloading (…)okenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



In [8]:
from datasets import Dataset,DatasetDict
train_dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df)
})

In [9]:
prompt = (
        f"Summarize this dialog:\n{{dialog}}\n---\nSummary:\n{{summary}}{{eos_token}}"
    )

In [10]:
del train

In [11]:
class Concatenator(object):
    def __init__(self, chunk_size=2048):
        self.chunk_size=chunk_size
        self.residual = {"input_ids": [], "attention_mask": []}
        
    def __call__(self, batch):
        concatenated_samples = {
            k: v + list(chain(*batch[k])) for k, v in self.residual.items()
        }

        total_length = len(concatenated_samples[list(concatenated_samples.keys())[0]])

        if total_length >= self.chunk_size:
            chunk_num = total_length // self.chunk_size
            result = {
                k: [
                    v[i : i + self.chunk_size]
                    for i in range(0, chunk_num * self.chunk_size, self.chunk_size)
                ]
                for k, v in concatenated_samples.items()
            }
            self.residual = {
                k: v[(chunk_num * self.chunk_size) :]
                for k, v in concatenated_samples.items()
            }
        else:
            result = concatenated_samples
            self.residual = {k: [] for k in concatenated_samples.keys()}

        result["labels"] = result["input_ids"].copy()

        return result

In [12]:
import datasets
from itertools import chain


dataset = train_dataset_dict["train"]

prompt = (
        f"Summarize this dialog:\n{{dialog}}\n---\nSummary:\n{{summary}}{{eos_token}}"
    )
def apply_prompt_template(sample):
    return {
            "text": prompt.format(
                dialog=sample["article"],
                summary=sample["highlights"],
                eos_token=tokenizer.eos_token,
            )
        }

dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))

  0%|          | 0/229690 [00:00<?, ?ex/s]

In [13]:
dataset1= dataset.map(
        lambda sample: tokenizer(sample["text"]),
        batched=True,
        remove_columns=list(dataset.features),
    )

  0%|          | 0/230 [00:00<?, ?ba/s]

In [14]:
del dataset

In [15]:
dataset2=dataset1.map(Concatenator(), batched=True)

  0%|          | 0/230 [00:00<?, ?ba/s]

In [16]:
del dataset1

In [17]:
model.train()

def create_peft_config(model):
    from peft import (
        get_peft_model,
        LoraConfig,
        TaskType,
        prepare_model_for_int8_training,
    )

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules = ["q_proj", "v_proj"]
    )

    # prepare int-8 model for training
    model = prepare_model_for_int8_training(model)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    return model, peft_config

# create peft config
model, lora_config = create_peft_config(model)



trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199


In [18]:
from transformers import TrainerCallback
from contextlib import nullcontext
enable_profiler = False
output_dir = "./llama-output"

config = {
    'lora_config': lora_config,
    'learning_rate': 1e-4,
    'num_train_epochs': 1,
    'gradient_accumulation_steps': 2,
    'per_device_train_batch_size': 2,
    'gradient_checkpointing': False,
    'max_steps':100
}

# Set up profiler
if enable_profiler:
    wait, warmup, active, repeat = 1, 1, 2, 1
    total_steps = (wait + warmup + active) * (1 + repeat)
    schedule =  torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=repeat)
    profiler = torch.profiler.profile(
        schedule=schedule,
        on_trace_ready=torch.profiler.tensorboard_trace_handler(f"{output_dir}/logs/tensorboard"),
        record_shapes=True,
        profile_memory=True,
        with_stack=True)
    
    class ProfilerCallback(TrainerCallback):
        def __init__(self, profiler):
            self.profiler = profiler
            
        def on_step_end(self, *args, **kwargs):
            self.profiler.step()

    profiler_callback = ProfilerCallback(profiler)
else:
    profiler = nullcontext()

In [19]:
from transformers import default_data_collator, Trainer, TrainingArguments



# Define training args
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    fp16=True,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="no",
    optim="paged_adamw_32bit",
    lr_scheduler_type="constant",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    group_by_length=False,
    max_steps=250
)

with profiler:
    # Create Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset2,
        data_collator=default_data_collator,
        callbacks=[profiler_callback] if enable_profiler else [],
    )

    # Start training
    trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,1.8747
20,1.8535
30,1.8649
40,1.8323
50,1.8113
60,1.8299
70,1.761
80,1.7789
90,1.7622
100,1.7286


In [20]:
model.save_pretrained(output_dir)

In [23]:
eval_prompt = """
Summarize this dialog:
Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative on the committee.Â 'It is time that the DOT and FAA take a stand for humane treatment of passengers.' But could crowding on planes lead to more serious issues than fighting for space in the overhead lockers, crashing elbows and seat back kicking? Tests conducted by the FAA use planes with a 31 inch pitch, a standard which on some airlines has decreased . Many economy seats on United Airlines have 30 inches of room, while some airlines offer as little as 28 inches . Cynthia Corbertt, a human factors researcher with the Federal Aviation Administration, that it conducts tests on how quickly passengers can leave a plane. But these tests are conducted using planes with 31 inches between each row of seats, a standard which on some airlines has decreased, reported the Detroit News. The distance between two seats from one point on a seat to the same point on the seat behind it is known as the pitch. While most airlines stick to a pitch of 31 inches or above, some fall below this. While United Airlines has 30 inches of space, Gulf Air economy seats have between 29 and 32 inches, Air Asia offers 29 inches and Spirit Airlines offers just 28 inches. British Airways has a seat pitch of 31 inches, while easyJet has 29 inches, Thomson's short haul seat pitch is 28 inches, and Virgin Atlantic's is 30-31.
---
Summary:
"""

In [24]:
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

In [25]:
model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))




Summarize this dialog:
Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative on the committee.Â 'It is time that the DOT and FAA take a stand for humane treatment of passengers.' But could crowding on planes lead to more serious issues

In [27]:
testing_highlight = """Experts question if  packed out planes are putting passengers at risk .
U.S consumer advisory group says minimum space must be stipulated .
Safety tests conducted on planes with more leg room than airlines offer ."""

In [26]:
testing_summary="""The FAA tests how quickly passengers can leave a plane .
But these tests are conducted using planes with 31 inches between each row of seats .
Some airlines offer as little as 28 inches .
The distance between two seats from one point on a seat to the same point on the seat behind it is known as the pitch ."""

In [32]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [33]:
from rouge import Rouge
rouge = Rouge()
rouge.get_scores(testing_summary, testing_highlight)

[{'rouge-1': {'r': 0.2727272727272727, 'p': 0.2, 'f': 0.23076922588757406},
  'rouge-2': {'r': 0.06060606060606061,
   'p': 0.03571428571428571,
   'f': 0.044943815558642075},
  'rouge-l': {'r': 0.2727272727272727, 'p': 0.2, 'f': 0.23076922588757406}}]