**Installing necessary packages**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install -q rouge_score
!pip install -q simpletransformers
!pip install -q datasets
!pip install -q evaluate
!pip install -q torch
!pip install -q accelerate
!pip install -q tqdm
!pip install -q nltk

Mounted at /content/drive
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.8/250.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m82.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m96.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m95.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB

**Importing necessary libraries**

In [None]:
import numpy as np
import pandas as pd
import datasets
from datasets import Dataset, DatasetDict
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('/content/drive/MyDrive/wikihow.csv')
df.head(5)

Unnamed: 0,headline,title,text
0,"\nMuddle the mint leaves, brown sugar and lime...",How to Make a Mojito Diablo,"Use a muddler, a pestle or the back of a spoo..."
1,"\nBuy resurrecting wings from the shop.,\nUse ...",How to Resurrect in Temple Run,You'll need 500 coins collected from your run...
2,"\nRinse your hands in vinegar.,\nMake a paste ...",How to Get a Bad Smell off Your Hands6,Vinegar is good for removing smells such as f...
3,\nApply a small amount of cleaning or metal po...,How to Remove a Scratch on Glass Cooktops2,";\n,\n\n\nThis procedure will test your cleani..."
4,"\nFind your birth animal.,\nRead about your zo...",How to Read Your Chinese Horoscope,Consult the chart below to find the year of y...


**Checking the shape of dataset**

In [None]:
df.shape

(5000, 3)

**Checking for missing values**

In [None]:
df.isnull().sum()

headline    18
title        0
text        22
dtype: int64

**Removing missing values**

In [None]:
df = df.dropna()

**Resetting Index**

In [None]:
df = df.reset_index(drop=True)
df.head(3)

Unnamed: 0,headline,title,text
0,"\nMuddle the mint leaves, brown sugar and lime...",How to Make a Mojito Diablo,"Use a muddler, a pestle or the back of a spoo..."
1,"\nBuy resurrecting wings from the shop.,\nUse ...",How to Resurrect in Temple Run,You'll need 500 coins collected from your run...
2,"\nRinse your hands in vinegar.,\nMake a paste ...",How to Get a Bad Smell off Your Hands6,Vinegar is good for removing smells such as f...


**Checking for and removing duplicates**

In [None]:
print (df.shape)
df = df.drop_duplicates()
print (df.shape)

(4978, 3)
(4978, 3)


**Selecting a reasonable datasize for model training**

In [None]:
df = df.iloc[0:3000, :]
df.head(3)

Unnamed: 0,headline,title,text
0,"\nMuddle the mint leaves, brown sugar and lime...",How to Make a Mojito Diablo,"Use a muddler, a pestle or the back of a spoo..."
1,"\nBuy resurrecting wings from the shop.,\nUse ...",How to Resurrect in Temple Run,You'll need 500 coins collected from your run...
2,"\nRinse your hands in vinegar.,\nMake a paste ...",How to Get a Bad Smell off Your Hands6,Vinegar is good for removing smells such as f...


In [None]:
df.shape

(3000, 3)

**Cleaning the title column**

In [None]:
# Remove numbers from the 'title' column
df['title'] = df['title'].str.replace('\d+', '', regex=True)
# Adding a : at the end of the 'title' column
df['title'] = df['title'].apply(lambda x: x + ':')

In [None]:
df.head(2)

Unnamed: 0,headline,title,text
0,"\nMuddle the mint leaves, brown sugar and lime...",How to Make a Mojito Diablo:,"Use a muddler, a pestle or the back of a spoo..."
1,"\nBuy resurrecting wings from the shop.,\nUse ...",How to Resurrect in Temple Run:,You'll need 500 coins collected from your run...


**Cleaning text and headline columns**

In [None]:
# Remove numbers and special characters except full stop, apostrophe and comma
df['headline'] = df['headline'].str.replace(r'[^a-zA-Z\s\'.]', '', regex=True)
df['text'] = df['text'].str.replace(r'[^a-zA-Z\s\'.]', '', regex=True)
df.head(2)

Unnamed: 0,headline,title,text
0,\nMuddle the mint leaves brown sugar and lime ...,How to Make a Mojito Diablo:,Use a muddler a pestle or the back of a spoon...
1,\nBuy resurrecting wings from the shop.\nUse t...,How to Resurrect in Temple Run:,You'll need coins collected from your runs.\...


**Making a somewhat larger summary by concatenating title and headline columns**



In [None]:
df['summary'] = df['title'] + ' ' + df['headline']
df.head(3)

Unnamed: 0,headline,title,text,summary
0,\nMuddle the mint leaves brown sugar and lime ...,How to Make a Mojito Diablo:,Use a muddler a pestle or the back of a spoon...,How to Make a Mojito Diablo: \nMuddle the mint...
1,\nBuy resurrecting wings from the shop.\nUse t...,How to Resurrect in Temple Run:,You'll need coins collected from your runs.\...,How to Resurrect in Temple Run: \nBuy resurrec...
2,\nRinse your hands in vinegar.\nMake a paste o...,How to Get a Bad Smell off Your Hands:,Vinegar is good for removing smells such as f...,How to Get a Bad Smell off Your Hands: \nRinse...


**Dropping remaining columns**

In [None]:
df = df.iloc[:, 2:]
df.head(2)

Unnamed: 0,text,summary
0,Use a muddler a pestle or the back of a spoon...,How to Make a Mojito Diablo: \nMuddle the mint...
1,You'll need coins collected from your runs.\...,How to Resurrect in Temple Run: \nBuy resurrec...


**Making train, test and validation splits**

In [None]:
from sklearn.model_selection import train_test_split
train_old, test = train_test_split(df, test_size = 0.2, random_state = 1)
train, val = train_test_split(train_old, test_size = 0.25, random_state = 1)

In [None]:
print (train.shape, test.shape, val.shape)

(1800, 2) (600, 2) (600, 2)


In [None]:
train.head(1)

Unnamed: 0,text,summary
2239,Traits are the parts of you that dont usually...,How to Define Your Personality: \nMake a list ...


**Converting dataset to arrow format for faster training**

In [None]:
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)
val = Dataset.from_pandas(val)

In [None]:
dataset = DatasetDict()

dataset['train'] = train
dataset['test'] = test
dataset['val'] = val

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', '__index_level_0__'],
        num_rows: 1800
    })
    test: Dataset({
        features: ['text', 'summary', '__index_level_0__'],
        num_rows: 600
    })
    val: Dataset({
        features: ['text', 'summary', '__index_level_0__'],
        num_rows: 600
    })
})

**Removing newly made index column**

In [None]:
dataset = dataset.remove_columns(["__index_level_0__"])

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'summary'],
        num_rows: 1800
    })
    test: Dataset({
        features: ['text', 'summary'],
        num_rows: 600
    })
    val: Dataset({
        features: ['text', 'summary'],
        num_rows: 600
    })
})

**Data preprocessing for summarization**

In [None]:
def show_samples(dataset, num_samples=1, seed=1):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Summary: {example['summary']}'")
        print(f"'>> Text: {example['text']}'")

show_samples(dataset)


'>> Summary: How to Make Hair Treatments: 
Raw egg yolks nourish and moisturize dry hair.
Coat damp hair before shampooing with  ml or  cup of egg yolk and leave it on for  minutes.


Rinse off with cool water and shampoo as usual.'
'>> Text: 
 This can be used once a month.

'


In [None]:
dataset = dataset.filter(lambda x: len(x["summary"].split()) > 2)

Filter:   0%|          | 0/1800 [00:00<?, ? examples/s]

Filter:   0%|          | 0/600 [00:00<?, ? examples/s]

Filter:   0%|          | 0/600 [00:00<?, ? examples/s]

In [None]:
dataset.shape

{'train': (1800, 2), 'test': (600, 2), 'val': (600, 2)}

**Initializing tokenizer and model**

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

**Defining tokenization function**

In [None]:
max_input_length = 1024
max_target_length = 100


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["text"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["summary"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

**Tokenizing the dataset**

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

**Defining evaluation metric and making a dictionary of various scores**

In [None]:
import evaluate

rouge_score = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
generated_summary = "I absolutely loved reading the Hunger Games"
reference_summary = "I loved reading the Hunger Games"

In [None]:
scores = rouge_score.compute(
    predictions=[generated_summary], references=[reference_summary])
scores

{'rouge1': 0.923076923076923,
 'rouge2': 0.7272727272727272,
 'rougeL': 0.923076923076923,
 'rougeLsum': 0.923076923076923}

In [None]:
import nltk

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import sent_tokenize


def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])


print(three_sentence_summary(dataset["train"][1]["text"]))

 These will have a pleated fiberglass mat or synthetic filter media not paper.
The cleaning methods described here will cause cheap media to break down leaving the filter useless.
When it becomes dirty remove it from the pumpfilter assembly.


In [None]:
def evaluate_baseline(dataset, metric):
    summaries = [three_sentence_summary(text) for text in dataset["text"]]
    return metric.compute(predictions=summaries, references=dataset["summary"])

In [None]:
score = evaluate_baseline(dataset["val"], rouge_score)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, round(score[rn] * 100, 2)) for rn in rouge_names)
rouge_dict

{'rouge1': 23.49, 'rouge2': 4.83, 'rougeL': 14.96, 'rougeLsum': 21.88}

**Initializing the model**

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

**Initializing data collator**

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

**Removing column names from tokenized dataset**

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(dataset["train"].column_names)

**Extracting features from tokenized dataset**

In [None]:
features = [tokenized_datasets["train"][i] for i in range(2)]
# data_collator(features)

**Defining data post processing function**

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # ROUGE expects a newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

**Convering tokenized dataset to torch format**

In [None]:
tokenized_datasets.set_format("torch")

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

**Preparing train and evaluation data loaders**

In [None]:
from torch.utils.data import DataLoader

batch_size = 2
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)
eval_dataloader = DataLoader(
    tokenized_datasets["val"], collate_fn=data_collator, batch_size=batch_size
)

**Initializing adam optimizer**

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

**Initializing accelerator**

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

**Defining training arguments**

In [None]:
from transformers import get_scheduler

num_train_epochs = 8
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # ROUGE expects a newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

**Defining the name and output directory for trained model**

In [None]:
model_name = "text_summarization_accelerate_own"
output_dir = "/content/drive/MyDrive/text summarization model/"

**Model training**

In [None]:
from tqdm.auto import tqdm
import torch
import numpy as np

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
            )

            generated_tokens = accelerator.pad_across_processes(
                generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
            )
            labels = batch["labels"]

            # If we did not pad to max length, we need to pad the labels too
            labels = accelerator.pad_across_processes(
                batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
            )

            generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
            labels = accelerator.gather(labels).cpu().numpy()

            # Replace -100 in the labels as we can't decode them
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]
            decoded_preds = tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            decoded_preds, decoded_labels = postprocess_text(
                decoded_preds, decoded_labels
            )

            rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels)

    # Compute metrics
    result = rouge_score.compute()
    # Extract the median ROUGE scores
    result = {key: value * 100 for key, value in result.items()}
    result = {k: round(v, 4) for k, v in result.items()}
    print(f"Epoch {epoch}:", result)

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)

  0%|          | 0/7200 [00:00<?, ?it/s]

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: {'rouge1': 40.632, 'rouge2': 15.8284, 'rougeL': 30.6617, 'rougeLsum': 39.5475}
Epoch 1: {'rouge1': 40.9352, 'rouge2': 15.801, 'rougeL': 30.616, 'rougeLsum': 39.7476}
Epoch 2: {'rouge1': 40.6826, 'rouge2': 15.6723, 'rougeL': 30.7377, 'rougeLsum': 39.5057}
Epoch 3: {'rouge1': 41.0823, 'rouge2': 15.7934, 'rougeL': 30.8835, 'rougeLsum': 39.8965}
Epoch 4: {'rouge1': 40.5059, 'rouge2': 15.067, 'rougeL': 29.9361, 'rougeLsum': 39.1592}
Epoch 5: {'rouge1': 40.3915, 'rouge2': 15.0178, 'rougeL': 29.7409, 'rougeLsum': 39.1627}
Epoch 6: {'rouge1': 40.4205, 'rouge2': 14.9754, 'rougeL': 29.6928, 'rougeLsum': 39.201}
Epoch 7: {'rouge1': 40.3347, 'rouge2': 14.7952, 'rougeL': 29.657, 'rougeLsum': 39.0005}


In [None]:
dataset["test"][1]["text"]

" Eating the same things at the same times can get dull for both children and adults. In order to keep your child's interest in eating healthy items it can be a good idea to switch things up from time to time. Consider trying some of the following tips to keep snacks exciting and interestingAvoid offering the same snacks everyday.\nTry combining snacks. For example adding yogurt to cereal can make a new snack option.\n\n Giving your child some control and influence over snacks is a great way to make healthy eating engaging. Whenever you go shopping with your child try letting them pick out some healthy items such as fruits or vegetables that they find appealing. This can get them excited about healthy foods and cause them to look forward to the next snack.While shopping for vegetables or fruits ask your child which ones look appealing.\nWhen shopping for any healthy snacks let your child choose one that they would enjoy.\n\n During snack time your child may or may not be hungry. Forcin

In [None]:
dataset["test"][1]["summary"]

"How to Encourage Your Child to Eat Healthy Snacks: \nSwitch things up.\nInvolve your child.\nWork with your child's appetite."

**Taking input from user and generating summary from the trained model**

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="/content/drive/MyDrive/text summarization model/")

user_input = input("Please enter the text: ")

output = summarizer(user_input)
summary_text = output[0]['summary_text']

print(summary_text)

Please enter the text:  Eating the same things at the same times can get dull for both children and adults. In order to keep your child's interest in eating healthy items it can be a good idea to switch things up from time to time. Consider trying some of the following tips to keep snacks exciting and interestingAvoid offering the same snacks everyday. Try combining snacks. For example adding yogurt to cereal can make a new snack option.   Giving your child some control and influence over snacks is a great way to make healthy eating engaging. Whenever you go shopping with your child try letting them pick out some healthy items such as fruits or vegetables that they find appealing. This can get them excited about healthy foods and cause them to look forward to the next snack.While shopping for vegetables or fruits ask your child which ones look appealing. When shopping for any healthy snacks let your child choose one that they would enjoy.   During snack time your child may or may not b