In [15]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0m

In [5]:
!nvidia-smi

Fri Apr 14 15:23:01 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   34C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|       

In [6]:
from transformers import pipeline, set_seed

import matplotlib.pyplot as plt
from datasets import load_dataset
import pandas as pd
from datasets import load_dataset, load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

# Pretrained Pegasus Model

In [10]:
model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [11]:
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

# Load Dataset

In [8]:
df=pd.read_csv("/kaggle/input/chat-gpt-dataset/dataset.csv")

In [9]:
df

Unnamed: 0.1,Unnamed: 0,x,y
0,0,"Ahead of AyodhyaVerdict, appeal to everyone to...",welcome Hon Supreme Court's AyodhyaVerdict on...
1,1,"WATCH Maulana Asghar Ali Salafi, President, Ma...",is a landmark decision by the Supreme Court o...
2,2,For all those of the western media that were g...,\n\nThis group of tweets is discussing the rea...
3,3,Reporter's DiaryMet two kids who sell diyas on...,".\n\nThese tweets discuss the Ayodhya verdict,..."
4,4,Just bcoz BJP got 303 seats in LS everything l...,: This tweet is suggesting that the BJP's succ...
...,...,...,...
1036,1036,CNN nbcsnl joerogan WHO WoodyHarrelson COVID19...,\n\nThese tweets discuss the COVID-19 pandemic...
1037,1037,Maddow Bitcoin CNN BernieSanders ElonMusk Eliz...,\n\nThese tweets discuss the ongoing COVID-19 ...
1038,1038,:red_circle: FBI Director COVID19 Pandemic 'l...,\n\nThese tweets are discussing the origin of ...
1039,1039,GOP Sen. Tom Cotton RAR slammed liberal media ...,\nSen. Tom Cotton has criticized the liberal m...


# Train Test Split

In [10]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.1, random_state=1)

In [15]:
test['x'][148]

'Contact for Learning Risk Management, Options Selling amp How to catch Market trends.:pushpin:Nifty StockMarketNifty50 NiftyBankAdaniGroup AdaniAdaniEnterprisesstockmarketcrash Telegram channel joining link.:red_triangle_pointed_down: Morning Minutes✦Group Moves Reliance, Adani amp Jindal group amp HDFC twins trade in green, while Tata group trade mixed✦Stocks in focus PowerGrid, ZydusLifesciences, TataPower, VodafoneIdea, Seamec, AdaniEntStockMarket nseindia BSEHindenburgRes BSDK Tool kit of georgesoros and INCIndia trying to demolish Indian Democracy by attacking Adani . We know everything who is behind this. Wait n watch for reaction."Coincidentally, Adani Powers sister concern Adani Enterprises is Indias biggest coal trader and owns mines in Indonesia and Australia – and the coal for the Godda plant will most likely be supplied by that vendor." StopAdani MSCIInc  TRANSMISSION Rs 27.23 crores NSE Block Trade for  4,04,617 shares, at Rs 673blockdeals nseindiaWho is buying Adani shar

In [16]:
test['y'][148]

"\n\nThis tweet thread is about the controversies surrounding Adani Enterprises, a coal trader and mine owner in India. It discusses how the Modi government has allegedly given special favors to Adani, such as allowing them to mine from a block with a large amount of coal. It also mentions how the Adani and Vedanta debacles have caused a decline in stock prices and investor caution. Finally, it talks about the LPG price hike to compensate for Adani's losses."

# Evaluating Pegasus pretrained model on my dataset

In [31]:
pipe = pipeline('summarization', model = model_ckpt)

In [32]:
pipe_out = pipe(test['x'][148])
print(pipe_out)

[{'summary_text': 'Reliance, Adani amp Jindal group amp twins trade in green, while Tata group trade mixedStocks in focus PowerGrid, ZydusLifesciences, TataPower, VodafoneIdea, Seamec, AdaniEntStockMarket nse India .'}]


# Calculating Rouge Score

In [10]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

In [11]:
def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, 
                               batch_size=16, device=device, 
                               column_text="article", 
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text].tolist(), batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary].tolist(), batch_size))
    print(type(article_batches))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):
        print("Enter")

        inputs = tokenizer(article_batch, max_length=1024,  truncation=True, 
                        padding="max_length", return_tensors="pt")
        print("3")
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device), 
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
        
        # Finally, we decode the generated texts, 
        # replace the  token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=True) 
               for s in summaries]      
        
        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
        
        
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
        
    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

In [12]:
pip install rouge_score

[0mNote: you may need to restart the kernel to use updated packages.


In [36]:
rouge_metric = load_metric('rouge')
score = calculate_metric_on_test_ds(test,rouge_metric,model_pegasus,tokenizer,4,column_text="x",column_summary='y')

<class 'list'>


  0%|          | 0/27 [00:00<?, ?it/s]

Enter
3


  4%|▎         | 1/27 [00:07<03:02,  7.02s/it]

Enter
3


  7%|▋         | 2/27 [00:14<02:55,  7.00s/it]

Enter
3


 11%|█         | 3/27 [00:20<02:42,  6.76s/it]

Enter
3


 15%|█▍        | 4/27 [00:27<02:38,  6.90s/it]

Enter
3


 19%|█▊        | 5/27 [00:32<02:16,  6.22s/it]

Enter
3


 22%|██▏       | 6/27 [00:37<02:03,  5.86s/it]

Enter
3


 26%|██▌       | 7/27 [00:44<02:05,  6.26s/it]

Enter
3


 30%|██▉       | 8/27 [00:51<02:03,  6.50s/it]

Enter
3


 33%|███▎      | 9/27 [00:58<02:00,  6.68s/it]

Enter
3


 37%|███▋      | 10/27 [01:05<01:55,  6.78s/it]

Enter
3


 41%|████      | 11/27 [01:12<01:49,  6.82s/it]

Enter
3


 44%|████▍     | 12/27 [01:19<01:43,  6.88s/it]

Enter
3


 48%|████▊     | 13/27 [01:25<01:31,  6.56s/it]

Enter
3


 52%|█████▏    | 14/27 [01:32<01:26,  6.68s/it]

Enter
3


 56%|█████▌    | 15/27 [01:38<01:17,  6.48s/it]

Enter
3


 59%|█████▉    | 16/27 [01:45<01:12,  6.62s/it]

Enter
3


 63%|██████▎   | 17/27 [01:52<01:07,  6.76s/it]

Enter
3


 67%|██████▋   | 18/27 [01:59<01:01,  6.88s/it]

Enter
3


 70%|███████   | 19/27 [02:06<00:55,  6.91s/it]

Enter
3


 74%|███████▍  | 20/27 [02:11<00:43,  6.26s/it]

Enter
3


 78%|███████▊  | 21/27 [02:18<00:38,  6.47s/it]

Enter
3


 81%|████████▏ | 22/27 [02:25<00:33,  6.65s/it]

Enter
3


 85%|████████▌ | 23/27 [02:32<00:27,  6.79s/it]

Enter
3


 89%|████████▉ | 24/27 [02:39<00:20,  6.85s/it]

Enter
3


 93%|█████████▎| 25/27 [02:46<00:13,  6.90s/it]

Enter
3


 96%|█████████▋| 26/27 [02:53<00:06,  6.93s/it]

Enter
3


100%|██████████| 27/27 [02:57<00:00,  6.57s/it]


# Calculating the Rogue Score before training the model on custom dataset

In [37]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )
pd.DataFrame(rouge_dict, index = ['pegasus'])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.018328,7.1e-05,0.017952,0.017915


In [20]:
from datasets import Dataset
train_ds=Dataset.from_pandas(train)
test_ds=Dataset.from_pandas(test)

# Fine Tuning the Model

In [21]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['x'] , max_length = 1024, truncation = True )
    
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['y'], max_length = 150, truncation = True )
        
    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }
    
train_pt = train_ds.map(convert_examples_to_features, batched = True)
test_pt=test_ds.map(convert_examples_to_features, batched = True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "


  0%|          | 0/1 [00:00<?, ?ba/s]

In [22]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [23]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir='pegasus-twitter', num_train_epochs=3, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16
) 

In [24]:
trainer = Trainer(model=model_pegasus, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=train_pt, 
                  eval_dataset=test_pt)

In [25]:
trainer.train()



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


TrainOutput(global_step=87, training_loss=3.1307084094518904, metrics={'train_runtime': 1383.1968, 'train_samples_per_second': 2.03, 'train_steps_per_second': 0.063, 'total_flos': 3492792484773888.0, 'train_loss': 3.1307084094518904, 'epoch': 2.97})

# Saving the Trained model

In [26]:
trainer.save_model("/kaggle/working/")

In [45]:
model_pegasus.save_pretrained("finetuned-model")

In [46]:
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/spiece.model',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

# Testing on the trained model

In [11]:
model_ckpt = "/kaggle/input/pegasus-model-3"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

In [12]:
pipe = pipeline('summarization', model = model_ckpt)

In [13]:
test['x'][148]

'Contact for Learning Risk Management, Options Selling amp How to catch Market trends.:pushpin:Nifty StockMarketNifty50 NiftyBankAdaniGroup AdaniAdaniEnterprisesstockmarketcrash Telegram channel joining link.:red_triangle_pointed_down: Morning Minutes✦Group Moves Reliance, Adani amp Jindal group amp HDFC twins trade in green, while Tata group trade mixed✦Stocks in focus PowerGrid, ZydusLifesciences, TataPower, VodafoneIdea, Seamec, AdaniEntStockMarket nseindia BSEHindenburgRes BSDK Tool kit of georgesoros and INCIndia trying to demolish Indian Democracy by attacking Adani . We know everything who is behind this. Wait n watch for reaction."Coincidentally, Adani Powers sister concern Adani Enterprises is Indias biggest coal trader and owns mines in Indonesia and Australia – and the coal for the Godda plant will most likely be supplied by that vendor." StopAdani MSCIInc  TRANSMISSION Rs 27.23 crores NSE Block Trade for  4,04,617 shares, at Rs 673blockdeals nseindiaWho is buying Adani shar

In [14]:
pipe_out = pipe(test['x'][148])
print(pipe_out)

[{'summary_text': 'Adani and Vedanta debacles have caused a ripple effect in the Indian financial markets, leading to a decline in stock prices and heightened investor caution.<n>The impact of corporate governance issues cannot be ignored. Adani Vedanta CorporateGovernance.'}]
