In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive




*   Import all necessary libraries



In [None]:
!pip install -U transformers
!pip install -U datasets
!pip install tensorboard
!pip install sentencepiece
!pip install accelerate

Collecting transformers
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.36.2
Collecting datasets
  Downloading datasets-2.16.0-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Collecting mu

In [None]:
import os
import time
import datetime
from google.colab import drive

import pandas as pd
import seaborn as sns
import numpy as np
import random

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import AutoModelForCausalLM, GenerationConfig, AutoTokenizer

# Set the seed value all over the place to make this reproducible.
SEED_VAL = 42

random.seed(SEED_VAL)
np.random.seed(SEED_VAL)
torch.manual_seed(SEED_VAL)
torch.cuda.manual_seed_all(SEED_VAL)
# used only for splitting the training set into train and val
# we don't want to randomly split the whole dataset
# we want to use the corpus's splits, so that we can compare results with others

from transformers import GPT2LMHeadModel, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration)
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers.optimization import Adafactor, AdafactorSchedule

import nltk
nltk.download('punkt') # library that divides a text into a list of sentences

MAX_LENGTH = 1024

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Prepare data


*   Download 20 news groups using the sklearn library in Python
*   Acess the text data and store them in data.

In [None]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

# Download the dataset's splits
newsgroups_data_train = fetch_20newsgroups(subset='train')

df = pd.DataFrame(newsgroups_data_train.data, columns=['news'])
df.head()

Unnamed: 0,news
0,From: lerxst@wam.umd.edu (where's my thing)\nS...
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...




* Preprocessing the dataset "data" to extract the subject  and the message body from each message.
* Formating it to a specific structure with the subject followed by a summary indicator ('; TLDR: ') and the message body.
* In the test split, we don't add the message body, as that's what we want the model to learn to generate.
* However, we save it separately so that we can use evaluation metrics with the reference later.





In [None]:
import re
# tokenizer = T5Tokenizer.from_pretrained('t5-small')

subject_and_body_prompt, subject_prompt, body_output = [], [], []

for index, row in df.iterrows():
  el = row['news']
  lines = el.split('\n')

  body, subject, element = '', '', ''

  for line in lines:
    # save the subject
    if 'Subject:' in line:
      subject = line[len('Subject:') + 1:]

    # ignoring other headers
    elif len(re.findall("^[A-Za-z-_\.]+:", line)) != 0:
      continue

    # save the body, respecting the model's maximum nr of tokens
    elif len(line) > 1:
      if (len(body.split(' '))):
        body += line + ' '

  # input: subject ; TLDR: body
  # we put backwards so that the model learns to generate the continuation
  full_element = 'summarize: ' + subject + ';' + body[:-1]
  subject_element = 'summarize: ' + subject

  subject_and_body_prompt.append(full_element)
  subject_prompt.append(subject_element)
  body_output.append(body[:-1])

df['subject_and_body_prompt'] = subject_and_body_prompt
df['subject_prompt'] = subject_prompt
df['body_output'] = body_output

df = df.drop(columns=['news'])
df.head()
#len(df)

Unnamed: 0,subject_and_body_prompt,subject_prompt,body_output
0,summarize: WHAT car is this!?; I was wondering...,summarize: WHAT car is this!?,I was wondering if anyone out there could enl...
1,summarize: SI Clock Poll - Final Call;A fair n...,summarize: SI Clock Poll - Final Call,A fair number of brave souls who upgraded thei...
2,"summarize: PB questions...;well folks, my mac ...",summarize: PB questions...,"well folks, my mac plus finally gave up the gh..."
3,summarize: Re: Weitek P9000 ?;Robert J.C. Kyan...,summarize: Re: Weitek P9000 ?,Robert J.C. Kyanko (rob@rjck.UUCP) wrote: > ab...
4,summarize: Re: Shuttle Launch Question;From ar...,summarize: Re: Shuttle Launch Question,"From article <C5owCB.n3p@world.std.com>, by to..."


In [None]:
val   = df.sample(n=100, random_state=SEED_VAL)
train = df.loc[~df.index.isin(val.index)]

#Reset the indexes
val   = val.reset_index()
train = train.reset_index()

In [None]:
val.head()
#len(val)

# save val output as a reference for evaluation in the future
with open('drive/MyDrive/reference_output.txt', 'w') as outfile:
  for index, row in val.iterrows():
    outfile.write(row['body_output'] + '\n')

In [None]:
train.head()
#len(train)

Unnamed: 0,index,subject_and_body_prompt,subject_prompt,body_output
0,0,summarize: WHAT car is this!?; I was wondering...,summarize: WHAT car is this!?,I was wondering if anyone out there could enl...
1,1,summarize: SI Clock Poll - Final Call;A fair n...,summarize: SI Clock Poll - Final Call,A fair number of brave souls who upgraded thei...
2,2,"summarize: PB questions...;well folks, my mac ...",summarize: PB questions...,"well folks, my mac plus finally gave up the gh..."
3,3,summarize: Re: Weitek P9000 ?;Robert J.C. Kyan...,summarize: Re: Weitek P9000 ?,Robert J.C. Kyanko (rob@rjck.UUCP) wrote: > ab...
4,4,summarize: Re: Shuttle Launch Question;From ar...,summarize: Re: Shuttle Launch Question,"From article <C5owCB.n3p@world.std.com>, by to..."




*   Setting GPT2 Tokenizer





*   Defining a custom dataset 'GPT2Dataset' for pytorch which will be used for model.



In [None]:
# https://github.com/francoisstamant/lyrics-generation-with-GPT2/blob/main/GPT2_final.ipynb
class NewsDataset(Dataset):
  def __init__(self, dataframe, max_length=MAX_LENGTH, split='train'):
    self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    # self.tokenizer.pad_token = self.tokenizer.eos_token
    self.news_count = 0
    self.subjects = []
    self.bodies = []

    for index, row in dataframe.iterrows():
      subject = row['subject_prompt']
      body = row['body_output']

      subject_encoding = self.tokenizer(
          subject,
          return_tensors='pt',
          max_length=max_length,
          padding='max_length'
      )['input_ids'][0][:1024]

      body_encoding = self.tokenizer(
          body,
          return_tensors='pt',
          max_length=max_length,
          padding='max_length'
      )['input_ids'][0][:1024]

      self.subjects.append(subject_encoding.clone().detach())
      self.bodies.append(body_encoding.clone().detach())

    self.news_count = len(self.subjects)

  def __len__(self):
    return self.news_count

  def __getitem__(self, idx):
    return self.subjects[idx], self.bodies[idx]




Creating the custom dataset




In [None]:
train_dataset = NewsDataset(train)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [None]:
len(train_dataset)

11214

In [None]:
val_dataset = NewsDataset(val, split='val')

In [None]:
len(val_dataset)

100

# Training

In [None]:
# from transformers import AutoModelForPreTraining
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
# model = AutoModelForPreTraining.from_pretrained('gpt2')

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
import matplotlib.pyplot as plt
from tqdm import tqdm

def train_model(
    dataset, model, tokenizer,
    batch_size=1, epochs=20, lr=2e-5,
    max_seq_len=MAX_LENGTH, warmup_steps=200,
    gpt2_type="gpt2", output_dir="drive/MyDrive/", output_prefix="gpt2_fine-tuning",
    test_mode=False,save_model_on_epoch=False,
):

    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()


    optimizer = Adafactor(model.parameters(), scale_parameter=False, relative_step=False, warmup_init=False, lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )
    train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    loss_values = []
    ppl_values = []
    for epoch in range(epochs):
        running_loss = 0.0
        running_ppl = 0.0

        print(f"Training epoch {epoch}")
        print('Loss: ', loss)
        if epoch != 0:
            print('Average loss: ', loss_values[-1])
            print('Average perplexity: ', ppl_values[-1])

        for idx, entry in tqdm(enumerate(train_dataloader)):
            subject, body = entry[0], entry[1]

            if (subject.size()[-1] != body.size()[-1]):
                #print('different sizes')
                continue
            #print(subject.size())
            #print(body.size())
            input_tensor = subject.to(device)
            continuation = body.to(device)
            outputs = model(input_tensor, labels=continuation)
            loss = outputs[0]
            loss.backward()

            running_loss =+ loss.item() * batch_size # batch size

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None

            # # # # # # # # perplexity # # # # # # # #
            encodings = body
            max_length = model.config.n_positions
            stride = 512
            seq_len = len(encodings)

            nlls = []
            prev_end_loc = 0
            for begin_loc in range(0, seq_len, stride):
                end_loc = min(begin_loc + max_length, seq_len)
                trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
                input_ids = encodings[begin_loc:end_loc].to(device)
                target_ids = input_ids.clone()
                target_ids[:-trg_len] = -100

                with torch.no_grad():
                    outputs = model(input_ids, labels=target_ids)
                    neg_log_likelihood = outputs.loss

                nlls.append(neg_log_likelihood)
                prev_end_loc = end_loc
                if end_loc == seq_len:
                    break
            ppl = torch.exp(torch.stack(nlls).mean())
            running_ppl =+ ppl * batch_size # batch size
            # # # # # # # #

        loss_values.append(running_loss / len(dataset))
        ppl_values.append(running_ppl / len(dataset))

    #plt.plot(loss_values)
    #plt.plot(ppl_values)
    return model, loss_values, ppl_values

In [None]:
#!pip install light-the-torch
#!ltt install torch torchvision

In [None]:
model, loss_values, ppl_values = train_model(train_dataset, model, tokenizer, epochs=4)

Training epoch 0
Loss:  0


11214it [1:07:35,  2.76it/s]


Training epoch 1
Loss:  tensor(3.3771, device='cuda:0', grad_fn=<NllLossBackward0>)
Average loss:  0.000301147306669156
Average perplexity:  tensor(0.0018, device='cuda:0')


11214it [1:07:30,  2.77it/s]


Training epoch 2
Loss:  tensor(2.3564, device='cuda:0', grad_fn=<NllLossBackward0>)
Average loss:  0.0002101278356079624
Average perplexity:  tensor(0.0005, device='cuda:0')


11214it [1:07:28,  2.77it/s]


Training epoch 3
Loss:  tensor(1.6826, device='cuda:0', grad_fn=<NllLossBackward0>)
Average loss:  0.00015004178260298405
Average perplexity:  tensor(0.0003, device='cuda:0')


4400it [26:23,  2.72it/s]

In [None]:
torch.save(model.state_dict(), 'drive/MyDrive/fine-tuned-gpt2.pt')

In [None]:
loss_values

In [None]:
ppl_values

# Generation

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.load_state_dict(torch.load('drive/MyDrive/fine-tuned-gpt2.pt'))
model.eval()

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# tokenizer.pad_token = tokenizer.eos_token

# add the EOS token as PAD token to avoid warnings
# model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

## Greedy search

In [None]:
from tqdm import tqdm

greedy_outputs = []
for index, row in val.iterrows():
  # encode context the generation is conditioned on
  model_inputs = tokenizer(row['subject_prompt'], return_tensors='pt')

  greedy_output = model.generate(
      **model_inputs,
      max_new_tokens=MAX_LENGTH - len(model_inputs['input_ids'][0]))

  text = tokenizer.decode(greedy_output[0], skip_special_tokens=True)
  greedy_outputs.append(text)

In [None]:
for index, row in val.iterrows():
  print(row['subject_prompt'])
  print(greedy_outputs[index])
  if index == 3:
    break

In [None]:
# save output to file
with open('drive/MyDrive/gpt2-greedy_output-2.txt', 'w') as outfile:
  for index, row in val.iterrows():
    outfile.write(greedy_outputs[index].replace("\n", " ") + '\n')

## Beam search

In [None]:
from tqdm import tqdm
beam_outputs = []

for index, row in val.iterrows():
  # encode context the generation is conditioned on
  model_inputs = tokenizer(row['subject_prompt'], return_tensors='pt')

  beam_output = beam_output = model.generate(
    **model_inputs,
    max_new_tokens=MAX_LENGTH - len(model_inputs['input_ids'][0]),
    num_beams=5,
    no_repeat_ngram_size=2,
    early_stopping=True
  )

  text = tokenizer.decode(beam_output[0], skip_special_tokens=True)
  beam_outputs.append(text)

In [None]:
# save output to file
with open('drive/MyDrive/gpt2-beam_output.txt', 'w') as outfile:
  for index, row in val.iterrows():
    outfile.write(beam_outputs[index][len(row['subject_prompt']):].replace("\n", " ") + '\n')

## Top-k sampling

In [None]:
from tqdm import tqdm

topk_outputs = []
for index, row in val.iterrows():
  # encode context the generation is conditioned on
  model_inputs = tokenizer(row['subject_prompt'], return_tensors='pt')

  topk_output = sample_output = model.generate(
      **model_inputs,
      max_new_tokens=MAX_LENGTH - len(model_inputs['input_ids'][0]),
      do_sample=True,
      top_k=50
  )

  text = tokenizer.decode(topk_output[0], skip_special_tokens=True)
  topk_outputs.append(text)

In [None]:
# save output to file
with open('drive/MyDrive/gpt2-topk_output.txt', 'w') as outfile:
  for index, row in val.iterrows():
    outfile.write(topk_outputs[index][len(row['subject_prompt']):].replace("\n", " ") + '\n')

## Top-p sampling

In [None]:
from tqdm import tqdm

topp_outputs = []
for index, row in val.iterrows():
  # encode context the generation is conditioned on
  model_inputs = tokenizer(row['subject_prompt'], return_tensors='pt')

  topp_output = model.generate(
      **model_inputs,
      max_new_tokens=MAX_LENGTH - len(model_inputs['input_ids'][0]),
      do_sample=True,
      top_p=0.92,
      top_k=0
  )


  text = tokenizer.decode(topp_output[0], skip_special_tokens=True)
  topp_outputs.append(text)

In [None]:
# save output to file
with open('drive/MyDrive/gpt2-topp_output.txt', 'w') as outfile:
  for index, row in val.iterrows():
    outfile.write(topp_outputs[index][len(row['subject_prompt']):].replace("\n", " ") + '\n')