In [1]:
root = '/content/drive/MyDrive/news_summary'

In [2]:
import os
os.chdir(root)
os.getcwd()

'/content/drive/MyDrive/news_summary'

In [3]:
!pip install sentencepiece -q
!pip install transformers -q

[K     |████████████████████████████████| 1.3 MB 33.7 MB/s 
[K     |████████████████████████████████| 4.9 MB 23.9 MB/s 
[K     |████████████████████████████████| 6.6 MB 61.6 MB/s 
[K     |████████████████████████████████| 120 kB 70.7 MB/s 
[?25h

In [4]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import T5Tokenizer, T5ForConditionalGeneration
import numpy as np
import pandas as pd
# t5forconditionalgeneration is a langauge model on top of the t5 generation model

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [5]:
!nvidia-smi

Tue Sep 20 01:56:26 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [31]:
class Config:
  TRAIN_BATCH_SIZE = 128
  VAL_BATCH_SIZE = 2
  TRAIN_EPOCHS = 2
  VAL_EPOCHS = 1
  LEARNING_RATE = 0.001
  SEED = 42
  MAX_LEN = 512 # article text max len
  SUMMARY_LEN = 150 # summary text max len

In [8]:
def df_processing(file_path, train_size=0.8, seed=Config.SEED):
  df = pd.read_csv(file_path, encoding='latin1')
  df = df[['text', 'ctext']] # text is the summary, ctext is the detailed news article
  df['ctext'] = 'summarize: ' + df['ctext'] # required for T5model

  train_df = df.sample(frac=train_size, random_state=seed)
  val_df = df.drop(train_df.index).reset_index(drop=True)
  train_df = train_df.reset_index(drop=True)

  return train_df, val_df

In [9]:
a, b = df_processing('/content/drive/MyDrive/news_summary/news_summary.csv')

In [10]:
a.head()

Unnamed: 0,text,ctext
0,"All restaurants, including five-star hotels, i...",summarize: Come April and you won?t have to go...
1,The Chinese government has banned singer Justi...,"summarize: ?I just need one more shot, second ..."
2,Pakistan on Saturday accused India of targetin...,summarize: Pakistan?s top military officer on ...
3,A 40-year-old woman in Hyderabad was burnt ali...,"summarize: In a tragic incident, a woman was b..."
4,The Delhi Police has arrested a man working as...,summarize: A man who allegedly threatened to b...


In [11]:
b.head()

Unnamed: 0,text,ctext
0,Hotels in Maharashtra will train their staff t...,summarize: Hotels in Mumbai and other Indian c...
1,The Congress party has opened a bank called 'S...,"summarize: It sounds like satire, but make no ..."
2,"Tanveer Hussain, a 24-year-old Indian athlete ...",summarize: A 24-year-old Indian athlete has be...
3,"The remains of a German hiker, who disappeared...",summarize: The remains of a German hiker who d...
4,"A UK-based doctor, Manish Shah, has been charg...",summarize: A GP who practised in east London h...


In [12]:
tk = T5Tokenizer.from_pretrained("t5-base")
tk.batch_encode_plus(['hello world aaaa'], max_length=10, pad_to_max_length=True)

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'input_ids': [[21820, 296, 3, 9, 9, 9, 9, 1, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]}

In [13]:
tk.decode([21820, 296, 3, 9, 9, 9, 9, 1, 0, 0])

'hello world aaaa</s> <pad> <pad>'

In [32]:
class CustomNewsDataset(Dataset):
  def __init__(self, df, tokenizer, article_len, summary_len):
    self.df = df
    self.text = self.df.text
    self.ctext = self.df.ctext
    self.sum_len = summary_len
    self.src_len = article_len
    self.tokenizer = tokenizer # T5TOkenizer

  def __len__(self):
    return len(self.text)

  def __getitem__(self, index):
    ctext = str(self.ctext[index])
    ctext = ' '.join(ctext.split())

    text = str(self.text[index])
    text = ' '.join(text.split())

    src_tokens = self.tokenizer.batch_encode_plus([ctext], max_length=self.src_len, pad_to_max_length=True, return_tensors='pt')
    target_tokens = self.tokenizer.batch_encode_plus([text], max_length=self.sum_len, pad_to_max_length=True, return_tensors='pt')

    src_ids = src_tokens['input_ids'].squeeze().to(dtype=torch.long) # reducing to a 1d vector
    src_mask = src_tokens['attention_mask'].squeeze().to(dtype=torch.long)
    target_ids = target_tokens['input_ids'].squeeze().to(dtype=torch.long)
    target_mask = target_tokens['attention_mask'].squeeze().to(dtype=torch.long)

    return {
        'source_ids': src_ids,
        'source_mask': src_mask,
        'target_ids': target_ids
    }

In [15]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [16]:
train_dataset = CustomNewsDataset(a, tokenizer, Config.MAX_LEN, Config.SUMMARY_LEN)
val_dataset = CustomNewsDataset(b, tokenizer, Config.MAX_LEN, Config.SUMMARY_LEN)

In [17]:
train_loader = DataLoader(
    train_dataset,
    batch_size = Config.TRAIN_BATCH_SIZE,
    shuffle = True,
    num_workers= 0
)

val_loader = DataLoader(
    val_dataset,
    batch_size = Config.VAL_BATCH_SIZE,
    shuffle = True,
    num_workers= 0
)

In [18]:
next(iter(train_loader))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'source_ids': tensor([[21603,    10,    71,  ...,     0,     0,     0],
         [21603,    10,  2106,  ...,     0,     0,     0]]),
 'source_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'target_ids': tensor([[   71,  2131,  3441,    63,   144,    16,  6697,    31,     7, 27864,
           7985,    65,  1380,     3,     9,   388,    12,     3,  5846,    15,
              3,     9,   898,    18,  1201,    18,  1490,  3202,    16,   851,
             13,   160,   384,    38, 19372,   227,   160,  4284,    47, 11970,
             13,     3,  5846,    53,     8,   388,    31,     7,  4806,     6,
           2095,   243,    30,  2875,     5,    37,   337,  1810,   225,    36,
          12171,    28,     8, 11970,    31,     7,  4806,    38,  4831,     6,
              8,  3309,  6098,  7760,     5,  5076, 10195,     8,   819,    13,
              8,  2131,  3441,    63,   144,     5,     1,     0,     0,     0,
              0,     0,     0,     0,     0, 

In [19]:
next(iter(val_loader))

{'source_ids': tensor([[21603,    10,   749,  ...,     0,     0,     0],
         [21603,    10,    37,  ...,     0,     0,     0]]),
 'source_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'target_ids': tensor([[  749,  4377,    53,   581,     8,  4816,   993,    16,  1025,    18,
           8071,   958,  3331,     6,  4968,  1689,  2713,   640,  2315,   708,
             46,    16, 14339,  6585,    45,  1856,     5,    96,   634,  2870,
             21,  4831,    57,  4072,   277,    13,  1547,    56,   916,  6501,
             27, 10255,  9822,  7211,     7,   165,  6384,    12,  1025,  1088,
            958,  3331,  8598,   976,     3,     9,  4072,  7221,    31,     7,
           6028,   243,     5,    37,  6585,    19,   952,    12, 23773,     8,
           4471,    13, 27592,     5,     1,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0, 

<h1>TF MODEL</h1>
<img src="./content/drive/MyDrive/news_summary/tf.png"/>

In [29]:
def train_model(epoch, tokenizer, model, device, data_loader, optimizer):
  model.train(mode=True)
  for _, data in enumerate(data_loader):
    y = data['target_ids'].to(device, dtype=torch.long)
    y_ids = y[:, :-1].contiguous() # skipping the last word for the decoder model, copying the tensors n memory
    labels = y[:, 1:].clone().detach() # actual targets
    labels[y[:,1:] == tokenizer.pad_token_id] = -100 # setting the pad id 0 as -100 so that it skips model training
    ids = data['source_ids'].to(device, dtype=torch.long)
    masks = data['source_mask'].to(device, dtype=torch.long)

    outputs = model(input_ids=ids, attention_mask=masks, decoder_input_ids=y_ids, labels=labels)
    loss = outputs[0]

    if epoch % 100 == 0:
      print(f'Train loss: {loss.item()} at epoch {epoch}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [34]:
def val_model(epoch, tokenize, model, device, data_loader):
  model.train(mode=False)
  predictions, txts = [], []

  with torch.no_grad():
    for _, data in enumerate(data_loader):
      y = data['target_ids'].to(device, dtype=torch.long)
      ids = data['source_ids'].to(device, dtype=torch.long)
      mask = data['source_mask'].to(device, dtype=torch.long)

      # num of beams = 4, prob of 4 words
      pred_ids = model.generate(input_ids=ids, 
                                attention_mask=mask, 
                                max_length=250, 
                                num_beams=4, 
                                repetition_penalty=2.0, 
                                length_penalty=1.0, 
                                early_stopping=True)
      
      preds = [tokenize.decode(id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for id in pred_ids]
      target = [tokenize.decode(id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for id in y]

      predictions.extend(preds)
      txts.extend(target)

  return predictions, txts


In [22]:
torch.manual_seed(Config.SEED) # pytorch random seed
np.random.seed(Config.SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

In [23]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model.to(device)

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [27]:
sum([p.numel() for p in model.parameters() if p.requires_grad])

60506624

In [28]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=Config.LEARNING_RATE)

In [30]:
for epoch in range(Config.TRAIN_EPOCHS):
  train_model(epoch, train_dataset.tokenizer, model, device, train_loader, optimizer)



Train loss: 3.4926369190216064 at epoch 0
Train loss: 2.479726791381836 at epoch 0
Train loss: 3.0086302757263184 at epoch 0
Train loss: 2.7178070545196533 at epoch 0
Train loss: 2.0589842796325684 at epoch 0
Train loss: 1.5570931434631348 at epoch 0
Train loss: 2.665304660797119 at epoch 0
Train loss: 1.8471661806106567 at epoch 0
Train loss: 2.8488986492156982 at epoch 0
Train loss: 2.4013192653656006 at epoch 0
Train loss: 1.6186436414718628 at epoch 0
Train loss: 1.8609271049499512 at epoch 0
Train loss: 2.2892589569091797 at epoch 0
Train loss: 2.8826003074645996 at epoch 0
Train loss: 5.155247211456299 at epoch 0
Train loss: 4.80718994140625 at epoch 0
Train loss: 1.4876031875610352 at epoch 0
Train loss: 2.3890769481658936 at epoch 0
Train loss: 4.227201461791992 at epoch 0
Train loss: 2.0078506469726562 at epoch 0
Train loss: 1.8181449174880981 at epoch 0
Train loss: 2.3403170108795166 at epoch 0
Train loss: 1.8728511333465576 at epoch 0
Train loss: 2.4775326251983643 at epoch 

In [35]:
for epoch in range(Config.VAL_EPOCHS):
  predictions, summary = val_model(epoch, val_dataset.tokenizer, model, device, val_loader)



In [36]:
for i in range(5):
  print('Actual Summary')
  print(f'{summary[i]}')
  print('Predicted Summary')
  print(f'{predictions[i]}')
  print('\n\n')

Actual Summary
A 31-year-old man has been arrested for masturbating mid-air and inappropriately touching a woman passenger on board a Bengaluru-Mumbai flight. The woman co-passenger woke up from her sleep to find the man masturbating and raised an alarm. When the airline crew reached the seat, the man denied the woman's allegation, however, he was fixing the zip of his trousers.
Predicted Summary
The man was arrested on Tuesday morning for masturbating and inappropriately touching a woman passenger on a flight. The man, Sabeen Hamza, started taking advantage of the situation and moved his hand on her. He was taken away after he woken up to the horror of the middle-aged man masturbating while looking at her.



Actual Summary
Actress Shraddha Kapoor, while speaking about reports of her dating Farhan Akhtar, has said that fiction can go to incredible heights. She said, "I choose to ignore them and just focus on my work," and added that she's friends with Farhan. Earlier, reports of Shrad

In [41]:
model_path = os.path.join(os.getcwd(), 't5model.pth')
torch.save(model.state_dict(), model_path)