In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/48/35/ad2c5b1b8f99feaaf9d7cdadaeef261f098c6e1a6a2935d4d07662a6b780/transformers-2.11.0-py3-none-any.whl (674kB)
[K     |▌                               | 10kB 21.9MB/s eta 0:00:01[K     |█                               | 20kB 5.8MB/s eta 0:00:01[K     |█▌                              | 30kB 7.5MB/s eta 0:00:01[K     |██                              | 40kB 8.3MB/s eta 0:00:01[K     |██▍                             | 51kB 7.0MB/s eta 0:00:01[K     |███                             | 61kB 7.6MB/s eta 0:00:01[K     |███▍                            | 71kB 8.0MB/s eta 0:00:01[K     |███▉                            | 81kB 8.6MB/s eta 0:00:01[K     |████▍                           | 92kB 8.1MB/s eta 0:00:01[K     |████▉                           | 102kB 8.8MB/s eta 0:00:01[K     |█████▍                          | 112kB 8.8MB/s eta 0:00:01[K     |█████▉                          | 122kB 8.8

In [0]:
from transformers import AutoModelWithLMHead, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from torch import optim
from tqdm import tqdm
import torch

In [3]:
MODEL_PATH = "huseinzol05/t5-base-bahasa-summarization-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelWithLMHead.from_pretrained(MODEL_PATH)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=468.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=793027.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=65.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891688818.0, style=ProgressStyle(descri…




In [0]:
class BertDataset(Dataset):
    def __init__(self, path, tokenizer):
        self.df = pd.read_csv(path)
        self.summary = self.df["summary_text"]
        self.paragraphs = self.df["text"]
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.paragraphs)
    
    def __getitem__(self, index):
        paragraphs = str(self.paragraphs[index])
        summary = str(self.summary[index])
        pad = tokenizer.pad_token
        eos = tokenizer.eos_token
        encoding_paragraphs = self.tokenizer.encode_plus("summarize: " + paragraphs + eos,
                                                         return_token_type_ids=False,
                                                         return_attention_mask=True,
                                                         max_length=512,
                                                         pad_to_max_length=True,
                                                         return_tensors='pt')

        encoding_summary = self.tokenizer.encode(pad + summary + eos,
                                                 add_special_tokens=False,
                                                 return_token_type_ids=False,
                                                 max_length=150,
                                                 pad_to_max_length=True,
                                                 return_tensors='pt')
        return {
            'sentence_text': paragraphs,
            'summary_text':summary,
            'input_ids': encoding_paragraphs['input_ids'].flatten(),
            'attention_mask': encoding_paragraphs['attention_mask'].flatten(),
            'lm_labels': encoding_summary.flatten(),
        }

In [0]:
train_set = BertDataset("drive/My Drive/model_summarization/train_01.csv",tokenizer)
train_loader = DataLoader(train_set, batch_size = 4,shuffle = True)
val_set = BertDataset("drive/My Drive/model_summarization/dev_01.csv",tokenizer)
val_loader = DataLoader(val_set, batch_size = 2,shuffle = True)

In [0]:
optimizer = optim.AdamW(model.parameters(), lr=3e-5)
model = model.to("cuda")

In [0]:
best_val_loss = 999999
early_stop = 0
epochs = 100
for _ in range(epochs):
  model.train()
  train_loss = 0
  for idx, data in tqdm(enumerate(train_loader)):
      sentence_text, summary_text, input_ids, attention_mask, lm_labels = data["sentence_text"], data["summary_text"], data["input_ids"], data["attention_mask"], data["lm_labels"]
      input_ids = input_ids.to("cuda")
      attention_mask = attention_mask.to("cuda")
      lm_labels = lm_labels.to("cuda")
      optimizer.zero_grad()
      output = model(input_ids = input_ids, attention_mask = attention_mask, lm_labels = lm_labels)
      loss, prediction_scores = output[:2]
      train_loss += loss.item()
      loss.backward()
      optimizer.step()
      if((idx % 1000) == 0):
          print("loss: ", loss.item(), " train_loss: ", train_loss/(idx+1))

  model.eval()
  with torch.no_grad():
    val_loss = 0
    for idx, data in tqdm(enumerate(val_loader)):
      sentence_text, summary_text, input_ids, attention_mask, lm_labels = data["sentence_text"], data["summary_text"], data["input_ids"], data["attention_mask"], data["lm_labels"]
      input_ids = input_ids.to("cuda")
      attention_mask = attention_mask.to("cuda")
      lm_labels = lm_labels.to("cuda")
      optimizer.zero_grad()
      output = model(input_ids = input_ids, attention_mask = attention_mask, lm_labels = lm_labels)
      loss, prediction_scores = output[:2]
      val_loss += loss.item()
  
  if((val_loss/len(val_loader)) < best_val_loss):
    model.save_pretrained("drive/My Drive/model_summarization/")
    best_val_loss  = (val_loss/len(val_loader))
  else:
    early_stop += 1
  print("train_loss: ", train_loss/len(train_loader))
  print("val_loss: ", val_loss/len(val_loader))

  if(early_stop == 3):
    break

In [28]:
with torch.no_grad():
  data = next(iter(val_loader))
  sentence_text, summary_text, input_ids, attention_mask, lm_labels = data["sentence_text"], data["summary_text"], data["input_ids"], data["attention_mask"], data["lm_labels"]
  input_ids = input_ids.to("cuda")
  attention_mask = attention_mask.to("cuda")
  lm_labels = lm_labels.to("cuda")
  generated = model.generate(input_ids=input_ids,
                      attention_mask=attention_mask, 
                      max_length=150, 
                      min_length=40, 
                      length_penalty=2.0, 
                      num_beams=4, 
                      early_stopping=True)
  tokenizer.decode(generated[0])
  print("full text")
  print(sentence_text[0])
  print("summary")
  print(summary_text[0])
  print("Generated summary")
  print(tokenizer.decode(generated[0]))

full text
Rimanews - Dinas Pariwisata Provinsi Nusa Tenggara Timur menargetkan pebalap dari 30 negara mengikuti balap sepeda internasional Tour de Flores di Pulau Flores , 14 - 19 Juli 2017 ." Sekarang sekitar 25 negara mendaftar dan kita targetkan bisa 30 negara akan ikut serta dalam Tour de Flores kali ini , " kata Kepala Dinas Pariwisata Provinsi NTT Marius Ardu Jelamu di Kupang , Kamis ( 20 / 4 / 2017 )Dia menjelaskan semakin banyak negara terlibat dalam balap sepeda internasional di sepanjang Pulau Flores itu , maka dampak promosi pariwisata semakin besar ke berbagai belahan dunia .Selain itu , katanya , TdF juga tidak sekadar meraup kedatangan wisatawan , namun juga untuk membangun diplomasi internasional melalui balap sepeda ." Bahwa kita sebagai bagian dari warga dunia ikut serta menjaga perdamaian dunia melalui balap sepeda dan juga menunjukkan kepada dunia luar bahwa Indonesia sangat aman untuk melaksanakan event - event internasional , " ujarnya .Ia mengatakan keterlibatan p