In [0]:
import torch
from torch.nn import functional as F
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.data import Field, BPTTIterator, TabularDataset
from torch.utils.data import Dataset

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
We will use the GPU: Tesla P4


In [0]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/13/33/ffb67897a6985a7b7d8e5e7878c3628678f553634bd3836404fef06ef19b/transformers-2.5.1-py3-none-any.whl (499kB)
[K     |▋                               | 10kB 20.6MB/s eta 0:00:01[K     |█▎                              | 20kB 17.1MB/s eta 0:00:01[K     |██                              | 30kB 14.3MB/s eta 0:00:01[K     |██▋                             | 40kB 13.2MB/s eta 0:00:01[K     |███▎                            | 51kB 11.4MB/s eta 0:00:01[K     |████                            | 61kB 11.6MB/s eta 0:00:01[K     |████▋                           | 71kB 11.7MB/s eta 0:00:01[K     |█████▎                          | 81kB 11.8MB/s eta 0:00:01[K     |██████                          | 92kB 11.5MB/s eta 0:00:01[K     |██████▋                         | 102kB 11.4MB/s eta 0:00:01[K     |███████▏                        | 112kB 11.4MB/s eta 0:00:01[K     |███████▉                        | 

In [0]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
# Load the BERT tokenizer.
print('Loading GPT tokenizer...')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large', do_lower_case=True)

Loading GPT tokenizer...


HBox(children=(IntProgress(value=0, description='Downloading', max=1042301, style=ProgressStyle(description_wi…




HBox(children=(IntProgress(value=0, description='Downloading', max=456318, style=ProgressStyle(description_wid…




In [0]:
model = GPT2LMHeadModel.from_pretrained('gpt2-large')
model.to(device)
print('Model loaded')

HBox(children=(IntProgress(value=0, description='Downloading', max=577, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=3247202234, style=ProgressStyle(description…


Model loaded


In [0]:
text = tokenizer.encode('Today Patriots will play against' )
input = torch.tensor([text]).to(device)
past = None 

for _ in range(100):
  logits, past = model(input,past=past)
  input = torch.multinomial(F.softmax(logits[:,-1]),1)
  text.append(input.item())

  import sys


In [0]:
tokenizer.decode(text)

'today patriots will play against us," his honorary group said at the rally, said former operative for Agriculture Department Tom Vilsack. "We all have to accept the responsibility for that."<|endoftext|>(CNN) Health officials in the United States have endorsed the use of vaccines to prevent the spread of the Zika virus, a declaration that comes as the US Centers for Disease Control and Prevention is downplaying the risk of secondary sexual transmission.\n\nIt was the first time scientists had hosted and encouraged discussion of what holds the risk of Zika'

In [0]:
cd drive/My\ Drive/NewsGen

/content/drive/My Drive/NewsGen


In [0]:
ls

[0m[01;34msample_data[0m/


# Dataloaders

In [0]:
import pandas as pd

In [0]:
BATCH_SIZE = 16

In [0]:
#model.load_state_dict(torch.load('model/gpt2_medium_4.pt'))

In [0]:
#
end_of_text_token = "<|endoftext|>"
from torch.utils.data import IterableDataset
from torch.utils.data import DataLoader
from itertools import cycle, islice, chain
class IterData(IterableDataset):
  """This class helps to get text in news"""
  def __init__(self, filename,tokenizer):
    self.filename = filename
    self.tokenizer = tokenizer

  
  def parse_file(self, file_path):
    with open(file_path, 'r') as file_obj:
      for line in file_obj:
        # Maybe change how you create batches
        tokens = line.split('.') 

        yield from tokens

  def get_stream(self, file_path):
    # give in seq_len
    
    return cycle(self.parse_file(file_path))

  def __iter__(self):
    """ Returns string of text """
    return self.get_stream(self.filename)

In [0]:
dataset = IterData('train.txt',tokenizer)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE,num_workers=0)

In [0]:
def get_seq(input,seq_len):
  input = ' '.join([sent for sent in input])
  t = input.split(' ')
  t = t[:seq_len]
  news = ' '.join([w for w in t])
  return news


# Training

In [0]:
#hparams
import os

EPOCHS = 5
LEARNING_RATE = 3e-5
WARMUP_STEPS = 5000
MAX_SEQ_LEN = 700

In [0]:
model.to(device)
model.train()
optimizer = AdamW(model.parameters(),lr=LEARNING_RATE)
#scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps=WARMUP_STEPS, t_total=-1)
# Create a scheduler and learn what scheduler is
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=WARMUP_STEPS,
                                            num_training_steps=6)
proc_seq_count = 0
sum_loss = 0.0
batch_count = 0
models_folder = './model/'

In [0]:
from torch.utils.tensorboard import SummaryWriter

# default `log_dir` is "runs" - we'll be more specific here
#writer = SummaryWriter('runs/fashion_mnist_experiment_1')

In [0]:
# Find out about the batch
tmp_news_tens = None
MAX_SEQ_LEN = 700
epoch_loss = 0
for epoch in range(EPOCHS):
  
  print(f"EPOCH {epoch} started" + '=' * 30)
  if epoch != 0:
    print(f"Epoch of loss: {epoch_loss} for epoch {epoch}")
    epoch_loss = 0

  for i,news in enumerate(dataloader):

    news = get_seq(news,MAX_SEQ_LEN)

    news_tensor = torch.tensor(tokenizer.encode([news])).unsqueeze(0).to(device)
    if news_tensor.size()[1] > MAX_SEQ_LEN:
      continue
          
    #The first joke sequence in the sequence
    if not torch.is_tensor(tmp_news_tens):
        tmp_news_tens = news_tensor
        continue
    else:
        #The next joke does not fit in so we process the sequence and leave the last joke 
        #as the start for next sequence 
        if tmp_news_tens.size()[1] + news_tensor.size()[1] > MAX_SEQ_LEN:
            work_news_tens = tmp_news_tens
            tmp_news_tens = news_tensor
        else:
            #Add the joke to sequence, continue and try to add more
            tmp_news_tens = torch.cat([tmp_news_tens, news_tensor[:,1:]], dim=1)
            continue
    outputs = model(work_news_tens,labels=work_news_tens)
    loss, logits = outputs[:2]
    loss.backward()
    sum_loss += loss.detach().data
    epoch_loss = sum_loss
    proc_seq_count += 1
    if proc_seq_count == 5:
      proc_seq_count = 0
      batch_count += 1
      optimizer.step()
      optimizer.zero_grad()
    if batch_count == 100:
      print(f"sum loss {sum_loss}")
      batch_count = 0
      sum_loss = 0.0

  #out = torch.topk(outputs[0], 1)
  #print(out)
  if epoch % 3 == 0:
    torch.save(model.state_dict(), os.path.join(models_folder, f"gpt2_large_{epoch}.pt"))




In [0]:
text = tokenizer.encode('Tonights game against Manchester City.')
input = torch.tensor([text]).to(device)
past = None 

for _ in range(120):
  logits, past = model(input,past=past)
  input = torch.multinomial(F.softmax(logits[:,-1]),1)
  text.append(input.item())

  import sys


In [0]:
out = tokenizer.decode(text)
out = out.replace('\n\n', '').replace(end_of_text_token, '.').replace('\ ', '')

In [0]:
out

'tonights game against manchester city.10 days, 10pm.www.ultrahighpitalshay.comTODAY BEGINNING TIME: 7.30pmALSO ON OPP Radio.Michael R. Wyatt Underground at 390 Sharp, Seattle\nThe world is something you learn deep within your bones -- beats you down for feel, keenness and the ability to articulate deep feelings as an expansive/sprawling streetscapes with the angry pulse. The feeling of a world disconnected, something you make in the day and draw out all its insides in one day with the'

# Metrics

In [0]:
from nltk.translate.bleu_score import sentence_bleu
reference = ['kanye', 'west', 'has' 'said' on a public comment ]
candidate = ['this', 'is', 'a', 'test']
score = sentence_bleu(reference, candidate)
print(score)

1.0


In [0]:
# Create a train and eval
# fine tune the model on News Dataset