In [1]:
!pip install transformers



In [2]:
!nvidia-smi

Mon Nov 30 19:16:56 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Imports

In [3]:
import re
import json
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, AutoModelWithLMHead
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
from tqdm.notebook import tqdm
from transformers import pipeline
import random
import pandas as pd

## Parameters

In [4]:
train_path = 'train.txt'
test_path = 'test.txt'
model = "gpt2"
model_path = './gpt2'
block_size=128
num_train_epochs=3
per_device_train_batch_size=32 
per_device_eval_batch_size=64  
eval_steps = 20 
save_steps=40
warmup_steps=10

## Preprocess data

Initialize the tokenizer and the test data.

In [5]:
#tokenizer = AutoTokenizer.from_pretrained(model)
tokenizer = GPT2Tokenizer.from_pretrained(model)

#### Preparing data collector

In [6]:
data_collector = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

## Prepare the data for our model

#### Preparing Train and Test dataset

In [7]:
train_dataset = TextDataset(tokenizer=tokenizer, file_path=train_path, 
                            block_size=block_size)
test_dataset = TextDataset(tokenizer=tokenizer, file_path=test_path,
                           block_size=block_size) 



## Initialize model

In [8]:
model = AutoModelWithLMHead.from_pretrained(model)



# Training

##### Training Arguments and instance

In [9]:
arguments = TrainingArguments(output_dir=model_path, overwrite_output_dir=True, 
                              num_train_epochs=num_train_epochs, 
                              per_device_train_batch_size=per_device_train_batch_size, 
                              per_device_eval_batch_size=per_device_eval_batch_size,  
                              eval_steps=eval_steps, save_steps=save_steps, 
                              warmup_steps=warmup_steps)


trainer = Trainer(model=model, args=arguments, data_collator=data_collector,
                  train_dataset=train_dataset, eval_dataset=test_dataset)

##### Train

In [10]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=15, training_loss=1.7298489888509114)

#### Save model

In [11]:
trainer.save_model()

## Deploy model

Preparing to generate sentences

#### Initialize pipeline

In [12]:
bot = pipeline('text-generation', model=model_path, tokenizer='gpt2', 
               config={'max_length':10000})

In [13]:
bot("In my defense I will say that ")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'In my defense I will say that 《an experienced and dedicated communist group based in Bantu capital of Kinshasa, and that communists tend to be highly communistic and tend to congregate in townships. I will not deny'}]

#### Generate sentences

In [19]:
initial1 = 'What is the meaning of life '
initial2 = "Look at the tree"
initial3 = "The sky looks clear"
initial4 = "Roham is charging sir "

sentences = []
for initial in tqdm([initial1, initial2, initial3, initial4]):
  for _ in tqdm(range(25)):
    sentences.append(bot(initial)[0]['generated_text'])

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene




HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene




HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene




HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene





#### Shuffle sentences

In [20]:
random.shuffle(sentences)

#### Convert into pandas

In [21]:
person = "Nelson Mandela"
df = pd.DataFrame([[i for i in range(len(sentences))], sentences, 
                   [person for _ in range(len(sentences))]]).T
df.columns = ["Number", "Sentences", "Person"]



In [22]:
df

Unnamed: 0,Number,Sentences,Person
0,0,The sky looks clear and I see a great many peo...,Nelson Mandela
1,1,What is the meaning of life 〉the struggle of t...,Nelson Mandela
2,2,"What is the meaning of life ?""`93 Sperber told...",Nelson Mandela
3,3,"The sky looks clear, and you see the Republic ...",Nelson Mandela
4,4,Look at the tree. I want to eat it. The man wh...,Nelson Mandela
...,...,...,...
95,95,The sky looks clear into the distance as I pas...,Nelson Mandela
96,96,Roham is charging sir utham with inciting hatr...,Nelson Mandela
97,97,Roham is charging sir ____________ with the sa...,Nelson Mandela
98,98,"What is the meaning of life ?"" asked Merton. ""...",Nelson Mandela


#### Save as csv

In [18]:
df.to_csv("Nelson.csv", index=False)