In [26]:
import re 
from tqdm import tqdm
import numpy as np

### Reading data 

In [64]:
news= open('en_US/en_US.news.txt',encoding="utf8").read()
blogs= open('en_US/en_US.blogs.txt',encoding="utf8").read()
twitter= open('en_US/en_US.twitter.txt',encoding="utf8").read()

In [65]:
text_corpus=news+blogs+twitter 
print(len(text_corpus))

576413590


In [29]:
del news , blogs , twitter

### Splitting 1% corpus into sentences 

In [66]:
from nltk import tokenize
text_corpus = tokenize.sent_tokenize(text_corpus[0:int(len(text_corpus)*0.01)])
print(len(text_corpus))

51608


51608 sentences are present in 1% of the  corpus

In [31]:

def extra_space(text):
    new_text= re.sub("\s+"," ",text)
    return new_text
def sp_charac(text):
    new_text=re.sub("[^0-9A-Za-z ]", "" , text)
    return new_text

### Removing extra space and special characters to find the average word count in each sentence

In [67]:
for i in range(len(text_corpus)) : 
    text_corpus[i] = extra_space(text_corpus[i])
    text_corpus[i] = sp_charac(text_corpus[i])

In [68]:
count=[]
for i in tqdm(text_corpus):
    count.append(len(i.split()))
print(np.mean(count))

100%|████████████████████████████████████████████████████████████████████████| 51608/51608 [00:00<00:00, 445174.03it/s]

18.69214075337157





### Dividing the sentences in train and test data and saving it in file

In [69]:
train_sent = text_corpus[0:int(0.8*len(text_corpus))]
test_sent = text_corpus[int(0.8*len(text_corpus)):]

In [71]:
train_sent='.'.join(train_sent)
test_sent='.'.join(test_sent)

In [72]:
import pickle
with open("train_sent.txt", "w") as fp:   #Pickling
    fp.write(train_sent)
with open("test_sent.txt", "w") as fp:   #Pickling
    fp.write(test_sent)

### Modeling using GPT

In [6]:
# Code Reference : https://towardsdatascience.com/fine-tune-a-non-english-gpt-2-model-with-huggingface-9acc2dc7635b
from transformers import OpenAIGPTTokenizer,OpenAIGPTLMHeadModel,TextDataset,TrainingArguments,Trainer,pipeline,DataCollatorForLanguageModeling

In [7]:
#Calling GPT-1 tokenizer
tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


In [8]:
print('vocabulary size: %d, max sequence length: %d' % (tokenizer.vocab_size, tokenizer.model_max_length))

vocabulary size: 40478, max sequence length: 512


In [41]:
# Setting the data collator for language modeling , it will generate batch of train and test data while training
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [73]:
# TextDataset function converts tokens to ids for each sentence . Here I have set each sentence length to be 19 as the
# the average word count is 18.69
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='train_sent.txt',
    overwrite_cache=True,
    block_size=19)
     
test_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='test_sent.txt',
    overwrite_cache=True,
    block_size=19)

Creating features from dataset file at 
Saving features into cached file cached_lm_OpenAIGPTTokenizer_19_train_sent.txt [took 0.067 s]
Creating features from dataset file at 
Saving features into cached file cached_lm_OpenAIGPTTokenizer_19_test_sent.txt [took 0.022 s]


In [44]:
#Calling the pretrained GPT-1 Model 
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')

loading configuration file https://huggingface.co/openai-gpt/resolve/main/config.json from cache at C:\Users\ADMIN/.cache\huggingface\transformers\bebb46f5735701bc248ef9faa26f12577944fa7fc8e9be1a774b94d4cb8b79b6.ba6f10a5446f364b92311c09e55e49aa27024a4aeefc1ea50fd733b77bcd997d
Model config OpenAIGPTConfig {
  "afn": "gelu",
  "architectures": [
    "OpenAIGPTLMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "openai-gpt",
  "n_ctx": 512,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 512,
  "n_special": 0,
  "predict_special_tokens": true,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.8.2",
  "vocab_size": 40478
}

l

#### Fine Tuning of GPT-1 Model on custom data 

In [77]:
# Setting the arguments for training
training_args = TrainingArguments(
    output_dir = 'gpt_model', 
    overwrite_output_dir = True, 
    per_device_train_batch_size = 64, 
    per_device_eval_batch_size = 64, 
    learning_rate = 5e-4, 
    num_train_epochs = 3,
)
# Initializing the trainer class object that will do the training
# here the data collator will generate the batch of size 64 of train and test data
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator=data_collator,
    train_dataset = train_dataset,
    eval_dataset = test_dataset
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [78]:
# Training the model for 3 epochs
trainer.train()

***** Running training *****
  Num examples = 47958
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 2250


Step,Training Loss
500,5.6404
1000,5.1396
1500,4.7816
2000,3.9684


Saving model checkpoint to gpt_model\checkpoint-500
Configuration saved in gpt_model\checkpoint-500\config.json
Model weights saved in gpt_model\checkpoint-500\pytorch_model.bin
Saving model checkpoint to gpt_model\checkpoint-1000
Configuration saved in gpt_model\checkpoint-1000\config.json
Model weights saved in gpt_model\checkpoint-1000\pytorch_model.bin
Saving model checkpoint to gpt_model\checkpoint-1500
Configuration saved in gpt_model\checkpoint-1500\config.json
Model weights saved in gpt_model\checkpoint-1500\pytorch_model.bin
Saving model checkpoint to gpt_model\checkpoint-2000
Configuration saved in gpt_model\checkpoint-2000\config.json
Model weights saved in gpt_model\checkpoint-2000\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=2250, training_loss=4.778721299913195, metrics={'train_runtime': 43771.0892, 'train_samples_per_second': 3.287, 'train_steps_per_second': 0.051, 'total_flos': 1911361108506624.0, 'train_loss': 4.778721299913195, 'epoch': 3.0})

#### The train loss achieved is 4.778

In [79]:
# Saving the model 
trainer.save_model()

Saving model checkpoint to gpt_model
Configuration saved in gpt_model\config.json
Model weights saved in gpt_model\pytorch_model.bin


In [80]:
# Evaluating on Test data
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 11951
  Batch size = 64


{'eval_loss': 5.522177696228027,
 'eval_runtime': 1369.1668,
 'eval_samples_per_second': 8.729,
 'eval_steps_per_second': 0.137,
 'epoch': 3.0}

#### Test loss is 5.52

### Creating a pipeline object for text generation

In [10]:
generator = pipeline('text-generation', tokenizer='openai-gpt', model='gpt_model')


In [15]:
'''
generating  next word in 3 possible ways
1. Greedy Search : chooses the best possible next word based on highest probability from 1 hypothesis
2. Beam Search : chooses the high probability next word from n hypothesis
3. Random Sampling : chooses random next word from possible hypothesis , however as the temperature is set high , it will
   ignore low probability words.
'''

print(generator('There was a beautiful', max_length=5)[0]['generated_text'])
print(generator('There was a beautiful', max_length=5,num_beams = 5)[0]['generated_text'])
print(generator('There was a beautiful' , max_length=5 , do_sample=True,temperature = 0.7)[0]['generated_text'])


There was a beautiful sunset
There was a beautiful simplicity
There was a beautiful and


### Predicting Next word 

In [3]:
def predict_next():
    from transformers import OpenAIGPTTokenizer,OpenAIGPTLMHeadModel,\
    TextDataset,TrainingArguments,Trainer,pipeline,DataCollatorForLanguageModeling
    import re 
    from nltk.tokenize import word_tokenize
    
    
    tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
    model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
    generator = pipeline('text-generation', tokenizer='openai-gpt', model='gpt_model') 
    while(True):
        text = input('Enter the text: ')
        length= len(tokenizer.encode(text, return_tensors='pt')[0])
        
        max_length = length+1
    
        print('Next Word: ')
        print(generator(text , max_length=max_length)[0]['generated_text'].split(' ')[-1])
        print(generator(text , max_length=max_length , num_beams = 5)[0]['generated_text'].split(' ')[-1])
        print(generator(text , max_length=max_length , do_sample=True,temperature = 0.7)[0]['generated_text'].split(' ')[-1])
    
    
    

In [4]:
predict_next()

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.
Some weights of OpenAIGPTLMHeadModel were not initialized from the model checkpoint at openai-gpt and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Enter the text: please be aware of the
Next Word: 
situation
situation
situation
Enter the text: Covid-19 is a pandemic which is 
Next Word: 
designed
used
more
Enter the text: Indian culture is 
Next Word: 
based
a
in
Enter the text: u r looking too
Next Word: 
good
good
big
Enter the text: I am a fan of those
Next Word: 
who
buns
buns
Enter the text: How long do you think
Next Word: 
he
this
the
Enter the text: My impression of 
Next Word: 
james
the
myself
Enter the text: He did you a 
Next Word: 
wonderful
lot
lot
Enter the text: can I ask for 
Next Word: 
your
your
a
Enter the text: Is it a valid medical
Next Word: 
request
insurance
insurance


KeyboardInterrupt: Interrupted by user

### Comparison of Results

In [1]:
from prettytable import PrettyTable
results = PrettyTable(["Model Name", "Train Loss", "Test Loss"])
results.add_row(["Markov Model", "NA", "82351 (Perplexity)"])
results.add_row(["LSTM_len2 Model", "6.51", "7.75"])
results.add_row(["LSTM_len4 Model", "5.72", "9.33"])
results.add_row(["LSTM_len7 Model", "5.78", "12.14"])
results.add_row(["LSTM_len2 Attention Model", "6.56", "7.09"])
results.add_row(["LSTM_len4 Attention Model", "3.50", "8.86"])
results.add_row(["LSTM_len7 Attention Model", "2.61", "9.24"])
results.add_row(["ALBert Model", "2.74", "2.88"])
results.add_row(["GPT Model", "4.77", "5.52"])
print(results)

+---------------------------+------------+--------------------+
|         Model Name        | Train Loss |     Test Loss      |
+---------------------------+------------+--------------------+
|        Markov Model       |     NA     | 82351 (Perplexity) |
|      LSTM_len2 Model      |    6.51    |        7.75        |
|      LSTM_len4 Model      |    5.72    |        9.33        |
|      LSTM_len7 Model      |    5.78    |       12.14        |
| LSTM_len2 Attention Model |    6.56    |        7.09        |
| LSTM_len4 Attention Model |    3.50    |        8.86        |
| LSTM_len7 Attention Model |    2.61    |        9.24        |
|        ALBert Model       |    2.74    |        2.88        |
|         GPT Model         |    4.77    |        5.52        |
+---------------------------+------------+--------------------+
