#### Author : Sanjoy Biswas
#### Project : Text Generation With AI System
#### Email : sanjoy.eee32@gmail.com

In [99]:

!pip install transformers



### Import Necessary Libraries

In [100]:
import numpy as np
import pandas as pd
import seaborn as sns
import urllib
import re
import os
import tensorflow as tf
from transformers import GPT2LMHeadModel, GPT2Tokenizer

### Import Tokenizer

In [101]:
## GPT-2 is a transformers model pretrained on a very large corpus of English data
## pretrained on the raw texts only, with no humans labelling them with autometic Process
## generating texts from a prompt

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")


### Import Model

In [102]:
## GPT-2 stack of transformer decoders which makes it so powerful in generating meaningful texts

model = GPT2LMHeadModel.from_pretrained("gpt2-large", pad_token_id=tokenizer.eos_token_id)

### Set The Topic

In [103]:
## Let’s take an example test text like

sentence = 'Cristiano Ronaldo match'

## Encode the input i.e. topic

In [104]:
## Tokenize this input text for GPT input i.e. converting text to integer indices

input_ids = tokenizer.encode(sentence, return_tensors='pt')

In [105]:

input_ids

tensor([[   34,  1585, 10115, 36309,  2872]])

In [106]:
## max_lenth-Number of Words in the Article
## num_beams-Number of different combination of words that can be chained together
## no_repeat_ngram_size-No of words that be combined together and repeated, example: ['benefits of sleeping' can be repeated 2 times but not more ]
## generate text until the output length (which includes the context length) reaches 50

output = model.generate(
    input_ids, 
    max_length=30, 
    num_beams=5, 
    no_repeat_ngram_size=2, 
    early_stopping=True
    )

In [107]:
## Tensor Output
output

tensor([[   34,  1585, 10115, 36309,  2872,    12, 14463,  3061,   287,   262,
          6662,  4041,  2457,  1028,  1629, 47286, 14708,    13,   198,   198,
         23672, 41476,  7781,   262,  8464,   287,  3131,    12,  2435,   284]])

In [108]:

## This is our tokenized text, if want to decode these indices values, can do so by decode method provided by the same tokenizer.
print(tokenizer.decode(output[0], skip_special_tokens=True))

Cristiano Ronaldo match-winning goal in the Champions League final against Atletico Madrid.

Ronaldo scored the winner in extra-time to


### Save the output in a variable

In [109]:
text = tokenizer.decode(output[0], skip_special_tokens=True).join(text.split(".")[:-1]) + "."

In [110]:
text

'Cristiano Ronaldo match-winning goal in the Champions League final against Atletico Madrid.'

### Install & Import Gradio

In [111]:
!pip install -q gradio

In [112]:
import gradio as gr

In [113]:
def generate_text(sentence):
    input_ids = tokenizer.encode(sentence, return_tensors='pt')
    beam_output = model.generate(input_ids, max_length=30, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
    output = tokenizer.decode(beam_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return ".".join(output.split(".")[:-1]) + "."

### Creating Gradio Interface

In [114]:
output_text = gr.outputs.Textbox()
gr.Interface(generate_text,"textbox", output_text, title="Text Generation",
             description="Generate Text and Comment With AI System").launch()

Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
This share link will expire in 24 hours. If you need a permanent link, visit: https://gradio.app/introducing-hosted (NEW!)
Running on External URL: https://40973.gradio.app
Interface loading below...


(<Flask 'gradio.networking'>,
 'http://127.0.0.1:7865/',
 'https://40973.gradio.app')

### Save Output as CSV File

In [115]:
with open('generatefile.csv','w') as f:
  f.write(text)

### Generate Pickle File

In [116]:

import pickle
pickle.dump(model,open('nlp_model.pkl','wb'))

In [117]:
pickle.dump(text,open('nlp_text.pkl','wb'))