#### Author : Sanjoy Biswas
#### Email : sanjoy.eee32@gmail.com
### Portfolio : https://imsanjoykb.github.io/

In [1]:

!pip install transformers

Collecting transformers
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 5.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 54.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 74.4 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 47.6 MB/s 
[?25hCollecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 6.6 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3

### Import Necessary Libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import urllib
import re
import os
import tensorflow as tf
from transformers import GPT2LMHeadModel, GPT2Tokenizer

### Import Tokenizer

In [3]:
## GPT-2 is a transformers model pretrained on a very large corpus of English data
## pretrained on the raw texts only, with no humans labelling them with autometic Process
## generating texts from a prompt

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")


Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/666 [00:00<?, ?B/s]

### Import Model

In [4]:
## GPT-2 stack of transformer decoders which makes it so powerful in generating meaningful texts

model = GPT2LMHeadModel.from_pretrained("gpt2-large", pad_token_id=tokenizer.eos_token_id)

Downloading:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

### Set The Topic

In [5]:
## Let’s take an example test text like

sentence = 'Cristiano Ronaldo match'

## Encode the input i.e. topic

In [6]:
## Tokenize this input text for GPT input i.e. converting text to integer indices

input_ids = tokenizer.encode(sentence, return_tensors='pt')

In [7]:

input_ids

tensor([[   34,  1585, 10115, 36309,  2872]])

In [8]:
## max_lenth-Number of Words in the Article
## num_beams-Number of different combination of words that can be chained together
## no_repeat_ngram_size-No of words that be combined together and repeated, example: ['benefits of sleeping' can be repeated 2 times but not more ]
## generate text until the output length (which includes the context length) reaches 50

output = model.generate(
    input_ids, 
    max_length=30, 
    num_beams=5, 
    no_repeat_ngram_size=2, 
    early_stopping=True
    )

In [9]:
## Tensor Output
output

tensor([[   34,  1585, 10115, 36309,  2872,    12, 14463,  3061,   287,   262,
          6662,  4041,  2457,  1028,  1629, 47286, 14708,    13,   198,   198,
         23672, 41476,  7781,   262,  8464,   287,  3131,    12,  2435,   284]])

In [10]:

## This is our tokenized text, if want to decode these indices values, can do so by decode method provided by the same tokenizer.
print(tokenizer.decode(output[0], skip_special_tokens=True))

Cristiano Ronaldo match-winning goal in the Champions League final against Atletico Madrid.

Ronaldo scored the winner in extra-time to


### Save the output in a variable

In [14]:
text_x = tokenizer.decode(output[0], skip_special_tokens=True)

In [15]:
text = tokenizer.decode(output[0], skip_special_tokens=True).join(text_x.split(".")[:-1]) + "."

In [16]:
text

'Cristiano Ronaldo match-winning goal in the Champions League final against Atletico Madrid.'

### Install & Import Gradio

In [38]:
!pip install -q gradio

[K     |████████████████████████████████| 3.4 MB 5.2 MB/s 
[K     |████████████████████████████████| 1.9 MB 38.3 MB/s 
[K     |████████████████████████████████| 206 kB 54.0 MB/s 
[K     |████████████████████████████████| 961 kB 71.2 MB/s 
[K     |████████████████████████████████| 3.0 MB 38.0 MB/s 
[K     |████████████████████████████████| 63 kB 2.2 MB/s 
[?25h  Building wheel for ffmpy (setup.py) ... [?25l[?25hdone
  Building wheel for flask-cachebuster (setup.py) ... [?25l[?25hdone


In [39]:
import gradio as gr

In [40]:
def generate_text(sentence):
    input_ids = tokenizer.encode(sentence, return_tensors='pt')
    beam_output = model.generate(input_ids, max_length=30, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
    output = tokenizer.decode(beam_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return ".".join(output.split(".")[:-1]) + "."

### Creating Gradio Interface

In [41]:
output_text = gr.outputs.Textbox()
gr.Interface(generate_text,"textbox", output_text, title="Text Generation",
             description="Generate Text and Comment With AI System").launch()

Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
This share link will expire in 24 hours. If you need a permanent link, visit: https://gradio.app/introducing-hosted (NEW!)
Running on External URL: https://56632.gradio.app
Interface loading below...


(<Flask 'gradio.networking'>,
 'http://127.0.0.1:7860/',
 'https://56632.gradio.app')

### Save Output as CSV File

In [42]:
from google.colab import files
with open('generatefile.csv','w') as f:
  f.write(text)


### Generate Pickle File

In [43]:

import pickle
pickle.dump(model,open('nlp_model.pkl','wb'))

In [45]:
pickle.dump(text,open('projmodel.pkl','wb'))