Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
import os, string
# Deep Learning packages.
import tensorflow as tf
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.preprocessing.text import Tokenizer
import pickle

In [2]:
np.random.seed(1)
tf.random.set_seed(2)

LoadData

In [3]:
curr_dir = 'archive/'
all_headlines = []
count = 0
for fileName in os.listdir(curr_dir):
    if 'Articles' in fileName:
        df = pd.read_csv(curr_dir + fileName)
        count += len(df.index)
        all_headlines.extend(df.headline.values)
        break
count

886

In [4]:
all_headlines = [lines for lines in all_headlines if lines!="Unkown"]
count = 0
for lines in all_headlines:
    if lines == "Unkown":
        count += 1
print(count)
print(all_headlines)

0


Cleaning text

In [5]:
def clean_text(txt):
    txt = "".join(t for t in txt if t not in string.punctuation).lower()
    txt.encode('utf-8').decode('ascii', 'ignore')
    return txt

In [6]:
corpus = [clean_text(x) for x in all_headlines]

In [7]:
print(corpus[:10])

['finding an expansive view  of a forgotten people in niger', 'and now  the dreaded trump curse', 'venezuela’s descent into dictatorship', 'stain permeates basketball blue blood', 'taking things for granted', 'the caged beast awakens', 'an everunfolding story', 'o’reilly thrives as settlements add up', 'mouse infestation', 'divide in gop now threatens trump tax plan']


Tokenization and ngrams creation

In [8]:
for line in corpus:
    print(line)

finding an expansive view  of a forgotten people in niger
and now  the dreaded trump curse
venezuela’s descent into dictatorship
stain permeates basketball blue blood
taking things for granted
the caged beast awakens
an everunfolding story
o’reilly thrives as settlements add up
mouse infestation
divide in gop now threatens trump tax plan
variety puzzle acrostic
they can hit a ball 400 feet but play catch that’s tricky
in trump country shock at trump budget cuts
why is this hate different from all other hate
pick your favorite ethical offender
my son’s growing black pride
jerks and the startups they ruin
trump  needs  a brain
manhood in the age of trump
the value of a black college
initial description
rough estimates
el pasatiempo nacional
cooling off on a hot day at yankee stadium
trump’s staff mixed politics and paydays
a virtuoso rebuilding act requires everyone in tune
‘homeland’ season 6 episode 11 is quinn just a natural killer
‘big little lies’ and the art of empathy
upending a w

In [9]:
tokenizer = Tokenizer()
def get_sequence(corpus):
    print("HI")
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    input_sequence = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        print(token_list)
        for i in range(1, len(token_list)):
            input_sequence.append(token_list[:i+1])
    return input_sequence, total_words


In [10]:
inp_sequence, total_word = get_sequence(corpus)

HI
[169, 21, 653, 359, 4, 2, 654, 170, 5, 655]
[6, 80, 1, 656, 11, 657]
[658, 659, 129, 660]
[661, 662, 663, 664, 665]
[104, 171, 8, 666]
[1, 667, 668, 669]
[21, 670, 225]
[130, 671, 23, 672, 360, 32]
[673, 674]
[226, 5, 227, 80, 675, 11, 56, 52]
[131, 172, 228]
[105, 42, 676, 2, 677, 678, 679, 28, 680, 681, 229, 682]
[5, 11, 683, 361, 13, 11, 362, 363]
[38, 10, 26, 106, 107, 18, 39, 230, 106]
[231, 43, 684, 685, 364]
[44, 686, 687, 132, 688]
[689, 6, 1, 690, 105, 691]
[11, 692, 2, 173]
[693, 5, 1, 365, 4, 11]
[1, 366, 4, 2, 132, 68]
[694, 695]
[696, 697]
[698, 699, 700]
[701, 174, 7, 2, 175, 64, 13, 702, 367]
[15, 703, 704, 232, 6, 368]
[2, 705, 706, 707, 708, 709, 5, 710]
[369, 19, 108, 24, 370, 10, 711, 133, 2, 712, 713]
[714, 233, 715, 6, 1, 134, 4, 716]
[717, 2, 718]
[234, 176, 6, 235, 24, 57, 104, 1, 371]
[135, 19, 45, 24, 372, 719, 10, 33, 373, 40, 69, 29]
[9]
[109, 81, 7, 5, 26, 177, 136, 34, 137]
[9]
[46, 22, 236, 720, 721, 41, 110, 47, 374, 5, 375, 21, 376, 377, 53, 43, 178]


In [11]:
print(inp_sequence[:10])

[[169, 21], [169, 21, 653], [169, 21, 653, 359], [169, 21, 653, 359, 4], [169, 21, 653, 359, 4, 2], [169, 21, 653, 359, 4, 2, 654], [169, 21, 653, 359, 4, 2, 654, 170], [169, 21, 653, 359, 4, 2, 654, 170, 5], [169, 21, 653, 359, 4, 2, 654, 170, 5, 655], [6, 80]]


In [12]:
import keras.utils as ku

In [13]:
def generate_padded_seq(input_seq):
    max_len = max(len(x) for x in input_seq)
    input_seq = np.array(pad_sequences(input_seq, maxlen=max_len, padding='pre'))
    print(input_seq.ndim)
    predictor, label = input_seq[:, :-1], input_seq[:, -1]
    # Return this label and then map.
    print(label)
    label = ku.to_categorical(label, num_classes=total_word)
    print(label)
    return predictor, label, max_len

In [14]:
predictor, label, max_len = generate_padded_seq(inp_sequence)

2
[  21  653  359 ...  357   93 2483]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [15]:
print(label[0])

[0. 0. 0. ... 0. 0. 0.]


In [16]:
print(predictor[0])

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0 169]


In [17]:
print(max_len)

21


In [18]:
def create_model(max_len, total_word):
    input_len = max_len-1
    model = Sequential()
    model.add(Embedding(total_word, 10, input_length=input_len))
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    model.add(Dense(total_word, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [19]:
print(total_word, max_len)

2484 21


In [20]:
model = create_model(max_len, total_word)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 10)            24840     
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 2484)              250884    
                                                                 
Total params: 320124 (1.22 MB)
Trainable params: 320124 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [43]:
model.fit(predictor, label, epochs= 50, verbose=5)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x27896f38990>

Prediction Fuction.

In [44]:
print(tokenizer.texts_to_sequences(['Iam myself and who are you']))

[[6, 37, 49, 22]]


In [45]:
def generate_text(seed_text, next_word, model, maxseq_len):
    for _ in range(next_word):
        token_list = tokenizer.texts_to_sequences(seed_text)
        # print(token_list)
        token_list = pad_sequences(token_list, maxlen=max_len-1, padding='pre')
        # print(token_list)
        # pred = model.predict(token_list)
        pred = np.argmax(model.predict(token_list, verbose=0))
        # print(pred)
        new_text = seed_text[0]
        for word, index in tokenizer.word_index.items():
            if index == pred:
                new_text += " " + word
                break
        seed_text[0] = new_text
    return new_text 

In [50]:
(generate_text(["The Hawk"], 3, model, max_len))

'The Hawk is sabotage obamacare'

In [51]:
word_index = tokenizer.word_index
index_word = {index: word for word, index in word_index.items()}
target_index = 30  # Replace with the desired index

# Use the index to retrieve the corresponding word
if target_index in index_word:
    word = index_word[target_index]
    print(f"The word corresponding to index {target_index} is: {word}")

The word corresponding to index 30 is: york


In [52]:
print(max(word_index.values()))

2483


In [53]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 10)            24840     
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 2484)              250884    
                                                                 
Total params: 320124 (1.22 MB)
Trainable params: 320124 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [54]:
model.save('model.h5')

  saving_api.save_model(


In [55]:
from keras.models import load_model
modle = load_model('model.h5')

Hello americans’ season 3


In [56]:
import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


In [60]:
def predict(input_text, predict_text):
    return generate_text([input_text], int(predict_text), modle, max_len)

In [61]:
iface = gr.Interface(
    fn = predict,
    inputs=['text', 'number'],
    outputs=['text']
)

In [63]:
iface.launch(share=True)

Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----
Running on public URL: https://666ff8646286518126.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




Traceback (most recent call last):
  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\gradio\routes.py", line 516, in predict
    output = await route_utils.call_process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\gradio\route_utils.py", line 219, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\gradio\blocks.py", line 1437, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\gradio\blocks.py", line 1109, in call_function
    prediction = await anyio.to_thread.run_sync(
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\anyio\to_thread.py", line 33, in run_sync
    return await get_asynclib(