In [1]:
import string, os
import numpy as np
import pandas as pd
import tensorflow as tf
import keras.utils as ku 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

2023-07-12 22:44:16.551043: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [15]:
curr_dir = 'data/'
all_headlines = []

for filename in os.listdir(curr_dir):
    if 'Articles' in filename: 
        article_df = pd.read_csv(curr_dir + filename)
        all_headlines.extend(list(article_df.headline.values))
        break

all_headlines = [v for v in all_headlines if v != 'Unknown']
len(all_headlines)

829

In [16]:
def clean_text(text):
    text = "".join(v for v in text if v not in string.punctuation)
    text = text.encode('utf8').decode('ascii', 'ignore')
    return text

corpus = [clean_text(x) for x in all_headlines]

In [17]:
all_headlines[1]

'Voice. Vice. Veracity.'

In [18]:
tokenizer = Tokenizer()

def get_sequence_of_text(corpus):
    
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    input_sequences = []

    for line in corpus: 
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_text(corpus)
    


In [19]:
inp_sequences

[[660, 117],
 [660, 117, 72],
 [660, 117, 72, 73],
 [660, 117, 72, 73, 661],
 [660, 117, 72, 73, 661, 662],
 [660, 117, 72, 73, 661, 662, 63],
 [660, 117, 72, 73, 661, 662, 63, 29],
 [660, 117, 72, 73, 661, 662, 63, 29, 210],
 [211, 663],
 [211, 663, 664],
 [2, 665],
 [2, 665, 666],
 [2, 665, 666, 345],
 [11, 27],
 [11, 27, 28],
 [11, 27, 28, 2],
 [11, 27, 28, 2, 667],
 [11, 27, 28, 2, 667, 73],
 [11, 27, 28, 2, 667, 73, 153],
 [11, 27, 28, 2, 667, 73, 153, 90],
 [2, 668],
 [2, 668, 669],
 [2, 668, 669, 12],
 [2, 668, 669, 12, 1],
 [2, 668, 669, 12, 1, 670],
 [346, 671],
 [212, 213],
 [19, 672],
 [19, 672, 673],
 [347, 348],
 [347, 348, 674],
 [675, 4],
 [675, 4, 2],
 [675, 4, 2, 349],
 [675, 4, 2, 349, 676],
 [1, 677],
 [1, 677, 350],
 [1, 677, 350, 4],
 [1, 677, 350, 4, 44],
 [1, 677, 350, 4, 44, 8],
 [25, 6],
 [25, 6, 9],
 [25, 6, 9, 678],
 [25, 6, 9, 678, 679],
 [25, 6, 9, 678, 679, 351],
 [25, 6, 9, 678, 679, 351, 2],
 [25, 6, 9, 678, 679, 351, 2, 680],
 [25, 6, 9, 678, 679, 351, 

In [14]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, 
                                             maxlen = max_sequence_len,
                                             padding = 'pre'))
    
    predictors, label = input_sequences[:,:-1], input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes = total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [15]:
def create_model(max_sequence_len, total_words):

    input_len = max_sequence_len - 1

    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Embedding(total_words, 10, input_length = input_len))
    model.add(tf.keras.layers.LSTM(100))
    model.add(tf.keras.layers.Dropout(0.1))
    model.add(tf.keras.layers.Dense(total_words, activation = 'softmax'))

    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
    return(model)

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential"
_________________________________________________________________


2023-07-09 18:21:21.110904: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-07-09 18:21:21.116222: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-07-09 18:21:21.119618: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 16, 10)            22880     
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 2288)              231088    
                                                                 
Total params: 298,368
Trainable params: 298,368
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.fit(predictors, label, epochs=100)

Epoch 1/100


2023-07-09 18:22:52.897945: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-07-09 18:22:52.900644: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-07-09 18:22:52.903448: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.callbacks.History at 0x13638de10>

In [17]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], 
                                   maxlen=max_sequence_len-1, 
                                   padding='pre')
        y_prob = model.predict(token_list, verbose=0) 
        predicted = y_prob.argmax(axis=-1)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text.title()

In [18]:
print (generate_text("united states", 5, model, max_sequence_len))

2023-07-09 18:32:24.238730: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-07-09 18:32:24.242792: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-07-09 18:32:24.244963: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

United States No Vote To Conspiracy Theories
