In [6]:
import random 
import pickle

import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation
from tensorflow.keras.optimizers import RMSprop

In [7]:
text_df = pd.read_csv("fake_or_real_news.csv")
text_df

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [8]:
text = list(text_df.text.values)


In [9]:
joined_text = " ".join(text)

In [10]:
partial_text = joined_text[:10000]

In [11]:
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(partial_text.lower())

In [12]:
unique_tokens = np.unique(tokens)
print(unique_tokens)

['2016' '2020' '5' '60' 'a' 'abc' 'abcpolitics' 'abedin' 'about'
 'aboutface' 'abuses' 'accused' 'accusing' 'act' 'ad' 'admits' 'ads'
 'afraid' 'after' 'afternoon' 'against' 'age' 'agency' 'agents' 'ago'
 'ahead' 'alive' 'all' 'allegations' 'allies' 'allowed' 'already' 'also'
 'amendment' 'americans' 'an' 'and' 'announced' 'anthony' 'any' 'anywhere'
 'apolitical' 'appearance' 'appeared' 'appearing' 'appeaser' 'approach'
 'are' 'around' 'arrogant' 'article' 'as' 'asked' 'assault' 'assaulting'
 'assaults' 'associates' 'assume' 'at' 'attack' 'attacked' 'attacking'
 'away' 'awkward' 'awkwardly' 'back' 'backed' 'bad' 'badly' 'batch'
 'bathroom' 'be' 'becoming' 'beds' 'been' 'before' 'behavior' 'behind'
 'being' 'belief' 'believes' 'believing' 'better' 'between' 'bigger'
 'bigotry' 'bizarre' 'boldly' 'born' 'boston' 'bragged' 'breathing'
 'breeze' 'breezy' 'bribery' 'bring' 'bureau' 'buried' 'but' 'by' 'cable'
 'calling' 'came' 'campaign' 'can' 'candidate' 'cards' 'career' 'careers'
 'carvil

In [13]:
unique_token_index = {token: idx for idx, token in enumerate(unique_tokens)}

In [14]:
n_words = 10
input_words = []
next_words = []
for i in range(len(tokens) - n_words):
    input_words.append(tokens[i:i + n_words])
    next_words.append(tokens[i + n_words])

In [15]:
print(next_words)



In [16]:
#pass this to our neural network

In [17]:
x = np.zeros((len(input_words), n_words, len(unique_tokens)), dtype=bool)
y = np.zeros((len(input_words), len(unique_tokens)), dtype=bool)

In [18]:
for i, words in enumerate(input_words):
    for j, word in enumerate(words):
        x[i, j, unique_token_index[word]] = 1
    y[i, unique_token_index[next_words[i]]] = 1

In [19]:
model = Sequential()
model.add(LSTM(128, input_shape=(n_words, len(unique_tokens)), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(unique_tokens)))
model.add(Activation("softmax"))
model.compile(loss="categorical_crossentropy", optimizer=RMSprop(learning_rate=0.01), metrics=["accuracy"])
model.fit(x, y, batch_size=128, epochs=10, shuffle=True)


Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x19abf081a10>

In [20]:
def predict_next_word(input_text, n_best):
    input_text = input_text.lower()
    x = np.zeros((1, n_words, len(unique_tokens)))
    for i, word in enumerate(input_text.split()):
        x[0, i, unique_token_index[word]] = 1
    predictions = model.predict(x)[0]
    return np.argpartition(predictions, -n_best)[-n_best:]

In [21]:
possible = predict_next_word("he will have to look into this thing and he", 5)
possible



array([  4, 588, 402, 587, 605], dtype=int64)

In [22]:
print([unique_tokens[idx] for idx in possible])

['a', 'the', 'of', 'that', 'to']


In [23]:
#how do we take this and generate text

In [24]:
def generate_text(input_text, text_length, creativity=3):
    word_sequence = input_text.split()
    current = 0
    for _ in range(text_length):
        sub_sequence = " ".join(tokenizer.tokenize(" ".join(word_sequence).lower())[current:current+n_words])
        try:
            choice = unique_tokens[random.choice(predict_next_word(sub_sequence, creativity))]
        except:
            choice = random.choice(unique_tokens)
        word_sequence.append(choice)
        current += 1
    return " ".join(word_sequence)

In [25]:
generate_text("he will have to look into this thing and he", 100, 5)



'he will have to look into this thing and he the to of a been war the the that to that to has of the fbi has of the that of fbi of the fbi to that that fbi clinton fbi that that the the fbi a of to has of to that of the of the investigation the that the of the the has fbi has fbi that that the of fbi that to fbi that the s fbi that the that the of of of fbi that to investigation the to that that fbi s that of fbi that s is of of fbi s fbi the of'