In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
import string
import re
import os

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras import Sequential
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from lxml import html

In [None]:
MAX_TOKENS = 10000
OUTPUT_LEN = 300

In [None]:
# df_true = pd.read_csv("../input/fake-and-real-news-dataset/True.csv")
# df_false = pd.read_csv("../input/fake-and-real-news-dataset/Fake.csv")
# df_true["label"] = 1
# df_false["label"] = 0

# df = pd.concat((df_true, df_false))
# df["text"] = df["title"] + " " + df["text"]
# del df["title"]
# del df["subject"]
# del df["date"]

# df = df[df["text"].str.len() > 50]

In [None]:
df = pd.read_csv("../input/fake-news-detection/data.csv")
df["text"] = df["Headline"] + " " + df["Body"]
df["text"] = df["text"].astype(np.str)
df["label"] = df["Label"]
del df["Headline"]
del df["Body"]
del df["URLs"]
del df["Label"]

In [None]:
df.tail()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(df["text"], df["label"], test_size=0.2, stratify=df["label"])
train_ds = tf.data.Dataset.from_tensor_slices((X_train, Y_train)).batch(128)
test_ds = tf.data.Dataset.from_tensor_slices((X_test, Y_test)).batch(128)

In [None]:
def process_text(input_data):
    data = tf.strings.lower(input_data)
    data = tf.strings.regex_replace(data, r"\[[^]]*\]", "")
    data = tf.strings.regex_replace(data, r"http\S+", "")
    data = tf.strings.regex_replace(data, f"[{re.escape(string.punctuation)}]", "")
    return data

vectorizer = TextVectorization(
    standardize=process_text,
    max_tokens=MAX_TOKENS,
    output_sequence_length=OUTPUT_LEN
)
train_text = train_ds.map(lambda x, y: x)
vectorizer.adapt(train_text)

In [None]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(2, len(voc))))

In [None]:
embeddings_index = {}
with open("../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word.decode("utf-8"))
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
model = Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorizer)
model.add(Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False,
))
model.add(LSTM(units=128, return_sequences=True, dropout=0.2))
model.add(LSTM(units=64, dropout=0.2))
model.add(Dense(units=32, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

In [None]:
model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(lr=0.01)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(train_ds, epochs=20, validation_data=test_ds)

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
a = """An organization started by NBA superstar LeBron James and other Black athletes and entertainers announced Friday that it will help Floridians with prior felony convictions register to vote in the November election.

More Than A Vote said it will donate $100,000 to the Florida Rights Restoration Coalition's fund for Floridians struggling to pay off outstanding fees and fines associated with their felony convictions before they register to vote.
"This is a fight about their constitutional right to vote being denied," James, the NBA champion and former Cleveland Cavaliers and Miami Heat star who now plays for the Los Angeles Lakers, tweeted Friday.
"We believe that your right to vote shouldn't depend upon whether or not you can pay to exercise it," Miami Heat forward Udonis Haslem, who is also a member of More Than A Vote, said in a press release Friday. "Which is why More Than A Vote is proud to partner with the Florida Rights Restoration Coalition to ensure that formerly incarcerated American citizens -- many of them Black and brown -- are able to pay their outstanding fines and fees and register to vote in the 2020 election and beyond."
The Florida Rights Restoration Coalition's executive director said the partnership "will improve lives and strengthen our democracy."""

In [None]:
a = "Trump Trump Trump Trump Trump tet"

In [None]:
model.predict([a])

In [None]:
export_model.summary()

In [None]:
model.save('model.tf', save_format='tf', include_optimizer=False)

In [None]:
!zip -r model.zip model.tf/