In [None]:
!pip install kaggle
!mkdir -p ~/.kaggle
!echo '{"username":"mcparadip","key":"ee60bb4deb5f1780508fbc0a864ea7dd"}' > ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

!mkdir -p /content/input
!cd /content/input
!kaggle datasets download --path /content/input --unzip rtatman/glove-global-vectors-for-word-representation
!cd /content

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
import random
import string
import re
import os

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras import Sequential
from nltk.corpus import stopwords

In [None]:
AUTO = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 128

In [None]:
LABELS = [
    "rumor",
    "hate",
    "unreliable",
    "conspiracy",
    "clickbait",
    "satire",
    "fake",
    "reliable",
    "bias",
    "political",
    "junksci",
    "unknown",
]

SPLIT = [
    ["fake"],
    # ["political"],
    ["reliable"],
    ["rumor", "unknown", "hate", "clickbait", "conspiracy", "junksci", "satire", "bias", "unreliable", "political"],
]

SPLIT = {
    LABELS.index(x): idx
    for idx, category in enumerate(SPLIT)
    for x in category
}

SPLIT = tf.lookup.StaticHashTable(
    tf.lookup.KeyValueTensorInitializer(
        list(SPLIT.keys()),
        list(SPLIT.values()),
        key_dtype=tf.int64
    ),
    0
)

In [None]:
GCS_PATH = "gs://dbunk"
files = tf.io.gfile.glob(GCS_PATH + "/*.tfrecord")
files_train = files[:20]
files_test = files[129:]

In [None]:
feature_description = {
    "text": tf.io.FixedLenFeature([], tf.string),
    "label": tf.io.FixedLenFeature([], tf.int64),
}

def parse(example):
    parsed = tf.io.parse_single_example(example, feature_description)
    return parsed["text"], parsed["label"]

def simplify(x, y):
    return x, SPLIT.lookup(y)

def remove(x, y):
    return y != 2

ds_train = tf.data.TFRecordDataset(files_train, num_parallel_reads=AUTO)
ds_train = ds_train.map(parse).map(simplify).filter(remove)
ds_train = ds_train.shuffle(10000).batch(BATCH_SIZE).prefetch(AUTO)

ds_test = tf.data.TFRecordDataset(files_test, num_parallel_reads=AUTO)
ds_test = ds_test.map(parse).map(simplify).filter(remove)
ds_test = ds_test.shuffle(10000).batch(BATCH_SIZE).prefetch(AUTO)

In [None]:
MAX_TOKENS = 5000
OUTPUT_LEN = 300
EMBEDDING_DIM = 100

In [None]:
def process_text(input_data):
    data = tf.strings.lower(input_data)
    data = tf.strings.regex_replace(data, r"\[[^]]*\]", "")
    data = tf.strings.regex_replace(data, r"http\S+", "")
    data = tf.strings.regex_replace(data, f"[{re.escape(string.punctuation)}]", "")
    return data

In [None]:
vectorize_layer = TextVectorization(
    standardize=process_text,
    max_tokens=MAX_TOKENS,
    output_sequence_length=OUTPUT_LEN
)
train_text = ds_train.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [None]:
voc = vectorize_layer.get_vocabulary()
word_index = dict(zip(voc, range(2, len(voc))))

In [None]:
embeddings_index = {}
with open("input/glove.6B.100d.txt") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word.decode("utf-8"))
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
class_weight = {0: 2, 1: 1}

In [None]:
model = Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)
model.add(Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False,
))
model.add(LSTM(512, return_sequences=True))
model.add(LSTM(512))
model.add(Dense(32, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

optimizer = tf.keras.optimizers.Adam(lr=0.001)
loss = tf.keras.losses.BinaryCrossentropy()
model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

In [None]:
model.summary()

In [None]:
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau()
history = model.fit(
    ds_train,
    epochs=20,
    steps_per_epoch=10240,
    validation_data=ds_test,
    class_weight=class_weight,
    callbacks=[reduce_lr]
)

In [None]:
history

In [None]:
model.evaluate(ds_test.take(100))

In [None]:
a = """Trump Trump News"""
model.predict([a])