# Project Clickbait Detection

### Team members: Joshua Burris, Caleb Tong

In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, GRU, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
import re

def text_cleaner(text):
    # lower case text
    newString = text.lower()
    newString = re.sub(r"'s\b","",newString)
    # remove punctuations
    newString = re.sub("[^a-zA-Z]", " ", newString) 
    long_words=[]
    # remove short word
    for i in newString.split():
        if len(i)>=3:                  
            long_words.append(i)
    return (" ".join(long_words)).strip()

data = open("clickbait_data", "r")
clickbait_data = data.read().replace('\n\n', '.\n\n')
#print(clickbait_data)
data = text_cleaner(clickbait_data)
#print(data)

In [3]:
def create_seq(text):
    length = 30
    sequences = list()
    for i in range(length, len(text)):
        # select sequence of tokens
        seq = text[i-length:i+1]
        # store
        sequences.append(seq)
    print('Total Sequences: %d' % len(sequences))
    return sequences

sequences = create_seq(data)
#print(sequences[:1000])

Total Sequences: 807008


In [4]:
# generate a sequence of characters with a language model
def generate_seq(model, tokenizer, seed_text, n_words):
    in_text, result = seed_text, seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = array(encoded)
        # predict a word in the vocabulary
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text, result = out_word, result + ' ' + out_word
    return result

In [5]:
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]

In [6]:
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 11272


In [7]:
# create word -> word sequences
sequences = list()
for i in range(1, len(encoded)):
    sequence = encoded[i-1:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 131330


In [8]:
# split into X and y elements
sequences = array(sequences)
X, y = sequences[:,0],sequences[:,1]

In [9]:
# one hot encode outputs
y = to_categorical(y, num_classes=vocab_size)

In [10]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1, 10)             112720    
_________________________________________________________________
lstm (LSTM)                  (None, 50)                12200     
_________________________________________________________________
dense (Dense)                (None, 11272)             574872    
Total params: 699,792
Trainable params: 699,792
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=500, verbose=2)

Train on 131330 samples
Epoch 1/500
131330/131330 - 69s - loss: 7.2894 - accuracy: 0.0462
Epoch 2/500
131330/131330 - 65s - loss: 6.8169 - accuracy: 0.0668
Epoch 3/500
131330/131330 - 65s - loss: 6.4981 - accuracy: 0.0944
Epoch 4/500
131330/131330 - 65s - loss: 6.2565 - accuracy: 0.1151
Epoch 5/500
131330/131330 - 64s - loss: 6.0791 - accuracy: 0.1306
Epoch 6/500
131330/131330 - 64s - loss: 5.9410 - accuracy: 0.1418
Epoch 7/500
131330/131330 - 64s - loss: 5.8219 - accuracy: 0.1503
Epoch 8/500
131330/131330 - 64s - loss: 5.7171 - accuracy: 0.1580
Epoch 9/500
131330/131330 - 65s - loss: 5.6217 - accuracy: 0.1638
Epoch 10/500
131330/131330 - 64s - loss: 5.5347 - accuracy: 0.1687
Epoch 11/500
131330/131330 - 64s - loss: 5.4558 - accuracy: 0.1737
Epoch 12/500
131330/131330 - 64s - loss: 5.3848 - accuracy: 0.1780
Epoch 13/500
131330/131330 - 64s - loss: 5.3179 - accuracy: 0.1828
Epoch 14/500
131330/131330 - 64s - loss: 5.2571 - accuracy: 0.1868
Epoch 15/500
131330/131330 - 63s - loss: 5.2017

KeyboardInterrupt: 

In [20]:
in_text = 'jack'
# evaluate
encoded = tokenizer.texts_to_sequences([in_text])[0]
encoded = array(encoded)
yhat = model.predict_classes(encoded, verbose=0)
for word, index in tokenizer.word_index.items():
    if index == yhat:
        print(word)
#print(generate_seq(model, tokenizer, 'Jack', 6))

lanterns
