In [6]:
# --- 1. Imports ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re, string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, SimpleRNN, LSTM, Flatten, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
# Load dataset
df = pd.read_csv('./clickbait_data.csv')
print(df.head())

# --- Cleaning functions ---
def RemoveSpecialCharacters(sentence):
    return re.sub('[^a-zA-Z]+',' ',sentence)

def ConvertToLowerCase(sentence):
    return sentence.lower()

def CleanText(sentence):
    sentence = str(sentence)
    STOPWORDS = stopwords.words('english') + ['u','ü','ur','4','2','im','dont','doin','ure']
    nopunc = [char for char in sentence if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    sentence = ' '.join([word for word in nopunc.split() if word.lower() not in STOPWORDS])
    sentence = ConvertToLowerCase(RemoveSpecialCharacters(sentence))
    return sentence

# Apply cleaning
df['clean_text'] = df['headline'].apply(CleanText)

# Features and labels
X = df['clean_text']
y = df['clickbait']

print(X)
print(y)


                                            headline  clickbait
0                                 Should I Get Bings          1
1      Which TV Female Friend Group Do You Belong In          1
2  The New "Star Wars: The Force Awakens" Trailer...          1
3  This Vine Of New York On "Celebrity Big Brothe...          1
4  A Couple Did A Stunning Photo Shoot With Their...          1
0                                                get bings
1                            tv female friend group belong
2          new star wars force awakens trailer give chills
3        vine new york celebrity big brother fucking pe...
4        couple stunning photo shoot baby learning inop...
                               ...                        
31995           make female hearts flutter iraq throw shoe
31996    british liberal democrat patsy calton dies cancer
31997    drone smartphone app help heart attack victims...
31998    netanyahu urges pope benedict israel denounce ...
31999    computer makers p

In [9]:
# Vectorize with TF-IDF
vect = CountVectorizer()
X_dtm = vect.fit_transform(X)
tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X_dtm)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Build ANN
ann = Sequential()
ann.add(Input(shape=(X_train.shape[1],)))
ann.add(Dense(64, activation='relu'))
ann.add(Dense(1, activation='sigmoid'))
ann.add(Dense(1, activation='sigmoid'))

ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train
ann_history = ann.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

# Evaluate
ann_acc = ann.evaluate(X_test, y_test, verbose=0)[1]
print("ANN Accuracy:", ann_acc)


Epoch 1/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 25ms/step - accuracy: 0.8080 - loss: 0.4926 - val_accuracy: 0.9595 - val_loss: 0.3628
Epoch 2/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 25ms/step - accuracy: 0.9796 - loss: 0.2840 - val_accuracy: 0.9634 - val_loss: 0.2680
Epoch 3/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 26ms/step - accuracy: 0.9930 - loss: 0.1920 - val_accuracy: 0.9609 - val_loss: 0.2147
Epoch 4/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 29ms/step - accuracy: 0.9971 - loss: 0.1364 - val_accuracy: 0.9605 - val_loss: 0.1832
Epoch 5/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 23ms/step - accuracy: 0.9980 - loss: 0.1010 - val_accuracy: 0.9598 - val_loss: 0.1637
ANN Accuracy: 0.9598437547683716


In [11]:
# Tokenize
max_words = 5000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)

X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=max_len)

# Split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

# Build RNN
rnn = Sequential()
rnn.add(Embedding(input_dim=max_words, output_dim=32))
rnn.add(SimpleRNN(32))
rnn.add(Dense(1, activation='sigmoid'))

rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train
rnn_history = rnn.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

# Evaluate
rnn_acc = rnn.evaluate(X_test, y_test, verbose=0)[1]
print("RNN Accuracy:", rnn_acc)


Epoch 1/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 25ms/step - accuracy: 0.9016 - loss: 0.2302 - val_accuracy: 0.9541 - val_loss: 0.1216
Epoch 2/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 22ms/step - accuracy: 0.9699 - loss: 0.0801 - val_accuracy: 0.9541 - val_loss: 0.1214
Epoch 3/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 22ms/step - accuracy: 0.9849 - loss: 0.0427 - val_accuracy: 0.9516 - val_loss: 0.1387
Epoch 4/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 22ms/step - accuracy: 0.9921 - loss: 0.0220 - val_accuracy: 0.9497 - val_loss: 0.1694
Epoch 5/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 21ms/step - accuracy: 0.9948 - loss: 0.0156 - val_accuracy: 0.9267 - val_loss: 0.2414
RNN Accuracy: 0.9267187714576721


In [7]:
# Build LSTM
lstm = Sequential()
lstm.add(Embedding(input_dim=max_words, output_dim=32))
lstm.add(LSTM(32))
lstm.add(Dense(1, activation='sigmoid'))

lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train
lstm_history = lstm.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

# Evaluate
lstm_acc = lstm.evaluate(X_test, y_test, verbose=0)[1]
print("LSTM Accuracy:", lstm_acc)


Epoch 1/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 33ms/step - accuracy: 0.9111 - loss: 0.2157 - val_accuracy: 0.9563 - val_loss: 0.1217
Epoch 2/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 34ms/step - accuracy: 0.9669 - loss: 0.0850 - val_accuracy: 0.9548 - val_loss: 0.1229
Epoch 3/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 36ms/step - accuracy: 0.9755 - loss: 0.0634 - val_accuracy: 0.9528 - val_loss: 0.1356
Epoch 4/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 37ms/step - accuracy: 0.9795 - loss: 0.0518 - val_accuracy: 0.9519 - val_loss: 0.1448
Epoch 5/5
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 38ms/step - accuracy: 0.9823 - loss: 0.0435 - val_accuracy: 0.9475 - val_loss: 0.1636
LSTM Accuracy: 0.9474999904632568


In [12]:
print("Final Comparison:")
print(f"ANN Accuracy:  {ann_acc:.4f}")
print(f"RNN Accuracy:  {rnn_acc:.4f}")
print(f"LSTM Accuracy: {lstm_acc:.4f}")


Final Comparison:
ANN Accuracy:  0.9598
RNN Accuracy:  0.9267
LSTM Accuracy: 0.9475
