In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler,StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split,cross_val_score, cross_val_predict
from sklearn.neural_network import MLPClassifier

In [23]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dropout, GlobalMaxPooling1D, Dense, Concatenate
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger
from tensorflow.keras import regularizers
from sklearn.metrics import (classification_report,precision_score, recall_score,f1_score, accuracy_score,mean_squared_error,mean_absolute_error,
                            roc_curve, auc, confusion_matrix)

In [3]:
import nltk
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.layers import TextVectorization
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [4]:
# Importing the dataset
df = pd.read_csv('Truth_Seeker_Model_Dataset.csv',header= 0)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,author,statement,target,BinaryNumTarget,manual_keywords,tweet,5_label_majority_answer,3_label_majority_answer
0,0,D.L. Davis,End of eviction moratorium means millions of A...,True,1,"Americans, eviction moratorium",@POTUS Biden Blunders - 6 Month Update\n\nInfl...,Mostly Agree,Agree
1,1,D.L. Davis,End of eviction moratorium means millions of A...,True,1,"Americans, eviction moratorium",@S0SickRick @Stairmaster_ @6d6f636869 Not as m...,NO MAJORITY,Agree
2,2,D.L. Davis,End of eviction moratorium means millions of A...,True,1,"Americans, eviction moratorium",THE SUPREME COURT is siding with super rich pr...,Agree,Agree
3,3,D.L. Davis,End of eviction moratorium means millions of A...,True,1,"Americans, eviction moratorium",@POTUS Biden Blunders\n\nBroken campaign promi...,Mostly Agree,Agree
4,4,D.L. Davis,End of eviction moratorium means millions of A...,True,1,"Americans, eviction moratorium",@OhComfy I agree. The confluence of events rig...,Agree,Agree


In [6]:
#make label column
def conversion(data):
  if (data['target'] == True and data['3_label_majority_answer'] == 'Agree') or (data['target'] == False and data['3_label_majority_answer'] == 'Disagree'):
    return 1
  else:
    return 0

In [7]:
df['label']=df.apply(conversion,axis=1)
df.label.value_counts()

label
1    68985
0    65213
Name: count, dtype: int64

In [9]:

from transformers import AutoTokenizer
from transformers import pipeline
#import preprocessor


In [11]:
def preprocess_tweet(tweet):

  # Lowercase the text
  tweet = tweet.lower()

  # Remove URLs and hashtags
  tweet = re.sub(r"http\S+|#\S+", "", tweet)

  # Remove mentions
  tweet = re.sub(r"@\S+", "@user", tweet)

  # Remove emojis (optional)
  tweet = re.sub(r"[^\w\s]", "", tweet)

  # Remove punctuation
  tweet = re.sub(r"[^a-zA-Z\s]", "", tweet)

  # Remove extra spaces
  tweet = re.sub(r"\s+", " ", tweet).strip()

  # Remove stop words (optional)
  stop_words = set(stopwords.words("english"))
  tweet = ' '.join([word for word in tweet.split() if word not in stop_words])

  # Perform lemmatization
  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(token) for token in tweet.split()]

  preprocessed_tweet = " ".join(tokens)

  return preprocessed_tweet

In [12]:
df['clean_tweet']=df['tweet'].apply(preprocess_tweet)

In [13]:
# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
#tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the tweets
tokenized_tweets = df['clean_tweet'].astype(str).apply(lambda x: tokenizer.encode(x, add_special_tokens=True))



In [14]:
max_length = max(len(tokens) for tokens in tokenized_tweets)

print(max_length)

95


In [15]:
X = pad_sequences(tokenized_tweets, maxlen=max_length, padding='post', truncating='post')

In [16]:
labels=df['label'].values

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [18]:

X_train.shape

(107358, 95)

In [19]:
#Access the vocabulary
vocab = tokenizer.get_vocab()

# Convert the vocabulary to a word index dictionary
word_index = {word: idx for idx, word in enumerate(vocab)}

In [20]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"),
             tf.keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [55]:
def fake_news_detection_model(vocab_size, embedding_dim, lstm_units, num_heads, dropout_rate, max_tweet_length):
    inputs = Input(shape=(max_tweet_length,))
    embeddings = Embedding(vocab_size, embedding_dim)(inputs)

    lstm_out = Bidirectional(LSTM(lstm_units, return_sequences=True))(embeddings)
    drop_out = Dropout(0.2)(lstm_out)

    transformer_block = TransformerBlock(embed_dim=embedding_dim, num_heads=num_heads, ff_dim=embedding_dim * 4)
    transformer_output = transformer_block(embeddings, training=True)

    concatenated = Concatenate()([drop_out, transformer_output])
    global_pooling = GlobalMaxPooling1D()(concatenated)
    global_pooling = Dropout(0.2)(global_pooling)

    outputs = Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(0.01))(global_pooling)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [56]:
vocab_size=len(word_index)+1
embedding_dim= 512
lstm_units=64
num_heads=8
dropout_rate=0.2
max_tweet_length=max_length

In [57]:
model=fake_news_detection_model(vocab_size, embedding_dim, lstm_units, num_heads, dropout_rate, max_tweet_length)

In [58]:
model.summary()

In [59]:
checkpoint=ModelCheckpoint(filepath='fake_news_detection_model.keras', monitor='val_accuracy', save_best_only=True, verbose=1)
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, callbacks=[checkpoint])

2024-09-25 22:26:27.407542: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 268435456 exceeds 10% of free system memory.
2024-09-25 22:26:28.230540: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 268435456 exceeds 10% of free system memory.
2024-09-25 22:26:28.916041: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 536870912 exceeds 10% of free system memory.
2024-09-25 22:26:30.287370: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 536870912 exceeds 10% of free system memory.
2024-09-25 22:26:31.600246: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 536870912 exceeds 10% of free system memory.


[1m 902/1678[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m1:50:08[0m 9s/step - accuracy: 0.7985 - loss: 0.5157

In [None]:
# Set Seaborn style
sns.set(style="whitegrid")

# Plotting training and validation accuracy
plt.figure(figsize=(12, 8))

plt.subplot(2, 1, 1)
sns.lineplot(x=range(1, len(history.history['accuracy']) + 1), y=history.history['accuracy'], label='Training Accuracy', marker='o')
sns.lineplot(x=range(1, len(history.history['val_accuracy']) + 1), y=history.history['val_accuracy'], label='Validation Accuracy', marker='o')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# Plotting training and validation loss
plt.subplot(2, 1, 2)
sns.lineplot(x=range(1, len(history.history['loss']) + 1), y=history.history['loss'], label='Training Loss', marker='o')
sns.lineplot(x=range(1, len(history.history['val_loss']) + 1), y=history.history['val_loss'], label='Validation Loss', marker='o')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()  # Adjust layout for better spacing
plt.show()

In [None]:

# Evaluating the model
val_loss, val_accuracy = model.evaluate(X_test, y_test)
print(f"Validation Accuracy: {val_accuracy}")

In [None]:
# Making Predictions
predictions = model.predict(X_test)
predicted_labels = [np.argmax(pred) for pred in predictions]
classes=np.argmax(predictions)

In [None]:
print("\nClassification Report:\n", classification_report(y_test, predictions.round()))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, predictions.round())
plt.figure(figsize=(6,6))
sns.heatmap(cm, annot=True, fmt='g', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()