In [1]:
import pandas as pd

**Name:** Izza Yaqoob (RAI_008)

**Base article:** Fake News Classification using transformer based enhanced LSTM
and BERT

**Link:** https://www.sciencedirect.com/science/article/pii/S2666307422000092

**Repository Cloning**

In [2]:
import os
import subprocess

# URL of the GitHub repository
repo_url = "https://github.com/KaiDMML/FakeNewsNet"
clone_dir = "FakeNewsNet"

# Check if already cloned
if not os.path.exists(clone_dir):
    print("Cloning FakeNewsNet repository...")
    subprocess.run(["git", "clone", repo_url])
    print("Repository cloned successfully.")
else:
    print("Repository already exists.")

# Change directory into the repo
os.chdir(clone_dir)
print(f"Changed working directory to: {os.getcwd()}")


Cloning FakeNewsNet repository...
Repository cloned successfully.
Changed working directory to: /content/FakeNewsNet


**Reading both fake and real datasets of gossipcop and politifact**

In [3]:
gossipcop_fake = pd.read_csv("/content/FakeNewsNet/dataset/gossipcop_fake.csv")
gossipcop_real = pd.read_csv("/content/FakeNewsNet/dataset/gossipcop_real.csv")
gossipcop_fake['label'] = 0  # 0 = Fake
gossipcop_real['label'] = 1  # 1 = Real
data1 = pd.concat([gossipcop_fake, gossipcop_real], ignore_index=True)[['title','label']]


In [4]:
data1.isnull().sum()

Unnamed: 0,0
title,0
label,0


In [5]:
data1.head()

Unnamed: 0,title,label
0,Did Miley Cyrus and Liam Hemsworth secretly ge...,0
1,Paris Jackson & Cara Delevingne Enjoy Night Ou...,0
2,Celebrities Join Tax March in Protest of Donal...,0
3,Cindy Crawford's daughter Kaia Gerber wears a ...,0
4,Full List of 2018 Oscar Nominations – Variety,0


In [6]:
politifact_fake = pd.read_csv("/content/FakeNewsNet/dataset/politifact_fake.csv")
politifact_real = pd.read_csv("/content/FakeNewsNet/dataset/politifact_real.csv")
politifact_fake['label'] = 0  # 0 = Fake
politifact_real['label'] = 1  # 1 = Real
data2 = pd.concat([politifact_fake, politifact_real], ignore_index=True)[['title','label']]

In [7]:
data2.head()

Unnamed: 0,title,label
0,BREAKING: First NFL Team Declares Bankruptcy O...,0
1,Court Orders Obama To Pay $400 Million In Rest...,0
2,UPDATE: Second Roy Moore Accuser Works For Mic...,0
3,Oscar Pistorius Attempts To Commit Suicide,0
4,Trump Votes For Death Penalty For Being Gay,0


**Shapes of data**

In [8]:
print('Gossipcop Fake: ',gossipcop_fake.shape)
print('Gossipcop Real: ',gossipcop_real.shape)
print('Politifact Fake: ',politifact_fake.shape)
print('Politifact Real: ',politifact_real.shape)
print('Concatenated Gossipcop: ',data1.shape)
print('Concatenated Politifact: ',data2.shape)

Gossipcop Fake:  (5323, 5)
Gossipcop Real:  (16817, 5)
Politifact Fake:  (432, 5)
Politifact Real:  (624, 5)
Concatenated Gossipcop:  (22140, 2)
Concatenated Politifact:  (1056, 2)


**Reducing the dimensions of Gossipcop data**

In [9]:
# combining 1056 samples of both classes
first_1056 = data1.head(1056)
last_1056= data1.tail(1056)
data1 = pd.concat([first_1056, last_1056], ignore_index=True)
data1.shape

(2112, 2)

In [10]:
print(data1['title'].isna().sum())

0


In [11]:
import re #built-in regular expressions module

import string

import nltk  #Natural Language Toolkit, a key Python package for NLP tasks

from nltk.corpus import stopwords  # Common words like "the", "is", "in", etc., usually removed from text

from nltk.tokenize import word_tokenize  # Splits text into individual words (tokens)

# Download NLTK resources

nltk.download('punkt')   #Tokenizer models

nltk.download('punkt_tab') #Likely a typo or legacy resource

nltk.download('stopwords') # Predefined list of stopwords for multiple languages

# Custom stopwords list, keeping "not" and "can"

custom_stopwords = set(stopwords.words('english')) - {"not", "can"} # remove stopwords except can and not

# Function to expand contractions like "can't" to "can not"

def expand_contractions(text):
    return re.sub(r"['’']t\b", " not", text)

# Function to clean and tokenize text

def preprocess_text(text):

    # Handle non-string inputs (e.g., lists, NaN, etc.)
    if not isinstance(text, str):
        text = str(text)
    # Lowercase the text
    text = text.lower()

    # Expand contractions
    text = expand_contractions(text)

    # Remove @mentions
    text = re.sub(r"@\w+", "", text)  # w is for word character

    # Remove special characters except question marks and keep words and ?

    text = re.sub(r"[^\w\s?]", "", text) # Removes special characters except question marks, words, and whitespace

    # Remove digits and underscores (optional, depending on context)
    text = re.sub(r"[\d_]", "", text)

    # Remove extra whitespace
    text = text.strip() # remove both start and trainling (ending) whitespaces

    # Tokenize, explicitly specifying the language
    tokens = word_tokenize(text, language='english')

    # Remove stopwords except "not" and "can"
    filtered_tokens = [word for word in tokens if word not in custom_stopwords]

    return filtered_tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**Testing Pre-processing steps on sample**

In [12]:
example = "I can’t believe @jack said that! Isn’t it amazing? Honestly, it shouldn't happen."
print(preprocess_text(example))

['can', 'not', 'believe', 'said', 'not', 'amazing', '?', 'honestly', 'not', 'happen']


**Testing Pre-processing steps on datasets**

In [13]:
data1['title'] = data1['title'].apply(preprocess_text)
data1

Unnamed: 0,title,label
0,"[miley, cyrus, liam, hemsworth, secretly, get,...",0
1,"[paris, jackson, cara, delevingne, enjoy, nigh...",0
2,"[celebrities, join, tax, march, protest, donal...",0
3,"[cindy, crawfords, daughter, kaia, gerber, wea...",0
4,"[full, list, oscar, nominations, variety]",0
...,...,...
2107,"[hollywood, film, awards, complete, list, winn...",1
2108,"[jada, pinkett, smith, explains, son, jaden, m...",1
2109,"[tinsley, mortimer, reacts, luann, de, lesseps...",1
2110,"[prince, harry, carries, princess, dianas, leg...",1


In [14]:
data2['title'] = data2['title'].apply(preprocess_text)

In [15]:
data2.tail()

Unnamed: 0,title,label
1051,"[flake, religious, tests, place, senate]",1
1052,"[change, can, believe]",1
1053,"[deputy, director, national, health, statistic...",1
1054,"[romneys, prolife, conversion, myth, reality, ...",1
1055,"[interest, group, ratings]",1


In [16]:
from sklearn.model_selection import train_test_split
X1 = data1.title
X2 = data2.title
Y1= data1.label
Y2 = data2.label
X_train1,X_test1,y_train1,y_test1 = train_test_split(X1,Y1,stratify=Y1,test_size=0.2,random_state=42)
X_train2,X_test2,y_train2,y_test2 = train_test_split(X2,Y2,stratify=Y2,test_size=0.2,random_state=42)

In [17]:
X_train1 , y_train1

(1959        [dylan, farrow, telling, truth, woody, allen]
 173     [anna, faris, uncomfortable, watching, chris, ...
 594     [royal, life, ruining, meghan, markles, sexy, ...
 1314    [famously, singleseason, anyone, else, watchin...
 1133    [meghan, markles, ex, trevor, engelson, marrie...
                               ...                        
 954     [sofia, vergara, joe, manganiello, defend, mar...
 1962    [xxxtentacions, posthumous, album, arrives, ba...
 1316    [selena, gomez, shopping, justin, biebers, clo...
 1187    [handmaids, tale, season, episode, recap, unwo...
 1222                       [really, costs, go, coachella]
 Name: title, Length: 1689, dtype: object,
 1959    1
 173     0
 594     0
 1314    1
 1133    1
        ..
 954     0
 1962    1
 1316    1
 1187    1
 1222    1
 Name: label, Length: 1689, dtype: int64)

In [18]:
X_train2 , y_train2

(641                 [week, transcript, adm, mike, mullen]
 936                 [kerrymccain, welcome, massachusetts]
 551     [rd, democratic, debate, transcript, annotated...
 580     [ad, says, obama, apologized, showed, weakness...
 609     [average, cable, tv, bill, cited, article, ind...
                               ...                        
 1035     [latest, political, news, headlines, dc, beyond]
 221       [manager, killed, employees, checkers, st, ave]
 761     [pwned, house, gop, dominates, twitter, youtub...
 1010    [employment, hours, earnings, current, employm...
 418     [senate, report, admits, clinton, gifted, chil...
 Name: title, Length: 844, dtype: object,
 641     1
 936     1
 551     1
 580     1
 609     1
        ..
 1035    1
 221     0
 761     1
 1010    1
 418     0
 Name: label, Length: 844, dtype: int64)

In [19]:
!pip install --upgrade tensorflow-hub tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.0 kB)
Collecting tensorflow<2.20,>=2.19.0 (from tensorflow_text)
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tensorboard~=2.19.0 (from tensorflow<2.20,>=2.19.0->tensorflow_text)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting ml-dtypes<1.0.0,>=0.5.1 (from tensorflow<2.20,>=2.19.0->tensorflow_text)
  Downloading ml_dtypes-0.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (21 kB)
INFO: pip is looking at multiple versions of tf-keras to determine which version is compatible with other requirements. This could take a while.
Collecting tf-keras>=2.14.1 (from tensorflow-hub)
  Downloading tf_keras-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Downloading tensorflow_text-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [20]:
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text
preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3", trainable=True)# Define function to get BERT embeddings



In [21]:
X_train1.shape

(1689,)

In [None]:
def get_bert_embeddings(texts):
    # Join the list of words into a single string for each text
    texts = [" ".join(text) for text in texts]
    text_input = tf.constant(texts)
    #print(text_input)
    preprocessed_text = preprocessor(text_input)
    #print(preprocessed_text)
    outputs = encoder(preprocessed_text)
    print(outputs)
    pooled_output = outputs['pooled_output']
    #print(pooled_output)
    return pooled_output

# Get BERT embeddings for training and test sets
train_embeddings1 = get_bert_embeddings(X_train1)

print(train_embeddings1.shape)
print(type(train_embeddings1))


In [None]:
test_embeddings1 = get_bert_embeddings(X_test1)

In [None]:
# from tf.keras.models import Sequential
# from tf.keras.layers import LSTM

# model = Sequential()
# lstm_layer = tf.keras.layers.LSTM(units=256, input_shape=(None, 786))


# Description
# Lstm layer = (768 x 256)
# Feed-forward layer = (256 x 128)
# Batch normalization = 128
# Dropout layer with ratio = 0.6
# Feed-forward layer = (128 x 32)
# Feed-forward layer = (32 x 2)
# output = fake/real


In [None]:
train_embeddings2 = get_bert_embeddings(X_train2)
test_embeddings2 = get_bert_embeddings(X_test2)