In [45]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, TFDistilBertModel

**Importing Dataset**

In [46]:
df_fake = pd.read_csv("../input/fake-news-detection/Fake.csv")
df_true = pd.read_csv("../input/fake-news-detection/True.csv")

In [47]:
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [48]:
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [49]:
df_fake["label"] = 0
df_true["label"] = 1

In [50]:
df_fake.shape, df_true.shape

((23481, 5), (21417, 5))

**Merging True and Fake Dataframes**

In [51]:
df_merge = pd.concat([df_fake, df_true], axis = 0 )
df_merge.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [52]:
df_merge.columns

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')

**Removing columns which are not required**

In [53]:
df = df_merge.drop(["title", "subject","date"], axis = 1)
df.head()

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [54]:
df.isnull().sum()

text     0
label    0
dtype: int64

**Random Shuffling the dataframe**

In [55]:
df1=df
df1.head()

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [56]:
shuffled_data = df1.sample(frac=1, random_state=42).reset_index(drop=True)
shuffled_data.head()

Unnamed: 0,text,label
0,"21st Century Wire says Ben Stein, reputable pr...",0
1,WASHINGTON (Reuters) - U.S. President Donald T...,1
2,(Reuters) - Puerto Rico Governor Ricardo Rosse...,1
3,"On Monday, Donald Trump once again embarrassed...",0
4,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",1


In [57]:
df2=shuffled_data
df2

Unnamed: 0,text,label
0,"21st Century Wire says Ben Stein, reputable pr...",0
1,WASHINGTON (Reuters) - U.S. President Donald T...,1
2,(Reuters) - Puerto Rico Governor Ricardo Rosse...,1
3,"On Monday, Donald Trump once again embarrassed...",0
4,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",1
...,...,...
44893,,0
44894,LONDON/TOKYO (Reuters) - British Prime Ministe...,1
44895,BERLIN (Reuters) - Chancellor Angela Merkel sa...,1
44896,Jesus f*cking Christ our President* is a moron...,0


In [58]:
news=df2[df2['label']==1]['text']
print(news)

1        WASHINGTON (Reuters) - U.S. President Donald T...
2        (Reuters) - Puerto Rico Governor Ricardo Rosse...
4        GLASGOW, Scotland (Reuters) - Most U.S. presid...
8        WASHINGTON (Reuters) - The State Department sa...
9         (This version of the story corrects the figur...
                               ...                        
44886    WASHINGTON (Reuters) - President Donald Trump ...
44888    PARIS (Reuters) - French businessman and art p...
44890    NAIROBI (Reuters) - Burundi s main opposition ...
44894    LONDON/TOKYO (Reuters) - British Prime Ministe...
44895    BERLIN (Reuters) - Chancellor Angela Merkel sa...
Name: text, Length: 21417, dtype: object


In [59]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

**Defining dependent and independent variables**

In [60]:
x = df2["text"]
y = df2["label"]

**Tokenize and Clean the Text Data**

In [61]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
max_len = 128  # Maximum sequence length for tokenization



In [66]:
def preprocess_text(texts, tokenizer, max_len):
    """Tokenizes and preprocesses the text data."""
    # Convert pandas Series to a list if not already
    if isinstance(texts, pd.Series):
        texts = texts.tolist()
    
    # Tokenize and preprocess
    encoded = tokenizer(
        texts,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="tf"
    )
    return encoded["input_ids"], encoded["attention_mask"]

**Split the data into train and test sets**

In [71]:
X_train, X_test, y_train, y_test = train_test_split(
    df2["text"], df2["label"], test_size=0.2, random_state=42
)

Tokenize training and testing data

In [72]:
# Ensure X_train and X_test are converted to lists
X_train_ids, X_train_mask = preprocess_text(X_train, tokenizer, max_len)
X_test_ids, X_test_mask = preprocess_text(X_test, tokenizer, max_len)

In [73]:
# Convert labels to TensorFlow tensors
y_train = tf.convert_to_tensor(y_train.values)
y_test = tf.convert_to_tensor(y_test.values)


**Build BERT Classification Model**

BERT (Bidirectional Encoder Representations from Transformers) is a powerful machine learning model used for understanding language. It's like a super-smart tool that reads text and tries to understand the meaning of each word in context.

For example:

In the sentence "The bank is on the river," BERT knows "bank" means a riverbank.

In "I need to go to the bank," it understands "bank" means a financial institution.

BERT achieves this by reading sentences both forward and backward (hence "bidirectional") to grasp the complete context, making it great for tasks like answering questions, translating languages, or finding the sentiment in a review.

In [74]:
def build_model(transformer, max_len):
    """Builds a classification model using DistilBERT."""
    input_word_ids = tf.keras.layers.Input(
        shape=(max_len,), dtype=tf.int32, name="input_word_ids"
    )
    attention_mask = tf.keras.layers.Input(
        shape=(max_len,), dtype=tf.int32, name="attention_mask"
    )

    sequence_output = transformer(input_word_ids, attention_mask=attention_mask)[0]
    cls_token = sequence_output[:, 0, :]  # [CLS] token output
    out = tf.keras.layers.Dense(1, activation="sigmoid")(cls_token)

    model = tf.keras.Model(inputs=[input_word_ids, attention_mask], outputs=out)
    return model


In [75]:
# Load the pre-trained BERT model
transformer = TFDistilBertModel.from_pretrained("distilbert-base-uncased")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


 *Build the classification model*

In [76]:
model = build_model(transformer, max_len)


In [77]:
# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
              loss="binary_crossentropy",
              metrics=["accuracy"])

**Train the Model**

In [78]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_word_ids": X_train_ids, "attention_mask": X_train_mask}, y_train
)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_word_ids": X_test_ids, "attention_mask": X_test_mask}, y_test
)).batch(16)


In [83]:
# Train the model
model.fit(train_dataset, validation_data=test_dataset, epochs=10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tf_keras.src.callbacks.History at 0x7ca31f5a8070>

**Evaluate the Model's Performance**

In [84]:
# Predict on the test set
y_pred = model.predict(test_dataset)
y_pred = (y_pred.flatten() > 0.5).astype(int)



In [91]:
# Classification metrics
print(classification_report(y_test, y_pred, target_names=["Fake", "Real"]))

              precision    recall  f1-score   support

        Fake       1.00      1.00      1.00      4710
        Real       1.00      1.00      1.00      4270

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

