# Fake News Detection Model Using TensorFlow

 Importing libraries and dataset

In [2]:
import pandas as pd #For data handling
import numpy as np #For mathematical operations
import tensorflow as tf # For deeplearning model creation
import nltk # Natural language processing(stopwords)
from  nltk.corpus import stopwords #Remove common words like is,and,the.
import re #Regular expression(text cleaning)
import string #Punctuation handling
from sklearn.feature_extraction.text import TfidfVectorizer #Convert text to numerical vectors
from sklearn.preprocessing import LabelEncoder #Convert REAL/FAKE into 0/1
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential #Keras model type
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout # Neural network layers
from sklearn.metrics import accuracy_score,classification_report # Model evaluation
df=pd.read_csv("/content/news.csv",usecols=["text","label"]) # Loads only text and label columns
df.head()

Unnamed: 0,text,label
0,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,It's primary day in New York and front-runners...,REAL


In [3]:
df.shape

(6335, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    6335 non-null   object
 1   label   6335 non-null   object
dtypes: object(2)
memory usage: 99.1+ KB


In [5]:
df['label'].value_counts() #Checking class balance

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
REAL,3171
FAKE,3164


In [6]:
df.isna().sum() #Checking missing values

Unnamed: 0,0
text,0
label,0


In [7]:
df['text']=df['text'].astype(str)#convert all text to string,ensure all text is string type.

In [8]:
nltk.download('stopwords') # Download stopwords
stop_words=set(stopwords.words('english')) #converts stopwords into a set for faster lookup

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Text cleaning function

In [9]:
def clean_text(text):
 text=text.lower() #Convert text to lowercase
 text=re.sub(r'\d+','',text) #Remove the occurences of one or more digits
 tokens=text.translate(str.maketrans('','',string.punctuation))# Remove punctuation
 text=re.sub(r'[^a-zA-Z\s]','',text) # Remove whitespace character,uppercase,lowercase
 words=text.split() # split into words
 words=[w for w in words if w not in stop_words] #Remove stopwords
 return " ".join(words) #Join words back into cleaned sentence




In [10]:
df['clean_text']=df['text'].apply(clean_text)# Apply cleaning to dataset

In [11]:
print(df[['text','clean_text']].head())

                                                text  \
0  Daniel Greenfield, a Shillman Journalism Fello...   
1  Google Pinterest Digg Linkedin Reddit Stumbleu...   
2  U.S. Secretary of State John F. Kerry said Mon...   
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...   
4  It's primary day in New York and front-runners...   

                                          clean_text  
0  daniel greenfield shillman journalism fellow f...  
1  google pinterest digg linkedin reddit stumbleu...  
2  us secretary state john f kerry said monday st...  
3  kaydee king kaydeeking november lesson tonight...  
4  primary day new york frontrunners hillary clin...  


Encode labels

In [12]:
label_encoder=LabelEncoder()
df['label_encoded']=label_encoder.fit_transform(df['label'])

Split Features and Target

In [13]:
x=df['clean_text']
y=df['label_encoded']

Train-Test split

In [14]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

TF-IDF vectorization

In [15]:
#Convert text into numbers
tfidf=TfidfVectorizer(
    max_features=5000, #Top 5000important words/most freequent terms
    ngram_range=(1,2) #unigrams & bigrams
)

In [16]:
#Transform text to vectors
x_train_tfidf=tfidf.fit_transform(x_train).toarray()
x_test_tfidf=tfidf.transform(x_test).toarray()

Build the neural network and compile the model

In [17]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(x_train_tfidf.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [18]:
#Train the model
history = model.fit(
    x_train_tfidf,
    y_train,
    epochs=10,
    batch_size=32,
    validation_data=(x_test_tfidf, y_test)
)

Epoch 1/10
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.7338 - loss: 0.5365 - val_accuracy: 0.9242 - val_loss: 0.1946
Epoch 2/10
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9477 - loss: 0.1391 - val_accuracy: 0.9321 - val_loss: 0.1787
Epoch 3/10
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9868 - loss: 0.0574 - val_accuracy: 0.9329 - val_loss: 0.1834
Epoch 4/10
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.9935 - loss: 0.0290 - val_accuracy: 0.9361 - val_loss: 0.1979
Epoch 5/10
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9980 - loss: 0.0129 - val_accuracy: 0.9290 - val_loss: 0.2248
Epoch 6/10
[1m159/159[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.9990 - loss: 0.0066 - val_accuracy: 0.9329 - val_loss: 0.2464
Epoch 7/10
[1m159/159

In [19]:
y_pred = (model.predict(x_test_tfidf) > 0.5).astype(int) #Evaluate model performance

print("Accuracy:", round(accuracy_score(y_test, y_pred) * 100, 2))
print("Classification Report:\n", classification_report(y_test, y_pred))

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Accuracy: 93.21
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93       633
           1       0.93      0.93      0.93       634

    accuracy                           0.93      1267
   macro avg       0.93      0.93      0.93      1267
weighted avg       0.93      0.93      0.93      1267



Predicting new unseen articles

In [20]:
new_articles = [
    "Government launches new digital education initiative",
    "Aliens officially land in Delhi confirms secret document",
    "Everybody said sun rises in the west "
]
# Clean and vectorize new text
clean_new = [clean_text(text) for text in new_articles]
new_tfidf = tfidf.transform(clean_new).toarray()

preds = (model.predict(new_tfidf) )



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step


In [21]:
probs = model.predict(new_tfidf)#Predict probabilities
#Final prediction output
for i, text in enumerate(new_articles):
    print("\nArticle:", text)
    print("Probability:", probs[i][0])
    print("Prediction:", "FAKE ❌" if probs[i][0] > 0.5 else "REAL ✅")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step

Article: Government launches new digital education initiative
Probability: 0.0110256
Prediction: REAL ✅

Article: Aliens officially land in Delhi confirms secret document
Probability: 0.00089917996
Prediction: REAL ✅

Article: Everybody said sun rises in the west 
Probability: 0.99434954
Prediction: FAKE ❌
