#### Fake news detection using machine learning

In [1]:
# Loading librairies
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import streamlit as st

In [2]:
# Loading the dataset
# Link to the dataset
datalink = 'https://raw.githubusercontent.com/kueyram/Fake-News-Detection/refs/heads/main/data/news.csv'

# Importing the dataset into pandas
news_data = pd.read_csv(datalink, sep=',')

# Checking the size of the dataset
news_data.shape

# Printing random rows from the dataset
news_data.sample(n=5)

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,class
22253,22244.0,Boiler Room #95 – Weapons of Mass Penetration,Tune in to the Alternate Current Radio Network...,US_News,3-Feb-17,0.0
30045,17850.0,"In rightward shift, Dutch PM seals new governm...",THE HAGUE (Reuters) - Dutch Prime Minister Mar...,worldnews,10-Oct-17,1.0
16342,16339.0,A VOTE FOR HILLARY IS A VOTE FOR WW III: Are Y...,,Government News,28-Oct-16,0.0
9249,9249.0,CREEPY BERNIE Calls For Trump To Step Down…Wha...,"Yesterday, Senator Bernie Sanders, I-Vt., went...",politics,8-Dec-17,0.0
8123,8123.0,Watch: Trump Doesn’t Know What To Do During M...,You may have seen the latest Republican debate...,News,14-Feb-16,0.0


In [3]:
# Checking missing values
# Dropping rows with missing values in critical columns
# The critical columns are 'title', 'text' and 'class'
news_data = news_data.dropna(subset=['title', 'text', 'class'])

# Checking the shape after dropping the columns with missing values
news_data.shape

(33594, 6)

In [4]:
# Data cleaning
# Function to clean the data
# We are going to remove urls, remove non-words characters and extra spcaces

def clean_text(text):
    text = re.sub(r'https?://\S+', '', text)  # Removing urls
    text = re.sub(r'\W', ' ', text)  # Removing non-word characters
    text = re.sub(r'\s+', ' ', text)  # Removing extra spaces
    return text.lower().strip() # Converting to lower cases

# Applying the function to the title and text columns
news_data['title'] = news_data['title'].apply(clean_text)
news_data['text'] = news_data['text'].apply(clean_text)

In [5]:
# We are going to combine title and text for better feature extraction
news_data['combined_text'] = news_data['title'] + " " + news_data['text']

# Features and labels
features = news_data['combined_text']
labels = news_data['class']

In [6]:
# Splitting the dataset into test and training set
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)

# We are going to convert the text to numerical data using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)  # We are limitting to top 5000 features for efficiency

# Fit and transform the training data
features_train_tfidf = tfidf.fit_transform(features_train)

# Transforming test data
features_test_tfidf = tfidf.transform(features_test) 

In [7]:
# Logistic regression
# Training a simple regression model
# Initialising the model
model = LogisticRegression()

# Training the model on the training set
model.fit(features_train_tfidf, labels_train)  # Train the model on the training set

In [8]:
# Making predictions
labels_pred = model.predict(features_test_tfidf)

In [9]:
# Model evaluation
print("Accuracy:", accuracy_score(labels_test, labels_pred))
print("Classification Report:\n", classification_report(labels_test, labels_pred))

Accuracy: 0.9897306146748028
Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      4453
         1.0       0.99      0.98      0.98      2266

    accuracy                           0.99      6719
   macro avg       0.99      0.99      0.99      6719
weighted avg       0.99      0.99      0.99      6719



In [10]:
# Function to test the model with new data
# The function will predict whether a given text is fake or real news.

def predict_fake_news(text):
    cleaned_text = clean_text(text)
    transformed_text = tfidf.transform([cleaned_text])
    prediction = model.predict(transformed_text)
    return "Fake News" if prediction[0] == 1 else "Real News"

In [11]:
# Testing the function
example_text1 = "Breaking: The government announces a major breakthrough in COVID-19 vaccine research."
print(predict_fake_news(example_text1))

Real News


In [None]:
# Streamlit interface
st.title("Fake News Detection App")
st.write("Enter the news article text below to determine if it is real or fake.")

# Input field
news_input = st.text_area("Enter News Text", placeholder="Type the news text here...")

if st.button("Analyze"):
    if news_input.strip():
        # Make prediction
        result = predict_fake_news(news_input)
        st.subheader(f"The news article is: **{result}**")
    else:
        st.error("Please enter a text to analyze.")