# Data Exploration for Fake News Detection

This notebook provides comprehensive data exploration for the fake news detection dataset.


In [None]:

!pip install pandas scikit-learn matplotlib joblib openpyxl

In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np


In [7]:
# Load the datasets
print("Loading datasets...")

# Load fake news data
fake_df = pd.read_csv('Fake.csv')
fake_df['label'] = 0  # 0 for fake news

# Load true news data  
true_df = pd.read_csv('True.csv')
true_df['label'] = 1  # 1 for true news

print(f"Fake news dataset shape: {fake_df.shape}")
print(f"True news dataset shape: {true_df.shape}")

# Combine both datasets
df = pd.concat([fake_df, true_df], ignore_index=True)
print(f"Combined dataset shape: {df.shape}")

# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print("Datasets loaded and combined successfully!")


Loading datasets...
Fake news dataset shape: (23481, 5)
True news dataset shape: (21417, 5)
Combined dataset shape: (44898, 5)
Datasets loaded and combined successfully!


# Missing Values Check

This section performs a comprehensive check for missing values in the dataset.


In [8]:
# Simple Missing Values Check

def check_missing_values(df):
   
    print("="*80)
    print("MISSING VALUES ANALYSIS")
    print("="*80)
    
    print(f"\n📊 DATASET INFO:")
    print(f"Total rows: {len(df):,}")
    print(f"Total columns: {len(df.columns)}")
    print(f"Columns: {list(df.columns)}")
    
    # Get missing value counts
    missing_counts = df.isnull().sum()
    
    print(f"\n📋 MISSING VALUES COUNT:")
    for column in df.columns:
        missing_count = missing_counts[column]
        if missing_count > 0:
            print(f"🔴 {column}: {missing_count:,} missing values ({missing_count/len(df)*100:.2f}%)")
        else:
            print(f"✅ {column}: No missing values")
    
    # Show specific row numbers for missing values
    print(f"\n📍 ROW NUMBERS WITH MISSING VALUES:")
    has_missing = False
    
    for column in df.columns:
        missing_rows = df[df[column].isnull()].index.tolist()
        if missing_rows:
            has_missing = True
            print(f"\n🔴 {column} (missing in {len(missing_rows)} rows):")
            if len(missing_rows) <= 20:
                print(f"   Row numbers: {missing_rows}")
            else:
                print(f"   First 20 rows: {missing_rows[:20]}")
                print(f"   ... and {len(missing_rows) - 20} more rows")
    
    if not has_missing:
        print("✅ No missing values found in any column!")
    
    # Summary statistics
    total_missing = missing_counts.sum()
    total_cells = len(df) * len(df.columns)
    
    print(f"\n📈 SUMMARY:")
    print(f"Total missing values: {total_missing:,}")
    print(f"Total cells in dataset: {total_cells:,}")
    print(f"Percentage of missing data: {total_missing/total_cells*100:.4f}%")
    
    # Check for completely empty rows
    empty_rows = df.isnull().all(axis=1).sum()
    if empty_rows > 0:
        empty_row_indices = df[df.isnull().all(axis=1)].index.tolist()
        print(f"\n🚨 COMPLETELY EMPTY ROWS: {empty_rows}")
        print(f"   Empty row numbers: {empty_row_indices}")
    else:
        print(f"\n✅ No completely empty rows found")

# Check for missing values in the dataset
check_missing_values(df)


MISSING VALUES ANALYSIS

📊 DATASET INFO:
Total rows: 44,898
Total columns: 5
Columns: ['title', 'text', 'subject', 'date', 'label']

📋 MISSING VALUES COUNT:
✅ title: No missing values
✅ text: No missing values
✅ subject: No missing values
✅ date: No missing values
✅ label: No missing values

📍 ROW NUMBERS WITH MISSING VALUES:
✅ No missing values found in any column!

📈 SUMMARY:
Total missing values: 0
Total cells in dataset: 224,490
Percentage of missing data: 0.0000%

✅ No completely empty rows found


# Text Preprocessing and Cleaning

This section contains comprehensive text cleaning functions for preprocessing the news articles.


In [10]:
# Import additional libraries for text processing
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

# Download required NLTK data
print("Downloading NLTK data...")
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('omw-1.4', quiet=True)
print("NLTK data downloaded successfully!")


Downloading NLTK data...
NLTK data downloaded successfully!


In [11]:
# Basic Text Cleaning Functions

def remove_urls(text):
    """Remove URLs from text"""
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub('', text)

def remove_html_tags(text):
    """Remove HTML tags from text"""
    html_pattern = re.compile(r'<.*?>')
    return html_pattern.sub('', text)

def remove_emails(text):
    """Remove email addresses from text"""
    email_pattern = re.compile(r'\S+@\S+')
    return email_pattern.sub('', text)

def remove_special_characters(text, keep_apostrophes=True):
    """Remove special characters while optionally keeping apostrophes for contractions"""
    if keep_apostrophes:
        # Keep apostrophes for contractions
        pattern = r'[^a-zA-Z0-9\s\']'
    else:
        pattern = r'[^a-zA-Z0-9\s]'
    return re.sub(pattern, '', text)



In [14]:
# Comprehensive Text Cleaning Pipeline

def clean_text_basic(text):
    """
    Basic text cleaning pipeline
    Suitable for models that need to preserve some original structure
    """
    if pd.isna(text):
        return ""
    
    # Convert to string and lowercase
    text = str(text).lower()
    
    # Remove URLs, emails, and HTML tags
    text = remove_urls(text)
    text = remove_emails(text)
    text = remove_html_tags(text)

    
    # Remove special characters but keep apostrophes
    text = remove_special_characters(text, keep_apostrophes=True)

    
    return text


In [15]:
# Apply text cleaning to the dataset

print("Applying text cleaning to the dataset...")
print("This may take a few minutes for large datasets...")

# Create copies of original text columns for comparison
df['title_original'] = df['title'].copy()
df['text_original'] = df['text'].copy()

# Apply different levels of cleaning
# Basic cleaning (good for neural networks)
df['title_basic'] = df['title'].apply(clean_text_basic)
df['text_basic'] = df['text'].apply(clean_text_basic)



print("Text cleaning completed!")
print(f"Dataset shape after adding cleaned columns: {df.shape}")
print(f"New columns: {[col for col in df.columns if 'title_' in col or 'text_' in col]}")


Applying text cleaning to the dataset...
This may take a few minutes for large datasets...
Text cleaning completed!
Dataset shape after adding cleaned columns: (44898, 9)
New columns: ['title_original', 'text_original', 'title_basic', 'text_basic']


In [17]:
print(f"first five rows {df.head(5)}")

first five rows                                                title  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
1  Trump drops Steve Bannon from National Securit...   
2  Puerto Rico expects U.S. to lift Jones Act shi...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
4  Donald Trump heads for Scotland to reopen a go...   

                                                text       subject  \
0  21st Century Wire says Ben Stein, reputable pr...       US_News   
1  WASHINGTON (Reuters) - U.S. President Donald T...  politicsNews   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...  politicsNews   
3  On Monday, Donald Trump once again embarrassed...          News   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...  politicsNews   

                  date  label  \
0    February 13, 2017      0   
1       April 5, 2017       1   
2  September 27, 2017       1   
3         May 22, 2017      0   
4       June 24, 2016       1   

                           

In [None]:
# Advanced Text Preprocessing Functions

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Get English stopwords
stop_words = set(stopwords.words('english'))

def get_wordnet_pos(word):
    """Map POS tag to first character used by WordNetLemmatizer"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def remove_stopwords(text, custom_stopwords=None):
    """Remove stopwords from text"""
    words = word_tokenize(text.lower())
    
    # Combine default stopwords with custom ones if provided
    stopwords_to_remove = stop_words
    if custom_stopwords:
        stopwords_to_remove = stop_words.union(set(custom_stopwords))
    
    filtered_words = [word for word in words if word not in stopwords_to_remove]
    return ' '.join(filtered_words)

def stem_text(text):
    """Apply stemming to text"""
    words = word_tokenize(text.lower())
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

def lemmatize_text(text):
    """Apply lemmatization to text with POS tagging"""
    words = word_tokenize(text.lower())
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
    return ' '.join(lemmatized_words)

def remove_short_words(text, min_length=2):
    """Remove words shorter than min_length"""
    words = word_tokenize(text)
    filtered_words = [word for word in words if len(word) >= min_length]
    return ' '.join(filtered_words)

def clean_text_advanced(text, remove_stops=True, use_lemmatization=True):
    """
    Advanced text cleaning pipeline
    Suitable for traditional ML models with TF-IDF
    """
    if pd.isna(text):
        return ""
    
    # Start with basic cleaning
    text = clean_text_basic(text)
    
    # Remove stopwords if requested
    if remove_stops:
        text = remove_stopwords(text)
    
    # Apply stemming or lemmatization
    if use_lemmatization:
        text = lemmatize_text(text)
    else:
        text = stem_text(text)
    
    # Remove short words
    text = remove_short_words(text, min_length=2)

    
    return text

