# Fake News Project - Data Preparation

In [3]:
# imports
import pandas as pd
import re
import os
import multiprocess as mp
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

# Downloading NLTK resources
nltk.download("punkt")  # Standard tokenizer
nltk.download("stopwords")  # Common stopwords
nltk.download('punkt_tab')  # Specific requirement for your setup


[nltk_data] Error loading punkt: <urlopen error [Errno 8] nodename nor
[nltk_data]     servname provided, or not known>
[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>
[nltk_data] Error loading punkt_tab: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>


False

## Part 1. Data Processing.

In [4]:
#Common functions definitions

# Function to clean text
def clean_text(text):
    if pd.isna(text):
        return text  # Keep NaN as is
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'https?://\S+', '<URL>', text)  # Replace URLs
    text = re.sub(r'\b\d{1,4}[-/]\d{1,2}[-/]\d{1,4}\b', '<DATE>', text)  # Replace Dates
    text = re.sub(r'\b\d+\b', '<NUM>', text)  # Replace Numbers
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '<EMAIL>', text)  # Replace Emails
    return text

# Function to process text: Tokenization + Stopword Removal + Stemming
def process_text(text):
    if pd.isna(text):
        return []
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    stemmed_tokens = [ps.stem(word) for word in tokens]
    return stemmed_tokens


In [5]:
# Initialization

chunk_size = 50000  # Process 50,000 rows at a time
num_cores = os.cpu_count()  # Use all available CPU cores

Task 1

In [6]:
# Load CSV into df
print("Loading CSV...")
df = pd.read_csv("news_sample.csv")  # Ensure correct file name
print(f"CSV Loaded. Total rows in df: {len(df)}")

df.rename(columns={'Unnamed: 0': 'Unnamed'}, inplace=True)
print(df.columns)

# Parallel processing for text cleaning
print("Cleaning text...")
with mp.Pool(num_cores) as pool:
    cleaned_content = pool.map(clean_text, df["content"])
print("Text cleaning complete.")

# Create new DataFrame with cleaned data
print("Creating clean_df...")
clean_df = df.copy()  # Preserve original df
clean_df["content"] = cleaned_content  # Assign cleaned text

# Standardize other columns
clean_df["label"] = df["type"].str.lower()
clean_df["domain"] = df["domain"].str.lower()
clean_df["title"] = df["title"].str.lower()
clean_df["authors"] = df["authors"].str.lower()
print("Standardization complete.")

# Apply filtering after cleaning
print("Applying filtering...")
clean_df = clean_df[
    (clean_df["label"] != "unknown") &  # Remove 'unknown' labels
    (clean_df["type"].notna()) &  # Remove null labels
    (clean_df["content"].notna()) &  # Remove null content
    (clean_df["content"].str.len() > 10) &  # Remove very short articles
    (clean_df["content"].apply(lambda x: len(re.findall(r'\b\w+\b', str(x))) > 1))  # Ensure multi-word articles
]

print(f"Filtering complete. Total rows in clean_df: {len(clean_df)}")

# Save clean_df to CSV
print("Saving clean_df to clean250.csv...")
clean_df.to_csv("clean250.csv", index=False)
print("File saved as clean250.csv")

Loading CSV...
CSV Loaded. Total rows in df: 250
Index(['Unnamed', 'id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'authors', 'keywords',
       'meta_keywords', 'meta_description', 'tags', 'summary'],
      dtype='object')
Cleaning text...
Text cleaning complete.
Creating clean_df...
Standardization complete.
Applying filtering...
Filtering complete. Total rows in clean_df: 232
Saving clean_df to clean250.csv...
File saved as clean250.csv


In [7]:
# Define input and output files
input_file = "clean250.csv"
output_file = "nltk250.csv"

# Load stopwords
stop_words = set(stopwords.words("english"))
ps = PorterStemmer()  # Initialize stemmer



# Open output file to write processed chunks
print(f"Processing {input_file} in chunks of {chunk_size} rows...")

for i, chunk in enumerate(pd.read_csv(input_file, chunksize=chunk_size)):
    print(f"Processing Chunk {i+1}...")

    # Parallel processing
    with mp.Pool(num_cores) as pool:
        chunk["tokens"] = pool.map(process_text, chunk["content"])

    # Drop original content column to save space
    chunk.drop(columns=["content"], inplace=True)

    # Write the processed chunk to the output file (append after first write)
    mode = "w" if i == 0 else "a"
    header = (i == 0)  # Write header only for the first chunk
    chunk.to_csv(output_file, index=False, mode=mode, header=header)

    print(f"Chunk {i+1} processed and saved!")

print(f"\n Processing complete! Tokenized & stemmed data saved to {output_file}")

Processing clean250.csv in chunks of 50000 rows...
Processing Chunk 1...


Chunk 1 processed and saved!

 Processing complete! Tokenized & stemmed data saved to nltk250.csv


Task 3. Located before Task 2 because original and cleaned datasets should be compared.

In [8]:
# Load CSV into df
print("Loading CSV...")
df = pd.read_csv("995,000_rows.csv")  # Ensure correct file name
print(f"CSV Loaded. Total rows in df: {len(df)}")

df.rename(columns={'Unnamed: 0': 'Unnamed'}, inplace=True)
print(df.columns)

# Parallel processing for text cleaning
print("Cleaning text in parallel...")
with mp.Pool(num_cores) as pool:
    cleaned_content = pool.map(clean_text, df["content"])
print("Text cleaning complete.")

# Create new DataFrame with cleaned data
print("Creating clean_df...")
clean_df = df.copy()  # Preserve original df
clean_df["content"] = cleaned_content  # Assign cleaned text

# Standardize other columns
clean_df["label"] = df["type"].str.lower()
clean_df["domain"] = df["domain"].str.lower()
clean_df["title"] = df["title"].str.lower()
clean_df["authors"] = df["authors"].str.lower()
print("Standardization complete.")

# Apply filtering after cleaning
print("Applying filtering...")
clean_df = clean_df[
    (clean_df["label"] != "unknown") &  # Remove 'unknown' labels
    (clean_df["type"].notna()) &  # Remove null labels
    (clean_df["content"].notna()) &  # Remove null content
    (clean_df["content"].str.len() > 10) &  # Remove very short articles
    (clean_df["content"].apply(lambda x: len(re.findall(r'\b\w+\b', str(x))) > 1))  # Ensure multi-word articles
]

print(f"Filtering complete. Total rows in clean_df: {len(clean_df)}")

# Save clean_df to CSV
print("Saving clean_df to 995k-cleaned.csv...")
clean_df.to_csv("995k-cleaned.csv", index=False)
print("File saved as 995k-cleaned.csv")

Loading CSV...


  df = pd.read_csv("995,000_rows.csv")  # Ensure correct file name


CSV Loaded. Total rows in df: 995000
Index(['Unnamed', 'id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'authors', 'keywords',
       'meta_keywords', 'meta_description', 'tags', 'summary', 'source'],
      dtype='object')
Cleaning text in parallel...
Text cleaning complete.
Creating clean_df...
Standardization complete.
Applying filtering...
Filtering complete. Total rows in clean_df: 903679
Saving clean_df to 995k-cleaned.csv...
File saved as 995k-cleaned.csv


In [9]:
# Input/output settings
input_file = "995k-cleaned.csv"
output_file = "995pre.csv"
chunk_size = 50000
num_cores = os.cpu_count()

# Load stopwords and stemmer
stop_words = set(stopwords.words("english"))
ps = PorterStemmer()

# Label mapping
fake_labels = [
    "fake", "conspiracy", "bias", "unreliable", "rumor",
    "clickbait", "junksci", "hate", "satire", "political"
]
real_labels = ["reliable"]

def map_label(label):
    label = str(label).lower()
    if label in fake_labels:
        return 0
    elif label in real_labels:
        return 1
    return None

# Start batch processing
if os.path.exists(output_file):
    os.remove(output_file)  # Clear old output if exists

print(f"Processing {input_file} in batches of {chunk_size} rows...")

reader = pd.read_csv(input_file, chunksize=chunk_size)
for i, chunk in enumerate(reader):
    print(f" Chunk {i+1}")

    # Clean labels → binary
    chunk["binary_label"] = chunk["label"].apply(map_label)
    chunk = chunk[chunk["binary_label"].notna()].copy()
    chunk["binary_label"] = chunk["binary_label"].astype(int)

    # Tokenize + stem in parallel
    with mp.Pool(num_cores) as pool:
        chunk["tokens"] = pool.map(process_text, chunk["content"])

    # Create 'text' column
    chunk["text"] = chunk["tokens"].apply(lambda x: " ".join(x))

    # Keep only required + useful columns
    keep_cols = ["tokens", "text", "binary_label"] + [col for col in ["title", "domain", "authors", "source", "type", "label"] if col in chunk.columns]
    chunk_out = chunk[keep_cols]

    # Write to output
    mode = "w" if i == 0 else "a"
    header = (i == 0)
    chunk_out.to_csv(output_file, index=False, mode=mode, header=header)

    print(f"Chunk {i+1} saved.")

print(f"\n All done! Final preprocessed file saved as {output_file}")

Processing 995k-cleaned.csv in batches of 50000 rows...
 Chunk 1
Chunk 1 saved.
 Chunk 2
Chunk 2 saved.
 Chunk 3
Chunk 3 saved.
 Chunk 4
Chunk 4 saved.
 Chunk 5
Chunk 5 saved.
 Chunk 6
Chunk 6 saved.
 Chunk 7
Chunk 7 saved.
 Chunk 8
Chunk 8 saved.
 Chunk 9
Chunk 9 saved.
 Chunk 10
Chunk 10 saved.
 Chunk 11
Chunk 11 saved.
 Chunk 12
Chunk 12 saved.
 Chunk 13
Chunk 13 saved.
 Chunk 14
Chunk 14 saved.
 Chunk 15
Chunk 15 saved.
 Chunk 16
Chunk 16 saved.
 Chunk 17


  for i, chunk in enumerate(reader):


Chunk 17 saved.
 Chunk 18
Chunk 18 saved.
 Chunk 19
Chunk 19 saved.

 All done! Final preprocessed file saved as 995pre.csv


## Preprocessing BBC dataset

In [10]:

# Setup
input_file = "articles_full.csv"
output_file = "articles_full_pre.csv"

# Load stopwords
stop_words = set(stopwords.words("english"))
ps = PorterStemmer()  # Initialize stemmer


# Load file
df = pd.read_csv(input_file)
print(f"Loaded {len(df)} rows from {input_file}")

# Clean 'text' using existing clean_text()
print("Cleaning text...")
with mp.Pool(num_cores) as pool:
    df["text"] = pool.map(clean_text, df["text"])

# Tokenize and stem using existing process_text()
print("Processing tokens...")
with mp.Pool(num_cores) as pool:
    df["tokens"] = pool.map(process_text, df["text"])

# Reconstruct final 'text' and add label
df["text"] = df["tokens"].apply(lambda x: " ".join(x))
df["binary_label"] = 1  # All articles are considered reliable

# Keep only what we need
df_out = df[["tokens", "text", "binary_label"]]

# Save output
df_out.to_csv(output_file, index=False)
print(f"Saved preprocessed file to {output_file}")


Loaded 707 rows from articles_full.csv
Cleaning text...
Processing tokens...
Saved preprocessed file to articles_full_pre.csv
