In [1]:
import pandas as pd

true = pd.read_csv("../data/raw/True.csv")
fake = pd.read_csv("../data/raw/Fake.csv")

true['label'] = 0   # real
fake['label'] = 1   # fake

df = pd.concat([true, fake], ignore_index=True).sample(frac=1).reset_index(drop=True)

In [2]:
df['content'] = df['title'].fillna('') + " " + df['text'].fillna('')

In [3]:
df['content'] = df['content'].str.lower()

In [4]:
import re

def clean_text(text):
    text = re.sub(r"http\S+|www\S+", " ", text)  # remove urls
    text = re.sub(r"[^a-z\s]", " ", text)        # keep only letters
    text = re.sub(r"\s+", " ", text)             # remove extra spaces
    return text.strip()

df['content'] = df['content'].apply(clean_text)

In [5]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

df['content'] = df['content'].apply(lambda x: " ".join([w for w in x.split() if w not in stop_words]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kausalya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

df['content'] = df['content'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in x.split()]))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kausalya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
df['word_count'] = df['content'].apply(lambda x: len(x.split()))

In [8]:
from sklearn.model_selection import train_test_split

X = df['content']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

Train size: (35918,)
Test size: (8980,)


In [9]:
import os

os.makedirs("../data/processed", exist_ok=True)
df.to_csv("../data/processed/cleaned_fake_news.csv", index=False)