In [10]:
import pandas as pd
import numpy as np
import re # for regular expressions
import nltk
import spacy
import joblib
from tqdm.auto import tqdm # for progress bars

# Load the spacy model
nltk.download('punkt_tab')
nlp = spacy.load('en_core_web_sm')

print("Libraries imported.")

# Load the dataset from the interim folder
df = pd.read_csv('../data/interim/news_dataset.csv')
df.head()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HPi5_\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Libraries imported.


Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",1
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",0
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",0
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",1
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",0


In [3]:
# Handle potential missing values just in case, filling them with an empty string
df['title'] = df['title'].fillna('')
df['text'] = df['text'].fillna('')

# Combine title and text into a new 'full_text' column
df['full_text'] = df['title'] + " " + df['text']
df[['full_text', 'label']].head()

Unnamed: 0,full_text,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,1
1,Trump drops Steve Bannon from National Securit...,0
2,Puerto Rico expects U.S. to lift Jones Act shi...,0
3,OOPS: Trump Just Accidentally Confirmed He Le...,1
4,Donald Trump heads for Scotland to reopen a go...,0


In [4]:
# Get the list of English stopwords
stop_words = set(nltk.corpus.stopwords.words('english'))

def preprocess_text(text):
    """
    Cleans and preprocesses a single text string.
    """
    # 1. Lowercase the text
    text = text.lower()

    # 2. Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)

    # 3. Tokenize the text into words
    tokens = nltk.word_tokenize(text)

    # 4. Lemmatize and remove stopwords
    # We use Spacy for more accurate lemmatization
    doc = nlp(" ".join(tokens))
    lemmatized_tokens = [token.lemma_ for token in doc if token.text not in stop_words]

    # 5. Join tokens back into a single string
    processed_text = " ".join(lemmatized_tokens)

    return processed_text

In [5]:
# This lets us use .progress_apply() which shows a progress bar
tqdm.pandas()

print("Starting text preprocessing on the 'full_text' column...")
# Apply the function. This will take some time.
df['clean_text'] = df['full_text'].progress_apply(preprocess_text)
print("Preprocessing complete.")

# View the original vs. cleaned text
df[['full_text', 'clean_text', 'label']].head()

Starting text preprocessing on the 'full_text' column...


  0%|          | 0/44898 [00:00<?, ?it/s]

  text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)


Preprocessing complete.


Unnamed: 0,full_text,clean_text,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,ben stein call th circuit court commit coup dt...,1
1,Trump drops Steve Bannon from National Securit...,trump drop steve bannon national security coun...,0
2,Puerto Rico expects U.S. to lift Jones Act shi...,puerto rico expect we lift jones act shipping ...,0
3,OOPS: Trump Just Accidentally Confirmed He Le...,oops trump accidentally confirm leak israeli i...,1
4,Donald Trump heads for Scotland to reopen a go...,donald trump head scotland reopen golf resort ...,0


In [6]:
from sklearn.model_selection import train_test_split

# Define our features (X) and target (y)
X = df['clean_text']
y = df['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2,       # 20% of data will be for testing
    random_state=42,     # Ensures the split is the same every time
    stratify=y           # IMPORTANT: Keeps the same % of fake/real in both sets
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

Training set size: 35918
Test set size: 8980


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),  # Consider both single words and two-word phrases (bigrams)
    max_features=10000,  # Limit the vocabulary size to the top 10,000 features
    min_df=5,            # Ignore terms that appear in less than 5 documents
    max_df=0.8           # Ignore terms that appear in more than 80% of documents
)

# --- IMPORTANT ---
# Fit the vectorizer and transform the TRAINING data
print("Fitting TF-IDF on training data...")
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# ONLY transform the TEST data
print("Transforming test data...")
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("TF-IDF vectorization complete.")
print(f"Shape of training TF-IDF matrix: {X_train_tfidf.shape}")
print(f"Shape of testing TF-IDF matrix: {X_test_tfidf.shape}")

Fitting TF-IDF on training data...
Transforming test data...
TF-IDF vectorization complete.
Shape of training TF-IDF matrix: (35918, 10000)
Shape of testing TF-IDF matrix: (8980, 10000)


In [8]:
from scipy.sparse import save_npz
import pandas as pd

# Save the TF-IDF matrices
save_npz('../data/processed/X_train_tfidf.npz', X_train_tfidf)
save_npz('../data/processed/X_test_tfidf.npz', X_test_tfidf)

# Save the labels
y_train.to_csv('../data/processed/y_train.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

print("Processed data saved successfully.")

Processed data saved successfully.


In [None]:
joblib.dump(tfidf_vectorizer, '../models/tfidf_vectorizer.joblib')

print("Vectorizer saved successfully.")

Vectorizer saved successfully.
