In [25]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-and-real-news-dataset/True.csv
/kaggle/input/fake-and-real-news-dataset/Fake.csv


In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import re
import nltk
from nltk.corpus import stopwords
import joblib

In [27]:
# --- 1. Load Data and Define Sample Size ---
SAMPLE_SIZE_PER_CLASS = 10000
TOTAL_SAMPLE_SIZE = SAMPLE_SIZE_PER_CLASS * 2

try:
    # Load the datasets, reading only the required number of rows
    df_fake = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv').head(SAMPLE_SIZE_PER_CLASS)
    df_true = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv').head(SAMPLE_SIZE_PER_CLASS)
    
    print(f"Loaded {len(df_fake)} fake samples and {len(df_true)} true samples.")
except FileNotFoundError:
    print("Error: Make sure 'Fake.csv' and 'True.csv' are in the same directory as your script.")
    exit()


Loaded 10000 fake samples and 10000 true samples.


In [28]:
# --- 2. Labeling and Combination ---
# Add a 'label' column to each DataFrame (0 for fake, 1 for true)
df_fake['label'] = 0
df_true['label'] = 1

# Combine the datasets
df = pd.concat([df_fake, df_true], ignore_index=True)

# Shuffle the combined data to mix fake and true news
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Total dataset size after sampling and combining: {len(df)} rows.")



Total dataset size after sampling and combining: 20000 rows.


In [29]:
# --- 3. Text Preprocessing and Feature Engineering (Addressing Leakage) ---
# ✅ CHANGE: Explicitly drop 'subject' and 'date' to prevent trivial data leakage.
# The model should classify based on content style, not source-based metadata.
df = df.drop(columns=['subject', 'date'], errors='ignore') 

# Combine 'title' and 'text' into a single feature for classification
df['content'] = df['title'] + ' ' + df['text']

# Initialize NLTK stopwords (download if necessary)
try:
    stop_words = set(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Clean the text: remove non-alphanumeric, lowercase, and remove stopwords."""
    if isinstance(text, str):
        # Remove non-alphanumeric characters (keep spaces)
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = text.lower()
        # Remove stopwords and join
        text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply preprocessing
df['content'] = df['content'].apply(preprocess_text)

In [30]:
# --- 4. Split Data and Vectorize ---
X = df['content']
y = df['label']

# Split the 5000 samples into training and testing sets (e.g., 80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_test)} samples")

# Initialize and fit TF-IDF Vectorizer on the training data
# TF-IDF converts text into numerical feature vectors
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"Number of features (TF-IDF vector length): {X_train_tfidf.shape[1]}")



Training set size: 16000 samples
Testing set size: 4000 samples
Number of features (TF-IDF vector length): 5000


In [31]:
# --- 5. Train Random Forest Model ---
print("\nStarting Random Forest training...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_tfidf, y_train)
print("Training complete.")



Starting Random Forest training...
Training complete.


In [32]:
# --- 6. Model Evaluation ---
# Predict on the test set
y_pred = rf_model.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\n--- Model Evaluation (Random Forest on {TOTAL_SAMPLE_SIZE} samples) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Fake (0)', 'True (1)']))



--- Model Evaluation (Random Forest on 20000 samples) ---
Accuracy: 0.9980

Classification Report:
              precision    recall  f1-score   support

    Fake (0)       1.00      1.00      1.00      2000
    True (1)       1.00      1.00      1.00      2000

    accuracy                           1.00      4000
   macro avg       1.00      1.00      1.00      4000
weighted avg       1.00      1.00      1.00      4000



In [33]:
# --- 7. Save the Model and Vectorizer (MLOPS ARTIFACTS) ---
MODEL_FILENAME = 'rf_fake_news_model.joblib'
VECTORIZER_FILENAME = 'tfidf_vectorizer.joblib'

# ✅ CHANGE: Save the trained model 
joblib.dump(rf_model, MODEL_FILENAME)
print(f"\nModel successfully saved to: {MODEL_FILENAME}")

# ✅ CHANGE: Save the fitted vectorizer (crucial for preprocessing new data)
joblib.dump(tfidf_vectorizer, VECTORIZER_FILENAME)
print(f"Vectorizer successfully saved to: {VECTORIZER_FILENAME}")



Model successfully saved to: rf_fake_news_model.joblib
Vectorizer successfully saved to: tfidf_vectorizer.joblib


In [34]:
# --- OPTIONAL: Demonstrate Loading the Model ---
# This is what your MLOps deployment service would do

print("\n--- Demonstration of Loading and Predicting ---")
# Load the model and vectorizer
loaded_model = joblib.load(MODEL_FILENAME)
loaded_vectorizer = joblib.load(VECTORIZER_FILENAME)

# Example new data (a fake news article and a real-sounding headline)
new_articles = [
    "BREAKING NEWS: The President was seen flying a gigantic purple dragon over the White House this morning, witnesses confirm.",
    "United Nations Security Council votes unanimously to impose new sanctions on the country of North Korea after a recent missile launch, according to reports from Reuters.",
]

# Preprocess and vectorize the new data using the loaded vectorizer
new_articles_processed = [preprocess_text(text) for text in new_articles]
new_articles_vectorized = loaded_vectorizer.transform(new_articles_processed)

# Make predictions
predictions = loaded_model.predict(new_articles_vectorized)
prediction_labels = ['Fake' if p == 0 else 'True' for p in predictions]

for article, label in zip(new_articles, prediction_labels):
    print(f"Article: '{article[:70]}...' -> Prediction: {label}")


--- Demonstration of Loading and Predicting ---
Article: 'BREAKING NEWS: The President was seen flying a gigantic purple dragon ...' -> Prediction: Fake
Article: 'United Nations Security Council votes unanimously to impose new sancti...' -> Prediction: True
