# Fake News Detection – Training Notebook
This notebook trains a simple **TF-IDF + Logistic Regression** model and saves it as a scikit-learn **Pipeline** to `models/fake_news_pipeline.pkl`.

In [1]:
# Install (if needed) and imports
# %pip install -r ../requirements.txt

import os, joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

import sys
sys.path.append("..")
from utils.data_utils import load_dataset

DATA_DIR = os.path.join("..", "data")
MODEL_DIR = os.path.join("..", "models")
MODEL_PATH = os.path.join(MODEL_DIR, "fake_news_pipeline.pkl")
os.makedirs(MODEL_DIR, exist_ok=True)


ModuleNotFoundError: No module named 'joblib'

In [None]:
# Load dataset
df = load_dataset(DATA_DIR)
df.head(), df['label'].value_counts()


In [None]:
# Train / test split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)
len(X_train), len(X_test)


In [None]:
# Build pipeline
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        stop_words="english",
        ngram_range=(1,2),
        max_df=0.9,
        min_df=2
    )),
    ("clf", LogisticRegression(max_iter=200))
])

# Train
pipeline.fit(X_train, y_train)

# Evaluate
y_pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")
print(classification_report(y_test, y_pred))


In [None]:
# Save pipeline
joblib.dump(pipeline, MODEL_PATH)
MODEL_PATH
