In [2]:
!pip install pandas numpy scikit-learn shap joblib



Collecting shap
  Downloading shap-0.48.0-cp310-cp310-win_amd64.whl (544 kB)
     -------------------------------------- 544.3/544.3 kB 8.5 MB/s eta 0:00:00
Collecting slicer==0.0.8
  Downloading slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.48.0 slicer-0.0.8


In [3]:
import pandas as pd
import numpy as np
import joblib
import shap

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix


In [4]:
# Load datasets
true = pd.read_csv("True.csv")
fake = pd.read_csv("Fake.csv")

# Add labels
true["label"] = 1
fake["label"] = 0

# Merge and shuffle
df = pd.concat([true, fake], ignore_index=True).sample(frac=1, random_state=42)

df.head()


Unnamed: 0,title,text,subject,date,label
22216,BREAKING: GOP Chairman Grassley Has Had Enoug...,"Donald Trump s White House is in chaos, and th...",News,"July 21, 2017",0
27917,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...,News,"May 7, 2016",0
25007,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe. He supports ex...,News,"December 3, 2016",0
1377,California AG pledges to defend birth control ...,SAN FRANCISCO (Reuters) - California Attorney ...,politicsNews,"October 6, 2017",1
32476,AZ RANCHERS Living On US-Mexico Border Destroy...,Twisted reasoning is all that comes from Pelos...,politics,"Apr 25, 2017",0


In [5]:
# Combine title + text for features
texts = (df["title"].fillna("") + " " + df["text"].fillna("")).str.strip()
y = df["label"].values

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    texts, y, test_size=0.2, stratify=y, random_state=42
)


In [6]:
# Pipeline: TF-IDF + Logistic Regression
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=50000,
        ngram_range=(1, 2),
        stop_words="english",
        min_df=2
    )),
    ("clf", LogisticRegression(max_iter=1000))
])

pipe.fit(X_train, y_train)


In [7]:
# Predictions
y_prob = pipe.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

print(classification_report(y_test, y_pred, digits=4))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0     0.9934    0.9881    0.9907      4696
           1     0.9870    0.9928    0.9899      4284

    accuracy                         0.9903      8980
   macro avg     0.9902    0.9904    0.9903      8980
weighted avg     0.9903    0.9903    0.9903      8980

ROC-AUC: 0.9991015358443207
Confusion Matrix:
 [[4640   56]
 [  31 4253]]


In [8]:
# Save model
joblib.dump(pipe, "fake_news_pipeline.joblib")

# Save background texts for SHAP explainer
bg_size = min(500, len(X_train))
background_texts = list(pd.Series(X_train).sample(bg_size, random_state=42))
joblib.dump(background_texts, "background_texts.joblib")


['background_texts.joblib']

In [9]:
# Load assets
pipe = joblib.load("fake_news_pipeline.joblib")
background_texts = joblib.load("background_texts.joblib")

vectorizer = pipe.named_steps["tfidf"]
clf = pipe.named_steps["clf"]

# Create SHAP explainer
background_X = vectorizer.transform(background_texts)
explainer = shap.LinearExplainer(clf, background_X, feature_perturbation="interventional")




In [11]:
# Take user input
user_text = input("Enter news article text: ")

# Prediction
proba_true = pipe.predict_proba([user_text])[0, 1]
pred_label = "Real (True)" if proba_true >= 0.5 else "Fake"

print(f"\nPrediction: {pred_label}")
print(f"Confidence (True class): {proba_true:.3f}")

# SHAP values
X_u = vectorizer.transform([user_text])
shap_vals = explainer.shap_values(X_u)

# Focus on words present in this text
present_idx = X_u.nonzero()[1]
feature_names = vectorizer.get_feature_names_out()
sv = np.asarray(shap_vals)[0]
tfidf_row = X_u.toarray()[0]

# Create dataframe of contributions
contrib = pd.DataFrame({
    "term": feature_names[present_idx],
    "shap_value": sv[present_idx],
    "tfidf": tfidf_row[present_idx]
})
contrib["impact"] = contrib["shap_value"].abs()
top = contrib.sort_values("impact", ascending=False).head(20)

print("\nTop contributing terms:")
display(top[["term", "shap_value", "tfidf"]])

# Plot SHAP bar chart
shap.initjs()
# Convert sparse TF-IDF row to dense array
dense_row = X_u.toarray()

shap.force_plot(
    explainer.expected_value,
    shap_vals,
    dense_row,
    feature_names=vectorizer.get_feature_names_out()
)


Enter news article text: The European Union announced a new set of climate policies on Tuesday aimed at reducing carbon emissions by 55% before 2030. The measures include stricter vehicle emission standards, incentives for renewable energy projects, and a phased-out plan for coal-based power plants. EU leaders emphasized the urgency of action as climate change continues to cause extreme weather events across the continent.

Prediction: Real (True)
Confidence (True class): 0.561

Top contributing terms:


Unnamed: 0,term,shap_value,tfidf
4,tuesday,0.295315,0.069671
28,eu,0.204276,0.111103
26,european,0.138418,0.095756
33,continues,-0.128509,0.107915
25,european union,0.116304,0.107878
29,energy,0.114254,0.109331
19,measures,0.109249,0.108499
3,union,0.088693,0.090958
46,aimed,0.081606,0.112136
20,leaders,0.078996,0.081631


In [12]:
import os
os.makedirs("model", exist_ok=True)


In [13]:
import shutil

shutil.move("fake_news_pipeline.joblib", "model/fake_news_pipeline.joblib")
shutil.move("background_texts.joblib", "model/background_texts.joblib")


'model/background_texts.joblib'

In [None]:
files_to_move = [
    "model"
    "NLP_Fake news Detection.ipynb",
    "Fake.csv",
    "True.csv",
    "merged_news.csv"
]

for file in files_to_move:
    if os.path.exists(file):
        shutil.move(file, os.path.join(Fake news Detection NLP (Project), file))