In [1]:
# 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import string
import re

In [2]:
# 2. Load Dataset
fake_df = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/Fake.csv")
true_df = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/True.csv")

In [3]:
# Add labels
fake_df["label"] = 0  # fake
true_df["label"] = 1  # real

In [4]:
# Combine the data
data = pd.concat([fake_df, true_df], axis=0).reset_index(drop=True)

In [5]:
# 3. Clean the Text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"http\S+|www.\S+", "", text)
    text = re.sub(r"<.*?>+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\n", " ", text)
    return text

data["text"] = data["text"].apply(clean_text)

In [6]:
# 4. Split Dataset
X = data["text"]
y = data["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [7]:
# 5. Text Vectorization
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [8]:
# 6. Train Model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

In [9]:
# 7. Evaluate Model
y_pred = model.predict(X_test_vec)
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4696
           1       0.99      0.99      0.99      4284

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [10]:
# 8. Predict on Real-World News
def predict_news(text):
    cleaned = clean_text(text)
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)[0]
    return "REAL" if prediction == 1 else "FAKE"

In [22]:
#  Install Dependencies (once)
!pip install transformers torch --quiet

#  Imports
import torch
from transformers import pipeline

#  Load Pretrained Fake News Classifier
MODEL = "jy46604790/Fake-News-Bert-Detect"
classifier = pipeline("text-classification", model=MODEL, tokenizer=MODEL)

#  Prediction Function
def predict_news(text):
    # Ensure text is under 500 tokens; pipeline truncates automatically
    res = classifier(text)[0]
    label = res["label"]
    score = res["score"]
    return f"REAL ({score:.2f})" if label == "LABEL_1" else f"FAKE ({score:.2f})"

#  Test with Your Example
sample = """
A passenger flight in Ahmedabad crash-landed due to technical failure on Tuesday morning.
No injuries were reported. Officials say an investigation is underway.
"""
print("Prediction:", predict_news(sample))


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Device set to use cuda:0


Prediction: FAKE (1.00)


In [23]:
# Test with Your Example
sample = """
A passenger flight in Ahmedabad crash-landed due to technical failure 
Injuries were reported. Officials say an investigation is underway.
"""
print("Prediction:", predict_news(sample))


Prediction: FAKE (1.00)


In [13]:
articles = [
    "Ahmedabad plane crash lands today after engine failure.",
    "NASA confirms presence of water on the Moon's surface.",
    "Aliens landed in Rajasthan desert, say local farmers."
]
for article in articles:
    print(predict_news(article))

FAKE (1.00)
FAKE (1.00)
REAL (1.00)


In [24]:
#  Install dependencies (run once)
!pip install transformers torch --quiet

#  Imports
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

#  Device selection
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on:", device)

#  Load the public Roberta model (fine‑tuned on Fake/Real news)
MODEL = "hamzab/roberta-fake-news-classification"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)

#  Define prediction function
def predict_news(text):
    # Format as required by this model
    input_str = "<title>" + text + "<content>" + text + "<end>"
    inputs = tokenizer.encode_plus(
        input_str,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.nn.functional.softmax(logits, dim=1)[0]
    label = torch.argmax(probs).item()
    label_str = "REAL" if label == 1 else "FAKE"
    confidence = float(probs[label])
    return f"{label_str} ({confidence:.2f})"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Running on: cuda


In [26]:
#  Test on your two cases
case1 = (
    "A passenger flight in Ahmedabad crash-landed due to technical failure. "
    "No injuries were reported. Officials say an investigation is underway."
)
case2 = (
    "Air India Ahmedabad plane crash: Government constituted high-level panel meets, looks into possible causes."
)

print("Case 1:", predict_news(case1))
print("Case 2:", predict_news(case2))

Case 1: FAKE (0.55)
Case 2: REAL (1.00)


In [16]:
news = "Liverpool forward Diogo Jota passes away at 28 in road accident"
print("Prediction:", predict_news(news))

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Prediction: REAL (1.00)


In [17]:
news = "‘The missiles represented hope’: Palestinians in Gaza react to Iran bombing Israel"
print("Prediction:", predict_news(news))


Prediction: REAL (1.00)


In [21]:

news = "IND vs ENG 2025: 'I'd be worried ...': Stuart Broad issues stern warning for England ahead of Lord's Test"
print("Prediction:", predict_news(news))

Prediction: REAL (1.00)


In [25]:

news = "Supreme Court to hear on July 10 pleas challenging Special Intensive Revision in Bihar"
print("Prediction:", predict_news(news))

Prediction: REAL (1.00)
