In [9]:
import pandas as pd
import re
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix


In [10]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


In [11]:
# Load datasets
true = pd.read_csv('data/True.csv')
fake = pd.read_csv('data/Fake.csv')


In [12]:
# Add labels
true['label'] = 1
fake['label'] = 0


In [13]:
# Preprocess
true['text'] = true['text'].apply(preprocess)
fake['text'] = fake['text'].apply(preprocess)


In [14]:
# Balance dataset
min_len = min(len(true), len(fake))
true = true.sample(min_len, random_state=42)
fake = fake.sample(min_len, random_state=42)



In [15]:
# Combine and shuffle
df = pd.concat([true, fake])
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

X = df['text']
y = df['label']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Improved vectorizer settings
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=2, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [17]:
model = MultinomialNB()
model.fit(X_train_vec, y_train)


In [18]:
y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9540095716119995
Confusion Matrix:
 [[3921  284]
 [ 110 4252]]


In [19]:
pickle.dump(model, open("model.pkl", "wb"))
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))


In [None]:
# Sample news list
"""sample_news = [
    "NASA launches Artemis mission to return to the Moon.",
    "Chocolate cures cancer, scientists claim in new study.",
    "Government confirms alien contact in leaked documents.",
    "Apple unveils new iPhone with revolutionary AI chip.",
    "United Nations announces climate change mitigation fund.",
    "Time traveler from 3030 visits Earth to warn of zombie apocalypse."
]

# Preprocess and predict
processed = [preprocess(news) for news in sample_news]
vec = vectorizer.transform(processed)
preds = model.predict(vec)

for news, pred in zip(sample_news, preds):
    print(f"News: {news}\nPrediction: {'Real News ✅' if pred == 1 else 'Fake News ❌'}\n")
"""

'sample_news = [\n    "NASA launches Artemis mission to return to the Moon.",\n    "Chocolate cures cancer, scientists claim in new study.",\n    "Government confirms alien contact in leaked documents.",\n    "Apple unveils new iPhone with revolutionary AI chip.",\n    "United Nations announces climate change mitigation fund.",\n    "Time traveler from 3030 visits Earth to warn of zombie apocalypse."\n]\n\n# Preprocess and predict\nprocessed = [preprocess(news) for news in sample_news]\nvec = vectorizer.transform(processed)\npreds = model.predict(vec)\n\nfor news, pred in zip(sample_news, preds):\n    print(f"News: {news}\nPrediction: {\'Real News ✅\' if pred == 1 else \'Fake News ❌\'}\n")\n'