In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


In [None]:
fake_news = pd.read_csv('Fake.csv')
true_news = pd.read_csv('True.csv')

# Add a 'label' column indicating if the news is fake (1) or true (0)
fake_news['label'] = 1
true_news['label'] = 0

# Combine the datasets
data = pd.concat([fake_news, true_news], axis=0)


In [None]:
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    return text

data['text'] = data['text'].apply(clean_text)


In [None]:
X = data['text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


In [None]:
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))


In [None]:
import joblib
joblib.dump(model, 'fake_news_detector_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
