In [1]:
import pandas as pd

# Load the fake and real news datasets
fake_df = pd.read_csv("Fake.csv")
real_df = pd.read_csv("True.csv")

In [2]:
# Add a 'label' column: 0 for Fake, 1 for Real
fake_df['label'] = 0
real_df['label'] = 1

In [3]:
# Combine the datasets into a single DataFrame
combined_df = pd.concat([fake_df, real_df], ignore_index=True)

In [4]:
# Shuffle the dataset to mix real and fake news
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [5]:
# Display basic information
print("Dataset shape:", combined_df.shape)
print("Class distribution:\n", combined_df['label'].value_counts())
print("First few rows:")
print(combined_df.head())

Dataset shape: (44898, 5)
Class distribution:
 label
0    23481
1    21417
Name: count, dtype: int64
First few rows:
                                               title  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
1  Trump drops Steve Bannon from National Securit...   
2  Puerto Rico expects U.S. to lift Jones Act shi...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
4  Donald Trump heads for Scotland to reopen a go...   

                                                text       subject  \
0  21st Century Wire says Ben Stein, reputable pr...       US_News   
1  WASHINGTON (Reuters) - U.S. President Donald T...  politicsNews   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...  politicsNews   
3  On Monday, Donald Trump once again embarrassed...          News   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...  politicsNews   

                  date  label  
0    February 13, 2017      0  
1       April 5, 2017       1  
2  September 27, 2017       1

In [6]:
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [7]:
# Download NLTK resources (only once)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [8]:
# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()



In [12]:
# Choose the text column to process — you can also use 'title' + 'text'
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Lowercase
    text = text.lower()
        # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenize and remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

In [14]:
# Create a new column for cleaned text
combined_df['cleaned_text'] = combined_df['text'].apply(preprocess_text)

# Display sample cleaned text
print(combined_df[['text', 'cleaned_text']].head())


                                                text  \
0  21st Century Wire says Ben Stein, reputable pr...   
1  WASHINGTON (Reuters) - U.S. President Donald T...   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...   
3  On Monday, Donald Trump once again embarrassed...   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...   

                                        cleaned_text  
0  st century wire say ben stein reputable profes...  
1  washington reuters u president donald trump re...  
2  reuters puerto rico governor ricardo rossello ...  
3  monday donald trump embarrassed country accide...  
4  glasgow scotland reuters u presidential candid...  


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(
    max_features=5000,       # Use top 5000 words
    stop_words='english',    # Remove English stopwords
    ngram_range=(1, 2)       # Unigrams and bigrams
)

In [17]:
# Fit and transform the cleaned text
X = tfidf.fit_transform(combined_df['cleaned_text'])

In [18]:
# Target variable
y = combined_df['label']

In [19]:
# Show shape of resulting feature matrix
print("TF-IDF matrix shape:", X.shape)

TF-IDF matrix shape: (44898, 5000)


In [20]:
from sklearn.model_selection import train_test_split

In [21]:
# Split the dataset into training and testing sets
# 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # Ensures balanced label distribution
)

In [22]:
# Print the size of each set
print("Training set size:", X_train.shape[0])
print("Testing set size:", X_test.shape[0])

Training set size: 35918
Testing set size: 8980


In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [24]:
# Initialize models
log_reg = LogisticRegression(max_iter=1000)
nb_model = MultinomialNB()

In [25]:
# Train Logistic Regression
log_reg.fit(X_train, y_train)

In [26]:
# Train Naive Bayes
nb_model.fit(X_train, y_train)

print("✅ Models trained successfully!")


✅ Models trained successfully!


In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Function to evaluate a model
def evaluate_model(model, X_test, y_test, model_name="Model"):
    y_pred = model.predict(X_test)

    print(f"\n🔍 Evaluation Report for {model_name}:")
    print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
    print("Precision:", round(precision_score(y_test, y_pred), 4))
    print("Recall:", round(recall_score(y_test, y_pred), 4))
    print("F1 Score:", round(f1_score(y_test, y_pred), 4))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nDetailed Classification Report:")
    print(classification_report(y_test, y_pred, target_names=["Fake", "Real"]))

# Evaluate both models
evaluate_model(log_reg, X_test, y_test, model_name="Logistic Regression")
evaluate_model(nb_model, X_test, y_test, model_name="Naive Bayes")



🔍 Evaluation Report for Logistic Regression:
Accuracy: 0.9872
Precision: 0.9826
Recall: 0.9907
F1 Score: 0.9866

Confusion Matrix:
[[4621   75]
 [  40 4244]]

Detailed Classification Report:
              precision    recall  f1-score   support

        Fake       0.99      0.98      0.99      4696
        Real       0.98      0.99      0.99      4284

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980


🔍 Evaluation Report for Naive Bayes:
Accuracy: 0.946
Precision: 0.9418
Recall: 0.9451
F1 Score: 0.9435

Confusion Matrix:
[[4446  250]
 [ 235 4049]]

Detailed Classification Report:
              precision    recall  f1-score   support

        Fake       0.95      0.95      0.95      4696
        Real       0.94      0.95      0.94      4284

    accuracy                           0.95      8980
   macro avg       0.95      0.95      0.95      8980
weighted avg       0.95   

In [28]:
import joblib

# Save Logistic Regression model
joblib.dump(log_reg, "logistic_regression_model.pkl")

# Save Naive Bayes model
joblib.dump(nb_model, "naive_bayes_model.pkl")

print("✅ Models saved successfully as .pkl files!")


✅ Models saved successfully as .pkl files!


In [29]:
# Save the TF-IDF vectorizer
joblib.dump(tfidf, "tfidf_vectorizer.pkl")


['tfidf_vectorizer.pkl']