<a href="https://colab.research.google.com/github/kmahatma/ssmi-patternrecognition/blob/main/w06_03_nb_fake_news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Google Drive Setup
from google.colab import drive
drive.mount('/content/drive')

# Set the default working directory

Mounted at /content/drive


# Step 1: Install & Import Required Libraries

In [None]:
!pip install pandas numpy scikit-learn nltk


In [None]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report

# Download stopwords if not already downloaded
nltk.download('stopwords')


# Step 2: Load the Dataset

In [None]:
# Load dataset
df_fake = pd.read_csv("Fake.csv")
df_real = pd.read_csv("True.csv")

# Add labels: 1 = Fake, 0 = Real
df_fake['label'] = 1
df_real['label'] = 0

# Combine both datasets
df = pd.concat([df_fake, df_real], axis=0).reset_index(drop=True)

# Check dataset structure
print(df.head())


# Step 3: Preprocess the Text Data
Since BernoulliNB works with binary features, we preprocess the text:

*  Remove special characters and convert text to lowercase.
*  Remove stopwords
*   Use CountVectorizer (binary=True) to convert words into a presence/absence matrix.

In [None]:
# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.strip()
    return text

# Apply text cleaning
df['text'] = df['title'] + " " + df['text']  # Combine title & content
df['text'] = df['text'].apply(clean_text)

# Define stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords function
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])

# Apply stopword removal
df['text'] = df['text'].apply(remove_stopwords)


**Step 4: Convert Text into Binary Features**
Using CountVectorizer (binary=True) ensures that each word is represented as 1 if present, 0 if absent.

In [None]:
# Convert text into binary feature matrix
vectorizer = CountVectorizer(binary=True, stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df['text'])

# Target variable (labels)
y = df['label']

# Split dataset into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Step 5: Train the Bernoulli Naïve Bayes Model

In [None]:
# Initialize Bernoulli Naïve Bayes Classifier
model = BernoulliNB()

# Train the model
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)


# Step 6: Evaluate the Model

In [None]:
# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Real', 'Fake']))


# Step 7: Test with a Custom Fake News Example

In [None]:
def predict_fake_news(text):
    cleaned_text = clean_text(text)
    cleaned_text = remove_stopwords(cleaned_text)
    text_vectorized = vectorizer.transform([cleaned_text])
    prediction = model.predict(text_vectorized)[0]
    return "FAKE NEWS" if prediction == 1 else "REAL NEWS"

# Example 1
news1 = "Breaking: Government Announces New COVID-19 Restrictions!"
print(f"News: {news1} → {predict_fake_news(news1)}")

# Example 2
news2 = "Shocking: Scientists Discover Secret Cure for Cancer!"
print(f"News: {news2} → {predict_fake_news(news2)}")
