<a href="https://colab.research.google.com/github/kmahatma/ssmi-patternrecognition/blob/main/w04_04_nb_bernoulli_fake_news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Google Drive Setup
from google.colab import drive
drive.mount('/content/drive')

# Set the default working directory

Mounted at /content/drive


# Step 1: Install & Import Required Libraries

In [None]:
!pip install pandas numpy scikit-learn nltk




In [None]:
!pip uninstall -y nltk
!pip install nltk


Found existing installation: nltk 3.9.1
Uninstalling nltk-3.9.1:
  Successfully uninstalled nltk-3.9.1
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nltk
Successfully installed nltk-3.9.1


In [None]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report

# Download stopwords if not already downloaded
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Step 2: Load the Dataset

In [None]:
# Load dataset
data_path = "/content/drive/MyDrive/@-ssmi-pattern-recognition/prak-06/"

# Load train and test datasets
df_train = pd.read_csv(data_path + "train.csv")
df_test = pd.read_csv(data_path + "test.csv")

# Fill missing values with empty strings
df_train.fillna("", inplace=True)
df_test.fillna("", inplace=True)

# Combine title & content
df_train['text'] = df_train['title'] + " " + df_train['text']
df_test['text'] = df_test['title'] + " " + df_test['text']


# Check dataset structure
print(df_train.head())
print(df_test.head())

   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1  
1  FLYNN: Hillary Clinton, Big Woman on Campus - ...      0  
2  Why the Truth Might Get You Fired Why the Trut...      1  
3  15 Civilians Killed In Single US Airstrike Hav...      1  
4  Iranian woman jailed for fictional unpublished...      1  
      id                                              title  \
0  20800  Specter of Trump Loosens Tongues, if Not Purse...   
1  20801  Russian war

# Step 3: Preprocess the Text Data
Since BernoulliNB works with binary features, we preprocess the text:

*  Remove special characters and convert text to lowercase.
*  Remove stopwords
*   Use CountVectorizer (binary=True) to convert words into a presence/absence matrix.

In [None]:
import re
import nltk
from nltk.corpus import stopwords

# Ensure stopwords are downloaded
nltk.download('stopwords')

# Function to clean text
def clean_text(text):
    if isinstance(text, str):  # Ensure text is a string
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\W', ' ', text)  # Remove special characters
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
        return text.strip()
    return ""  # Return empty string for NaN values

# Function to remove stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])

# Apply text cleaning to both train and test datasets
df_train['clean_text'] = (df_train['title'].astype(str) + " " + df_train['text'].astype(str)).apply(clean_text)
df_test['clean_text'] = (df_test['title'].astype(str) + " " + df_test['text'].astype(str)).apply(clean_text)

# Apply stopword removal
df_train['clean_text'] = df_train['clean_text'].apply(remove_stopwords)
df_test['clean_text'] = df_test['clean_text'].apply(remove_stopwords)

# Display sample output
print(df_train[['clean_text']].head())
print(df_test[['clean_text']].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                          clean_text
0  house dem aide even see comey letter jason cha...
1  flynn hillary clinton big woman campus breitba...
2  truth might get fired truth might get fired tr...
3  15 civilians killed single us airstrike identi...
4  iranian woman jailed fictional unpublished sto...
                                          clean_text
0  specter trump loosens tongues purse strings si...
1  russian warships ready strike terrorists near ...
2  nodapl native american leaders vow stay winter...
3  tim tebow attempt another comeback time baseba...
4  keiser report meme wars e995 keiser report mem...


In [None]:
# Function to clean text
def clean_text(text):
    if isinstance(text, str):  # Ensure text is a string
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\W', ' ', text)  # Remove special characters
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
        return text.strip()
    return ""  # Return empty string for NaN values

# Function to remove stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])

# Combine 'title' and 'text' into 'content' for both train and test datasets
df_train['content'] = (df_train['title'].astype(str) + " " + df_train['text'].astype(str)).apply(clean_text)
df_test['content'] = (df_test['title'].astype(str) + " " + df_test['text'].astype(str)).apply(clean_text)

# Apply stopword removal
df_train['content'] = df_train['content'].apply(remove_stopwords)
df_test['content'] = df_test['content'].apply(remove_stopwords)

# Preview cleaned text
print("\nCleaned Text Sample (Train):")
print(df_train[['content']].head())

print("\nCleaned Text Sample (Test):")
print(df_test[['content']].head())



Cleaned Text Sample (Train):
                                             content
0  house dem aide even see comey letter jason cha...
1  flynn hillary clinton big woman campus breitba...
2  truth might get fired truth might get fired tr...
3  15 civilians killed single us airstrike identi...
4  iranian woman jailed fictional unpublished sto...

Cleaned Text Sample (Test):
                                             content
0  specter trump loosens tongues purse strings si...
1  russian warships ready strike terrorists near ...
2  nodapl native american leaders vow stay winter...
3  tim tebow attempt another comeback time baseba...
4  keiser report meme wars e995 keiser report mem...


# Step 4: Convert Text into Binary Features
We will use CountVectorizer to:

Convert text into binary format (presence = 1, absence = 0).
Select top 5000 most frequent words.

In [None]:
# Convert text into binary feature matrix
vectorizer = CountVectorizer(binary=True, stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df_train['content'])
y = df_train['label']

# Split dataset into train (80%) and validation (20%)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# Step 5: Train the Bernoulli Naïve Bayes Model

In [None]:
# Initialize Bernoulli Naïve Bayes Classifier
model = BernoulliNB()

# Train the model
model.fit(X_train, y_train)


# Step 6: Evaluate the Model

In [None]:
# Predict on validation set
y_pred = model.predict(X_val)

# Accuracy score
accuracy = accuracy_score(y_val, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=['Real', 'Fake']))


Model Accuracy: 0.73

Classification Report:
              precision    recall  f1-score   support

        Real       0.78      0.65      0.71      2132
        Fake       0.68      0.81      0.74      2028

    accuracy                           0.73      4160
   macro avg       0.73      0.73      0.72      4160
weighted avg       0.73      0.73      0.72      4160



# Step 7: Predict Labels for test.csv
Since test.csv does not contain labels, we will:

Transform test text using the trained CountVectorizer.
Predict labels using the trained Naïve Bayes model.

In [None]:
# Transform test data
X_test = vectorizer.transform(df_test['content'])

# Predict labels
test_predictions = model.predict(X_test)


# Step 8: Save Predictions to submit.csv

In [None]:
# Create submission DataFrame
df_submission = pd.DataFrame({'id': df_test['id'], 'label': test_predictions})

# Save to CSV
df_submission.to_csv("submit.csv", index=False)

print("✅ Submission file 'submit.csv' is saved successfully!")


✅ Submission file 'submit.csv' is saved successfully!


In [None]:
from google.colab import files
files.download("submit.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>