In [14]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/168e3115-98f4-49ea-a4b1-
[nltk_data]     316c09e46f4d/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/168e3115-98f4-49ea-a4b1-
[nltk_data]     316c09e46f4d/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/168e3115-98f4-49ea-a4b1-
[nltk_data]     316c09e46f4d/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [16]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenization
    words = text.split()
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [18]:
# Load dataset
data = pd.read_csv('fake_or_real_news.csv')
# Apply preprocessing to the text column
data['cleaned_text'] = data['text'].apply(preprocess_text)

# Feature Extraction
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['cleaned_text'])
y = data['label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Display evaluation metrics
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.8926598263614838
Confusion Matrix:
[[541  87]
 [ 49 590]]
Classification Report:
              precision    recall  f1-score   support

        FAKE       0.92      0.86      0.89       628
        REAL       0.87      0.92      0.90       639

    accuracy                           0.89      1267
   macro avg       0.89      0.89      0.89      1267
weighted avg       0.89      0.89      0.89      1267



In [20]:
# trying ensemble model to increase the accuracy

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

# Initialize the individual models
nb = MultinomialNB(alpha=1.0)  # Naive Bayes
lr = LogisticRegression(solver='liblinear',C=20)  # Logistic Regression
dt = DecisionTreeClassifier(random_state=42,max_depth=40)  # Decision Tree

# Create a Voting Classifier (with hard or soft voting)
# Hard Voting: Majority class prediction
# Soft Voting: Averages the predicted probabilities
ensemble_model = VotingClassifier(estimators=[
    ('nb', nb), ('lr', lr), ('dt', dt)],
    voting='hard')  # change 'hard' to 'soft' for soft voting

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = ensemble_model.predict(X_test)

# Calculate the accuracy of the ensemble model
accuracy = accuracy_score(y_test, y_pred)
print(f'Ensemble Model Accuracy: {accuracy}')


Ensemble Model Accuracy: 0.909234411996843
