In [39]:
# Importing necessary libraries
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation and analysis
import re  # For regular expressions, used in text preprocessing
import nltk  # Natural Language Toolkit for text processing
from nltk.corpus import stopwords  # Stopwords list from NLTK
from nltk.stem.porter import PorterStemmer  # PorterStemmer for stemming words
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF Vectorizer for text to numeric conversion
from sklearn.model_selection import train_test_split, GridSearchCV  # For dataset splitting and hyperparameter tuning
from sklearn.linear_model import LogisticRegression  # Logistic Regression model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # Evaluation metrics
import joblib  # For saving the trained model
import logging  # For logging the process

In [21]:
# Setting up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Downloading necessary NLTK data
nltk.download('stopwords')

# Displaying the stopwords in English
logging.info(f"Stopwords from NLTK: {stopwords.words('english')}")

# Utility function for text preprocessing
def preprocess_text(content):
    """
    Clean, lower, split, and stem the input text using regular expressions, stopwords removal, and stemming.
    """
    # Initialize the PorterStemmer
    port_stem = PorterStemmer()
    
    # Remove all non-alphabetical characters
    content = re.sub('[^a-zA-Z]', ' ', content)
    # Convert to lowercase
    content = content.lower()
    # Split the text into individual words
    words = content.split()
    # Stem words and remove stopwords
    words = [port_stem.stem(word) for word in words if word not in stopwords.words('english')]
    
    # Rejoin words into a single string
    processed_content = ' '.join(words)
    
    return processed_content

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kunalk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2024-08-21 22:53:10,981 - INFO - Stopwords from NLTK: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'th

In [23]:
# Loading the dataset into a pandas DataFrame
file_path = 'train.csv'  # Path to the dataset
try:
    news_dataset = pd.read_csv(file_path)
    logging.info(f"Dataset loaded successfully. Shape: {news_dataset.shape}")
except FileNotFoundError:
    logging.error(f"File not found. Please check the path: {file_path}")
    raise

# Checking for missing values in the dataset
missing_values = news_dataset.isnull().sum()
logging.info(f"Missing values in the dataset:\n{missing_values}")

# Replacing missing values with empty strings to avoid issues during text processing
news_dataset.fillna('', inplace=True)
logging.info("Missing values have been replaced with empty strings.")

# Merging the 'author' and 'title' columns to create a new 'content' column
news_dataset['content'] = news_dataset['author'] + ' ' + news_dataset['title']

# Applying text preprocessing to the 'content' column
news_dataset['content'] = news_dataset['content'].apply(preprocess_text)

# Separating the features (X) and the target label (Y)
X = news_dataset['content'].values
Y = news_dataset['label'].values

# Displaying the first few rows of preprocessed data
logging.info(f"Preprocessed 'content' column:\n{news_dataset['content'].head()}")

2024-08-21 22:54:08,764 - INFO - Dataset loaded successfully. Shape: (20800, 5)
2024-08-21 22:54:08,769 - INFO - Missing values in the dataset:
id           0
title      558
author    1957
text        39
label        0
dtype: int64
2024-08-21 22:54:08,773 - INFO - Missing values have been replaced with empty strings.
2024-08-21 22:54:17,794 - INFO - Preprocessed 'content' column:
0    darrel lucu hous dem aid even see comey letter...
1    daniel j flynn flynn hillari clinton big woman...
2               consortiumnew com truth might get fire
3    jessica purkiss civilian kill singl us airstri...
4    howard portnoy iranian woman jail fiction unpu...
Name: content, dtype: object


In [24]:
# Converting textual data to numerical data using TF-IDF Vectorizer

# Initialize the TfidfVectorizer with optimized parameters
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))

# Fit the vectorizer on the text data and transform the content into vectors
X = vectorizer.fit_transform(X)

# Displaying the shape of the transformed data to verify the conversion
logging.info(f"Shape of the transformed data (X): {X.shape}")

# Display the first few feature names generated by the vectorizer for inspection
logging.info(f"First 10 feature names generated by the vectorizer:\n{vectorizer.get_feature_names_out()[:10]}")

# Splitting the dataset into training and test sets using stratified sampling
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

# Displaying the shape of the training and test sets
logging.info(f"Training set shape: X_train: {X_train.shape}, Y_train: {Y_train.shape}")
logging.info(f"Test set shape: X_test: {X_test.shape}, Y_test: {Y_test.shape}")

2024-08-21 22:54:35,687 - INFO - Shape of the transformed data (X): (20800, 5000)
2024-08-21 22:54:35,689 - INFO - First 10 feature names generated by the vectorizer:
['aaron' 'aaron carrol' 'aaron kesel' 'aaron klein' 'abandon' 'abbi'
 'abbi goodnough' 'abc' 'abduct' 'abe']
2024-08-21 22:54:35,889 - INFO - Training set shape: X_train: (16640, 5000), Y_train: (16640,)
2024-08-21 22:54:35,889 - INFO - Test set shape: X_test: (4160, 5000), Y_test: (4160,)


In [25]:
# Initializing the Logistic Regression model
model = LogisticRegression()

# Setting up hyperparameters for tuning
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 500]
}

# Using GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

# Extracting the best model from the grid search
best_model = grid_search.best_estimator_
logging.info(f"Best model parameters: {grid_search.best_params_}")

# Training the best model on the entire training data
best_model.fit(X_train, Y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


2024-08-21 22:54:58,294 - INFO - Best model parameters: {'C': 10, 'max_iter': 100, 'solver': 'saga'}


In [26]:
# Evaluating the model's performance on the training data

# Predicting the labels for the training set
X_train_prediction = best_model.predict(X_train)

# Calculating the accuracy score on the training data
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
logging.info(f"Accuracy score on the training data: {training_data_accuracy:.4f}")

# Evaluating the model's performance on the test data

# Predicting the labels for the test set
X_test_prediction = best_model.predict(X_test)

# Calculating the accuracy score on the test data
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
logging.info(f"Accuracy score on the test data: {test_data_accuracy:.4f}")

# Generating a classification report for detailed performance metrics
classification_report_str = classification_report(Y_test, X_test_prediction, target_names=['Real', 'Fake'])
logging.info(f"Classification Report:\n{classification_report_str}")

# Generating a confusion matrix to see the distribution of predictions
conf_matrix = confusion_matrix(Y_test, X_test_prediction)
logging.info(f"Confusion Matrix:\n{conf_matrix}")

2024-08-21 22:55:17,023 - INFO - Accuracy score on the training data: 0.9987
2024-08-21 22:55:17,026 - INFO - Accuracy score on the test data: 0.9925
2024-08-21 22:55:17,034 - INFO - Classification Report:
              precision    recall  f1-score   support

        Real       0.99      0.99      0.99      2077
        Fake       0.99      0.99      0.99      2083

    accuracy                           0.99      4160
   macro avg       0.99      0.99      0.99      4160
weighted avg       0.99      0.99      0.99      4160

2024-08-21 22:55:17,036 - INFO - Confusion Matrix:
[[2057   20]
 [  11 2072]]


In [27]:
# Saving the trained model to a file
model_filename = 'best_logistic_regression_model.pkl'
joblib.dump(best_model, model_filename)
logging.info(f"Model saved to {model_filename}")

# Saving the vectorizer to a file
vectorizer_filename = 'tfidf_vectorizer.pkl'
joblib.dump(vectorizer, vectorizer_filename)
logging.info(f"TF-IDF vectorizer saved to {vectorizer_filename}")

2024-08-21 22:55:42,393 - INFO - Model saved to best_logistic_regression_model.pkl
2024-08-21 22:55:42,553 - INFO - TF-IDF vectorizer saved to tfidf_vectorizer.pkl


In [28]:
# Loading the model and vectorizer
loaded_model = joblib.load(model_filename)
loaded_vectorizer = joblib.load(vectorizer_filename)

# Function to predict whether news is real or fake
def predict_news(news_text):
    """
    Predict whether a piece of news is real or fake.
    :param news_text: str, the news content
    :return: str, 'Real' or 'Fake'
    """
    # Preprocess the text
    processed_text = preprocess_text(news_text)
    # Vectorize the processed text
    vectorized_text = loaded_vectorizer.transform([processed_text])
    # Predict the label
    prediction = loaded_model.predict(vectorized_text)
    
    return 'Real' if prediction[0] == 0 else 'Fake'

# Example usage
sample_news = "The President announces new policies to boost economy."
result = predict_news(sample_news)
logging.info(f"Prediction for the sample news: {result}")

2024-08-21 22:55:54,724 - INFO - Prediction for the sample news: Fake


In [38]:
# Example Real News Article
real_news = """The latest report from the United Nations Intergovernmental Panel on Climate Change (IPCC) warns that global warming is accelerating at an unprecedented rate. The report, which is based on extensive scientific research, indicates that the world is likely to exceed the 1.5°C warming threshold within the next two decades if urgent measures are not taken.

The findings underscore the need for immediate and ambitious actions to reduce greenhouse gas emissions. According to the report, the consequences of inaction could be catastrophic, leading to more frequent and severe weather events, rising sea levels, and disruptions to ecosystems and food supplies. The report calls on governments worldwide to strengthen their commitments under the Paris Agreement and to invest in sustainable energy solutions.

The UN Secretary-General, António Guterres, described the report as "a code red for humanity," urging world leaders to act swiftly to avert the worst impacts of climate change."""

# Use the predict_news function from your model to test this article
result = predict_news(real_news)

# Print the prediction result
print(f"Prediction for the fake news: {result}")

Prediction for the fake news: Real


In [30]:
# Example usage to test a fake news article
fake_news = "Scientists Discover a Cure for All Cancers in Pineapple Juice. A group of scientists has claimed that drinking pineapple juice every day can cure all types of cancers. They say that the natural enzymes found in the fruit can completely eliminate cancer cells in just weeks."
result = predict_news(fake_news)
print(f"Prediction for the fake news: {result}")

Prediction for the fake news: Fake
