# **DATA PREPROCESSING**



Our goal is to preprocess the text data to prepare it for machine learning models aimed at detecting fake news.



*   Import Necessary Libraries
*   Load the Dataset
*   Inspect the Data
*   Combine Datasets and Add Labels
*   Shuffle the data
*   TExt Cleaning Function
*   Apply Text Cleaning
*   Feature Extraction Using TF-IDF
*   Split the Data into Training and Testing Sets
*   Save the Preprocessed Data









# Import Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# LOAD THE DATASET

In [None]:
from google.colab import files
uploaded = files.upload()
# Load datasets
true_news = pd.read_csv('True.csv', encoding='utf-8', engine="python")
fake_news = pd.read_csv('Fake.csv', encoding='utf-8', engine="python")


TypeError: 'NoneType' object is not subscriptable

In [None]:
# Display the first few rows of each dataset
print("True News Sample:")
print(true_news.head())

print("\nFake News Sample:")
print(fake_news.head())

# COMBINE DATASETS AND ADD LABELS

We’ll add a label to each dataset: 1 for true news and 0 for fake news. Then, we’ll combine them into a single DataFrame.

In [None]:
# Add a label column to each dataframe
true_news['label'] = 1  # 1 indicates true news
fake_news['label'] = 0  # 0 indicates fake news

# Combine the datasets
data = pd.concat([true_news, fake_news], axis=0).reset_index(drop=True)

# Shuffle the Data

We shuffle the dataset to prevent order bias during model training. The original dataset may have all fake news in one section and all real news in another, which could cause the model to learn patterns based on position rather than content. Shuffling ensures that:


*   Training and testing sets are well-distributed across both categories.

*   The model generalizes better rather than memorizing sequence-based patterns.
*   Avoids overfitting to any unintentional ordering biases in the dataset.




This step is essential for a fair and unbiased learning process.

In [None]:
# Shuffle the combined dataset
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# Text Cleaning Function

In [None]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove stopwords and lemmatize
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

In [None]:
# Combine title and text columns
data['content'] = data['title'] + " " + data['text']

# Apply the cleaning function
data['cleaned_content'] = data['content'].apply(clean_text)



# Feature Extraction Using TF-IDF

In [None]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust the number of features as needed

# Fit and transform the cleaned text
X = tfidf_vectorizer.fit_transform(data['cleaned_content'])

# Labels
y = data['label'].values

# Split the Data into Training and Testing Sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Print a sample of training data
print("Training Data Sample:")
print(X_train[:5].toarray())  # Convert sparse matrix to array for readability
print("Training Labels Sample:", y_train[:5])

# Print a sample of testing data
print("\nTesting Data Sample:")
print(X_test[:5].toarray())  # Convert sparse matrix to array for readability
print("Testing Labels Sample:", y_test[:5])

# Save the Preprocessed Data

In [None]:
import pickle

# Save the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

# Save the train-test split
with open('train_test_data.pkl', 'wb') as file:
    pickle.dump((X_train, X_test, y_train, y_test), file)

## baseline model

In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_curve,
    auc
)

# -------------------------------
# Load preprocessed data
# -------------------------------
with open('train_test_data.pkl', 'rb') as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

# -------------------------------
# Baseline Model: Logistic Regression with Hyperparameter Tuning
# -------------------------------
logreg = LogisticRegression(random_state=42, max_iter=1000)

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear']  # Suitable for small-medium datasets
}

grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

best_model = grid_search.best_estimator_

# -------------------------------
# Evaluation on Test Data
# -------------------------------
y_pred = best_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Test Accuracy:", acc)
print("Test Precision:", prec)
print("Test Recall:", rec)
print("Test F1 Score:", f1)

# Plot Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.colorbar()
tick_marks = np.arange(2)
plt.xticks(tick_marks, ['Fake', 'Real'], rotation=45)
plt.yticks(tick_marks, ['Fake', 'Real'])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# ROC Curve
y_prob = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# -------------------------------
# Save the Best Model
# -------------------------------
with open('best_logreg_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Model Development

In [None]:
!pip install datasets --quiet
!pip install evaluate
import os
os.environ["WANDB_MODE"] = "disabled"

# **Advanced Model**

In [None]:
import pandas as pd
import re
import string
import numpy as np
import torch
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK
nltk.download('stopwords')
nltk.download('wordnet')

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import evaluate

# -------------------------------
# Define Text Cleaning Function
# -------------------------------
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text


# Add labels: 1 for true news, 0 for fake news
true_news['label'] = 1
fake_news['label'] = 0

# Combine the datasets
data = pd.concat([true_news, fake_news], axis=0).reset_index(drop=True)

# Combine title and text columns, then clean the text
data['content'] = data['title'] + " " + data['text']
data['cleaned_content'] = data['content'].apply(clean_text)

# Split the data into training and testing sets using the cleaned text
train_df, test_df = train_test_split(data[['cleaned_content', 'label']], test_size=0.2, random_state=42)

# Rename column for compatibility with Hugging Face Datasets
train_df = train_df.rename(columns={'cleaned_content': 'text'})
test_df = test_df.rename(columns={'cleaned_content': 'text'})

# Create Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


In [None]:

# -------------------------------
# Tokenization and Model Setup
# -------------------------------
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# -------------------------------
# Define Evaluation Metrics
# -------------------------------
metric_accuracy = evaluate.load("accuracy")
metric_precision = evaluate.load("precision")
metric_recall = evaluate.load("recall")
metric_f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric_accuracy.compute(predictions=predictions, references=labels)
    precision = metric_precision.compute(predictions=predictions, references=labels, average='weighted')
    recall = metric_recall.compute(predictions=predictions, references=labels, average='weighted')
    f1 = metric_f1.compute(predictions=predictions, references=labels, average='weighted')
    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"]
    }


In [None]:
# -------------------------------
# Training Arguments and Trainer Setup
# -------------------------------
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to=[],
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    metric_for_best_model="accuracy",
    bf16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
!nvidia-smi
# -------------------------------
# Train and Evaluate the Model
# -------------------------------
from transformers import Trainer

trainer.train()
results = trainer.evaluate()
print("Evaluation results:", results)

# -------------------------------
# Save the Fine-Tuned Model and Tokenizer
# -------------------------------
model.save_pretrained("./fine_tuned_bert")
tokenizer.save_pretrained("./fine_tuned_bert")