# 📰 NLP Mini Project: Fake News Classifier

In [1]:


# 1. Import Libraries

import pandas as pd
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


  from .autonotebook import tqdm as notebook_tqdm





In [5]:
# 2. Load Data

train_df = pd.read_csv("../project-3-nlp/dataset/training_data.csv", sep="\t", names=["label", "text"])
test_df = pd.read_csv("../project-3-nlp/dataset/testing_data.csv", sep="\t", names=["label", "text"])

print(train_df.head())


   label                                               text
0      0  donald trump sends out embarrassing new year‚s...
1      0  drunk bragging trump staffer started russian c...
2      0  sheriff david clarke becomes an internet joke ...
3      0  trump is so obsessed he even has obama‚s name ...
4      0  pope francis just called out donald trump duri...


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34152 entries, 0 to 34151
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   34152 non-null  int64 
 1   text    34152 non-null  object
dtypes: int64(1), object(1)
memory usage: 533.8+ KB


In [None]:


# 3. Preprocessing

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(r"[^a-z\s]", " ", text)  # remove punctuation/numbers
    text = re.sub(r"\s+", " ", text).strip()
    return text

train_df["clean_text"] = train_df["text"].apply(clean_text)
test_df["clean_text"] = test_df["text"].apply(clean_text)


In [None]:


# 4. Baseline Model (MultinomialNB)

X_train, X_val, y_train, y_val = train_test_split(
    train_df["clean_text"], train_df["label"], test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

nb = MultinomialNB()
nb.fit(X_train_vec, y_train)
y_pred = nb.predict(X_val_vec)

print("🔹 Baseline MultinomialNB Results")
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))


🔹 Baseline MultinomialNB Results
Accuracy: 0.9354413702239789
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      3529
           1       0.94      0.92      0.93      3302

    accuracy                           0.94      6831
   macro avg       0.94      0.94      0.94      6831
weighted avg       0.94      0.94      0.94      6831



In [None]:


# # 6. Predict on Test Data

# preds = trainer.predict(test_dataset)
# pred_labels = preds.predictions.argmax(axis=-1)

# # Replace '2' with predictions
# test_df["label"] = pred_labels
# test_df[["label", "text"]].to_csv("final_predictions.csv", sep="\t", index=False)

# print("🎯 Saved predictions to final_predictions.csv")


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)


# 0. Debug env

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


# 1. Load data

# df = pd.read_csv("/content/training_data.csv",  sep="\t", names=["label", "text"])

# Clean labels
train_df["label"] = train_df["label"].astype(str).str.strip().str.replace("\ufeff", "").astype(int)

# Train/test split (80/20)
train_df_splt, test_df_splt = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df["label"])

print("📊 Train size:", len(train_df))
print("📊 Test size:", len(test_df))


# 2. Label remapping → 0..K-1

all_labels = sorted(set(train_df_splt["label"].unique()))
label2id = {label: idx for idx, label in enumerate(all_labels)}
id2label = {idx: str(label) for label, idx in label2id.items()}
num_labels = len(label2id)

print("🔑 Label mapping:", label2id)

train_df_splt["label"] = train_df_splt["label"].map(label2id)
test_df_splt["label"] = test_df_splt["label"].map(label2id)


# 3. HuggingFace Datasets

train_dataset = Dataset.from_pandas(train_df_splt[["text", "label"]], preserve_index=False)
test_dataset = Dataset.from_pandas(test_df_splt[["text", "label"]], preserve_index=False)


# 4. Tokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset = train_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")

# Ensure labels are ints
train_dataset = train_dataset.map(lambda x: {"labels": int(x["labels"])})
test_dataset = test_dataset.map(lambda x: {"labels": int(x["labels"])})


# 5. Sanity checks

unique_train_labels = sorted(set(train_dataset["labels"]))
unique_test_labels = sorted(set(test_dataset["labels"]))

print("✅ Unique train labels:", unique_train_labels)
print("✅ Unique test labels:", unique_test_labels)
print("✅ Model will be trained with num_labels =", num_labels)

assert min(unique_train_labels) >= 0
assert max(unique_train_labels) < num_labels

# 6. Torch format

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


# 7. Load model

label2id = {int(k): int(v) for k, v in label2id.items()}
id2label = {int(k): str(v) for k, v in id2label.items()}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)


# 8. Metrics

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=0)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


# 9. Training arguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="no",
    logging_dir="./logs",
    logging_steps=500,
    report_to=[],
)


# 10. Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


# 11. Train & Evaluate

print("🚀 Training Transformer Model...")
trainer.train()

print("✅ Final Evaluation:")
results = trainer.evaluate()
print(results)


📊 Train size: 34152
📊 Test size: 9984
🔑 Label mapping: {np.int64(0): 0, np.int64(1): 1}


Map:   0%|          | 0/27321 [00:00<?, ? examples/s]

Map:   0%|          | 0/6831 [00:00<?, ? examples/s]

Map:   0%|          | 0/27321 [00:00<?, ? examples/s]

Map:   0%|          | 0/6831 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Unique train labels: [0, 1]
✅ Unique test labels: [0, 1]
✅ Model will be trained with num_labels = 2


  trainer = Trainer(


🚀 Training Transformer Model...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1255,0.04409,0.985214,0.985229,0.985214,0.985215
2,0.0281,0.048519,0.985507,0.985512,0.985507,0.985508


✅ Final Evaluation:


{'eval_loss': 0.048518937081098557, 'eval_accuracy': 0.9855072463768116, 'eval_precision': 0.9855117331511769, 'eval_recall': 0.9855072463768116, 'eval_f1': 0.9855077778906379, 'eval_runtime': 23.5887, 'eval_samples_per_second': 289.588, 'eval_steps_per_second': 9.072, 'epoch': 2.0}


In [None]:
# %pip install --upgrade accelerate transformers