In [4]:
!pip install -q scikit-learn pandas nltk joblib matplotlib seaborn

In [6]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
!ls /kaggle/input/fake-news-datasets


fake_news_datasets


In [17]:
import os

base_folder = "/kaggle/input/fake-news-datasets"
print(os.listdir(base_folder))

['fake_news_datasets']


In [18]:
import os

# base folder for Kaggle dataset
base_folder = "/kaggle/input/fake-news-datasets"

# list all files in the dataset folder
for file in os.listdir(base_folder):
    print(file)


fake_news_datasets


In [19]:
import os

base_folder = "/kaggle/input/fake-news-datasets"

# list all top-level folders/files
print("Top-level contents:")
top_contents = os.listdir(base_folder)
for item in top_contents:
    print(item)

# if you see a folder like 'archive-4', list its contents
for folder in top_contents:
    folder_path = os.path.join(base_folder, folder)
    if os.path.isdir(folder_path):
        print(f"\nContents of {folder}:")
        print(os.listdir(folder_path))


Top-level contents:
fake_news_datasets

Contents of fake_news_datasets:
['archive (5)', 'archive (4)', 'archive (6)']


In [23]:
import os
import pandas as pd

# ---- Base paths for each dataset ----
base_folder = "/kaggle/input/fake-news-datasets/fake_news_datasets"

# ---- Dataset 1 (archive 4) ----
folder1 = os.path.join(base_folder, "archive (4)")
df1_fake = pd.read_csv(os.path.join(folder1, "Fake.csv"))
df1_true = pd.read_csv(os.path.join(folder1, "True.csv"))
df1_fake['label'] = 0
df1_true['label'] = 1
df1 = pd.concat([df1_fake, df1_true], ignore_index=True)
df1['content'] = df1['title'].fillna('') + " " + df1['text'].fillna('')

# ---- Dataset 2 (archive 5) ----
folder2 = os.path.join(base_folder, "archive (5)")
df2 = pd.read_csv(os.path.join(folder2, "WELFake_Dataset.csv"))

# Map labels to 0/1 if they are text
if df2['label'].dtype == object:
    df2['label'] = df2['label'].map({'FAKE': 0, 'REAL': 1})

# Combine title and text if available
if "title" in df2.columns and "text" in df2.columns:
    df2['content'] = df2['title'].fillna('') + " " + df2['text'].fillna('')
else:
    df2.rename(columns={df2.columns[0]: "content"}, inplace=True)

# ---- Dataset 3 (archive 6) ----
folder3 = os.path.join(base_folder, "archive (6)")
df3_fake = pd.read_csv(os.path.join(folder3, "Fake.csv"))
df3_true = pd.read_csv(os.path.join(folder3, "True.csv"))
df3_fake['label'] = 0
df3_true['label'] = 1
df3 = pd.concat([df3_fake, df3_true], ignore_index=True)
df3['content'] = df3['title'].fillna('') + " " + df3['text'].fillna('')

# ---- Combine All Datasets ----
df_all = pd.concat([
    df1[['content', 'label']],
    df2[['content', 'label']],
    df3[['content', 'label']]
], ignore_index=True)

# ---- Summary ----
print("Combined dataset shape:", df_all.shape)
print("Label distribution:\n", df_all['label'].value_counts())
print(df_all.head())
print(df_all.tail())



Combined dataset shape: (161930, 2)
Label distribution:
 label
0    81990
1    79940
Name: count, dtype: int64
                                             content  label
0   Donald Trump Sends Out Embarrassing New Year’...      0
1   Drunk Bragging Trump Staffer Started Russian ...      0
2   Sheriff David Clarke Becomes An Internet Joke...      0
3   Trump Is So Obsessed He Even Has Obama’s Name...      0
4   Pope Francis Just Called Out Donald Trump Dur...      0
                                                  content  label
161925  'Fully committed' NATO backs new U.S. approach...      1
161926  LexisNexis withdrew two products from Chinese ...      1
161927  Minsk cultural hub becomes haven from authorit...      1
161928  Vatican upbeat on possibility of Pope Francis ...      1
161929  Indonesia to buy $1.14 billion worth of Russia...      1


In [24]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import nltk
nltk.download('punkt_tab') # Download the required resource

stop = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+',' ', text)             # remove URLs
    text = re.sub(r'[^a-z\s]',' ', text)            # keep letters
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop and len(t)>1]
    return " ".join(tokens)

df = df_all.dropna(subset=['content'])  # ensure content exists
df['clean'] = df_all['content'].apply(clean_text)
df['clean'].head()

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


0    donald trump sends embarrassing new year eve m...
1    drunk bragging trump staffer started russian c...
2    sheriff david clarke becomes internet joke thr...
3    trump obsessed even obama name coded website i...
4    pope francis called donald trump christmas spe...
Name: clean, dtype: object

In [25]:
# If labels are 'fake'/'real' in a 'label' column, map them:
if 'label' in df.columns and df['label'].dtype == object:
    unique = df['label'].unique()
    print("Unique labels:", unique)
    mapping = {}
    if set(['FAKE','REAL']).issubset(set([u.upper() for u in unique])):
        mapping = {u: 0 if u.upper()=='FAKE' else 1 for u in unique}
        df['label_num'] = df['label'].map(mapping)
    else:
        # if already numeric
        try:
            df['label_num'] = pd.to_numeric(df['label'])
        except:
            print("Please edit mapping manually for your labels.")
else:
    if 'label_num' not in df.columns:
        # create labels if e.g. there is a 'type' column
        possible = [c for c in df.columns if 'label' in c.lower() or 'target' in c.lower() or 'class' in c.lower()]
        print("Possible label cols:", possible)

# For many sample CSVs 'label_num' might not exist; set manually:
# df['label_num'] = (df['label_text']=='REAL').astype(int)

# Quick train-test split
X = df['clean']
y = df['label_num'] if 'label_num' in df.columns else df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(X_train.shape, X_test.shape)


Possible label cols: ['label']
(129544,) (32386,)


In [27]:
!pip install transformers torch scikit-learn


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [28]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tqdm import tqdm
import numpy as np

# ---------------- 1. Load tokenizer & model ----------------
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model.eval()  # set model to evaluation mode

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# ---------------- 2. Function to get embeddings ----------------
def get_embeddings(texts, batch_size=16):
    embeddings = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size)):
            batch_texts = texts[i:i+batch_size]
            encoded = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors='pt')
            input_ids = encoded['input_ids'].to(device)
            attention_mask = encoded['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            # Use [CLS] token representation
            batch_embeddings = outputs.last_hidden_state[:,0,:].cpu().numpy()
            embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

# ---------------- 3. Prepare embeddings ----------------
X_train_emb = get_embeddings(X_train.tolist())
X_test_emb  = get_embeddings(X_test.tolist())

# ---------------- 4. Train Logistic Regression ----------------
clf = LogisticRegression(max_iter=1000, class_weight='balanced', solver='liblinear', C=2.0, random_state=42)
clf.fit(X_train_emb, y_train)

# ---------------- 5. Evaluate ----------------
y_pred = clf.predict(X_test_emb)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


2025-09-21 04:40:51.313230: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758429651.669570      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758429651.771666      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

100%|██████████| 8097/8097 [39:36<00:00,  3.41it/s]
100%|██████████| 2025/2025 [10:03<00:00,  3.36it/s]


Accuracy: 0.6519174952139813

Classification Report:
               precision    recall  f1-score   support

           0     0.6554    0.6590    0.6572     16398
           1     0.6483    0.6446    0.6464     15988

    accuracy                         0.6519     32386
   macro avg     0.6519    0.6518    0.6518     32386
weighted avg     0.6519    0.6519    0.6519     32386

Confusion Matrix:
 [[10807  5591]
 [ 5682 10306]]


In [29]:
!pip install -q transformers datasets accelerate evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.8.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.
bigframes 2.8.0 requires google-cloud-bigquery[bqstorage,pandas]>=3.31.0, but you have google-cloud-bigquery 3.25.0 which is incompatible.
bigframes 2.8.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.[0m[31m
[0m

In [None]:
import os

# Try Kaggle Secrets first
api_key = None
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("API_KEY")
    print("✅ Using Kaggle Secrets")
except Exception as e:
    # Fallback: try .env file
    try:
        from dotenv import load_dotenv
        load_dotenv()
        api_key = os.getenv("API_KEY")
        if api_key:
            print("✅ Using .env file")
        else:
            print("⚠️ No API key found in .env")
    except ImportError:
        print("⚠️ python-dotenv not installed, and Kaggle Secrets unavailable")

# Final check
if not api_key:
    raise ValueError("❌ No API key found. Please set it in Kaggle Secrets or in a .env file.")


In [32]:
# ✅ Setup W&B in Kaggle using secrets
from kaggle_secrets import UserSecretsClient
import wandb
import os

# Access Kaggle secrets
secrets = UserSecretsClient()
wandb_api_key = secrets.get_secret("WANDB_API_KEY")  # use the exact name of your secret

if wandb_api_key:
    wandb.login(key=wandb_api_key)
    print("✅ Logged in to Weights & Biases successfully.")
else:
    print("⚠️ WANDB_API_KEY not found. W&B logging will be disabled.")
    os.environ["WANDB_DISABLED"] = "true"  # disable if no key


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mharleenkaur240305[0m ([33mharleenkaur240305-igdtuw-ac-in[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


✅ Logged in to Weights & Biases successfully.


In [33]:
from datasets import Dataset
import evaluate
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Prepare HuggingFace dataset from pandas
train_df = pd.DataFrame({'text': X_train.tolist(), 'label': y_train.tolist()})
test_df  = pd.DataFrame({'text': X_test.tolist(), 'label': y_test.tolist()})
hf_train = Dataset.from_pandas(train_df)
hf_test  = Dataset.from_pandas(test_df)

def tokenize_fn(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=256)

hf_train = hf_train.map(tokenize_fn, batched=True)
hf_test  = hf_test.map(tokenize_fn, batched=True)

hf_train.set_format('torch')
hf_test.set_format('torch')
hf_train = hf_train.remove_columns(['text'])
hf_test  = hf_test.remove_columns(['text'])

args = TrainingArguments(
    output_dir='./distilbert-fake-news',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=50,
    save_total_limit=1,
    report_to="wandb"  # logs to W&B now that API key is set
)

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=preds, references=labels)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=hf_train,
    eval_dataset=hf_test,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/129544 [00:00<?, ? examples/s]

Map:   0%|          | 0/32386 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]





Step,Training Loss
50,0.694
100,0.6719
150,0.6655
200,0.6663
250,0.663
300,0.6483
350,0.6367
400,0.6304
450,0.6074
500,0.6345




{'eval_loss': 0.5675325989723206,
 'eval_accuracy': 0.7109244735379485,
 'eval_runtime': 164.6898,
 'eval_samples_per_second': 196.648,
 'eval_steps_per_second': 12.296,
 'epoch': 2.0}

In [38]:
# Directory to save
output_dir = "/kaggle/working/distilbert-fake-news-final"

# Save model
trainer.model.save_pretrained(output_dir)

# Save tokenizer using your tokenizer variable (NOT trainer.tokenizer)
tokenizer.save_pretrained(output_dir)

print(f"✅ Model and tokenizer saved to {output_dir}")


✅ Model and tokenizer saved to /kaggle/working/distilbert-fake-news-final


In [39]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

save_path = "/kaggle/working/distilbert-fake-news-final"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(save_path)
model = AutoModelForSequenceClassification.from_pretrained(save_path)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [40]:
def predict_label(text, model, tokenizer, device, max_length=128):
    """
    Predict whether a given text is FAKE or REAL.
    """
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding='max_length',
        max_length=max_length
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(**inputs).logits
        pred_id = torch.argmax(logits, dim=-1).item()

    return "REAL" if pred_id == 1 else "FAKE"


In [41]:
sample_text = "This is a test article about a political event."
predicted_label = predict_label(sample_text, model, tokenizer, device)

print(f"Sample Text: {sample_text}")
print(f"Predicted Label: {predicted_label}")


Sample Text: This is a test article about a political event.
Predicted Label: REAL
