In [None]:
import pandas as pd

# loading data set from huggingface datasets
splits = {
    'train': 'data/train-00000-of-00001-31aecafc15fe32e0.parquet',
    'test': 'data/test-00000-of-00001-1d49200ad03190fd.parquet'
}

# train data field and test data field
train_df = pd.read_parquet("hf://datasets/mohammadjavadpirhadi/fake-news-detection-dataset-english/" + splits["train"])
test_df = pd.read_parquet("hf://datasets/mohammadjavadpirhadi/fake-news-detection-dataset-english/" + splits["test"])
print("traindf:")
train_df.head()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


traindf:


Unnamed: 0,title,text,subject,date,label
0,"Saudi Arabia says foils Islamic State bomb, fo...",DUBAI (Reuters) - Saudi Arabia said on Tuesday...,worldnews,"September 11, 2017",0
1,GOP Just Demanded Hillary Clinton Give Them A...,"On Wednesday, the GOP in an effort to deflec...",News,"July 5, 2017",1
2,AUDIT REVEALS: 150 AFGHAN TROOPS Have Gone AWO...,The new report on the number of missing Afghan...,politics,"Oct 20, 2017",1
3,Watch Elizabeth Warren DESTROY Donald Trump i...,Senator Elizabeth Warren (D-MA) lit into Repub...,News,"March 21, 2016",1
4,Mohammed Dahlan speaks about Palestinian unity...,"GAZA (Reuters) - Mohammad Dahlan, who played a...",worldnews,"October 4, 2017",0


In [None]:
print("testdf:")
test_df.head()

testdf:


Unnamed: 0,title,text,subject,date,label
0,British foreign secretary 'concerned' about pl...,LONDON (Reuters) - British Foreign Secretary B...,worldnews,"December 6, 2017",0
1,Trump Gets ROASTED By The White House For Emb...,"Up until recently, we ve all been witnessing D...",News,"April 27, 2016",1
2,House Committee Uncovers DAMNING BOMBSHELL – ...,It s easy to think the whole brouhaha with Mic...,News,"March 18, 2017",1
3,HUH? GERMANS BOMBED PEARL HARBOR? Congressman ...,The video from 2012 is a little blurry but the...,politics,"Dec 8, 2016",1
4,Kellyanne Conway Shamelessly Begs For Money A...,If you voted for Trump hoping for financial pr...,News,"January 2, 2017",1


In [None]:
from transformers import AutoTokenizer

# create a tokenizer based on bert-base-uncased model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# tokenization function
def tokenize_data(df):
  return tokenizer(
      df['title'].tolist(),
      padding=True,
      truncation=True,
      return_tensors="pt"
  )


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
train_encodings = tokenize_data(train_df)
test_encodings = tokenize_data(test_df)

# get the values of attribute 'label' from the dataset
# 0 - real, 1 - fake
train_labels = train_df['label'].values
test_labels = test_df['label'].values
train_labels

array([0, 1, 1, ..., 1, 1, 0])

In [None]:
test_labels

array([0, 1, 1, ..., 0, 0, 1])

In [None]:
import torch

# Define a custom Dataset class that inherits from PyTorch's Dataset base class
class NewsDataset(torch.utils.data.Dataset):
    # Constructor: takes tokenized encodings and their corresponding labels
    def __init__(self, encodings, labels):
        self.encodings = encodings  # Dictionary of input_ids, attention_mask, etc.
        self.labels = labels        # List or tensor of ground truth labels (0 or 1)

    # Required method: returns the number of samples in the dataset
    def __len__(self):
        return len(self.labels)  # Dataset size is equal to the number of labels

    # Required method: returns one item at a time (used during batching)
    def __getitem__(self, idx):
        # For the given index, extract input values like input_ids[idx], attention_mask[idx]
        item = {}

        for key, val in self.encodings.items():
            item[key] = val[idx]

        # Add the corresponding label as a tensor
        item['labels'] = torch.tensor(self.labels[idx])

        return item  # Returns a dictionary {input_ids, attention_mask, labels}

# Instantiate the custom Dataset with your training encodings and labels
train_dataset = NewsDataset(train_encodings, train_labels)

# Do the same for the test dataset
test_dataset = NewsDataset(test_encodings, test_labels)


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,             #Overall correctness of predictions
        'precision': precision,      #How many predicted positives are actually correct
        'recall': recall,            #How many actual positives were found
        'f1': f1,                    #Balance between precision and recall
    }


In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Define training configurations
training_args = TrainingArguments(
    output_dir="./results",                      # Directory to save model checkpoints
    num_train_epochs=5,                         # Increase epochs to allow longer training with early stopping
    per_device_train_batch_size=16,              # Training batch size per GPU/CPU
    per_device_eval_batch_size=16,               # Evaluation batch size
    eval_strategy="epoch",                       # Evaluate at the end of every epoch
    save_strategy="epoch",                       # Save model at the end of each epoch
    learning_rate=1e-5,                          # Initial learning rate (AdamW default)
    weight_decay=0.01,                           # L2 regularization to prevent overfitting
    logging_dir="./logs",                        # Directory to save logs
    logging_steps=50,                            # Log training loss every 50 steps
    load_best_model_at_end=True,                 # Load best checkpoint at end based on `metric_for_best_model`
    metric_for_best_model="f1",                  # Use F1-score to determine best model
    greater_is_better=True,                      # Higher F1 is better
    report_to="none",                           # Report metrics to Weights & Biases dashboard
    fp16=True,  # ✅ Enable mixed-precision training

    # NEW: Save only the best model
    save_total_limit=2,                          # Keep only 2 best checkpoints

    # NEW: Enable learning rate warm-up for stable initial training
    warmup_ratio=0.1,                            # Warm up 10% of total steps

    # NEW: Add gradient clipping to prevent exploding gradients
    max_grad_norm=1.0,                           # Clip gradients above 1.0

    # NEW: Add evaluation patience to stop early if no improvement
    logging_first_step=True                      # Log the first step of training

)

# Create the Trainer with model, data, training arguments, and metrics
trainer = Trainer(
    model=model,                                 # Your fine-tuned Transformer model
    args=training_args,                          # The improved training configuration
    train_dataset=train_dataset,                 # Preprocessed training dataset
    eval_dataset=test_dataset,                   # Preprocessed validation dataset
    compute_metrics=compute_metrics,              # Custom metric function (must return 'f1')
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] # Early stop training after no improvement for 2 epochs

)


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0964,0.076495,0.979287,0.977055,0.983323,0.980179
2,0.0375,0.055432,0.98608,0.987366,0.985888,0.986627
3,0.0347,0.067369,0.985523,0.985065,0.987171,0.986117
4,0.01,0.084366,0.987862,0.988452,0.98824,0.988346
5,0.0107,0.087447,0.987973,0.989291,0.987599,0.988444


TrainOutput(global_step=11225, training_loss=0.05141839775155176, metrics={'train_runtime': 668.9164, 'train_samples_per_second': 268.479, 'train_steps_per_second': 16.781, 'total_flos': 2927263023147960.0, 'train_loss': 0.05141839775155176, 'epoch': 5.0})

In [None]:
trainer.save_model("./results")  # Saves model
tokenizer.save_pretrained("./results")  # Also save tokenizer


('./results/tokenizer_config.json',
 './results/special_tokens_map.json',
 './results/vocab.txt',
 './results/added_tokens.json',
 './results/tokenizer.json')

In [None]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.08744722604751587, 'eval_accuracy': 0.9879732739420936, 'eval_precision': 0.9892910687513387, 'eval_recall': 0.9875988881761814, 'eval_f1': 0.988444254226407, 'eval_runtime': 4.79, 'eval_samples_per_second': 1874.72, 'eval_steps_per_second': 117.327, 'epoch': 5.0}


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("./results")
tokenizer = AutoTokenizer.from_pretrained("./results")
model.to("cuda")  # or "cpu"


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
def predict_title(title):
    # Tokenize input
    inputs = tokenizer(title, return_tensors="pt", truncation=True, padding=True)

    # Remove token_type_ids if model doesn't support them
    if "token_type_ids" in inputs:
        del inputs["token_type_ids"]

    # Move inputs to same device as model
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Predict
    outputs = model(**inputs)
    probs = outputs.logits.softmax(dim=1).squeeze()
    pred = probs.argmax().item()
    label = "FAKE" if pred == 1 else "REAL"
    return {"label": label, "confidence": round(probs[pred].item(), 3)}


In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())


CUDA available: True


In [None]:
# title = input("Please enter a news title: ")
news_title = [
    # ✅ Real News
    "NASA's Perseverance Rover Discovers Organic Molecules on Mars",
    "UN Declares Global Climate Emergency Amid Record-Breaking Heatwaves",
    "WHO Approves New Malaria Vaccine for Widespread Use in Africa",

    # ❌ Fake News
    "Drinking Bleach Can Cure COVID-19, Claims Viral Facebook Post",
    "Elon Musk Buys the Moon to Build Private Resort",
    "COVID-19 Pandemic Is a Hoax Created to Control the Population, Experts Say"
]


for nt in news_title:
  result = predict_title(nt)
  print(nt, result)

# predict_title(title)

NASA's Perseverance Rover Discovers Organic Molecules on Mars {'label': 'REAL', 'confidence': 1.0}
UN Declares Global Climate Emergency Amid Record-Breaking Heatwaves {'label': 'REAL', 'confidence': 0.825}
WHO Approves New Malaria Vaccine for Widespread Use in Africa {'label': 'REAL', 'confidence': 1.0}
Drinking Bleach Can Cure COVID-19, Claims Viral Facebook Post {'label': 'FAKE', 'confidence': 0.756}
Elon Musk Buys the Moon to Build Private Resort {'label': 'FAKE', 'confidence': 0.997}
COVID-19 Pandemic Is a Hoax Created to Control the Population, Experts Say {'label': 'REAL', 'confidence': 0.686}
