In [None]:
import torch
torch.cuda.is_available()

Install libraries that we are going to use

In [None]:
!pip install datasets transformers huggingface_hub evaluate

To use git in this model

In [None]:
!apt-get install git-lfs

# Upload the kaggle dataset

In [None]:
from google.colab import files
files.upload()  # This will prompt you to upload the kaggle.json file.

In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d emineyetm/fake-news-detection-datasets

In [None]:
import zipfile
import os

# Unzip the dataset
with zipfile.ZipFile("fake-news-detection-datasets.zip", "r") as zip_ref:
    zip_ref.extractall("fake-news-detection-datasets")

# Verify the files
print(os.listdir("fake-news-detection-datasets"))

# Define the function to remove stopwords

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Define stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(example):
    example['text'] = ' '.join([word for word in example['text'].split() if word.lower() not in stop_words])
    return example

# Merge 3 different datasets to measure generalablity of the model best and remove the stopwords from them

In [None]:
import pandas as pd
from datasets import load_dataset
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Example: Load a CSV file
fakes = pd.read_csv("/content/fake-news-detection-datasets/News _dataset/Fake.csv")
trues = pd.read_csv("/content/fake-news-detection-datasets/News _dataset/True.csv")

# Add a label part both
fakes["label"] = 0
trues["label"] = 1

# merge these sets
merged_df = pd.concat([fakes, trues], ignore_index=True)
# remove subject and date columns
merged_df = merged_df.drop(["subject", "date"], axis=1)
# merge title and text columns with :
merged_df["text"] = merged_df["title"] + " : " + merged_df["text"]
# remove title column
merged_df = merged_df.drop(["title"], axis=1)
# shuffle data
merged_df = merged_df.sample(frac=1, random_state=42).reset_index(drop=True)
pol_dataset = Dataset.from_pandas(merged_df)
pol_dataset = pol_dataset.select(range(10000))
print(pol_dataset.column_names)  # Check column names in the split

# Another dataset----------------------------------------------------------------------------------------------------
ds = load_dataset("noahgift/fake-news")
print(ds.column_names)  # Check column names in the split
ds1 = ds["train"]
ds1 = ds1.select_columns(["text", "label"])
ds1 = ds1.select(range(2000))
# Define a mapping from string labels to integers
label_mapping2 = {"Real": 1, "Fake": 0}
ds1 = ds1.map(lambda example: {"label": label_mapping2[example["label"]]})
print(ds1.column_names)  # Check column names in the split

# Last dataset----------------------------------------------------------------------------------------------------
dataset = load_dataset("Cartinoe5930/Politifact_fake_news")
dataset1 = dataset["train"]
# cahgen the column anme news to text
dataset1 = dataset1.rename_column("news", "text")
dataset1 = dataset1.select_columns(["text", "label"])
dataset1 = dataset1.select(range(10000))
print(dataset1.column_names)  # Check column names in the split



# merge pol_dataset, dataset1, and ds1
merged_df = pd.concat([pol_dataset.to_pandas(), dataset1.to_pandas(), ds1.to_pandas()], ignore_index=True)

# shuffle
merged_df = merged_df.sample(frac=1, random_state=42).reset_index(drop=True)

#convert to dataset
merged_df = Dataset.from_pandas(merged_df)
print(merged_df.column_names)  # Check column names in the split
print(merged_df[0])
merged_df = merged_df.map(remove_stopwords)
print(merged_df[0])


#split to train and test
dataset_split = merged_df.train_test_split(test_size=0.2, seed=42)
train_df = dataset_split["train"]
test_df = dataset_split["test"]
#check
print(train_df.column_names)  # Check column names in the split
print(test_df.column_names)  # Check column names in the spli
print(train_df.shape)
print(test_df.shape)


# Load the tokenizer



In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenization

In [None]:
def preprocess_function(examples):
   return tokenizer(examples["text"], truncation=True)

# Now you can map with batched=True
tokenized_train = train_df.map(preprocess_function, batched=True)
tokenized_test = test_df.map(preprocess_function, batched=True)

# Show the shape of the datas
print(tokenized_train.shape)
print(tokenized_test.shape)

#Show the first 5 elements of the datasets
print(tokenized_train[:5])
print(tokenized_test[:5])

**Use data_collator to speed up training**

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

\# Upload the distilBERT model to fine-tune

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

**Evaluation Metrices**

In [None]:
import numpy as np
import evaluate
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    accuracy = accuracy_score(labels, predictions)
    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }



In [None]:
from huggingface_hub import notebook_login
notebook_login()

# COMMAND 1

# **Train the distilBERT model with created dataset and evaluate it. This is our baseline.**

In [None]:
from transformers import TrainingArguments, Trainer

repo_name = "fake-news-model-22000-samples"

dataset_split = tokenized_train.train_test_split(test_size=0.2, seed=42)

# Extract training and validation datasets
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=10,
   weight_decay=0.01,
   save_strategy="epoch",
   eval_strategy="epoch",  # Evaluate at the end of each epoch
   logging_dir=f"{repo_name}/logs",  # Directory for logs
   logging_strategy="epoch",  # Log metrics at the end of each epoch
   push_to_hub=True,
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=train_dataset,
   eval_dataset=eval_dataset,
   processing_class=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

# Push to hub to use again.

In [None]:
trainer.push_to_hub()

events.out.tfevents.1735040137.4e57bcf02596.557.0:   0%|          | 0.00/12.1k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ilhamiuturkkan/fake-news-model-22000-samples/commit/0882a7c0536d45d54aff9429af40ccb3f974f030', commit_message='End of training', commit_description='', oid='0882a7c0536d45d54aff9429af40ccb3f974f030', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ilhamiuturkkan/fake-news-model-22000-samples', endpoint='https://huggingface.co', repo_type='model', repo_id='ilhamiuturkkan/fake-news-model-22000-samples'), pr_revision=None, pr_num=None)

# Load the trained model


In [None]:
from transformers import pipeline
fake_news_model = pipeline(model="ilhamiuturkkan/fake-news-model-22000-samples", device=0)

# Since distilBERT has max 512 token capacity. Tokenize the input with respect to it.

In [None]:
def truncate_texts_to_max_length(texts, tokenizer, max_length=512):
    return [
        tokenizer.decode(
            tokenizer(
                text,
                max_length=max_length,
                truncation=True,  # Ensure truncation
            )["input_ids"],
            skip_special_tokens=True,
        )
        for text in texts
    ]

# Test the trained distilBERT model

In [None]:
from datasets import load_dataset

# Truncate texts to the model's maximum token length
test_texts = truncate_texts_to_max_length(tokenized_test["text"], tokenizer)
test_labels = tokenized_test["label"]

outputs = fake_news_model(test_texts)

# Define a mapping from string labels to integers
label_mapping = {"LABEL_0": 0, "LABEL_1": 1}

# Convert predictions to integers by accessing the "label" key
predictions = [label_mapping[output["label"]] for output in outputs]
ground_truths = test_labels
# Compute accuracy
accuracy = accuracy_score(ground_truths, predictions)
print(f"Accuracy with trained model: {accuracy}")


# COMMAND 2

# Now, work for LSTM

In [None]:
train_texts = tokenizer(train_dataset["text"], return_tensors="pt", padding='max_length', truncation=True, max_length=256)
train_labels = train_dataset["label"]
val_texts = tokenizer(eval_dataset["text"], return_tensors="pt", padding='max_length', truncation=True, max_length=256)
val_labels = eval_dataset["label"]
test_texts = tokenizer(tokenized_test["text"], return_tensors="pt", padding='max_length', truncation=True, max_length=256)
test_labels = tokenized_test["label"]

**First get the embeddings of the train set texts**

In [None]:
from torch.utils.data import DataLoader, TensorDataset, random_split
labels_tensor = torch.tensor(train_labels)
dataset = TensorDataset(train_texts.input_ids, train_texts.attention_mask, labels_tensor)
loader = DataLoader(dataset, batch_size=32)

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel
model = DistilBertModel.from_pretrained('distilbert-base-uncased', num_labels=2)

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

embeddings = []
for batch in loader:
    # Move inputs to the same device as the model
    batch_input_ids, batch_attention_mask = batch[0].to(device), batch[1].to(device)
    with torch.no_grad():
        # Get the embeddings from the model
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
        batch_embeddings = outputs.last_hidden_state.cpu().numpy()  # Move embeddings to CPU for storage
        embeddings.append(batch_embeddings)


In [None]:
train_embeddings = np.concatenate(embeddings, axis=0)
#Random embedding to verify
print(train_embeddings[12])

**Now, get the embeddings of evaluation set**

In [None]:
labels_tensor = torch.tensor(val_labels)
dataset = TensorDataset(val_texts.input_ids, val_texts.attention_mask, labels_tensor)
loader = DataLoader(dataset, batch_size=32)
model = model.to(device=0)

embeddings = []
for batch in loader:
    # Move inputs to the same device as the model
    batch_input_ids, batch_attention_mask = batch[0].to(device), batch[1].to(device)
    with torch.no_grad():
        # Get the embeddings from the model
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
        batch_embeddings = outputs.last_hidden_state.cpu().numpy()  # Move embeddings to CPU for storage
        embeddings.append(batch_embeddings)

In [None]:
val_embeddings = np.concatenate(embeddings, axis=0)

# Now, define the LSTM Biderctional Model and train it with obtained embeddings.

In [None]:
import tensorflow as tf
from tensorflow.keras.regularizers import l2
from transformers import Trainer, AutoTokenizer, AdamW, TrainingArguments

dropout_rate = 0.2
l2_strength=0.005
print((train_embeddings.shape[1], train_embeddings.shape[2]))
bidirectional_model = tf.keras.Sequential([
    tf.keras.Input(shape=(train_embeddings.shape[1], train_embeddings.shape[2])),
    #tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=l2(l2_strength)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate, kernel_regularizer=l2(l2_strength))),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=l2(l2_strength)),
    tf.keras.layers.Dropout(dropout_rate),
    tf.keras.layers.Dense(2, activation='softmax', kernel_regularizer=l2(l2_strength))])


In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy

bidirectional_model.compile(
    optimizer=Adam(learning_rate=0.0001),  # Lower learning rate for finer updates
    loss=CategoricalCrossentropy(label_smoothing=0.1),  # Label smoothing for better generalization
    metrics=[
        'accuracy',
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        tf.keras.metrics.F1Score(name='f1_score')
    ]
)


In [None]:
from tensorflow.keras.utils import to_categorical

train_y_one_hot = to_categorical(train_labels, num_classes=2)
val_y_one_hot = to_categorical(val_labels, num_classes=2)

In [None]:
#Check the shapes.
print(train_embeddings.shape)
print(val_embeddings.shape)
print(np.array(train_y_one_hot).shape)
print(np.array(val_y_one_hot).shape)

(14080, 256, 768)
(3520, 256, 768)
(14080, 2)
(3520, 2)


In [None]:
import matplotlib.pyplot as plt

batch_size = 64
num_epochs = 10

train_history = bidirectional_model.fit(train_embeddings,
                                          np.array(train_y_one_hot),
                                          batch_size=batch_size,
                                          epochs=num_epochs,
                                          validation_data=(val_embeddings, np.array(val_y_one_hot)))

# Extract metrics
train_accuracy = train_history.history['accuracy']
val_accuracy = train_history.history['val_accuracy']
epochs = range(1, len(train_accuracy) + 1)

# Plotting
plt.figure(figsize=(12, 6))

# Train Accuracy
plt.plot(epochs, train_accuracy, marker='o', linestyle='-', label="Train Accuracy")

# Validation Accuracy
plt.plot(epochs, val_accuracy, marker='o', linestyle='-', label="Validation Accuracy", color="orange")

# Title and Labels
plt.title("Training and Validation Accuracy per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.xticks(epochs)
plt.ylim(0, 1)
plt.grid(True)
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()


In [None]:
labels_tensor = torch.tensor(test_labels)
dataset = TensorDataset(test_texts.input_ids, test_texts.attention_mask, labels_tensor)
loader = DataLoader(dataset, batch_size=32)

model = model.to(device=0)

embeddings = []
for batch in loader:
    # Move inputs to the same device as the model
    batch_input_ids, batch_attention_mask = batch[0].to(device), batch[1].to(device)
    with torch.no_grad():
        # Get the embeddings from the model
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
        batch_embeddings = outputs.last_hidden_state.cpu().numpy()  # Move embeddings to CPU for storage
        embeddings.append(batch_embeddings)
test_embeddings = np.concatenate(embeddings, axis=0)

test_y_one_hot = to_categorical(test_labels, num_classes=2)

test_history = bidirectional_model.evaluate(test_embeddings, np.array(test_y_one_hot))

In [None]:
bidirectional_model.summary()