# Imports

In [28]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import torch
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from pymongo import MongoClient

In [29]:
torch.cuda.is_available()

False

# Data Preprocessing

In [None]:
client = MongoClient("mongodb://localhost:27017")
coll = client.get_database("fakenews").get_collection("articles")

In [30]:
# Get English or German Articles
language = "en"
df = pd.DataFrame(list(coll.find({"language": language})))

In [None]:
# Remap label to numeric for training
df["label"] = df["label"].map({"fake": 0, "real": 1})
# Combine Title and Text
df["content"] = df["title"] + " [SEP] " + df["text"]
# Remove duplicates and missing values
df = df.copy().dropna(subset=["content", "label"]).drop_duplicates(subset=["content"])
# Clean formatting errors
df["content"] = (
    df["content"]
    .str.replace("Ã¤", "ä", regex=True)
    .replace("Ã¼", "ü", regex=True)
    .replace("Ã¶", "ö", regex=True)
    .replace("kã", "kä", regex=True)
    .replace("fã¼r", "für", regex=True)
)

In [None]:
# Balance dataset by undersampling
df["label"] = df["label"].astype(int)
df = df.groupby("label").sample(df.groupby("label").size().min())

# Baseline Model to identify the most important words
The primary objective for using this baseline model is to try and prevent overfitting.
By identifying and removing specific words or meta topics, which the model might use to shortcut the learning process, it prevents the model from focussing on the specific format of this dataset and rather focus on the nature of misinformation.

In [None]:
vectorizer = CountVectorizer(max_features=10000)
X = vectorizer.fit_transform(df["content"])
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

print(f"Baseline Accuracy: {clf.score(X_test, y_test):.4f}")

In [None]:
# Calucalate coefficients
feature_names = vectorizer.get_feature_names_out()
coefs = clf.coef_[0]

# Sort and print Top 20 words for each label
top_positive = np.argsort(coefs)[-20:]
top_negative = np.argsort(coefs)[:20]

print("Real")
print([feature_names[j] for j in top_positive])

print("\nFake")
print([feature_names[j] for j in top_negative])

# Training the model

In [None]:
# Prepare Training and validation Datasets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["content"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    random_state=141,
    stratify=df["label"],
)

In [None]:
if language == "en":
    bert_identifier = "distilbert-base-cased"
elif language == "de":
    bert_identifier = "distlibert-base-german-cased"
else:
    bert_identifier = "distilbert-base-multilingual-cased "

tokenizer = DistilBertTokenizer.from_pretrained(bert_identifier)

model = DistilBertForSequenceClassification.from_pretrained(
    bert_identifier, num_labels=2
)

In [None]:
# Use the maximum length possible for news articles
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

In [None]:
# Setup dataset
class FakeNewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = FakeNewsDataset(train_encodings, train_labels)
val_dataset = FakeNewsDataset(val_encodings, val_labels)

In [None]:
# Define metrics for validation while training
def compute_metrics(eval_pred):
    logits, labels = eval_pred

    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_score(y_true=labels, y_pred=predictions)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true=labels, y_pred=predictions, average="weighted"
    )

    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

In [None]:
# Paramaters might need tuning for the language and size of the dataset used
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="no",
    learning_rate=2e-5,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

## English Validation Results (Epoch 2/2)
```
Step	Training Loss	Validation Loss	Accuracy	F1	Precision	Recall
1000	0.068600	0.081274	0.978367	0.978366	0.978516	0.978367
2000	0.056800	0.045051	0.989317	0.989317	0.989340	0.989317
3000	0.025000	0.033110	0.992789	0.992789	0.992793	0.992789
4000	0.031600	0.028730	0.994213	0.994213	0.994229	0.994213
5000	0.025100	0.022759	0.993857	0.993857	0.993858	0.993857
```

## German Validation Results (Epoch 2/2)
```
Step	Training Loss	Val Loss	Accuracy	F1	Precision	Recall
1000	0.124400	0.122266	0.955190	0.955167	0.956129	0.955190
2000	0.105400	0.081241	0.970214	0.970213	0.970249	0.970214
3000	0.063900	0.067688	0.976405	0.976403	0.976559	0.976405
4000	0.069200	0.059439	0.979547	0.979547	0.979578	0.979547
5000	0.072900	0.053993	0.980440	0.980439	0.980529	0.980440
6000	0.034500	0.052662	0.983192	0.983192	0.983199	0.983192
7000	0.035000	0.047028	0.986074	0.986073	0.986098	0.986074
8000	0.022300	0.049871	0.986036	0.986036	0.986043	0.986036
9000	0.043000	0.041755	0.986873	0.986873	0.986903	0.986873
10000	0.021900	0.046021	0.987245	0.987245	0.987250	0.987245
11000	0.032800	0.041483	0.988454	0.988453	0.988471	0.988454
12000	0.030500	0.041880	0.988305	0.988305	0.988312	0.988305
13000	0.032300	0.040375	0.988565	0.988565	0.988567	0.988565
```

In [None]:
model.save_pretrained(f"./models/fake_news_bert_model_{language}")
tokenizer.save_pretrained(f"./models/fake_news_bert_tokenizer_{language}")