In [3]:
#Import Libraries
import pandas as pd
from sklearn.metrics import classification_report
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#Load Data
fake = pd.read_csv("Fake.csv")
true = pd.read_csv("True.csv")

In [5]:
fake["label"] = 0
true["label"] = 1

In [6]:
df = pd.concat([fake, true]).sample(frac=1, random_state=42)
df["text"] = df["title"].fillna('') + " " + df["text"].fillna('')

In [7]:
import re
def light_clean(text):
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    text = re.sub(r"\s+", " ", text)
    text = text.encode("ascii", "ignore").decode("utf-8")
    return text.strip()

df["text"] = df["text"].apply(light_clean)

In [8]:
df = df[["text", "label"]]
df.reset_index(drop=True, inplace=True)

In [9]:
#Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)

In [10]:
#Tokenization
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [11]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 44898/44898 [01:01<00:00, 730.63 examples/s]


In [12]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)

In [None]:
import transformers
print(transformers.__version__)

In [13]:
#Load Model & Trainer
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="bert_fakenews_final",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=lambda p: {
        "accuracy": (p.predictions.argmax(axis=1) == p.label_ids).mean()
    }
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
trainer.train()

In [None]:
model.save_pretrained("bert_fakenews_final")
tokenizer.save_pretrained("bert_fakenews_final")

In [None]:
#Evaluation
predictions = trainer.predict(tokenized_dataset["test"])
y_true = predictions.label_ids
y_pred = predictions.predictions.argmax(axis=1)
print(classification_report(y_true, y_pred))