In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split


# Load Data
real_df = pd.read_csv('/home/jparep/proj/nlp-deberta/data/true.csv')
fake_df = pd.read_csv('/home/jparep/proj/nlp-deberta/data/fake.csv')

In [2]:
# Add a 'label' column
real_df['label'] = 1
fake_df['label'] = 0
df = pd.concat([real_df, fake_df], axis=0).sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,title,text,subject,date,label
0,MUSLIM TEENS Stage Fake Terrorist Attack In MN...,"Oh and as an added bonus, according to one of ...",left-news,"Jan 8, 2016",0
1,MUSLIM SCHOLAR CRITICIZES OBAMAâ€¦Explains Why A...,".@MissDiagnosis: ""He's our president and I thi...",politics,"Nov 20, 2016",0
2,"BOILER ROOM â€“ EP #44 â€“ Dig, Dug, Dirt!",Tune in to the Alternate Current Radio Network...,Middle-east,"February 19, 2016",0
3,South Korea imposes sanctions on 18 North Kore...,SEOUL (Reuters) - South Korea imposed unilater...,worldnews,"November 6, 2017",1
4,Another Pervert Out At Fox News Over Allegati...,Fox News has a problem with sexual predators i...,News,"September 8, 2017",0


In [3]:
# Define features and tagert variable
df = df[['text', 'label']]

In [4]:
# Step 5: Split the Data into Training and Test Sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
# Load the DeBERTa Tokenizer
from transformers import DebertaTokenizer
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import torch
# Step 7: Tokenize the Data
def tokenize_data(df, tokenizer, max_length=512):
    return tokenizer(
        df['text'].tolist(),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

train_encodings = tokenize_data(train_df, tokenizer)
test_encodings = tokenize_data(test_df, tokenizer)

In [7]:
# Step 8: Create a Dataset Class
class FakeNewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Prepare datasets
train_dataset = FakeNewsDataset(train_encodings, train_df['label'].tolist())
test_dataset = FakeNewsDataset(test_encodings, test_df['label'].tolist())

In [8]:
# Step 9: Load the DeBERTa Model
from transformers import DebertaTokenizer, DebertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base')

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",     # Evaluate during training at the end of each epoch
    save_strategy="epoch",           # Save the model at the end of each epoch
    load_best_model_at_end=True,     # Load the best model after training
    metric_for_best_model="accuracy"
)




In [10]:
# Step 11: Create Trainer Instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(p.label_ids, torch.argmax(p.predictions, axis=1)),
        'precision': precision_recall_fscore_support(p.label_ids, torch.argmax(p.predictions, axis=1), average='binary')[0],
        'recall': precision_recall_fscore_support(p.label_ids, torch.argmax(p.predictions, axis=1), average='binary')[1],
        'f1': precision_recall_fscore_support(p.label_ids, torch.argmax(p.predictions, axis=1), average='binary')[2],
    }
)


In [11]:
# Step 12: Train the Model
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Evaluate the Model
results = trainer.evaluate()
print(f"Evaluate Results {results}")

In [None]:
# Save the model
model.save_pretrained('./fake_news_deberta_model')
tokenizer.save_pretrained('./fake-news_deberta_model')