In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split


df = pd.read_csv('labeled_fulldata.csv')

df = df.sample(frac=1, random_state=42)

Pre-processing data

In [38]:
from torch.utils.data import DataLoader
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, AutoModel, Trainer, TrainingArguments
from datasets import Dataset

df = df[0:1000]  
X = df['text'].values
y = df['fake_news_flag'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y,random_state=42)


# Load pre-trained BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
#model = AutoModel.from_pretrained('distilbert-base-uncased')
model = BertForSequenceClassification.from_pretrained('distilbert-base-uncased')
print(list(X_train)[0])

tokens_train = tokenizer.batch_encode_plus(list(X_train),max_length = 128, padding=True, truncation = True)

tokens_test = tokenizer.batch_encode_plus(list(X_test), max_length = 128, padding=True, truncation = True)


train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(list(y_train))

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(list(y_test))



You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['bert.embeddings.LayerNorm.bias', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.word_embeddings.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0

SEOUL (Reuters) - South Korea s President Moon Jae-in said on Wednesday he and Russian President Vladimir Putin shared an understanding that resolving the North Korea nuclear issue is a top priority for development in East Asia. Moon, making the comments at a joint media conference with Putin after a meeting in Russia, said the Russian president expressed his full support for South Korea s efforts to handle issues related to North Korea. Moon and Putin met on the sidelines of an Eastern Economic Forum in the Russian city of Vladivostok, that began on Wednesday.  (Story refiles to add dropped word  he  in headline.) 


In [44]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 32

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(X_train)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

test_data = TensorDataset(test_seq, test_mask, test_y)
test_sampler = RandomSampler(X_test)
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = batch_size)

In [45]:
from tqdm import tqdm
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr = 4e-5)

epochs = 4

model.train()

for epoch in range(epochs):
    total_loss = 0
    
    progress = tqdm(train_dataloader, desc=f"Epoch {epoch+1}")
    for step, batch in enumerate(train_dataloader):
        optimizer.zero_grad()
        batch = [r for r in batch]
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels)

        #print(outputs[:2])
        loss, logits = outputs[:2]
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        # preds = torch.argmax(logits, dim=1)
        # print("Preds:", preds.tolist())
        # print("Labels:", labels.tolist())

        optimizer.step()
        

        total_loss += loss.item()
        progress.set_postfix(loss=loss.item())
        progress.update(1)
    print(f"Epoch {epoch+1} completed. Avg loss: {total_loss / len(train_dataloader):.4f}")


Epoch 4:  46%|████▌     | 23/50 [01:38<01:55,  4.27s/it, loss=0.701]
Epoch 1: 100%|██████████| 25/25 [03:07<00:00,  7.15s/it, loss=0.715]

Epoch 1 completed. Avg loss: 0.7024


Epoch 1: 100%|██████████| 25/25 [03:08<00:00,  7.52s/it, loss=0.715]


Epoch 2 completed. Avg loss: 0.7075


Epoch 2: 100%|██████████| 25/25 [03:53<00:00,  9.34s/it, loss=0.723]
Epoch 3: 100%|██████████| 25/25 [03:30<00:00,  9.66s/it, loss=0.714]

Epoch 3 completed. Avg loss: 0.6690


Epoch 3: 100%|██████████| 25/25 [03:30<00:00,  8.41s/it, loss=0.714]


Epoch 4 completed. Avg loss: 0.6285


In [46]:
model.eval()

from sklearn.metrics import accuracy_score, classification_report

all_preds = []
all_labels = []

with torch.no_grad():
    progress = tqdm(test_dataloader, desc=f"Epoch {epoch+1}")
    for step, batch in enumerate(test_dataloader):
        #print("step, batch")
        

        batch = [r for r in batch]
        input_ids, attention_mask, labels = batch
        #print("Getting outputs")

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        #print(outputs.logits)
        logits = outputs.logits

        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        progress.set_postfix(loss=loss.item())
        progress.update(1)

print("Accuracy:", accuracy_score(all_labels, all_preds))
print("Classification Report:")
print(classification_report(all_labels, all_preds))



Epoch 4: 100%|██████████| 25/25 [04:00<00:00,  9.61s/it, loss=0.624]
Epoch 4: 100%|██████████| 7/7 [00:09<00:00,  1.24s/it, loss=0.624]

Accuracy: 0.625
Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.76      0.67       101
           1       0.67      0.48      0.56        99

    accuracy                           0.62       200
   macro avg       0.63      0.62      0.62       200
weighted avg       0.63      0.62      0.62       200



In [13]:
from collections import Counter
print(Counter(y_train))

Counter({np.int64(0): 409, np.int64(1): 391})
