In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split


df = pd.read_csv('labeled_fulldata.csv')

df = df.sample(frac=1, random_state=42)

Pre-processing data

In [15]:
from torch.utils.data import DataLoader
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, AutoModel, Trainer, TrainingArguments
from datasets import Dataset

df = df[0:10000]  
X = df['text'].values
y = df['fake_news_flag'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Load pre-trained BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
#model = AutoModel.from_pretrained('distilbert-base-uncased')
model = BertForSequenceClassification.from_pretrained('distilbert-base-uncased')
print(list(X_train)[0])

tokens_train = tokenizer.batch_encode_plus(list(X_train),max_length = 128, padding=True, truncation = True)

tokens_test = tokenizer.batch_encode_plus(list(X_test), max_length = 128, padding=True, truncation = True)


train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(list(y_train))

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(list(y_test))



You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['bert.embeddings.LayerNorm.bias', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.word_embeddings.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0

Фото: kremlin.ru 
Россия старается уважать право собственности, это касается и собственности Порошенко, подчеркнул российский лидер. 
"Мы стремимся к тому, чтобы уважать право собственности", - сказал президент России, выразив согласие с известной позицией экс-министра финансов РФ Алексея Кудрина о том, что "это один из столпов экономической политики". "У нас далеко не всегда так получается и нам практику нужно еще поправлять и в законодательстве многое еще нужно сделать, но мы всегда будем к этому стремиться", - сказал глава государства. 
"То же самое касается и наших иностранных инвесторов, в том числе и украинских инвесторов. Петр Алексеевич Порошенко является одним из таких инвесторов, имея в виду, что он является собственником достаточно крупного предприятия в Липецкой области - фабрики "Рошен", - отметил Путин. 
Он обратил при этом внимание, что речь идет даже о двух предприятиях, одно из которых занимается реализацией продукции. "Там есть и некоторые проблемы, связанные с невозв

In [16]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 16

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(X_train)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

test_data = TensorDataset(test_seq, test_mask, test_y)
test_sampler = RandomSampler(X_test)
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = batch_size)

In [None]:
from tqdm import tqdm
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr = 3e-5)

epochs = 2

model.train()

for epoch in range(epochs):
    total_loss = 0
    
    progress = tqdm(train_dataloader, desc=f"Epoch {epoch+1}")
    for step, batch in enumerate(train_dataloader):

        batch = [r for r in batch]
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels)

        #print(outputs[:2])
        loss, logits = outputs[:2]
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        progress.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1} completed. Avg loss: {total_loss / len(train_dataloader):.4f}")




In [12]:
model.eval()

from sklearn.metrics import accuracy_score, classification_report

all_preds = []
all_labels = []

with torch.no_grad():
    for step, batch in enumerate(test_dataloader):
        print("step, batch")
        progress = tqdm(test_dataloader, desc=f"Epoch {epoch+1}")

        batch = [r for r in batch]
        input_ids, attention_mask, labels = batch
        print("Getting outputs")

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        #print(outputs.logits)
        logits = outputs.logits

        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("Accuracy:", accuracy_score(all_labels, all_preds))
print("Classification Report:")
print(classification_report(all_labels, all_preds))



step, batch


Epoch 2:   0%|          | 0/50 [08:21<?, ?it/s, loss=0.727]

Getting outputs





step, batch


Epoch 2:   0%|          | 0/13 [00:00<?, ?it/s]

Getting outputs





step, batch


Epoch 2:   0%|          | 0/13 [00:00<?, ?it/s]

Getting outputs





step, batch


Epoch 2:   0%|          | 0/13 [00:00<?, ?it/s]

Getting outputs





step, batch


Epoch 2:   0%|          | 0/13 [00:00<?, ?it/s]

Getting outputs





step, batch


Epoch 2:   0%|          | 0/13 [00:00<?, ?it/s]

Getting outputs





step, batch


Epoch 2:   0%|          | 0/13 [00:00<?, ?it/s]

Getting outputs





step, batch


Epoch 2:   0%|          | 0/13 [00:00<?, ?it/s]

Getting outputs





step, batch


Epoch 2:   0%|          | 0/13 [00:00<?, ?it/s]

Getting outputs





step, batch


Epoch 2:   0%|          | 0/13 [00:00<?, ?it/s]

Getting outputs





step, batch


Epoch 2:   0%|          | 0/13 [00:00<?, ?it/s]

Getting outputs





step, batch


Epoch 2:   0%|          | 0/13 [00:00<?, ?it/s]

Getting outputs





step, batch


Epoch 2:   0%|          | 0/13 [00:00<?, ?it/s]

Getting outputs





Accuracy: 0.485
Classification Report:
              precision    recall  f1-score   support

           0       0.48      1.00      0.65        97
           1       0.00      0.00      0.00       103

    accuracy                           0.48       200
   macro avg       0.24      0.50      0.33       200
weighted avg       0.24      0.48      0.32       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
from collections import Counter
print(Counter(y_train))

Counter({np.int64(0): 409, np.int64(1): 391})
