In [1]:
import pandas as pd
import torch
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score,classification_report
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_train = pd.read_csv('Fake.br-Corpus.csv',sep=',')
df_train['label'] = df_train['label'].apply(lambda x: 0 if x == "fake" else 1)

In [3]:
df_train.drop(columns=['index'],inplace = True)
df_train.rename(columns={'preprocessed_news': 'text'}, inplace = True)
df_train.rename(columns={'label': 'labels'}, inplace = True)
df_train = df_train[['text', 'labels']]

In [4]:
import numpy as np
df_train = df_train.iloc[np.random.permutation(len(df_train))].reset_index(drop=True)
df_train

Unnamed: 0,text,labels
0,procuradores eua poderao pedir prisao lula dil...,0
1,ha risco guerra potencias internacionais siria...,1
2,governo conta alta arrecadacao reduzir restric...,1
3,planalto preocupado vai aguardar efeitos delac...,1
4,planalto busca nome pmdb assumir secretaria ju...,1
...,...,...
7195,invertendo papeis defesa lula vai stf provar c...,0
7196,casa endereco sigiloso porto alegre acolhe vit...,1
7197,apos cuspir bolsonaro jean wyllys recebe punic...,0
7198,poder militar russia ameaca conflito contra uc...,1


In [5]:
device = torch.device("cuda:0" if (torch.cuda.is_available())else "cpu")

In [6]:
kf = KFold(n_splits=10)

In [7]:
results = []

model_args = ClassificationArgs()
model_args.overwrite_output_dir=True
model_args.num_train_epochs = 1
model_args.num_labels = 2
model_args.silent = True

logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
model = ClassificationModel('bert', 'neuralmind/bert-base-portuguese-cased', args=model_args, use_cuda=False)

for train_index, val_index in kf.split(df_train):  
    train_df = df_train.iloc[train_index]
    val_df = df_train.iloc[val_index]
    model.train_model(train_df)
    
    result, model_output, wrong_predictions = model.eval_model(val_df, acc=accuracy_score)     

    predictions, raw_outputs  = model.predict(val_df['text'].values.tolist())
    print(classification_report(predictions,val_df["labels"], digits=4))
    print(f"Accuracy: {result['acc']}")

    results.append(result['acc'])

              precision    recall  f1-score   support

           0     0.9785    0.9406    0.9592       387
           1     0.9339    0.9760    0.9545       333

    accuracy                         0.9569       720
   macro avg     0.9562    0.9583    0.9568       720
weighted avg     0.9579    0.9569    0.9570       720

Accuracy: 0.9569444444444445
              precision    recall  f1-score   support

           0     0.9861    0.9516    0.9685       372
           1     0.9501    0.9856    0.9676       348

    accuracy                         0.9681       720
   macro avg     0.9681    0.9686    0.9680       720
weighted avg     0.9687    0.9681    0.9681       720

Accuracy: 0.9680555555555556
              precision    recall  f1-score   support

           0     0.9867    0.9789    0.9828       379
           1     0.9767    0.9853    0.9810       341

    accuracy                         0.9819       720
   macro avg     0.9817    0.9821    0.9819       720
weighted avg    