In [None]:
!pip install simpletransformers tensorboardX emoji

In [None]:
!pip install --upgrade scikit-learn scipy matplotlib imblearn

In [None]:
#!pip3 install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html

Instalação do NVIDA Apex

In [None]:
!git clone https://github.com/NVIDIA/apex

In [None]:
import os

os.chdir('apex')

In [None]:
!nvcc --version

In [None]:
!pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./

Bibliotecas básicas e função para determinismo (repetibilidade)

In [None]:
import numpy as np
#from numpy.random import seed
#seed(42)
rng = np.random.RandomState(42)

import torch
import random

def set_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

set_seed(42)

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import logging

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
train_data = pd.read_csv('/content/dataset_FinalVersion.csv')

In [None]:
train_data

In [None]:
train_data = train_data.drop('Unnamed: 0', axis=1)

In [None]:
train_data.columns = ["text", "labels"]

In [None]:
#from sklearn.preprocessing import LabelEncoder

#encoder = LabelEncoder()
#train_data['labels'] = encoder.fit_transform(train_data['labels'])

In [None]:
#encoder.classes_

In [None]:
X = train_data['text']
Y = train_data['labels']

In [None]:
#!pip install mlrose

In [None]:
import six
import sys
sys.modules['sklearn.externals.six'] = six
import mlrose

import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

sys.modules['sklearn.metrics.classification'] = sklearn.metrics._classification

from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, matthews_corrcoef, f1_score, precision_score, recall_score, balanced_accuracy_score
from imblearn.metrics import specificity_score

In [None]:
n=10
kf = StratifiedKFold(n_splits=n, random_state=rng, shuffle=True)

In [None]:
model_mcc = []
model_f1 = []
model_precision = []
model_recall = []
model_bacc = []
model_spec = []

In [None]:
#class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(Y), y=Y)

Hiperparâmetros do modelo

In [None]:
model_args = ClassificationArgs()

model_args.train_batch_size = 32
model_args.eval_batch_size = 32
model_args.learning_rate = 1e-5
model_args.adam_epsilon = 1e-8
model_args.num_train_epochs = 3
model_args.overwrite_output_dir = True
model_args.dataloader_num_workers = 0
#model_args.gradient_accumulation_steps = 2
model_args.fp16 = True
#model_args.warmup_ratio = 0.1 # https://aclanthology.org/2021.acl-long.178.pdf

cuda_available = torch.cuda.is_available()

Treinamento e cross-validation

In [None]:
#from sklearn.preprocessing import MultiLabelBinarizer

for train_index, val_index in kf.split(X,Y):

    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_data['labels'].iloc[train_index]), y=train_data['labels'].iloc[train_index])

    print(class_weights)
    
    model = ClassificationModel("distilbert", "Geotrend/distilbert-base-pt-cased", num_labels=2, args=model_args, use_cuda=cuda_available, weight=list(class_weights))
    
    train_df = train_data.iloc[train_index]
    val_df = train_data.iloc[val_index]
    
    model.train_model(train_df)
    
    def mcc(y_true, y_pred):
        #mlb = MultiLabelBinarizer()
        return matthews_corrcoef(y_true, y_pred)
    
    def f1(y_true, y_pred):
        #mlb = MultiLabelBinarizer()
        return f1_score(y_true, y_pred, average='weighted')
    
    def precision(y_true, y_pred):
        #mlb = MultiLabelBinarizer()
        return precision_score(y_true, y_pred, average='weighted')
    
    def recall(y_true, y_pred):
        #mlb = MultiLabelBinarizer()
        return recall_score(y_true, y_pred, average='weighted')

    def bacc(y_true, y_pred):
        #mlb = MultiLabelBinarizer()
        return balanced_accuracy_score(y_true, y_pred)

    def spec(y_true, y_pred):
        #mlb = MultiLabelBinarizer()
        return specificity_score(y_true, y_pred, average='weighted')
    
    result, model_outputs, wrong_predictions = model.eval_model(val_df, mcc=mcc, f1=f1, precision=precision, recall=recall,
                                                                bacc=bacc, spec=spec)
    print(model_outputs)

    print(result['mcc'])
    print(result['f1'])
    print(result['precision'])
    print(result['recall'])
    print(result['bacc'])
    print(result['spec'])
    
    model_mcc.append(result['mcc'])
    model_f1.append(result['f1'])
    model_precision.append(result['precision'])
    model_recall.append(result['recall'])
    model_bacc.append(result['bacc'])
    model_spec.append(result['spec'])

In [None]:
print(f"Mean-MCC: {sum(model_mcc) / len(model_mcc):.4f}")

In [None]:
print(f"Mean-F1: {sum(model_f1) / len(model_f1):.4f}")

In [None]:
print(f"Mean-Precision: {sum(model_precision) / len(model_precision):.4f}")

In [None]:
print(f"Mean-Recall: {sum(model_recall) / len(model_recall):.4f}")

In [None]:
print(f"Mean-BACC: {sum(model_bacc) / len(model_bacc):.4f}")

In [None]:
print(f"Mean-Specificity: {sum(model_spec) / len(model_spec):.4f}")