Train models on the ToldBR and HateBR datasets

In [1]:
from simpletransformers.classification import ClassificationModel
from hate_nlp.dataset import ToldBRDataset, HateBRDataset
from hate_nlp.preprocess import PreprocessToldBR
import sklearn
from sklearn.model_selection import StratifiedKFold

def train_model_cv(model_name, dataset, folds=2):
    df = dataset.get_dataframe()
    skf = StratifiedKFold(n_splits=folds)
    X, y = df['text'], df['labels']

    results = []
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
      model = build_model(model_name)
      df_train = df.iloc[train_index]
      df_test = df.iloc[test_index]
      model.train_model(df_train)
      result, model_outputs, wrong_predictions = model.eval_model(df_test, acc=sklearn.metrics.accuracy_score)
      results.append(result)
    
    return results

def build_model(model_name):
    if model_name == 'distilbert-toldbr':
      model = ClassificationModel(
            "distilbert", "distilbert-base-multilingual-cased",
            args={
              'num_train_epochs': 1,
              'evaluate_during_training': False,
              'overwrite_output_dir': True,
              'do_lower_case': False,
              'save_steps': 100000,
              'no_cache': True,
              'n_gpu': 1,
              'train_batch_size': 32,
              'max_seq_len': 512,
              'silent': True,
              "reprocess_input_data": True,
            },
          )
    else:
      raise NotImplementedError

    return model

  from .autonotebook import tqdm as notebook_tqdm


Use preprocessing method as used in ToldBR

In [2]:
toldbr_dataset = ToldBRDataset('/home/jose/Programas/HateSpeech-NLP/data/raw/ToLD-BR.csv', PreprocessToldBR())
hatebr_dataset = HateBRDataset('/home/jose/Programas/HateSpeech-NLP/data/raw/HateBR.csv', PreprocessToldBR())

Use Distilbert as in the ToldBR classification examples

In [3]:
results_toldbr = train_model_cv('distilbert-toldbr', toldbr_dataset, 5) 

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [None]:
results_hatebr = train_model_cv('distilbert-toldbr', hatebr_dataset, 5) 