# Import Libraries

In [2]:
from pathlib import Path

from farm.data_handler.data_silo import DataSilo
from farm.data_handler.processor import TextClassificationProcessor
from farm.modeling.optimization import initialize_optimizer
from farm.infer import Inferencer
from farm.modeling.adaptive_model import AdaptiveModel
from farm.modeling.language_model import LanguageModel
from farm.modeling.prediction_head import MultiLabelTextClassificationHead
from farm.modeling.tokenization import Tokenizer
from farm.train import Trainer
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings

07/01/2020 22:58:30 - INFO - transformers.file_utils -   PyTorch version 1.5.0+cu92 available.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Logging

# Global Settings

In [3]:
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=False)
n_epochs = 1
batch_size = 32

evaluate_every = 500
lang_model = "bert-base-uncased"
do_lower_case = True

07/01/2020 22:58:33 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None


# Create Bert Tokenizer

In [4]:
tokenizer = Tokenizer.load(
    pretrained_model_name_or_path=lang_model,
    do_lower_case=do_lower_case)

07/01/2020 22:58:34 - INFO - farm.modeling.tokenization -   Loading tokenizer of type 'BertTokenizer'
07/01/2020 22:58:34 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/rohit/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


# Load training data

In [5]:
label_list = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
metric = "acc"

processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len=128,
                                        data_dir=Path("./data/toxic-comments"),
                                        label_list=label_list,
                                        label_column_name="label",
                                        metric=metric,
                                        quote_char='"',
                                        multilabel=True,
                                        train_filename="train.tsv",
                                        dev_filename="val.tsv",
                                        test_filename=None,
                                        dev_split=0,
                                        )

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(
    processor=processor,
    batch_size=batch_size)

07/01/2020 22:58:35 - INFO - farm.data_handler.data_silo -   
Loading data into the data silo ... 
              ______
               |o  |   !
   __          |:`_|---'-.
  |__|______.-/ _ \-----.|       
 (o)(o)------'\ _ /     ( )      
 
07/01/2020 22:58:35 - INFO - farm.data_handler.data_silo -   Loading train set from: data/toxic-comments/train.tsv 
07/01/2020 22:58:36 - INFO - farm.data_handler.data_silo -   Got ya 7 parallel workers to convert 159571 dictionaries to pytorch datasets (chunksize = 2000)...
07/01/2020 22:58:36 - INFO - farm.data_handler.data_silo -    0    0    0    0    0    0    0 
07/01/2020 22:58:36 - INFO - farm.data_handler.data_silo -   /w\  /w\  /w\  /w\  /w\  /w\  /w\
07/01/2020 22:58:36 - INFO - farm.data_handler.data_silo -   /'\  / \  /'\  /'\  / \  / \  /'\
07/01/2020 22:58:36 - INFO - farm.data_handler.data_silo -               
Preprocessing Dataset data/toxic-comments/train.tsv:   0%|          | 0/159571 [00:00<?, ? Dicts/s]07/01/2020 22:58:52 - IN

07/01/2020 23:01:57 - INFO - farm.data_handler.data_silo -   /|\  /w\  /w\  /w\  /w\  /w\  /w\
07/01/2020 23:01:57 - INFO - farm.data_handler.data_silo -   /'\  /'\  /'\  /'\  /'\  / \  /'\
07/01/2020 23:01:57 - INFO - farm.data_handler.data_silo -               
Preprocessing Dataset data/toxic-comments/val.tsv:   0%|          | 0/10000 [00:00<?, ? Dicts/s]07/01/2020 23:02:00 - INFO - farm.data_handler.processor -   *** Show 2 random examples ***
07/01/2020 23:02:00 - INFO - farm.data_handler.processor -   

      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
      "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
       || /\     ____) | (_| | | | | | | |_) | |  __/
    /\ ||//\)   |_____/ \__,_|_| |_| |_| .__/|_|\___|
   (/\||/                             |_|           
______\||/___________________________________________                     

ID: 230-0
Clear Text: 
 	text: 

# Define Model and Prediction Head

In [6]:
# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and a prediction head on top that is suited for our task => Text classification
prediction_head = MultiLabelTextClassificationHead(num_labels=len(label_list))

model = AdaptiveModel(
    language_model=language_model,
    prediction_heads=[prediction_head],
    embeds_dropout_prob=0.1,
    lm_output_types=["per_sequence"],
    device=device)

# 5. Create an optimizer
model, optimizer, lr_schedule = initialize_optimizer(
    model=model,
    learning_rate=3e-5,
    device=device,
    n_batches=len(data_silo.loaders["train"]),
    n_epochs=n_epochs)

# 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
trainer = Trainer(
    model=model,
    optimizer=optimizer,
    data_silo=data_silo,
    epochs=n_epochs,
    n_gpu=n_gpu,
    lr_schedule=lr_schedule,
    evaluate_every=evaluate_every,
    device=device)

07/01/2020 23:03:33 - INFO - transformers.modeling_utils -   loading weights file https://cdn.huggingface.co/bert-base-uncased-pytorch_model.bin from cache at /home/rohit/.cache/torch/transformers/f2ee78bdd635b758cc0a12352586868bef80e47401abe4c4fcc3832421e7338b.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157
	 We guess it's an *ENGLISH* model ... 
	 If not: Init the language model by supplying the 'language' param.
07/01/2020 23:03:36 - INFO - farm.modeling.prediction_head -   Prediction head initialized with size [768, 6]
07/01/2020 23:03:36 - INFO - farm.modeling.optimization -   Loading optimizer `TransformersAdamW`: '{'correct_bias': False, 'weight_decay': 0.01, 'lr': 3e-05}'
07/01/2020 23:03:36 - INFO - farm.modeling.optimization -   Using scheduler 'get_linear_schedule_with_warmup'
07/01/2020 23:03:36 - INFO - farm.modeling.optimization -   Loading schedule `get_linear_schedule_with_warmup`: '{'num_warmup_steps': 498.70000000000005, 'num_training_steps': 4987}'


# Train

In [None]:
trainer.train()

07/01/2020 23:07:51 - INFO - farm.train -   
 

          &&& &&  & &&             _____                   _             
      && &\/&\|& ()|/ @, &&       / ____|                 (_)            
      &\/(/&/&||/& /_/)_&/_&     | |  __ _ __ _____      ___ _ __   __ _ 
   &() &\/&|()|/&\/ '%" & ()     | | |_ | '__/ _ \ \ /\ / / | '_ \ / _` |
  &_\_&&_\ |& |&&/&__%_/_& &&    | |__| | | | (_) \ V  V /| | | | | (_| |
&&   && & &| &| /& & % ()& /&&    \_____|_|  \___/ \_/\_/ |_|_| |_|\__, |
 ()&_---()&\&\|&&-&&--%---()~                                       __/ |
     &&     \|||                                                   |___/
             |||
             |||
             |||
       , -=-~  .-^- _
              `

Train epoch 0/0 (Cur. train loss: 0.8174):   0%|          | 1/4987 [00:10<14:13:40, 10.27s/it]

# Save Models

In [None]:
# 8. Hooray! You have a model. Store it:
save_dir = Path("../models/bert-multi-toxic-comment")
model.save(save_dir)
processor.save(save_dir)