<a href="https://colab.research.google.com/github/jakob-ra/financial_news/blob/master/PL_multi_label_roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# # Crash on purpose to get more ram :
# import torch
# torch.tensor([10.]*10000000000)

In [2]:
!pip install transformers
!pip install pytorch_lightning
!pip install simpletransformers
!pip install nlp



In [3]:
import torch
import pytorch_lightning as pl
from simpletransformers.classification import MultiLabelClassificationModel
import pandas as pd
import nlp
import transformers
from transformers import RobertaModel



In [4]:
from google.colab import files
files.upload()

{}

In [5]:
DEBUG = True
EPOCHS = 1
BATCH_SIZE = 8
LR = 1e-4
MOMENTUM = .9
MODEL = 'roberta-large'
SEQ_LENGTH = 64
TEST_SIZE = 0.1
LABEL_COLS = ['toxic', 'obscene', 'threat']

df = pd.read_csv('train_small.csv')
df['label'] = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values.tolist()
NUM_LABELS = len(df.label.iloc[0])

In [6]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [7]:
from sklearn.metrics import f1_score

In [None]:
def loss_fn(outputs, targets):
    targets = targets.squeeze().type(torch.FloatTensor).cpu()
    outputs = torch.FloatTensor(outputs.cpu())
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

class RobertaMultiLabelClass(torch.nn.Module):
    def __init__(self):
        super(RobertaMultiLabelClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained(MODEL)
        # self.l2 = torch.nn.Dropout(0.3)
        self.l2 = torch.nn.Linear(1024, NUM_LABELS)

    def forward(self, ids, mask):
        _, output_1 = self.l1(ids, attention_mask=mask)
        # output_2 = self.l2(output_1)
        output = self.l2(output_1)
        return output


class RobertaMultiLabelFinetuner(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = RobertaMultiLabelClass().cuda()
        # self.model = MultiLabelClassificationModel('roberta', MODEL, use_cuda=True, num_labels=len(
        #     LABEL_COLS), args={'reprocess_input_data': True, 'overwrite_output_dir': True,
        #     'num_train_epochs': EPOCHS})

    def prepare_data(self):
        tokenizer = transformers.RobertaTokenizer.from_pretrained(
            MODEL,
            do_lower_case=False,
            add_special_tokens=True,
            max_length=SEQ_LENGTH,
            pad_to_max_length=True)

        def _tokenize(x):
            x['input_ids'] = tokenizer.batch_encode_plus(x['comment_text'], max_length=SEQ_LENGTH,
                truncation=True, padding=True)['input_ids']

            return x

        def _prepare_ds(split):
            ds = nlp.Dataset.from_pandas(df)
            # ds = ds.train_test_split(test_size=TEST_SIZE,
            #     shuffle=True, seed=42)
            ds = ds.map(_tokenize, batched=True)
            ds.set_format(type='torch', columns=['input_ids', 'label'], device=DEVICE)
            return ds

        self.train_ds, self.test_ds = map(_prepare_ds, ('train', 'test'))

    def forward(self, input_ids):
        mask = (input_ids != 0).float()
        logits = self.model(input_ids, mask)
        return logits

    def training_step(self, batch, batch_idx):
        logits = self.forward(batch['input_ids'])
        loss = loss_fn(logits, batch['label']).mean()
        return {'loss': loss, 'log': {'train_loss': loss}}

    def validation_step(self, batch, batch_idx):
        logits = self.forward(batch['input_ids'])
        val_loss = loss_fn(logits, batch['label'])
        predicted_vals = logits > 0.5
        val_acc = f1_score(batch['label'].cpu(), predicted_vals.cpu(), average='micro')
        return {'val_loss': val_loss, 'val_acc': val_acc}

    def validation_epoch_end(self, outputs):
        val_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        val_acc = torch.stack([x['val_acc'] for x in outputs]).mean()
        out = {'val_loss': val_loss, 'val_acc': val_acc}
        return {**out, 'log': out}

    def validation_epoch_end(self, outputs):
        return outputs

    def train_dataloader(self):
        return torch.utils.data.DataLoader(
                self.train_ds,
                batch_size=BATCH_SIZE,
                drop_last=True,
                shuffle=True,
                )

    def val_dataloader(self):
        return torch.utils.data.DataLoader(
                self.test_ds,
                batch_size=BATCH_SIZE,
                drop_last=False,
                shuffle=True,
                )
        
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=LR)


def main():
    model = RobertaMultiLabelFinetuner()
    trainer = pl.Trainer(
        default_root_dir='logs',
        gpus = (1 if DEVICE=='cuda' else 0),
        max_epochs=EPOCHS,
        fast_dev_run=DEBUG,
        logger=pl.loggers.TensorBoardLogger('logs/', name='toxic', version=0),
    )
    trainer.fit(model)

if __name__ == '__main__':
    main()

Running in fast_dev_run mode: will run a full train, val and test loop using a single batch
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


  | Name  | Type                   | Params
-------------------------------------------------
0 | model | RobertaMultiLabelClass | 355 M 








HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

  average, "true nor predicted", 'F-score is', len(true_sum)
