### Imports

In [None]:
%%capture

from pandas import read_csv, set_option

from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW

from sklearn.model_selection import train_test_split

import torch

from torch.nn import BCEWithLogitsLoss

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from tqdm import tqdm, trange

In [None]:
set_option('display.max_rows', 25)

set_option('display.max_columns', 25)

set_option('display.max_colwidth', 25)

set_option('display.width', 225)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

number_gpus = torch.cuda.device_count()

print(f'Device: ', device, end='\n\n')

print(f'Number of GPUs: ', number_gpus)

In [None]:
# torch.cuda.get_device_name(0)

### Dataset

#### Paths

In [None]:
TRAIN_DATA_PATH = './data/train.csv'

TEST_DATA_PATH = './data/test.csv'

TEST_DATA_LABELS_PATH = './data/test_label.csv'

#### Attributes

In [None]:
FEATURE_COLUMNS = ['comment_text']

LABEL_COLUMNS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

#### Read

In [None]:
train_data = read_csv(TRAIN_DATA_PATH)

#### Visualize

In [None]:
print(f'Overview: \n\n', train_data.head())

In [None]:
print('Null values: ', train_data.isnull().values.any())

# train_data[train_data.isna().any(axis=1)]

In [None]:
print('Unique sentences: ', train_data.comment_text.nunique() == train_data.shape[0])

In [None]:
print('Average sentence length: ', train_data.comment_text.str.split().str.len().mean(), end='\n\n')

print('Standard deviation sentence length: ', train_data.comment_text.str.split().str.len().std())

In [None]:
print('Plots: \n')

train_data.hist(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']);

In [None]:
# Label counts, may need to downsample or upsample
print('Count of 1 per label: \n\n', train_data[LABEL_COLUMNS].sum(), '\n')

print('Count of 0 per label: \n\n', train_data[LABEL_COLUMNS].eq(0).sum())

#### Process

In [None]:
# Suffle rows
train_data = train_data.sample(frac=1).reset_index(drop=True)

In [None]:
# Create One Hot Encode labels
train_data['one_hot_labels'] = list(train_data[LABEL_COLUMNS].values)

train_data.head()

In [None]:
# Select labels and sentences
labels = list(train_data.one_hot_labels.values)

number_labels = len(LABEL_COLUMNS)

sentences = list(train_data.comment_text.values)

In [None]:
# Create Transformer tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case=False)

In [None]:
# Transform labels to tokens
max_length = 10

encodings = tokenizer.batch_encode_plus(
    sentences,
    max_length=max_length,
    truncation=True,
    padding=True
)

In [None]:
print('Tokenizer outputs: ', encodings.keys())

In [None]:
input_ids = encodings['input_ids'] # Tokenized and encoded sentences

token_type_ids = encodings['token_type_ids'] # Token type ids

attention_masks = encodings['attention_mask'] # Attention masks

In [None]:
# Identifying indices of One Hot Encode labels entries that only occur once. This makes
# possible to stratify split the training data later, to take conclusions, i.e understand
# the existing relationship between groups, which are made from each label encoded
label_counts = train_data.one_hot_labels.astype(str).value_counts()

one_frequency = label_counts[label_counts == 1].keys()

one_frequency_idxs = sorted(
    list(
        train_data[train_data.one_hot_labels.astype(str).isin(one_frequency)].index
    ),
    reverse=True
)

In [None]:
print('Train label indices with only one instance: ', one_frequency_idxs)

In [None]:
# Gathering single instance inputs to force into the training set after stratified split
one_frequency_input_ids = [input_ids.pop(_id) for _id in one_frequency_idxs]

one_frequency_token_types = [token_type_ids.pop(_id) for _id in one_frequency_idxs]

one_frequency_attention_masks = [attention_masks.pop(_id) for _id in one_frequency_idxs]

one_frequency_labels = [labels.pop(_id) for _id in one_frequency_idxs]

#### Split data

In [None]:
# Use train_test_split to split our data into train and validation sets
(
    train_inputs,
    validation_inputs,
    train_labels,
    validation_labels,
    train_token_types,
    validation_token_types,
    train_masks,
    validation_masks
) = train_test_split(
    input_ids,
    labels,
    token_type_ids,
    attention_masks,
    random_state=10,
    test_size=0.10,
    stratify=labels
)

#### Extend data

In [None]:
# Add one frequency data to train data
train_inputs.extend(one_frequency_input_ids)

train_labels.extend(one_frequency_labels)

train_masks.extend(one_frequency_attention_masks)

train_token_types.extend(one_frequency_token_types)

Convert data

In [None]:
%%capture

# Convert all data into Torch Tensors, the required datatype for the model
train_inputs = torch.tensor(train_inputs)

train_labels = torch.tensor(train_labels)

train_masks = torch.tensor(train_masks)

train_token_types = torch.tensor(train_token_types)


validation_inputs = torch.tensor(validation_inputs)

validation_labels = torch.tensor(validation_labels)

validation_masks = torch.tensor(validation_masks)

validation_token_types = torch.tensor(validation_token_types)

In [None]:
# Select a batch size for training. For fine-tuning with XLNet, the
# authors recommend a batch size of 32, 48, or 128. Use 32 to avoid
# memory issues
batch_size = 32

# Create an iterator of our data with Torch Data Loader
train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)

train_sampler = RandomSampler(train_data)

train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels, validation_token_types)

validation_sampler = SequentialSampler(validation_data)

validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
# torch.save(validation_dataloader,'validation_data_loader')

# torch.save(train_dataloader,'train_data_loader')

#### Create model

In [None]:
# An alternative to sequence classification is multiple choise, which
# learn to choose from varying options in contrast to sequence classification
# which the choises (classes) do not vary across your samples, which
# is exactly what
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=number_labels)

# model.cuda()

In [None]:
# Set custom optimization parameters. You may implement a scheduler here as well.
parameter_optimizer = list(model.named_parameters())

no_decay = ['bias', 'gamma', 'beta']

# Select parameters that are not in no decasy
optimizer_grouped_parameters = [
    {
        'params': [
            parameters for layer, parameters in parameter_optimizer if not any(_ in layer for _ in no_decay)
        ],
        'weight_decay_rate': 0.01
    },
    {
        'params': [
            parameters for layer, parameters in parameter_optimizer if any(_ in layer for _ in no_decay)
        ],
        'weight_decay_rate': 0.0}
]

In [None]:
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=True)

# optimizer = AdamW(model.parameters(),lr=2e-5) # Default optimization

#### Train model

In [None]:
# Store loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 2

for _ in trange(epochs, desc="Epoch"):
  #
  # Training
  #

  # Set model to training mode
  model.train()

  # Tracking variables
  training_loss = 0 #running loss
    
  training_examples, training_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(item.to(device) for item in batch)
    
    # Unpack the inputs from our data loader
    batch_input_ids, batch_input_mask, batch_labels, batch_token_types = batch
    
    # Clear out the gradients
    optimizer.zero_grad()

    # # Forward pass for multi-class classification
    # outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask, labels=batch_labels)
    
    # loss = outputs[0]
    
    # logits = outputs[1]

    # Forward pass for multi-label classification
    outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask)
    
    logits = outputs[0]
    
    loss_function = BCEWithLogitsLoss() 
    
    loss = loss_function(
        logits.view(-1, number_labels),
        batch_labels.type_as(logits).view(-1, number_labels)
    ) # Convert labels to float for calculation
    
    # loss_function = BCELoss() 
    
    # loss = loss_function(
    #    torch.sigmoid(
    #        logits.view(-1,number_labels)),
    #        batch_labels.type_as(logits).view(-1,number_labels
    #    )
    # ) #convert labels to float for calculation
    
    train_loss_set.append(loss.item())    

    # Backward pass
    loss.backward()
    
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    # Update tracking variables
    training_loss += loss.item()
    
    training_examples += batch_input_ids.size(0)
    
    training_steps += 1

  print('Train loss: {training_loss / training_steps:.4}')

  #
  # Validation
  #
    
  # Set model to evaluation mode to evaluate loss on the validation set
  model.eval()

  # Variables to gather full output
  logit_predictions, true_labels, predicted_labels, tokenized_sentences = [],[],[],[]

  # Predict
  for step, batch in enumerate(validation_dataloader):
    batch = tuple(item.to(device) for item in batch)
    
    # Unpack the inputs from our data loader
    batch_input_ids, batch_input_mask, batch_labels, batch_token_types = batch
    
    with torch.no_grad():
      # Forward pass
      outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask)
        
      batch_logit_prediction = outputs[0]
    
      predicted_label = torch.sigmoid(batch_logit_prediction)

      batch_logit_prediction = batch_logit_prediction.detach().cpu().numpy()
    
      predicted_label = predicted_label.to('cpu').numpy()
        
      batch_labels = batch_labels.to('cpu').numpy()

    tokenized_sentences.append(batch_input_ids)
    
    logit_predictions.append(batch_logit_prediction)
    
    true_labels.append(batch_labels)
    
    predicted_labels.append(predicted_label)

  # Flatten outputs
  predicted_labels = [item for sublist in predicted_labels for item in sublist]
    
  true_labels = [item for sublist in true_labels for item in sublist]

  # Calculate Accuracy
  threshold = 0.50

  predicted_bools = [predicted_label > threshold for predicted_label in predicted_labels]
    
  true_bools = [true_label == 1 for true_label in true_labels]

  value_f1_accuracy = f1_score(true_bools,predicted_bools,average='micro')*100
    
  value_flat_accuracy = accuracy_score(true_bools, predicted_bools)*100

  print('F1 validation accuracy: ', value_f1_accuracy)
    
  print('Flat validation accuracy: ', value_flat_accuracy)