In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score, precision_score, recall_score

import torch 
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

import transformers 
from transformers import AdamW
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

import datasets
from tqdm import trange
import tensorflow as tf
from collections import Counter
import seaborn as sns

In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
!pip install tqdm



**Load data**

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

full_clean_df = pd.read_excel("../data/full_clean_df.xlsx", index_col=0)

In [None]:
full_clean_df.head(10)

Unnamed: 0,Tweets,NotHate,Racist,Sexist,Homophobe,Religion,OtherHate,tweets_train,tweets_emoji_train,tweets_nig_train
0,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,1,0,0,0,0,0,nigga momma youngboy spit real shit nigga,nigga momma youngboy spit real shit nigga,momma youngboy spit real shit
1,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,1,1,0,0,0,0,xxsugvngxx ran holy nigga today,xxsugvngxx ran holy nigga today loudly_crying_...,xxsugvngxx ran holy today
2,“EVERYbody calling you Nigger now!” https://t....,1,1,0,0,0,0,everybody call nigger,everybody call nigger,everybody call nigger
3,“ real ass bitch give a fuck boutta nigga” htt...,1,0,0,0,0,0,real bitch give fuck boutta nigga,real bitch give fuck boutta nigga,real bitch give fuck boutta
4,@WhiteHouse @realDonaldTrump Fuck ice. White s...,0,1,0,0,0,1,fuck ice white supremacist trash racist garbage,fuck ice white supremacist trash racist garbage,fuck ice white supremacist trash racist garbage
5,@DefNotJerm So.... you turn to twitter for it ...,1,0,0,0,0,0,turn twitter instead beating nigga contact police,turn twitter instead beating nigga contact police,turn twitter instead beating contact police
6,@WhatUpJT I swear I was waiting for her to mou...,1,0,0,0,0,0,swear wait mouth word nigger,swear wait mouth word nigger face_with_tears_o...,swear wait mouth word nigger
7,I’m 💯 behind you nigga u my thug brother🖤 http...,1,0,0,0,0,0,behind nigga u thug brother,hundred_points behind nigga u thug brother bla...,behind u thug brother
8,bf: move your bighead 😅😂 gf: NIGGA WHAT?? THA...,0,1,1,0,0,0,bf move bighead gf nigga daddy left,bf move bighead grinning_face_with_sweat face_...,bf move bighead gf daddy left
9,@OriginalSlimC This fat nigga slander is getti...,1,0,0,1,0,0,fat nigga slander get outta hand,fat nigga slander get outta hand loudly_crying...,fat slander get outta hand


In [None]:
labels_name = ['NotHate', 'Racist', 'Sexist', 'Homophobe', 'Religion', 'OtherHate']

# Preprocess Data

In [None]:
df = full_clean_df[['tweets_train']]

In [None]:
df['labels'] = full_clean_df[labels_name].values.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
df.head(10)

Unnamed: 0,tweets_train,labels
0,nigga momma youngboy spit real shit nigga,"[1, 0, 0, 0, 0, 0]"
1,xxsugvngxx ran holy nigga today,"[1, 1, 0, 0, 0, 0]"
2,everybody call nigger,"[1, 1, 0, 0, 0, 0]"
3,real bitch give fuck boutta nigga,"[1, 0, 0, 0, 0, 0]"
4,fuck ice white supremacist trash racist garbage,"[0, 1, 0, 0, 0, 1]"
5,turn twitter instead beating nigga contact police,"[1, 0, 0, 0, 0, 0]"
6,swear wait mouth word nigger,"[1, 0, 0, 0, 0, 0]"
7,behind nigga u thug brother,"[1, 0, 0, 0, 0, 0]"
8,bf move bighead gf nigga daddy left,"[0, 1, 1, 0, 0, 0]"
9,fat nigga slander get outta hand,"[1, 0, 0, 1, 0, 0]"


## Split Data

In [None]:
X, X_test, y, y_test = train_test_split(df['tweets_train'], df['labels'], test_size = 0.33, random_state=12)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.33, random_state = 12, stratify = y.values)

In [None]:
X_train.shape

(64316,)

In [None]:
X_val.shape

(31679,)

In [None]:
X_test.shape

(47282,)

## Preprocess Data for Training
* Tokenize and Encode tweets
* Convert to Torch tensors
* Convert to Torch Dataset





In [None]:
bert_path = 'bert-base-uncased'

In [None]:
tokenizer = BertTokenizer.from_pretrained(bert_path)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [None]:
def preprocess_data(X, y):
  X = list(X.values)
  y = list(y.values)

  tokenizer = BertTokenizer.from_pretrained(bert_path)
  encodings = tokenizer.batch_encode_plus(X, add_special_tokens=True, max_length = 30, padding = 'max_length', truncation=True)

  input_ids = encodings['input_ids'] # tokenized and encoded sentences
  attention_masks = encodings['attention_mask'] # attention masks
  token_type_ids = encodings['token_type_ids'] # token type ids

  # Convert all of our data into torch tensors
  data_inputs = torch.tensor(input_ids)
  data_masks = torch.tensor(attention_masks)
  data_labels = torch.tensor(y)
  data_token_types = torch.tensor(token_type_ids)

  batch_size = 32

  # Convert to Tensor Dataset type
  data_dataset = TensorDataset(data_inputs, data_masks, data_labels, data_token_types)
  data_sampler = SequentialSampler(data_dataset)
  data_dataloader = DataLoader(data_dataset, sampler=data_sampler, batch_size=batch_size)

  return data_dataloader

In [None]:
train_dataloader = preprocess_data(X_train, y_train)
validation_dataloader = preprocess_data(X_val, y_val)

## Load and Set Model Parameters

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

Found GPU at: /device:GPU:0


'Tesla K80'

In [None]:
num_labels = 6
model = BertForSequenceClassification.from_pretrained(bert_path, num_labels = num_labels).to('cuda')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
# setting custom optimization parameters.
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [None]:
optimizer = AdamW(optimizer_grouped_parameters,lr=2e-5,correct_bias=True)

### Train Model

In [None]:
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):

  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0 #running loss
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()

    # Forward pass for multilabel classification
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]
    loss_func = BCEWithLogitsLoss() 
    loss = loss_func(logits.view(-1,num_labels),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # scheduler.step()
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

###############################################################################

  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Variables to gather full output
  logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

  # Predict
  for i, batch in enumerate(validation_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    with torch.no_grad():
      # Forward pass
      outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      b_logit_pred = outs[0]
      pred_label = torch.sigmoid(b_logit_pred)

      b_logit_pred = b_logit_pred.detach().cpu().numpy()
      pred_label = pred_label.to('cpu').numpy()
      b_labels = b_labels.to('cpu').numpy()

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)

  # Flatten outputs
  pred_labels = [item for sublist in pred_labels for item in sublist]
  true_labels = [item for sublist in true_labels for item in sublist]

  # Calculate Accuracy
  threshold = 0.50
  pred_bools = [pl>=threshold for pl in pred_labels]
  true_bools = [tl==1 for tl in true_labels]
  val_f1_accuracy = f1_score(true_bools,pred_bools,average='macro')*100
  val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100

  print('F1 Validation Accuracy: ', val_f1_accuracy)
  print('Flat Validation Accuracy: ', val_flat_accuracy)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Train loss: 0.2763483518243429


Epoch:  33%|███▎      | 1/3 [14:01<28:03, 841.84s/it]

F1 Validation Accuracy:  45.55624103991566
Flat Validation Accuracy:  48.77994886202216
Train loss: 0.2561619575936996


Epoch:  67%|██████▋   | 2/3 [28:05<14:02, 842.25s/it]

F1 Validation Accuracy:  49.378458303125896
Flat Validation Accuracy:  49.04510874711954
Train loss: 0.24486723345755346


Epoch: 100%|██████████| 3/3 [42:08<00:00, 842.92s/it]

F1 Validation Accuracy:  51.098441018441434
Flat Validation Accuracy:  48.868335490387956





In [None]:
# load saved model

torch.save(model.state_dict(), '/content/drive/My Drive/Colab Notebooks/Group Project/bert_baseline_model')

In [None]:
# baseline classification report, threshold = 0.5

print(classification_report(true_bools, pred_bools, target_names = labels_name, digits=4))

              precision    recall  f1-score   support

     NotHate     0.9387    0.9878    0.9626     28995
      Racist     0.6732    0.2066    0.3161      9861
      Sexist     0.5509    0.4208    0.4772      4318
   Homophobe     0.6970    0.6560    0.6759      2349
    Religion     0.5122    0.0882    0.1505       476
   OtherHate     0.6614    0.3811    0.4836      4694

   micro avg     0.8574    0.7076    0.7753     50693
   macro avg     0.6722    0.4568    0.5110     50693
weighted avg     0.8131    0.7076    0.7302     50693
 samples avg     0.8927    0.7718    0.7964     50693



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
def optimal_threshold(true, predictions):
  thresholds=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
  threshold_df = pd.DataFrame({})
  
  for thres in thresholds:
    prediction = predictions.copy()

    pred = [pl >= thres for pl in pred_labels]

    precision = precision_score(true, pred, average='macro')
    recall = recall_score(true, pred, average='macro')
    f1 = f1_score(true, pred, average='macro')

    df = pd.DataFrame({
        'threshold': round(thres, 4),
        'prediction': round(precision, 4),
        'recall': round(recall, 4),
        'f1': round(f1, 4)
    }, index=[0])

    threshold_df = threshold_df.append(df)
  
  return threshold_df.reset_index(drop=True)

In [None]:
optimal_threshold(true_bools, pred_labels)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,threshold,prediction,recall,f1
0,0.1,0.4616,0.7262,0.543
1,0.2,0.527,0.6796,0.5728
2,0.3,0.5617,0.6397,0.5791
3,0.4,0.6038,0.554,0.5607
4,0.5,0.6722,0.4568,0.511
5,0.6,0.716,0.4073,0.4731
6,0.7,0.728,0.3627,0.432
7,0.8,0.6793,0.2845,0.3646
8,0.9,0.6065,0.1509,0.1799


In [None]:
# optimal threshold = 0.3
pred = [pl >= 0.3 for pl in pred_labels]

# classification report

print(classification_report(true_bools, pred, target_names = labels_name, digits=4))

              precision    recall  f1-score   support

     NotHate     0.9368    0.9919    0.9636     28995
      Racist     0.4641    0.8112    0.5904      9861
      Sexist     0.4806    0.6139    0.5391      4318
   Homophobe     0.6760    0.6752    0.6756      2349
    Religion     0.3567    0.1176    0.1769       476
   OtherHate     0.4562    0.6285    0.5287      4694

   micro avg     0.7049    0.8680    0.7780     50693
   macro avg     0.5617    0.6397    0.5791     50693
weighted avg     0.7440    0.8680    0.7938     50693
 samples avg     0.7306    0.8979    0.7764     50693



  _warn_prf(average, modifier, msg_start, len(result))
