# 0. Import Dependencies

In [None]:
import os
import csv
import pandas as pd
import numpy as np
import torch
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Data Preprocessing

In [None]:
# Load the training dataset

data_path = 'new_train.csv'
data_raw = pd.read_csv(data_path)

print("Number of rows in data =",data_raw.shape[0])
print("Number of columns in data =",data_raw.shape[1])
print("\n")
print("**Sample data:**")
data_raw.drop(columns='index', inplace=True)
data_raw.head()

In [None]:
# Show total number of comments for each label

categories = list(data_raw.columns.values)

sns.set(font_scale = 2)
plt.figure(figsize=(15,8))
ax = sns.barplot(categories[2:], data_raw.iloc[:,2:].sum().values)
plt.title("Comments in each category", fontsize=24)
plt.ylabel('Number of comments', fontsize=18)
plt.xlabel('Comment Type ', fontsize=18)

#adding the text labels

rects = ax.patches
labels = data_raw.iloc[:,2:].sum().values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom', fontsize=18)
    
plt.show()

In [None]:
# Counting the number of comments having multiple labels

rowSums = data_raw.iloc[:,2:].sum(axis=1)
multiLabel_counts = rowSums.value_counts()
multiLabel_counts = multiLabel_counts.iloc[1:]
sns.set(font_scale = 2)
plt.figure(figsize=(15,8))
ax = sns.barplot(multiLabel_counts.index, multiLabel_counts.values)
plt.title("Comments having multiple labels ")
plt.ylabel('Number of comments', fontsize=18)
plt.xlabel('Number of labels', fontsize=18)

#adding the text labels
rects = ax.patches
labels = multiLabel_counts.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
plt.show()

In [None]:
# Generate a summary column "category".  The column contains "1" if the comment is labeled at least once.
# Otherwise, the column will take on a value of "0".

data_raw["category"] = data_raw.iloc[:,2:8].sum(axis=1)
data_raw["category"] = data_raw["category"]/data_raw["category"]
data_raw.fillna(0, inplace=True)
data_raw.category = data_raw.category.astype(int)
data_raw.head()
print("Total number of labeled comments is %d." %data_raw.category.sum())

Total number of labeled comments is 14602.


In [None]:
data = data_raw

In [None]:
# Data Pre-processing

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext

def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

data['comment_text'] = data['comment_text'].str.lower()
data['comment_text'] = data['comment_text'].apply(cleanHtml)
data['comment_text'] = data['comment_text'].apply(cleanPunc)
data['comment_text'] = data['comment_text'].apply(keepAlpha)

# 2. Model

The rest of the code is adopted to work with the Coursera code.

DO NOT USE THIS CODE WITH OTHER EMBEDDINGS OR MODELS.

In [None]:
# Retain relevant columns from the preprocessed dataset.
data = data[['id', 'comment_text', 'category']]

# Replace values in the column 'toxicity' by {0: non-toxic, 1: toxic}.
data.loc[data.category == 0, 'category'] = 'non-toxic'
data.loc[data.category == 1, 'category'] = 'toxic'

# Replace index in-place by the 'id' column.
data.set_index('id', inplace=True)

In [None]:
data.head()

In [None]:
data.category.value_counts()

In [None]:
# From here on, code is adopted from the Coursera tutorial.

In [None]:
possible_labels = data.category.unique()
possible_labels

In [None]:
label_dict = {}

for index, possible_labels in enumerate(possible_labels):
    label_dict[possible_labels] = index

In [None]:
label_dict

In [None]:
data['label'] = data.category.replace(label_dict)

In [None]:
data.head(20)

# 3. Training / Validation Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    data.index.values,
    data.label.values,
    test_size = 0.9,
    random_state = 42,
    stratify = data.label.values
)

In [None]:
data['data_type'] = ['not_set']*data.shape[0]

In [None]:
data.loc[X_train, 'data_type'] = 'train'
data.loc[X_val, 'data_type'] = 'val'

In [None]:
data.groupby(['category', 'label', 'data_type']).count()

# 4. Load Tokenizer and Encode Data

In [None]:
!pip install transformers

In [None]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [None]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
)

In [None]:
max_length = 200

encode_data_train = tokenizer.batch_encode_plus(
    data[data.data_type == 'train'].comment_text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=max_length,
    truncation=True,
    return_tensors='pt'
)

encode_data_val = tokenizer.batch_encode_plus(
    data[data.data_type == 'val'].comment_text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=max_length,
    truncation=True,
    return_tensors='pt'
)

input_ids_train = encode_data_train['input_ids']
attention_masks_train = encode_data_train['attention_mask']
labels_train = torch.tensor(data[data.data_type == 'train'].label.values)

input_ids_val = encode_data_val['input_ids']
attention_masks_val = encode_data_val['attention_mask']
labels_val = torch.tensor(data[data.data_type == 'val'].label.values)

In [None]:
# Example encoding.  Note that each wordpiece is encoded with an unique ID and
# the entire sentence is padded to a maximum length of 'max_length'.
encode_data_train['input_ids'][0]

In [None]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [None]:
# Note the ratio of validation set to the whole data set is the same as the 'stratify' parameter set
# 'train_test_split'.

len(dataset_val)/(len(dataset_val)+len(dataset_train))

# 5. Setup BERT Pretrained Model

In [None]:
from transformers import BertForSequenceClassification

In [None]:
# This is the fine-tuning step!!!

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_dict),
    output_attentions=False,
    output_hidden_states=False
)

# 6. Create Data Loaders

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [None]:
batch_size = 32

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=batch_size
)

# 7. Setup Optimizer and Scheduler

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
# This is from HuggingFace.

# AdamW is to optimizer our backpropagation.
optimizer = AdamW(
    model.parameters(),
    lr=2e-5, # 2e-5 > 5e-5
    eps=1e-8
)

In [None]:
epochs = 4

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train)*epochs
)

# 8. Define Performance Metrics

In [None]:
import numpy as np
from sklearn.metrics import f1_score

In [None]:
# f1 score is better because there is class inbalance.

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels = labels.flatten()
    return f1_score(labels, preds_flat, average='weighted')

In [None]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

# 9. Create Training Loop

This approach is adapted from an older version of HuggingFace's `run_glue.py` script.

In [None]:
import random

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)  # this has to do with using GPUs.

In [None]:
torch.cuda.empty_cache()

In [None]:
# Determine which device is used, cuda (GPS) vs. cpu.

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [None]:
# Evaluation Function

def evaluate(dataloader_val):
    
    model.eval()  # Put model in evaluation mode, which freezes all weights.
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':       batch[0],
                 'attention_mask': batch[1],
                 'labels':          batch[2],
                }
        
        with torch.no_grad():
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()
        
        logits = logits.detach().cpu().numpy()  # In the case of using GPU, the number will be pulled off to CPU.
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
        
    loss_val_avg = loss_val_total/len(dataloader_val)
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
    
    return loss_val_avg, predictions, true_vals

In [None]:
# Training Loop

training_loss_tracker = []
val_loss_tracker = []
val_f1_tracker = []


for epoch in tqdm(range(1, epochs+1)):
    
    model.train()  # set the model to training mode
    
    loss_train_total = 0
    
    progress_bar = tqdm(dataloader_train,
                        desc='Epoch {:1d}'.format(epoch),
                        leave=False,
                        disable=False)
    
    for batch in progress_bar:
        
        model.zero_grad()
    
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {
            'input_ids':      batch[0],
            'attention_mask': batch[1],
            'labels':         batch[2]
        }
        
        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()  # Backpropagation.  'loss' is a built-in function in BERT.
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
        
        
    torch.save(model.state_dict(), f'BERT_S_L{max_length}_B{batch_size}_E{epoch}.model')
    
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)
    
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')
    
    training_loss_tracker.append(loss_train_avg)
    val_loss_tracker.append(val_loss)
    val_f1_tracker.append(val_f1)

In [None]:
# training_loss_tracker
# val_loss_tracker
val_f1_tracker

# 10. Load and Evaluate Models

Load and process the test data

In [None]:
# Load the test dataset

data_path = 'new_test.csv'
data_raw = pd.read_csv(data_path)

print("Number of rows in data =",data_raw.shape[0])
print("Number of columns in data =",data_raw.shape[1])
print("\n")
print("**Sample data:**")
data_raw.drop(columns='index', inplace=True)
data_raw.head()

In [None]:
# Show total number of comments for each label

categories = list(data_raw.columns.values)

sns.set(font_scale = 2)
plt.figure(figsize=(15,8))
ax = sns.barplot(categories[2:], data_raw.iloc[:,2:].sum().values)
plt.title("Comments in each category", fontsize=24)
plt.ylabel('Number of comments', fontsize=18)
plt.xlabel('Comment Type ', fontsize=18)

#adding the text labels

rects = ax.patches
labels = data_raw.iloc[:,2:].sum().values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom', fontsize=18)
    
plt.show()

In [None]:
# Counting the number of comments having multiple labels

rowSums = data_raw.iloc[:,2:].sum(axis=1)
multiLabel_counts = rowSums.value_counts()
multiLabel_counts = multiLabel_counts.iloc[1:]
sns.set(font_scale = 2)
plt.figure(figsize=(15,8))
ax = sns.barplot(multiLabel_counts.index, multiLabel_counts.values)
plt.title("Comments having multiple labels ")
plt.ylabel('Number of comments', fontsize=18)
plt.xlabel('Number of labels', fontsize=18)

#adding the text labels
rects = ax.patches
labels = multiLabel_counts.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
plt.show()

In [None]:
# Generate a summary column "category".  The column contains "1" if the comment is labeled at least once.
# Otherwise, the column will take on a value of "0".

data_raw["category"] = data_raw.iloc[:,2:8].sum(axis=1)
data_raw["category"] = data_raw["category"]/data_raw["category"]
data_raw.fillna(0, inplace=True)
data_raw.category = data_raw.category.astype(int)
data_raw.head()
print("Total number of labeled comments is %d." %data_raw.category.sum())

In [None]:
test_data = data_raw

In [None]:
# Data Pre-processing

test_data['comment_text'] = test_data['comment_text'].str.lower()
test_data['comment_text'] = test_data['comment_text'].apply(cleanHtml)
test_data['comment_text'] = test_data['comment_text'].apply(cleanPunc)
test_data['comment_text'] = test_data['comment_text'].apply(keepAlpha)

In [None]:
# Retain relevant columns from the preprocessed dataset.
test_data = test_data[['id', 'comment_text', 'category']]

# Replace values in the column 'toxicity' by {0: non-toxic, 1: toxic}.
test_data.loc[test_data.category == 0, 'category'] = 'non-toxic'
test_data.loc[test_data.category == 1, 'category'] = 'toxic'

# Replace index in-place by the 'id' column.
test_data.set_index('id', inplace=True)

In [None]:
test_data.head()

In [None]:
test_data.category.value_counts()

In [None]:
test_data['label'] = test_data.category.replace(label_dict)

In [None]:
test_data[test_data['category'] == 'toxic'].head()

In [None]:
encode_data_test = tokenizer.batch_encode_plus(
    test_data.comment_text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=max_length,
    truncation=True,
    return_tensors='pt'
)

In [None]:
input_ids_test = encode_data_test['input_ids']
attention_masks_test = encode_data_test['attention_mask']
labels_test = torch.tensor(test_data.label.values)

In [None]:
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

In [None]:
dataloader_test = DataLoader(
    dataset_test,
    sampler=RandomSampler(dataset_test),
    batch_size=batch_size
)

The model in this section needs to be trained in Colab using GPU from step 9.  In the training loop line 40, save the model with appropriate names.  Here is the convention:

> BERT_X_L###_B####_E#.model

> X     : S or M for single class or multi-class

> L###  : maximum length of each tokenized comment_text (from Section 4)

> B#### : batch size used in the training loop (from Section 6)

> E#    : epoch number when the model is trained (from Section 7)

Example:

> BERT_S_L256_B1024_E2.model is a model trained with single class, a maximum length of 256 tokens, a batch size of 1024, and on the second epoch.

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
model.to(device)
pass

In [None]:
# Model name would be different.  Try running traning on Google Colab using GPUs.

model.load_state_dict(
    torch.load('/content/BERT_S_L200_B32_E2.model',
               map_location=torch.device(device)))

In [None]:
_, predictions, true_vals = evaluate(dataloader_test)

In [None]:
accuracy_per_class(predictions, true_vals)