In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from transformers import BertTokenizer, BertModel
from bs4 import BeautifulSoup
import torch
import sklearn

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# unzip the files
import zipfile

datasets = ("sample_submission.csv", "test.csv", "test_labels.csv", "train.csv")

for d in datasets:
    with zipfile.ZipFile("../input/jigsaw-toxic-comment-classification-challenge/"+d+".zip","r") as z:
        z.extractall(".")

In [None]:
test = pd.read_csv("/kaggle/working/test.csv")
train = pd.read_csv("/kaggle/working/train.csv")

In [None]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

## Some EDA

In [None]:
train['comment_text'].head()

In [None]:
# check for any missing values
train.isna().sum()

In [None]:
# don't need this column
train = train.drop(['id'], axis=1)

In [None]:
train

In [None]:
# check balancing of the labels
for i in train[labels].columns :
    print(f"Percent of {i}: {round(100*train[i].mean(),2)}")

### Pre-processing & data cleaning

In [None]:
import re

def pre_process(text):
    text = BeautifulSoup(text).get_text()
    # fetch alphabetic characters

    text = re.sub("[^a-zA-Z]", " ", text)
    # convert text to lower case
    text = text.lower()
    # split text into tokens to remove whitespaces
    tokens = text.split()
    return " ".join(tokens)

In [None]:
train['comment_text'] = train['comment_text'].map(pre_process)

### stop words

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
train['comment_text'] = train['comment_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

### stemming

In [None]:
from nltk.stem.snowball import SnowballStemmer

#stemming
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

train['comment_text'] = train['comment_text'].apply(stemming)

### investigate text lenghts

In [None]:
train['word_count'] = train['comment_text'].apply(lambda x: len(str(x).split(" ")))

In [None]:
train.hist(column='word_count', bins = 15)
# no text is longer than 512, ok for Bert model

## Split data into training, validation and test datasets

In [None]:
y = train[labels]
x = train[['comment_text']]

### sample 10,000 data to demo and get the model to run

In [None]:
import random
from random import sample
random.seed(4)
i = sample(range(x.shape[0]), 10000)

In [None]:
X_new = x.iloc[i]
y_new = y.iloc[i]

## stratified sampling to handle imbalanced data for multilabel classification
- useful [reference](https://github.com/scikit-multilearn/scikit-multilearn/issues/194) to get data into right format

In [None]:
from skmultilearn.model_selection import iterative_train_test_split
x_temp, y_temp, x_test, y_test = iterative_train_test_split(X_new.values, y_new.values, test_size = 0.1)

# split into training and validation
x_train, y_train, x_val, y_val = iterative_train_test_split(x_temp, y_temp, test_size = 0.2)

In [None]:
y_train = y_train.astype(float)
y_val = y_val.astype(float)
y_test = y_test.astype(float)

## Format data for BERT model

In [None]:
# Set up device for GPU usage for neural network training
from torch import cuda

device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
from datasets.dataset_dict import DatasetDict
from datasets import Dataset

dataset = {'train':Dataset.from_dict({'comment_text':x_train, 'labels':y_train}),
           'val':Dataset.from_dict({'comment_text':x_val, 'labels':y_val}),
           'test':Dataset.from_dict({'comment_text':x_test, 'labels':y_test}),
          }

dataset = DatasetDict(dataset)

In [None]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

### tokenize for Transformer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def format_data(data):

    text = data["comment_text"]
    # encode them
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128,is_split_into_words=True)
    encoding["labels"] = data["labels"]
    
    return encoding

In [None]:
encoded_dataset = dataset.map(format_data, batched =True)

### inspect the encoded data

In [None]:
example = encoded_dataset['train'][0]
print(example.keys())

In [None]:
tokenizer.decode(example['input_ids'])

In [None]:
example['labels']

In [None]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1]

#### set the format of our training, validation and test datasets into PyTorch datasets

In [None]:
encoded_dataset.set_format("torch")

In [None]:
encoded_dataset['train']['input_ids'][0]

### Model Definition

usingi weights from the pre-trained base, bert-base-uncase. 

can specify multi-label problem using:
- `problem_type = "multi_label_classification"`
- `num_labels = len(labels)`

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

# Train model

In [None]:
# model config

BATCH_SIZE = 32
METRIC_NAME = 'f1'
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 5
WEIGHT_DECAY = 0.01

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    weight_decay= WEIGHT_DECAY,
    load_best_model_at_end=True,
    metric_for_best_model=METRIC_NAME,
)

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    # probabilities, output from sigmoid function
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return metrics as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [None]:
encoded_dataset['train']['labels'][0].unsqueeze(0)

In [None]:
encoded_dataset['train']['input_ids'][0].type()

In [None]:
# forward pass
outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoded_dataset['train']['labels'][0].unsqueeze(0))
outputs

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

# Evaluate on test dataset

In [None]:
trainer.evaluate(eval_dataset=encoded_dataset["test"])