In [12]:
import datasets
from datasets import load_dataset
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertModel, BertConfig
from torch.utils.data import DataLoader

In [2]:
""" Label formatting
"""
df = pd.read_csv("./data/train.csv")
df["labels"] = df[df.columns[2:]].values.tolist()
df = df[["comment_text", "labels"]].copy()
df.head()

Unnamed: 0,comment_text,labels
0,Explanation\nWhy the edits made under my usern...,"[0, 0, 0, 0, 0, 0]"
1,D'aww! He matches this background colour I'm s...,"[0, 0, 0, 0, 0, 0]"
2,"Hey man, I'm really not trying to edit war. It...","[0, 0, 0, 0, 0, 0]"
3,"""\nMore\nI can't make any real suggestions on ...","[0, 0, 0, 0, 0, 0]"
4,"You, sir, are my hero. Any chance you remember...","[0, 0, 0, 0, 0, 0]"


In [3]:
""" Load Huggingface dataset from In-memory data
"""
dataset = Dataset.from_pandas(df)

In [6]:
""" Encoding dataset (TOKENIZATION) for later use with BERT model
 - input_ids
 - attention_mask
 - token_type_ids
"""
MAX_LEN = 200
tokenizer_checkpoint = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(tokenizer_checkpoint)

def tokenize(samples):
    """ Tokenize text(s) with a given pretrained tokenizer
    """
    tokenized_samples = tokenizer(
        text=samples["comment_text"],
        max_length=MAX_LEN,
        padding="max_length",
        truncation=True,
        return_token_type_ids=True)
    return tokenized_samples

In [58]:
""" Tokenize texts in dataset
"""
encoded_dataset = dataset.map(function=tokenize,
                              batched=True,
                              num_proc=8)

In [59]:
""" Train/Test split
"""
TEST_SIZE = 0.2
SEED = 42
encoded_dataset = encoded_dataset.train_test_split(
                            test_size=TEST_SIZE,
                            seed=SEED)
train_dataset = encoded_dataset["train"]
test_dataset = encoded_dataset["test"]

# Set output format
train_dataset.set_format("torch")
test_dataset.set_format("torch")

In [66]:
""" Create Train/Test Dataloader (use in training loop)
"""
BATCH_SIZE = 16
train_dataloader = DataLoader(dataset=train_dataset,
                              shuffle=True,
                              batch_size=BATCH_SIZE,
                              num_workers=4)
test_dataloader = DataLoader(dataset=test_dataset,
                             batch_size=BATCH_SIZE,
                             num_workers=4)

In [38]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import (
    DataLoader, RandomSampler,
    SequentialSampler
)

In [5]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'