In [1]:
import pandas as pd
import numpy as np
import torch

# Loading the data

First, let's examine what the data looks like.

(Note: This repo does not contain the full data. To get the full data, go to the [Kaggle competition page](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge) and download the data for yourself.

In [2]:
pd.read_csv("data/train.csv").head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0


In [3]:
pd.read_csv("data/valid.csv").head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,000eefc67a2c930f,Radial symmetry \n\nSeveral now extinct lineag...,0,0,0,0,0,0
1,000f35deef84dc4a,There's no need to apologize. A Wikipedia arti...,0,0,0,0,0,0


Apparently we have to predict 6 labels

In [4]:
pd.read_csv("data/test.csv").head(2)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...


### Declaring Fields

The Field class determines how the data is preprocessed and converted into a numeric format

In [5]:
from torchtext.data import Field

We want comment_text field to be converted to lowercase, tokenized on whitespace, and preprocessed. So we tell that to the Field

In [6]:
tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)

That was simple. The preprocessing of the labels is even easier, since they are already converted into a binary encoding.
All we need to do is to tell the Field class that the labels are already processed. We do this by passing the use_vocab=False keyword to the constructor

In [7]:
LABEL = Field(sequential=False, use_vocab=False)

### Creating the Dataset

We'll use the TabularDataset class to read our data, since it is in csv format (TabularDataset handles csv, tsv, and json files as of now)

In [8]:
from torchtext.data import TabularDataset

For the train and validation data, we need to process the labels. The fields we pass in must be in the same order as the columns. For fields we don't use, we pass in a tuple where the second element is None

In [9]:
%%time
tv_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                 ("comment_text", TEXT), ("toxic", LABEL),
                 ("severe_toxic", LABEL), ("threat", LABEL),
                 ("obscene", LABEL), ("insult", LABEL),
                 ("identity_hate", LABEL)]

trn, vld = TabularDataset.splits(
        path="data", # the root directory where the data lies
        train='train.csv', validation="valid.csv",
        format='csv',
        skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields=tv_datafields)

CPU times: user 4.57 ms, sys: 1.81 ms, total: 6.37 ms
Wall time: 5.25 ms


For the test data, we don't have any labels

In [10]:
%%time
tst_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                 ("comment_text", TEXT)
]

tst = TabularDataset(
        path="data/test.csv", # the file path
        format='csv',
        skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields=tst_datafields)

CPU times: user 1.98 ms, sys: 877 µs, total: 2.86 ms
Wall time: 2.17 ms


For the TEXT field to convert words into integers, it needs to be told what the entire vocabulary is. To do this, we run TEXT.build_vocab, passing in the dataset to build the vocabulary on.

In [11]:
%%time
TEXT.build_vocab(trn)

CPU times: user 2.29 ms, sys: 207 µs, total: 2.5 ms
Wall time: 2.54 ms


Let's take a look at what the vocab looks like.

The vocab.freqs is a collections.Counter object, so we can take a look at the most frequent words.

In [12]:
TEXT.vocab.freqs.most_common(10)

[('the', 78),
 ('to', 41),
 ('you', 33),
 ('of', 30),
 ('and', 26),
 ('a', 26),
 ('is', 24),
 ('that', 22),
 ('i', 20),
 ('if', 19)]

It is also instructive to take a look inside the Dataset. Datasets can be indexed like normal lists, so we'll look at the first element.

In [13]:
trn[0]

<torchtext.data.example.Example at 0x11023a860>

Each element of the dataset is an Example object that bundles the attributes of a single data point together.

In [14]:
trn[0].__dict__.keys()

dict_keys(['comment_text', 'toxic', 'severe_toxic', 'threat', 'obscene', 'insult', 'identity_hate'])

We see that the comment text is already tokenized for us.

In [15]:
trn[0].comment_text[:3]

['explanation', 'why', 'the']

Looking good. Now, let's build the Iterator which will allow us to load the data into our model.

### Creating the Iterator

In [16]:
from torchtext.data import Iterator, BucketIterator

During training, we'll be using a special kind of Iterator, called the **BucketIterator**.

When we pass data into a neural network, we want the data to be padded to be the same length so that we can process them in batch:

e.g.
\[ 
\[3, 15, 2, 7\],
\[4, 1\], 
\[5, 5, 6, 8, 1\] 
\] -> \[ 
\[3, 15, 2, 7, **0**\],
\[4, 1, **0**, **0**, **0**\], 
\[5, 5, 6, 8, 1\] 
\] 

If the sequences differ greatly in length, the padding will consume a lot of wasteful memory and time.

The BucketIterator groups sequences of similar lengths together for each batch to minimize padding. Handy, right?

In [17]:
train_iter, val_iter = BucketIterator.splits(
        (trn, vld), # we pass in the datasets we want the iterator to draw data from
        batch_sizes=(64, 64),
        device=-1, # if you want to use the GPU, specify the GPU number here
        sort_key=lambda x: len(x.comment_text), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

Let's take a look at what the output of the BucketIterator looks like

In [18]:
batch = next(train_iter.__iter__()); batch


[torchtext.data.batch.Batch of size 25]
	[.comment_text]:[torch.LongTensor of size 494x25]
	[.toxic]:[torch.LongTensor of size 25]
	[.severe_toxic]:[torch.LongTensor of size 25]
	[.threat]:[torch.LongTensor of size 25]
	[.obscene]:[torch.LongTensor of size 25]
	[.insult]:[torch.LongTensor of size 25]
	[.identity_hate]:[torch.LongTensor of size 25]

The batch has all the fields we passed to the Dataset as attributes. The batch data can be accessed through the attribute with the same name.

In [19]:
batch.__dict__.keys()

dict_keys(['batch_size', 'dataset', 'train', 'fields', 'comment_text', 'toxic', 'severe_toxic', 'threat', 'obscene', 'insult', 'identity_hate'])

For the test set, we don't want the data to be shuffled. This is why we'll be using a standard Iterator.

In [20]:
test_iter = Iterator(tst, batch_size=64, device=-1, sort=False, sort_within_batch=False, repeat=False)

### Wrapping the Iterator

Currently, the iterator returns a custom datatype called torchtext.data.Batch. This makes code reuse difficult (since each time the column names change, we need to modify the code), and makes torchtext hard to use with other libraries for some use cases (like torchsample and fastai). 

I hope this will be dealt with in the future (I'm considering filing a PR if I can decide what the API should look like), but in the meantime, we'll hack on a simple wrapper to make the batches easy to use. 

Concretely, we'll convert the batch to a tuple in the form (x, y) where x is the independent variable (the input to the model) and y is the dependent variable (the supervision data).

In [21]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x and y
    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            
            if self.y_vars is not None: # we will concatenate y into a single tensor
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

We'll use this to wrap the BucketIterator

In [22]:
train_dl = BatchWrapper(train_iter, "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
valid_dl = BatchWrapper(val_iter, "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
test_dl = BatchWrapper(test_iter, "comment_text", None)

In [23]:
next(train_dl.__iter__())

(Variable containing:
   606   354   334  ...     63    15    15
   693    63    55  ...      4   601    29
   584     4   520  ...    664   242    21
        ...          ⋱          ...       
     1     1     1  ...      1     1    84
     1     1     1  ...      1     1   118
     1     1     1  ...      1     1    15
 [torch.LongTensor of size 494x25], Variable containing:
     0     0     0     0     0     0
     1     1     0     1     1     0
     1     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     1     0     0     0    

Now we're ready to start training a model!

# Training a Text Classifier

We'll use a simple LSTM as a baseline example.

In [24]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [25]:
class SimpleBiLSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim=300,
                 spatial_dropout=0.05, recurrent_dropout=0.1, num_linear=1):
        super().__init__() # don't forget to call this!
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1, dropout=recurrent_dropout)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 6)
    
    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
            feature = layer(feature)
        preds = self.predictor(feature)
        return preds

In [26]:
em_sz = 100
nh = 500
nl = 3
model = SimpleBiLSTMBaseline(nh, emb_dim=em_sz); model

SimpleBiLSTMBaseline(
  (embedding): Embedding(784, 100)
  (encoder): LSTM(100, 500, dropout=0.1)
  (linear_layers): ModuleList(
  )
  (predictor): Linear(in_features=500, out_features=6)
)

If you're using a GPU, remember to call model.cuda() to move your model to the GPU.

In [27]:
# model.cuda()

### The training loop

In [28]:
import tqdm

In [29]:
opt = optim.Adam(model.parameters(), lr=1e-2)
loss_func = nn.BCEWithLogitsLoss()

In [30]:
epochs = 2

In [31]:
%%time
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for x, y in tqdm.tqdm(train_dl): # thanks to our wrapper, we can intuitively iterate over our data!
        opt.zero_grad()

        preds = model(x)
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()
        
        running_loss += loss.data[0] * x.size(0)
        
    epoch_loss = running_loss / len(trn)
    
    # calculate the validation loss for this epoch
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for x, y in valid_dl:
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.data[0] * x.size(0)

    val_loss /= len(vld)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))

  

  0%|          | 0/1 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 13.7826, Validation Loss: 2.0098


100%|██████████| 1/1 [00:13<00:00, 13.78s/it]


Epoch: 2, Training Loss: 3.9747, Validation Loss: 1.9117
CPU times: user 1min 21s, sys: 3.43 s, total: 1min 25s
Wall time: 32.1 s


# Writing Predictions

Finally, we output the data in the format required by the competition

In [32]:
test_dl

<__main__.BatchWrapper at 0x110269c18>

In [33]:
test_preds = []
for x, y in tqdm.tqdm(test_dl):
    preds = model(x)
    # if you're data is on the GPU, you need to move the data back to the cpu
    # preds = preds.data.cpu().numpy()
    preds = preds.data.numpy()
    # the actual outputs of the model are logits, so we need to pass these values to the sigmoid function
    preds = 1 / (1 + np.exp(-preds))
    test_preds.append(preds)
test_preds = np.hstack(test_preds)

  



In [34]:
df = pd.read_csv("data/test.csv")
for i, col in enumerate(["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]):
    df[col] = test_preds[:, i]

# if you want to write the submission file to disk, uncomment and run the below code
# df.drop("comment_text", axis=1).to_csv("submission.csv", index=False)

In [35]:
df.head(3)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,0.068491,0.014765,0.00251,0.012214,0.014872,0.002175
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,0.068491,0.014765,0.00251,0.012214,0.014872,0.002175
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",0.068491,0.014765,0.00251,0.012214,0.014872,0.002175
