In [1]:
import warnings
warnings.filterwarnings('ignore')

import random
import time
from pathlib import Path
import os
import multiprocessing as mp
import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook

import mxnet as mx
from mxnet import nd, gluon, autograd
import gluonnlp as nlp

In [2]:
random.seed(666)
np.random.seed(666)
mx.random.seed(666)

In [3]:
class MeanPoolingLayer(gluon.HybridBlock):
    """A block for mean pooling of encoder features"""
    def __init__(self, prefix=None, params=None):
        super(MeanPoolingLayer, self).__init__(prefix=prefix, params=params)

    def hybrid_forward(self, F, data, valid_length): # pylint: disable=arguments-differ
        """Forward logic"""
        # Data will have shape (T, N, C)
        masked_encoded = F.SequenceMask(data,
                                        sequence_length=valid_length,
                                        use_sequence_length=True)
        agg_state = F.broadcast_div(F.sum(masked_encoded, axis=0),
                                    F.expand_dims(valid_length, axis=1))
        return agg_state


class SentimentNet(gluon.HybridBlock):
    """Network for sentiment analysis."""
    def __init__(self, dropout, prefix=None, params=None):
        super(SentimentNet, self).__init__(prefix=prefix, params=params)
        with self.name_scope():
            self.embedding = None # will set with lm embedding later
            self.encoder = None # will set with lm encoder later
            self.agg_layer = MeanPoolingLayer()
            self.output = gluon.nn.HybridSequential()
            with self.output.name_scope():
                self.output.add(gluon.nn.Dropout(dropout))
                self.output.add(gluon.nn.Dense(1, flatten=False))

    def hybrid_forward(self, F, data, valid_length): # pylint: disable=arguments-differ
        encoded = self.encoder(self.embedding(data))  # Shape(T, N, C)
        agg_state = self.agg_layer(encoded, valid_length)
        out = self.output(agg_state)
        return out

In [4]:
dropout = 0
language_model_name = 'standard_lstm_lm_200'
pretrained = True
learning_rate, batch_size = 0.00025, 128
bucket_num, bucket_ratio = 10, 0.2
epochs = 5
grad_clip = None
log_interval = 10000

In [5]:
context = mx.gpu(0)

### Load pre-trained model

In [6]:
lm_model, vocab = nlp.model.get_model(name=language_model_name,
                                      dataset_name='wikitext-2',
                                      pretrained=pretrained,
                                      ctx=context,
                                      dropout=dropout)

### Create SA model from pre-trained model

In [7]:
net = SentimentNet(dropout=dropout)
net.embedding = lm_model.embedding
net.encoder = lm_model.encoder
net.hybridize()
net.output.initialize(mx.init.Xavier(), ctx=context)
print(net)

SentimentNet(
  (embedding): HybridSequential(
    (0): Embedding(33278 -> 200, float32)
  )
  (encoder): LSTM(200 -> 200, TNC, num_layers=2)
  (agg_layer): MeanPoolingLayer(
  
  )
  (output): HybridSequential(
    (0): Dropout(p = 0, axes=())
    (1): Dense(None -> 1, linear)
  )
)


## Data pipeline

In [8]:
path = Path('data/jigsaw')
os.listdir(path)

['test.csv', 'tmp', 'train.csv', 'sample_submission.csv']

In [9]:
train_csv = path/'train.csv'
test_csv = path/'test.csv'
sample_csv = path/'sample_submission.csv'

In [10]:
train_df = pd.read_csv(train_csv)

In [11]:
test_df = pd.read_csv(test_csv)

In [12]:
train_df['comment_text'] = train_df['comment_text'].astype(str)
train_df['target']=(train_df['target']>=0.5).astype(int)

In [13]:
train_data_raw = train_df[['comment_text', 'target']].values

In [14]:
test_df['comment_text'] = test_df['comment_text'].astype(str)
test_data_raw = test_df[['comment_text']].values

In [15]:
test_data_raw[0][0]

"Jeff Sessions is another one of Trump's Orwellian choices. He believes and has believed his entire career the exact opposite of what the position requires."

In [16]:
# tokenizer takes as input a string and outputs a list of tokens.
tokenizer = nlp.data.SpacyTokenizer('en')

# length_clip takes as input a list and outputs a list with maximum length 500.
length_clip = nlp.data.ClipSequence(500)

def preprocess(x):
    data, label = x
    # A token index or a list of token indices is
    # returned according to the vocabulary.
    data = vocab[length_clip(tokenizer(data))]
    return data, label

def test_preprocess(x):
    data = x[0]
    # A token index or a list of token indices is
    # returned according to the vocabulary.
    data = vocab[length_clip(tokenizer(data))]
    return data

def get_length(x):
    return float(len(x[0]))

In [17]:
start = time.time()
pool = mp.Pool()
# Each sample is processed in an asynchronous manner.
train_dataset = gluon.data.SimpleDataset(pool.map(preprocess, train_data_raw))
train_data_lengths = gluon.data.SimpleDataset(pool.map(get_length, train_dataset))

end = time.time()

In [18]:
print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(train_dataset)))

Done! Tokenizing Time=129.93s, #Sentences=1804874


In [19]:
test_dataset = gluon.data.SimpleDataset(pool.map(test_preprocess, test_data_raw))
#test_dataset = gluon.data.SimpleDataset(test_data_raw)
#test_dataset = test_dataset.transform(test_preprocess)

In [20]:
batchify_fn = nlp.data.batchify.Tuple(
    nlp.data.batchify.Pad(axis=0, ret_length=True),
    nlp.data.batchify.Stack(dtype='float32'))
batch_sampler = nlp.data.sampler.FixedBucketSampler(
    train_data_lengths,
    batch_size=batch_size,
    num_buckets=bucket_num,
    ratio=bucket_ratio,
    shuffle=True)
print(batch_sampler.stats())
train_dataloader = gluon.data.DataLoader(
    dataset=train_dataset,
    batch_sampler=batch_sampler,
    batchify_fn=batchify_fn)

FixedBucketSampler:
  sample_num=1804874, batch_num=10121
  key=[50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
  cnt=[1020105, 421486, 178152, 131888, 52962, 253, 16, 8, 2, 2]
  batch_size=[256, 128, 128, 128, 128, 128, 128, 128, 128, 128]


In [21]:
test_batchify_fn = nlp.data.batchify.Pad(axis=0, ret_length=True)
test_dataloader = gluon.data.DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=False,
    batchify_fn=test_batchify_fn)

## Training

### Evaluation using loss and accuracy

In [22]:
trainer = gluon.Trainer(net.collect_params(), 'ftml',
                        {'learning_rate': learning_rate})
loss = gluon.loss.SigmoidBCELoss()

parameters = net.collect_params().values()

In [26]:
# Training/Testing
for epoch in range(epochs):
    # Epoch training stats
    start_epoch_time = time.time()
    epoch_L = 0.0
    epoch_sent_num = 0
    epoch_wc = 0
    # Log interval training stats
    start_log_interval_time = time.time()
    log_interval_wc = 0
    log_interval_sent_num = 0
    log_interval_L = 0.0

    for i, ((data, length), label) in tqdm_notebook(enumerate(train_dataloader), total=len(train_dataloader), leave=False):
        L = 0
        wc = length.sum().asscalar()
        log_interval_wc += wc
        epoch_wc += wc
        log_interval_sent_num += data.shape[1]
        epoch_sent_num += data.shape[1]
        with autograd.record():
            output = net(data.as_in_context(context).T,
                         length.as_in_context(context)
                               .astype(np.float32))
            L = L + loss(output, label.as_in_context(context)).mean()
        L.backward()
        # Clip gradient
        if grad_clip:
            gluon.utils.clip_global_norm(
                [p.grad(context) for p in parameters],
                grad_clip)
        # Update parameter
        trainer.step(1)
        log_interval_L += L.asscalar()
        epoch_L += L.asscalar()
        if (i + 1) % log_interval == 0:
            print(
                '[Epoch {} Batch {}/{}] elapsed {:.2f} s, '
                'avg loss {:.6f}, throughput {:.2f}K wps'.format(
                    epoch, i + 1, len(train_dataloader),
                    time.time() - start_log_interval_time,
                    log_interval_L / log_interval_sent_num, log_interval_wc
                    / 1000 / (time.time() - start_log_interval_time)))
            # Clear log interval training stats
            start_log_interval_time = time.time()
            log_interval_wc = 0
            log_interval_sent_num = 0
            log_interval_L = 0
    end_epoch_time = time.time()

HBox(children=(IntProgress(value=0, description='avg loss: 0.000000', max=13163, style=ProgressStyle(descripti…

[Epoch 0 Batch 10000/13163] elapsed 220.42 s, avg loss 0.002249, throughput 336.11K wps


HBox(children=(IntProgress(value=0, description='avg loss: 0.000000', max=13163, style=ProgressStyle(descripti…

[Epoch 1 Batch 10000/13163] elapsed 221.57 s, avg loss 0.002069, throughput 333.60K wps


HBox(children=(IntProgress(value=0, description='avg loss: 0.000000', max=13163, style=ProgressStyle(descripti…

[Epoch 2 Batch 10000/13163] elapsed 222.49 s, avg loss 0.001924, throughput 333.79K wps


HBox(children=(IntProgress(value=0, description='avg loss: 0.000000', max=13163, style=ProgressStyle(descripti…

[Epoch 3 Batch 10000/13163] elapsed 220.90 s, avg loss 0.001815, throughput 332.99K wps


HBox(children=(IntProgress(value=0, description='avg loss: 0.000000', max=13163, style=ProgressStyle(descripti…

[Epoch 4 Batch 10000/13163] elapsed 222.20 s, avg loss 0.001689, throughput 333.52K wps


In [23]:
net.load_parameters('lm-20190624-2')

In [27]:
net.save_parameters('lm-20190624-2')

## Prediction

In [50]:
net(
    mx.nd.reshape(
        mx.nd.array(vocab[['This', 'movie', 'is', 'amazing']], ctx=context),
        shape=(-1, 1)), mx.nd.array([4], ctx=context)).sigmoid()


[[0.00679768]]
<NDArray 1x1 @gpu(0)>

In [28]:
results = []
for _, (data, length) in tqdm_notebook(enumerate(test_dataloader), total=len(test_dataloader),leave=False):
    output = net(data.as_in_context(context).T,
                 length.as_in_context(context)
                       .astype(np.float32))
    results.extend(out for out in output)

HBox(children=(IntProgress(value=0, max=761), HTML(value='')))



In [29]:
results[4].sigmoid().asnumpy()[0]

0.975501

In [30]:
predictions = [result.sigmoid().asnumpy()[0] for result in results]

In [31]:
submission = pd.DataFrame.from_dict({
    'id': test_df['id'],
    'prediction': predictions
})

In [32]:
submission.to_csv('submission.csv', index=False)