#### REF
* https://github.com/dmlc/gluon-nlp/blob/master/docs/api/notes/data_api.rst

## Data Loading

In [1]:
import os
import pandas as pd
import numpy as np
import nltk
import collections
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score, auc
from mxnet import gluon
import gluonnlp as nlp

import time, re
import multiprocessing as mp
import itertools
from tqdm import tqdm, tqdm_notebook

### Word as token

In [39]:
MAX_SENTENCE_LENGTH = 40
MAX_FEATURES = 2000

In [40]:
train_dataset, test_dataset = [nlp.data.IMDB(segment = segment) for segment in ('train', 'test')]

train_seqs = [sample[0] for sample in train_dataset]
# => 2,5000 sentences

counter = nlp.data.count_tokens(list(itertools.chain.from_iterable((re.split(' |\n', x) for x in train_seqs))))
# => 281158 tokens

vocab = nlp.Vocab(counter, max_size=MAX_FEATURES, padding_token=None, bos_token=None, eos_token=None)
# => 2000 vocabs

In [42]:
print("Tokenize using spaCy...")
# tokenizer takes as input a string and outputs a list of tokens.
tokenizer = nlp.data.SpacyTokenizer('en')
# length_clip takes as input a list and outputs a list with maximum length 500.
length_clip = nlp.data.ClipSequence(MAX_SENTENCE_LENGTH)

def preprocess(x):
    data, label = x
    # In the labeled train/test sets, a negative review has a score <= 4
    # out of 10, and a positive review has a score >= 7 out of 10. Thus
    # reviews with more neutral ratings are not included in the train/test
    # sets. We labeled a negative review whose score <= 4 as 0, and a
    # positive review whose score >= 7 as 1. As the neural ratings are not
    # included in the datasets, we can simply use 5 as our threshold.
    label = int(label > 5)
    # A token index or a list of token indices is
    # returned according to the vocabulary.
    data = vocab[length_clip(tokenizer(data))]
    return data, label, float(len(data))

def get_length(x):
    return float(len(x[0]))

def preprocess_dataset(dataset):
    start = time.time()
    with mp.Pool() as pool:
        # Each sample is processed in an asynchronous manner.
        dataset = gluon.data.SimpleDataset(pool.map(preprocess, dataset))
        lengths = gluon.data.SimpleDataset(pool.map(get_length, dataset))
    end = time.time()
    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
    return dataset, lengths

train_dataset, train_data_lengths = preprocess_dataset(train_dataset)
test_dataset, test_data_lengths = preprocess_dataset(test_dataset)

Tokenize using spaCy...
Done! Tokenizing Time=9.08s, #Sentences=25000
Done! Tokenizing Time=7.82s, #Sentences=25000


### Parameters

In [43]:
num_gpus = 0
batch_size = 64
bucket_num = 10
bucket_ratio = .2

dropout = 0
learning_rate = .0002
epochs = 1
log_interval = 100

In [44]:
# (data, label, length): (64 * 500), (64,), (64,)

# Pad data, stack label and lengths
batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0),
                                          nlp.data.batchify.Stack(),
                                          nlp.data.batchify.Stack())

batch_sampler = nlp.data.sampler.FixedBucketSampler(train_data_lengths,
                                                        batch_size=batch_size,
                                                        num_buckets=bucket_num,
                                                        ratio=bucket_ratio,
                                                        shuffle=True)
train_dataloader = gluon.data.DataLoader(dataset=train_dataset,
                                             batch_sampler=batch_sampler,
                                             batchify_fn=batchify_fn)
test_dataloader = gluon.data.DataLoader(dataset=test_dataset,
                                            batch_size=batch_size,
                                            shuffle=False,
                                            batchify_fn=batchify_fn)

In [45]:
for i, (a, b, c) in enumerate(train_dataloader):
    a, b, c = a, b, c
    if i == 0:
        break

a.shape

(64, 40)

### Another Data preparation
* Input data shape::$ (batch \times word \times vocab )$ 
* Split data: training & validation
* Create data iterator for training

In [19]:
word_freq = collections.Counter()
max_len = 0
num_rec = 0

with open('../data/umich-sentiment-train.txt', 'rb') as f:
    for line in f:
        label, sentence = line.decode('utf8').strip().split('\t')
        words = nltk.word_tokenize(sentence.lower())
        if len(words) > max_len:
            max_len = len(words)
        for word in words:
            word_freq[word] += 1
        num_rec += 1

# most_common output -> list
word2idx = {x[0]: i+2 for i, x in enumerate(word_freq.most_common(MAX_FEATURES - 2))}
word2idx ['PAD'] = 0
word2idx['UNK'] = 1

idx2word= {i:v for v, i in word2idx.items()}
vocab_size = len(word2idx)

y = []
x = []
origin_txt = []
with open('../data/umich-sentiment-train.txt', 'rb') as f:
    for line in f:
        _label, _sentence = line.decode('utf8').strip().split('\t')
        origin_txt.append(_sentence)
        y.append(int(_label))
        words = nltk.word_tokenize(_sentence.lower())
        _seq = []
        for word in words:
            if word in word2idx.keys():
                _seq.append(word2idx[word])
            else:
                _seq.append(word2idx['UNK'])
        if len(_seq) < MAX_SENTENCE_LENGTH:
            _seq.extend([0] * ((MAX_SENTENCE_LENGTH) - len(_seq)))
        else:
            _seq = _seq[:MAX_SENTENCE_LENGTH]
        x.append(_seq)

pd.DataFrame(y, columns = ['yn']).reset_index().groupby('yn').count().reset_index()

## Sentence representation: Average of BOW

def one_hot(x, vocab_size):
    res = np.zeros(shape = (vocab_size))
    res[x] = 1
    return res

x_1 = np.array([np.sum(np.array([one_hot(word, MAX_FEATURES) for word in example]), axis = 0) for example in x])

## Data process - tr/va split and define iterator

tr_idx = np.random.choice(range(x_1.shape[0]), int(x_1.shape[0] * .8))
va_idx = [x for x in range(x_1.shape[0]) if x not in tr_idx]

tr_x = x_1[tr_idx, :]
tr_y = [y[i] for i in tr_idx]
va_x = x_1[va_idx, :]
va_y = [y[i] for i in va_idx]

import mxnet as mx
batch_size = 16
train_data = mx.io.NDArrayIter(data=[tr_x, tr_y], batch_size=batch_size, shuffle = False)
valid_data = mx.io.NDArrayIter(data=[va_x, va_y], batch_size=batch_size, shuffle = False)

## Define Relation Network

  * Apply the following calculation to every pair of words in the sentence
  
  $$ f(x_i, x_j ) =W \phi(U_{left} e_i + U_{right} e_j)$$
  
  * It is hard to distinguish between sentence representation step and classification one, since they needs to be done simultaneously
     - For RN, sentence representation and classification needs to be trained at the same time
     - They are connected as parts of the network
  * For your information, please note that
    - BOW: Simply take sum or average of one-hot tokenvector as sentence representation. Various classifier can be applied to the resulting sentence representation
    - CBOW: improves BoW by reducing its dimensionality using various embedding techiniques. But still ignores order information. Use embedding vector as input of classifier


In [48]:
from mxnet import gluon, autograd, nd
from mxnet.gluon import nn
import mxnet as mx
context = mx.cpu()

In [132]:
class RN_Classifier(nn.HybridBlock):
    def __init__(self, HIDDEN_DIM, SENTENCE_LENGTH, **kwargs):
        super(RN_Classifier, self).__init__(**kwargs)
        self.SENTENCE_LENGTH = SENTENCE_LENGTH
        self.HIDDEN_DIM = HIDDEN_DIM
        with self.name_scope():
            self.embed = nn.Embedding(SENTENCE_LENGTH, HIDDEN_DIM)
            self.g_fc1 = nn.Dense(256,activation='relu')
            #self.g_fc2 = nn.Dense(256,activation='relu')
            #self.g_fc3 = nn.Dense(256,activation='relu')
            #self.g_fc4 = nn.Dense(256,activation='relu')

            self.fc1 = nn.Dense(128, activation = 'relu') # 256 * 128
            self.fc2 = nn.Dense(1) # 128 * 2
            # 1253632 param : 약 20MB
    def hybrid_forward(self, F, x):
        # (x_i, x_j)의 pair를 만들기
        # 64 배치를 가정하면
        
        x = self.embed(x)
        x_i = x.expand_dims(1) # 64 * 1* 40 * 2000* : 0.02GB
        x_i = F.repeat(x_i,repeats= self.SENTENCE_LENGTH, axis=1) # 64 * 40 * 40 * 2000: 1.52GB
        x_j = x.expand_dims(2) # 64 * 40 * 1 * 2000
        x_j = F.repeat(x_j,repeats= self.SENTENCE_LENGTH, axis=2) # 64 * 40 * 40 * 2000: 1.52GB
        x_full = F.concat(x_i,x_j,dim=3) # 64 * 40 * 40 * 4000: 3.04GB
        
        # batch*sentence_length*sentence_length개의 batch를 가진 2*VOCABULARY input을 network에 feed
        _x = x_full.reshape((-1, 2 * self.HIDDEN_DIM))
        
        _x = self.g_fc1(_x) # (64 * 40 * 40) * 256: .1GB 추가메모리는 안먹나?
        
        #_x = self.g_fc2(_x) # (64 * 40 * 40) * 256: .1GB (reuse)
        #_x = self.g_fc3(_x) # (64 * 40 * 40) * 256: .1GB (reuse)
        #_x = self.g_fc4(_x) # (64 * 40 * 40) * 256: .1GB (reuse)
        
        # sentence_length*sentence_length개의 결과값을 모두 합해서 sentence representation으로 나타냄
        x_g = _x.reshape((-1, self.SENTENCE_LENGTH * self.SENTENCE_LENGTH,256)) # (64, 40*40, 256) : .1GB
        sentence_rep = x_g.sum(1) # (64, 256): ignorable
        
        # Check dimension
        #print('shape = {}'.format(x.shape))
        #print('shape = {}'.format(x.shape))
        #print('shape = {}'.format(x_i.shape))
        #print('shape = {}'.format(x_i.shape))
        #print('shape = {}'.format(x_j.shape))
        #print('shape = {}'.format(x_j.shape))
        #print('x_full shape = {}'.format(x_full.shape))
        #print('_x shape = {}'.format(_x.shape))
        #print('g_fc1 shape = {}'.format(_x.shape))
        #print('shape = {}'.format(x_g.shape))
        
        # 여기서부터는 classifier
        clf = self.fc1(sentence_rep)
        clf = self.fc2(clf)
        
        return clf

In [127]:
z = nd.random.uniform(shape = (64, 40), ctx = context)
rn = RN_Classifier(200, 40)
rn.collect_params().initialize(mx.init.Xavier(), ctx = context)
#print(rn(z))
#print(rn)

In [133]:
rn = RN_Classifier(100, MAX_SENTENCE_LENGTH)
rn.collect_params().initialize(mx.init.Xavier(), ctx = context)
loss = gluon.loss.SigmoidBCELoss()
trainer = gluon.Trainer(rn.collect_params(), 'adam', {'learning_rate': 1e-3})

In [134]:
n_epoch = 10

In [135]:
def evaluate(net, dataloader, context):
    loss = gluon.loss.SigmoidBCELoss()
    total_L = 0.0
    total_sample_num = 0
    total_correct_num = 0
    start_log_interval_time = time.time()
    print('Begin Testing...')
    for i, (data, label, valid_length) in enumerate(dataloader):
        data =  data.as_in_context(context)
        valid_length = valid_length.as_in_context(context).astype(np.float32)
        label = label.as_in_context(context)
        output = net(data)
        L = loss(output, label)
        pred = (output > 0.5).reshape(-1)
        total_L += L.sum().asscalar()
        total_sample_num += label.shape[0]
        total_correct_num += (pred == label).sum().asscalar()
        if (i + 1) % log_interval == 0:
            print('[Batch {}/{}] elapsed {:.2f} s'.format(
                i + 1, len(dataloader),
                time.time() - start_log_interval_time))
            start_log_interval_time = time.time()
    avg_L = total_L / float(total_sample_num)
    acc = total_correct_num / float(total_sample_num)
    return avg_L, acc

In [None]:
for epoch in tqdm_notebook(range(n_epoch), desc = 'epoch'):
    ## Training
    train_data.reset()
    # Epoch training stats
    start_epoch_time = time.time()
    epoch_L = 0.0
    epoch_sent_num = 0
    epoch_wc = 0
    # Log interval training stats
    start_log_interval_time = time.time()
    log_interval_wc = 0
    log_interval_sent_num = 0
    log_interval_L = 0.0
    
    for i, (data, label, length) in enumerate(train_dataloader):
        _data = data.as_in_context(context)
        _label = label.as_in_context(context)
        _length = length.as_in_context(context)
        L = 0
        wc = length.sum().asscalar()
        log_interval_wc += wc
        epoch_wc += wc
        log_interval_sent_num += _data.shape[1]
        epoch_sent_num += _data.shape[1]
        with autograd.record():
            _out = rn(_data)
            L = L + loss(_out, _label).mean().as_in_context(context)
        L.backward()
        trainer.step(_data.shape[0])
        log_interval_L += L.asscalar()
        epoch_L += L.asscalar()
        if (i + 1) % log_interval == 0:
            tqdm.write('[Epoch {} Batch {}/{}] elapsed {:.2f} s, \
                    avg loss {:.6f}, throughput {:.2f}K wps'.format(
                    epoch, i + 1, len(train_dataloader),
                    time.time() - start_log_interval_time,
                    log_interval_L / log_interval_sent_num,
                    log_interval_wc / 1000 / (time.time() - start_log_interval_time)))
            # Clear log interval training stats
            start_log_interval_time = time.time()
            log_interval_wc = 0
            log_interval_sent_num = 0
            log_interval_L = 0
        end_epoch_time = time.time()
        test_avg_L, test_acc = evaluate(rn, test_dataloader, context)
        tqdm.write('[Epoch {}] train avg loss {:.6f}, test acc {:.2f}, \
        test avg loss {:.6f}, throughput {:.2f}K wps'.format(
            epoch, epoch_L / epoch_sent_num,
            test_acc, test_avg_L, epoch_wc / 1000 /
            (end_epoch_time - start_epoch_time)))

HBox(children=(IntProgress(value=0, description='epoch', max=10), HTML(value='')))

Begin Testing...
[Batch 100/391] elapsed 87.98 s


In [206]:
va_txt = pd.DataFrame(np.array([origin_txt[idx] for idx in va_idx]), columns = ['txt'])
pred_rn = rn(va_txt)
pred_rn_pd = pd.DataFrame(pred_rn, columns  = ['pred_rn'])
label_pd = pd.DataFrame(va_y, columns = ['label'])
result = pd.concat([va_txt, pred_rn, label_pd], axis = 1)

### Result

* Only 33 comments are mis-classified

In [210]:
result[result['pred_rn'] != result['label']].shape

(33, 3)