In [12]:
import gluonnlp as nlp
import numpy as np
from mxnet.gluon import nn, HybridBlock, Trainer, loss
import mxnet as mx
from mxnet import init, autograd, ndarray as nd
import re
import random
import csv
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

In [2]:
def _clean_str(string):
    """

    :param string: a raw data string
    :return: cleaned string
    """
    string = re.sub(r'[^A-Za-z0-9(),!?\'\`]', ' ', string)
    string = re.sub(r'\s{2,}', ' ', string)
    string = re.sub(r'\'s', ' \'s', string)
    string = re.sub(r'\'ve', ' \'ve', string)
    string = re.sub(r'n\'t', ' n\'t', string)
    string = re.sub(r'\'re', ' \'re', string)
    string = re.sub(r'\'d', ' \'d', string)
    string = re.sub(r'\'ll', ' \'ll', string)
    string = re.sub(r',', ' , ', string)
    string = re.sub(r'!', ' ! ', string)
    string = re.sub(r'\(', ' ( ', string)
    string = re.sub(r'\)', ' ) ', string)
    string = re.sub(r'\?', ' ? ', string)
    string = re.sub(r'\s{2,}', ' ', string)
    return string.strip().lower()

In [3]:
def build_vocabulary(snts):
    """
    build vocabulary from training, validation and test data
    :param tr_array: data array
    :param val_array: data array
    :param tst_array: data array
    :return: vocab in gluon
    """
    flag = 0
    all_tokens = []
    for snt, _ in snts:
        tokens = snt.split()
        if len(tokens) > flag:
            flag = len(tokens)
        all_tokens.extend(tokens)
    print(flag)
    counter = nlp.data.count_tokens(all_tokens)
    vocab = nlp.Vocab(counter)
    return vocab

In [4]:
file_name = "../../gold_standard/gold_standard_final.txt"
snts = []
label_set = set()
with open(file_name, 'r') as f:
    for row in f:
        snt, label = row.rsplit(':', 1)
        snts.append((_clean_str(snt.strip()), label.strip()))
        label_set.add(label.strip())
print((snts[0]))
print(label_set)

('i am miss louisa christopher the only daughter of mr christopher from republic of zimbabwe in desire to get somebody who will safe guard my interest , that of my junior brother ( louis ) and this money', 'SELF-INTRO')
{'RAPPORT', 'BENEFITS', 'SELF-INTRO', 'MOTIVATION', 'OTHER', 'PURPOSE'}


In [5]:
{label: idx for idx, label in enumerate(label_set)}

{'RAPPORT': 0,
 'BENEFITS': 1,
 'SELF-INTRO': 2,
 'MOTIVATION': 3,
 'OTHER': 4,
 'PURPOSE': 5}

In [6]:
label_set = {'SELF-INTRO': 0, 'OTHER': 1, 'RAPPORT': 2, 'BENEFITS': 3, 'MOTIVATION': 4, 'PURPOSE': 5}

In [7]:
vocab = build_vocabulary(snts)


98


In [8]:
def preprocess(vocab, instance, max_snt_len=98):
    snt, label = instance
    word_indices = vocab.to_indices(snt.split())
    label_index = label_set[label]
    return word_indices, label_index

In [9]:
preprocess(vocab, snts[4])

([36, 10, 460, 18, 15, 409, 171, 255, 6, 203, 32, 336, 11, 149, 64], 5)

In [48]:
def creat_dataset(snts, max_snt_len=98):
    all_data = []
    padding = nlp.data.PadSequence(max_snt_len, pad_val=1, clip=True)  # in the vocabulary, index 1 is mapped to <pad>
    for instance in snts:
        word_indices, label_index = preprocess(vocab, instance)
        all_data.append((padding(word_indices), label_index))
    return all_data


In [73]:
all_data = creat_dataset(snts, max_snt_len=98)
np.random.shuffle(all_data)
train_data = all_data[:round(len(all_data) * 0.9)]
test_data = all_data[round(len(all_data) * 0.9):]

In [81]:
train_dataloader = nlp.data.ShardedDataLoader(all_data, batch_size=4, shuffle=True)
test_dataloader = nlp.data.ShardedDataLoader(all_data, batch_size=4, shuffle=True)
for inputs, label in train_dataloader:
    print(inputs)
    print(label)
    break


[[  11  131  258  534  270   86  191   56   75  261  321  625   14  133
   131  278  459    7   40  797  502    4  737  142  377  127    8   77
   594  282  377  127    1    1    1    1    1    1    1    1    1    1
     1    1    1    1    1    1    1    1    1    1    1    1    1    1
     1    1    1    1    1    1    1    1    1    1    1    1    1    1
     1    1    1    1    1    1    1    1    1    1    1    1    1    1
     1    1    1    1    1    1    1    1    1    1    1    1    1    1]
 [  25  135   26   58 1463 1698   64  496  767 1319   30 1228  633   28
   649    7  285 1335 1283  715 1676  767  989    1    1    1    1    1
     1    1    1    1    1    1    1    1    1    1    1    1    1    1
     1    1    1    1    1    1    1    1    1    1    1    1    1    1
     1    1    1    1    1    1    1    1    1    1    1    1    1    1
     1    1    1    1    1    1    1    1    1    1    1    1    1    1
     1    1    1    1    1    1    1    1    1    1    1    1 

In [82]:
class DeceptionClassifier(HybridBlock):

    def __init__(self, emb_input_dim, emb_output_dim, num_classes=6, prefix=None, params=None):
        super(DeceptionClassifier, self).__init__(prefix=prefix, params=params)
        with self.name_scope():
            self.embedding = nn.Embedding(emb_input_dim, emb_output_dim)
            self.output = nn.HybridSequential()
            with self.output.name_scope():
                self.output.add(nn.Dense(32))
                self.output.add(nn.Dense(64))
                self.output.add(nn.Dense(num_classes))

    def hybrid_forward(self, F, data):
        return self.output(self.embedding(data))

In [83]:
model = DeceptionClassifier(emb_input_dim=len(vocab.idx_to_token), emb_output_dim=100, num_classes=len(label))
model.initialize(init=init.Xavier(), force_reinit=True)
# model.embedding.collect_params().setattr('grad_req', 'null')
model.hybridize()  # OPTIONAL for efficiency - perhaps easier to comment this out during debugging
trainer = Trainer(model.collect_params(), 'adam', {'learning_rate': 0.00025})

In [84]:
loss_fn = loss.SoftmaxCrossEntropyLoss()
for epoch in range(16):
        # begin training
        train_loss, train_acc = 0., 0.
        for i, data in enumerate(train_dataloader):
            inputs = nd.array(data[0], dtype='float32')
            labels = data[1]
            with autograd.record(train_mode=True):
                # the default mode is train mode
                output = model(inputs)
                l = loss_fn(output, labels).mean()
            l.backward()
            trainer.step(2)
            train_loss += l.asscalar()
        print(train_loss)
#         print(evaluate(model, train_dataloader)[0])


86.9509200155735
72.67099912464619
65.41851350665092
51.79095705598593
36.14811047166586
21.59899842273444
12.928983259014785
6.923688088776544
4.8888769769109786
3.4103482521604747
2.4007068312494084
1.9241070479620248
1.5733393294503912
1.2867386759025976
1.0786546732997522
0.9765101024149772


In [85]:
def _acc(output, label):
    """
    helper function, compute the accuracy
    :param output: predicted result
    :param label: golden result
    :return:
    """
    return (output.argmax(axis=1) == label.astype('float32')).mean().asscalar()

In [86]:
def evaluate(model, dataloader):
    """
    evaluate the result
    :param model: trained model
    :param dataloader:
    :param ctx:
    :return:
    """
    labels, preds = [], []
    for i, (data, label) in enumerate(dataloader):
        data = nd.array(data, dtype='float32')
        output = mx.nd.softmax(model(data))
        pred = output.argmax(axis=1)
        for j in range(pred.shape[0]):
            lab = int(label[j].asscalar())
            pre = int(pred[j].asscalar())
            labels.append(lab)  # a list of correct labels
            preds.append(pre)  # a list of predicted labels
    return accuracy_score(labels, preds), precision_score(labels, preds, average='macro'), \
           recall_score(labels, preds, average='macro'), f1_score(labels, preds, average='macro')

In [87]:
evaluate(model, test_dataloader)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0.45791245791245794,
 0.5126703499079189,
 0.6627906976744186,
 0.5292913204885915)