In [1]:
import math
import os
import time
import numpy as np
import mxnet as mx
from mxnet import gluon, autograd
from mxnet.gluon import nn, rnn
import random

In [2]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

In [41]:
class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(path + 'train.txt')
        random.shuffle(self.train)
        temp = int(0.20 * self.train.shape[0])
        self.valid = self.train[:temp]
        self.train = self.train[temp:]
        self.test = self.tokenize(path + 'test.txt')

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r') as f:
            ids = np.zeros((tokens,), dtype='int32')
            token = 0
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1

        return mx.nd.array(ids, dtype='int32')
    
    def to_word(self, list):
        a = ["dn"]
        print(self.dictionary.idx2word[20])
        for word in list:
            #print(word[0])
            a.append(self.dictionary.idx2word[word[0]])
        return a;
                

In [24]:
class RNNModel(gluon.Block):
    """A model with an encoder, recurrent layer, and a decoder."""

    def __init__(self, mode, vocab_size, num_embed, num_hidden,
                 num_layers, dropout=0.5, tie_weights=False, **kwargs):
        super(RNNModel, self).__init__(**kwargs)
        with self.name_scope():
            self.drop = nn.Dropout(dropout)
            self.encoder = nn.Embedding(vocab_size, num_embed,
                                        weight_initializer = mx.init.Uniform(0.1))
            if mode == 'rnn_relu':
                self.rnn = rnn.RNN(num_hidden, num_layers, activation='relu', dropout=dropout,
                                   input_size=num_embed)
            elif mode == 'rnn_tanh':
                self.rnn = rnn.RNN(num_hidden, num_layers, dropout=dropout,
                                   input_size=num_embed)
            elif mode == 'lstm':
                self.rnn = rnn.LSTM(num_hidden, num_layers, dropout=dropout,
                                    input_size=num_embed)
            elif mode == 'gru':
                self.rnn = rnn.GRU(num_hidden, num_layers, dropout=dropout,
                                   input_size=num_embed)
            else:
                raise ValueError("Invalid mode %s. Options are rnn_relu, "
                                 "rnn_tanh, lstm, and gru"%mode)
            if tie_weights:
                self.decoder = nn.Dense(vocab_size, in_units = num_hidden,
                                        params = self.encoder.params)
            else:
                self.decoder = nn.Dense(vocab_size, in_units = num_hidden)
            self.num_hidden = num_hidden

    def forward(self, inputs, hidden):
        emb = self.drop(self.encoder(inputs))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.reshape((-1, self.num_hidden)))
        return decoded, hidden

    def begin_state(self, *args, **kwargs):
        return self.rnn.begin_state(*args, **kwargs)

In [25]:
args_data = ''
args_model = 'rnn_relu'
args_emsize = 100
args_nhid = 100
args_nlayers = 2
args_lr = 1.0
args_clip = 0.2
args_epochs = 10
args_batch_size = 32
args_bptt = 5
args_dropout = 0.2
args_tied = True
args_cuda = 'store_true'
args_log_interval = 500
args_save = 'model.param'

In [42]:
context = mx.cpu() # this notebook takes too long on cpu
corpus = Corpus(args_data)

In [26]:
def batchify(data, batch_size):
    """Reshape data into (num_example, batch_size)"""
    nbatch = data.shape[0] // batch_size
    data = data[:nbatch * batch_size]
    data = data.reshape((batch_size, nbatch)).T
    return data

train_data = batchify(corpus.train, args_batch_size).as_in_context(context)
val_data = batchify(corpus.valid, args_batch_size).as_in_context(context)
test_data = batchify(corpus.test, args_batch_size).as_in_context(context)

In [54]:
import string
import re

import nltk
import numpy as np
import pandas as pd
from mxnet import gluon
from mxnet import nd
from mxnet.contrib import text
import collections

args_lr = 0.001
args_batch_size = 32

train_data = pd.read_fwf('train.txt',header=None)
test_data = pd.read_fwf('test.txt')

train = train_data.values

train_data = train_data[0]

def clean_document(doco):
    punctuation = string.punctuation + '\n\n';
    punc_replace = ''.join([' ' for s in punctuation]);
    doco_clean = doco.replace('-', ' ');
    doco_alphas = re.sub(r'\W +', '', doco_clean)
    trans_table = str.maketrans(punctuation, punc_replace);
    doco_clean = ' '.join([word.translate(trans_table) for word in doco_alphas.split(' ')]);
    doco_clean = doco_clean.split(' ');
    regex = re.compile('[^a-zA-Z]')
    doco_clean = [regex.sub("", word.lower()) for word in doco_clean if len(word) > 0];
    
    return doco_clean;

data_to_clean = [line[0] for line in train_data];

data_clean = [clean_document(doc) for doc in train_data]
sentences = [' '.join(r) for r in data_clean]

for line in sentences:
    line = re.sub(' +', ' ', line)

counter = text.utils.count_tokens_from_str('\n'.join(sentences))
my_vocab = text.vocab.Vocabulary(counter, unknown_token='&lt;unk&gt;', 
                                 reserved_tokens=['&lt;pad&gt;'])

my_embedding = text.embedding.create('fasttext', pretrained_file_name='wiki.simple.vec', 
                                     vocabulary=my_vocab)

# my_embedding = text.embedding.create('glove', pretrained_file_name='glove.6B.50d.txt', 
#                                      vocabulary=my_vocab)

  'skipped.' % (line_num, token, elems))


In [53]:
corpus.train[1]


[214]
<NDArray 1 @cpu(0)>

In [7]:
ntokens = len(corpus.dictionary)

model = RNNModel(args_model, ntokens, args_emsize, args_nhid,
                       args_nlayers, args_dropout, args_tied)
model.collect_params().initialize(mx.init.Xavier(), ctx=context)
trainer = gluon.Trainer(model.collect_params(), 'sgd',
                        {'learning_rate': args_lr, 'momentum': 0, 'wd': 0})
loss = gluon.loss.SoftmaxCrossEntropyLoss()


In [8]:
def get_batch(source, i):
    seq_len = min(args_bptt, source.shape[0] - 1 - i)
    data = source[i : i + seq_len]
    target = source[i + 1 : i + 1 + seq_len]
    return data, target.reshape((-1,))

def detach(hidden):
    if isinstance(hidden, (tuple, list)):
        hidden = [i.detach() for i in hidden]
    else:
        hidden = hidden.detach()
    return hidden

In [9]:
def eval(data_source):
    total_L = 0.0
    ntotal = 0
    hidden = model.begin_state(func = mx.nd.zeros, batch_size = args_batch_size, ctx=context)
    for i in range(0, data_source.shape[0] - 1, args_bptt):
        data, target = get_batch(data_source, i)
        output, hidden = model(data, hidden)
        L = loss(output, target)
        total_L += mx.nd.sum(L).asscalar()
        ntotal += L.size
    return total_L / ntotal

In [10]:
def train():
    args_lr = 1.0
    best_val = float("Inf")
    for epoch in range(args_epochs):
        total_L = 0.0
        start_time = time.time()
        hidden = model.begin_state(func = mx.nd.zeros, batch_size = args_batch_size, ctx = context)
        for ibatch, i in enumerate(range(0, train_data.shape[0] - 1, args_bptt)):
            data, target = get_batch(train_data, i)
            hidden = detach(hidden)
            with autograd.record():
                output, hidden = model(data, hidden)
                L = loss(output, target)
                L.backward()

            grads = [i.grad(context) for i in model.collect_params().values()]
            # Here gradient is for the whole batch.
            # So we multiply max_norm by batch_size and bptt size to balance it.
            gluon.utils.clip_global_norm(grads, args_clip * args_bptt * args_batch_size)

            trainer.step(args_batch_size)
            total_L += mx.nd.sum(L).asscalar()

            if ibatch % args_log_interval == 0 and ibatch > 0:
                cur_L = total_L / args_bptt / args_batch_size / args_log_interval
                print('[Epoch %d Batch %d] loss %.2f, perplexity %.2f' % (
                    epoch + 1, ibatch, cur_L, math.exp(cur_L)))
                total_L = 0.0

        val_L = eval(val_data)

        print('[Epoch %d] time cost %.2fs, validation loss %.2f, validation perplexity %.2f' % (
            epoch + 1, time.time() - start_time, val_L, math.exp(val_L)))

        if val_L < best_val:
            best_val = val_L
            test_L = eval(test_data)
            model.save_parameters(args_save)
            print('test loss %.2f, test perplexity %.2f' % (test_L, math.exp(test_L)))
        else:
            args_lr = args_lr * 0.25
            trainer._init_optimizer('sgd',
                                    {'learning_rate': args_lr,
                                     'momentum': 0,
                                     'wd': 0})
            model.load_parameters(args_save, context)

In [11]:
train()
model.load_parameters(args_save, context)
test_L = eval(test_data)
print('Best test loss %.2f, test perplexity %.2f'%(test_L, math.exp(test_L)))

[Epoch 1] time cost 12.01s, validation loss 7.19, validation perplexity 1319.99
test loss 7.16, test perplexity 1283.61
[Epoch 2] time cost 12.55s, validation loss 6.94, validation perplexity 1030.70
test loss 6.94, test perplexity 1032.37
[Epoch 3] time cost 12.75s, validation loss 6.96, validation perplexity 1057.54
[Epoch 4] time cost 11.48s, validation loss 6.86, validation perplexity 950.20
test loss 6.90, test perplexity 991.87
[Epoch 5] time cost 12.18s, validation loss 6.85, validation perplexity 941.05
test loss 6.91, test perplexity 1005.63
[Epoch 6] time cost 11.57s, validation loss 6.84, validation perplexity 934.57
test loss 6.93, test perplexity 1022.18
[Epoch 7] time cost 11.78s, validation loss 6.84, validation perplexity 931.16
test loss 6.94, test perplexity 1036.45
[Epoch 8] time cost 11.85s, validation loss 6.84, validation perplexity 930.86
test loss 6.96, test perplexity 1054.59
[Epoch 9] time cost 11.06s, validation loss 6.84, validation perplexity 934.20
[Epoch 

In [12]:
def eval1(data_source):
    total_L = 0.0
    ntotal = 0
    hidden = model.begin_state(func = mx.nd.zeros, batch_size = args_batch_size, ctx=context)
    for i in range(0, 1, args_bptt):
        data, target = get_batch(data_source, i)
        print(data,target)
        output, hidden = model(data, hidden)
        L = loss(output, target)
        total_L += mx.nd.sum(L).asscalar()
        ntotal += L.size
    return [output, target, total_L / ntotal]

In [14]:
out,tar,p = eval1(test_data)
#corpus.to_word(out)
#print(out,tar)
print(p)



[[ 744   94 3095   12 7097  197  443   90   12 7246 7263 1951   38   49
  4666  736 7458  633    6  219  371 3455   52   56 1053  515  529    2
  2100 4482   12   95]
 [   6   12 7043  910   12 1485   26   48  346 1814   34  119   12  818
    26 4797  378  139 3553  773  139 5678 7598  207 5934  387   52 4461
  5878 7773 3512   12]
 [6980  214   12  664 1913 7141  415   40 2461 1815 5839 1053    0 7361
   450  314    6    6 1766    8   77   19 3235   26   12   12   91    8
    52 7774 7800  127]
 [3426  498  214  119  103   12  120 7199   26  186   12  378 2605   34
   103   12  478 7482 7365 3130 1977 5028   12   52   77  160 1208    6
  1656   52   26  520]
 [  67  249  249 1209  344  214  214   34 1458 7246   46   52   26  124
  2481  888 7180 7483   12 7531  219 1751 7599 1766 1053  780 7696 2972
  6883 7775    6  788]]
<NDArray 5x32 @cpu(0)> 
[   6   12 7043  910   12 1485   26   48  346 1814   34  119   12  818
   26 4797  378  139 3553  773  139 5678 7598  207 5934  387   52 44

In [16]:
test_data.shape

(215, 32)