In [9]:
import numpy as np
import pandas as pd
import wget
import os.path

In [10]:
AZ_ACC = "amazonsentimenik"
AZ_CONTAINER = "textclassificationdatasets"
ALPHABET = list("abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+ =<>()[]{}")
print("Alphabet %d characters: " % len(ALPHABET), ALPHABET)
FEATURE_LEN = 1014
BATCH_SIZE = 128
EMBED_SIZE = 16

Alphabet 69 characters:  ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', ',', ';', '.', '!', '?', ':', "'", '"', '/', '\\', '|', '_', '@', '#', '$', '%', '^', '&', '*', '~', '`', '+', ' ', '=', '<', '>', '(', ')', '[', ']', '{', '}']


In [11]:
def download_file(url):
    
    # Create file-name
    local_filename = url.split('/')[-1]
    
    if os.path.isfile(local_filename):
        print("The file %s already exist in the current directory\n" % local_filename)
    else:
        # Download
        print("downloading ...\n")
        wget.download(url)
        print('saved data\n')


def load_data_frame(infile, batch_size=128, shuffle=True):

    # Get data from windows blob
    download_file('https://%s.blob.core.windows.net/%s/%s' % (AZ_ACC, AZ_CONTAINER, infile))
                  
    # load data into dataframe
    df = pd.read_csv(infile,
                     header=None,
                     names=['sentiment', 'summary', 'text'])
           
    # concat summary, review; trim to 1014 char; reverse; lower
    df['rev'] = df.apply(lambda x: "%s %s" % (x['summary'], x['text']), axis=1)
    df.rev = df.rev.str[:FEATURE_LEN].str[::-1].str.lower()
    # store class as nparray
    df.sentiment -= 1
    y_split = np.asarray(df.sentiment, dtype='int')
    #print(Y_split[:30])
    # drop columns
    df.drop(['text', 'summary', 'sentiment'], axis=1, inplace=True)
    
    if shuffle:
        df = df.sample(frac=1).reset_index(drop=True)

    # Dictionary to create character vectors
    character_hash = pd.DataFrame(np.identity(len(ALPHABET)), columns=ALPHABET)
    # Yield mini-batch amount of character vectors
    for ti, tx in enumerate(df.rev):
        if ti % batch_size == 0:
            # output
            if ti > 0:
                yield X_split, y_split[ti-batch_size:ti]
            X_split = np.zeros([batch_size, FEATURE_LEN, len(ALPHABET)], dtype='int')
            
        chars = list(tx)
        for ci, ch in enumerate(chars):
            if ch in ALPHABET:
                X_split[ti%batch_size][ci] = np.array(character_hash[ch])

def example():
    count = 0
    for minibatch in load_data_frame('amazon_review_polarity_test.csv', batch_size=5, shuffle=True):
        count += 1
        print(minibatch[-1])
        if count == 6:
            break

In [12]:
example()

The file amazon_review_polarity_test.csv already exist in the current directory

[1 1 0 1 1]
[0 0 0 1 0]
[1 0 0 1 0]
[0 1 1 1 1]
[0 0 1 1 0]
[0 1 0 1 0]


In [None]:
from collections import namedtuple
import mxnet as mx

CNNModel = namedtuple("CNNModel", ['cnn_exec', 'symbol', 'data', 'label', 'param_blocks'])

def create_cnn_model(ctx,
                     sentence_size, 
                     batch_size,
                     vocab_size,
                     num_label,
                     filter_list,
                     num_filter,
                     dropout,
                     initializer=mx.initializer.Uniform(0.1)):

    """ 
    Create cnn_model with optional dropout and embedding
    """
    
    input_x = mx.sym.Variable('data')
    input_y = mx.sym.Variable('softmax_label')
    
                                   
    # create convolutions and max pooling
    pooled_outputs = []
    
    for i, filter_size in enumerate(filter_list):
        
        convi = mx.sym.Convolution(data=input_x,
                                   kernel=(filter_size, vocab_size),
                                   num_filter=num_filter)
        
        relui = mx.sym.Activation(data=convi, 
                                  act_type='relu')
        
        pooli = mx.sym.Pooling(data=relui,
                               pool_type='max', 
                               kernel=(sentence_size - filter_size + 1, 1),
                               stride=(1,1))
        
        pooled_outputs.append(pooli)
 
    # combine all pooled outputs
    total_filters = num_filter * len(filter_list)
    concat = mx.sym.Concat(*pooled_outputs, dim=1)
    h_pool = mx.sym.Reshape(data=concat, target_shape=(batch_size, total_filters))

    # dropout layer
    h_drop = mx.sym.Dropout(data=h_pool, p=dropout)

    # fully connected
    cls_weight = mx.sym.Variable('cls_weight')
    cls_bias = mx.sym.Variable('cls_bias')

    fc = mx.sym.FullyConnected(data=h_drop,
                               weight=cls_weight,
                               bias=cls_bias,
                               num_hidden=num_label)

    # softmax output
    cnn = mx.sym.SoftmaxOutput(data=fc, label=input_y, name='softmax')

    # get arguments
    arg_names = cnn.list_arguments()

    # shape
    input_shapes = {}
    input_shapes['data'] = (batch_size, sentence_size, vocab_size)

    arg_shape, out_shape, aux_shape = cnn.infer_shape(**input_shapes)
    arg_arrays = [mx.nd.zeros(s, ctx) for s in arg_shape]
    args_grad = {}
    
    for shape, name in zip(arg_shape, arg_names):
        if name in ['softmax_label', 'data']: # input, output
            continue
        args_grad[name] = mx.nd.zeros(shape, ctx)

    cnn_exec = cnn.bind(ctx=ctx, args=arg_arrays, args_grad=args_grad, grad_req='add')

    param_blocks = []
    arg_dict = dict(zip(arg_names, cnn_exec.arg_arrays))
    
    for i, name in enumerate(arg_names):
        if name in ['softmax_label', 'data']: # input, output
            continue
        initializer(name, arg_dict[name])

        param_blocks.append( (i, arg_dict[name], args_grad[name], name) )

    out_dict = dict(zip(cnn.list_outputs(), cnn_exec.outputs))

    data = cnn_exec.arg_dict['data']
    label = cnn_exec.arg_dict['softmax_label']

    return CNNModel(cnn_exec=cnn_exec, symbol=cnn, data=data, label=label, param_blocks=param_blocks)
    
    
def train_cnn(model,
              fun_data,
              optimiser='rmsprop',
              learning_rate = 0.0005,
              epochs=200):
    
    m = model
    opt = mx.optimizer.create(optimiser)
    opt.lr = learning_rate
    
    updater = mx.optimizer.get_updater(opt)
    
    for epoch in range(epochs):
        
        num_correct = 0
        num_total = 0
        
        for batchX, batchY in fun_data:

            print(m.data[:].shape) # (128, 1014)
            print(m.label[:].shape) # (128,)
            
            print(batchX.shape) # (128, 1014, 69)
            print(batchY.shape) # (128,)
            
            m.data[:] = batchX
            m.label[:] = batchY
            
            # forward
            m.cnn_exec.forward(is_train_True)
            
            # backward
            m.cnn_exec.backward()
            
            # evaluate on training
            num_correct += sum(batchY == np.argmax(m.cnn_exec.outputs[0].asnumpy(), axis=1))
            num_total += len(batchY)

        # end of training loop
        train_acc = num_correct * 100 / float(num_total)
        print("Iter [%d], Training Accuracy: %.3f" % (epoch, train_acc) )

                
if __name__ == '__main__':
    
    #1. Create model
    cnn_model = create_cnn_model(mx.cpu(),
                                 sentence_size=FEATURE_LEN,
                                 batch_size=BATCH_SIZE,
                                 vocab_size=len(ALPHABET),
                                 num_label=2,
                                 filter_list=[3, 4, 5],
                                 num_filter=100,
                                 dropout=0.5)
                                 
    #2. Train model       
    train_cnn(model=cnn_model,
              fun_data=load_data_frame('amazon_review_polarity_test.csv', BATCH_SIZE))   