In [1]:
import numpy as np
import pandas as pd
import wget
import os.path

In [9]:
AZ_ACC = "amazonsentimenik"
AZ_CONTAINER = "textclassificationdatasets"
ALPHABET = list("abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+ =<>()[]{}")
print("Alphabet %d characters: " % len(ALPHABET), ALPHABET)
FEATURE_LEN = 1014
BATCH_SIZE = 128
EMBED_SIZE = 16
NUM_FILTERS = 256
NUM_EPOCHS = 10

Alphabet 69 characters:  ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', ',', ';', '.', '!', '?', ':', "'", '"', '/', '\\', '|', '_', '@', '#', '$', '%', '^', '&', '*', '~', '`', '+', ' ', '=', '<', '>', '(', ')', '[', ']', '{', '}']


In [28]:
def download_file(url):
    
    # Create file-name
    local_filename = url.split('/')[-1]
    
    if os.path.isfile(local_filename):
        print("The file %s already exist in the current directory\n" % local_filename)
    else:
        # Download
        print("downloading ...\n")
        wget.download(url)
        print('saved data\n')


def load_data_frame(infile, batch_size=128, shuffle=True):

    # Get data from windows blob
    download_file('https://%s.blob.core.windows.net/%s/%s' % (AZ_ACC, AZ_CONTAINER, infile))
                  
    # load data into dataframe
    df = pd.read_csv(infile,
                     header=None,
                     names=['sentiment', 'summary', 'text'])
           
    # concat summary, review; trim to 1014 char; reverse; lower
    df['rev'] = df.apply(lambda x: "%s %s" % (x['summary'], x['text']), axis=1)
    df.rev = df.rev.str[:FEATURE_LEN].str[::-1].str.lower()
    # store class as nparray
    df.sentiment -= 1
    y_split = np.asarray(df.sentiment, dtype='int')
    #print(Y_split[:30])
    # drop columns
    df.drop(['text', 'summary', 'sentiment'], axis=1, inplace=True)
    
    if shuffle:
        df = df.sample(frac=1).reset_index(drop=True)

    # Dictionary to create character vectors
    character_hash = pd.DataFrame(np.identity(len(ALPHABET)), columns=ALPHABET)
    # Yield mini-batch amount of character vectors
    for ti, tx in enumerate(df.rev):
        if ti % batch_size == 0:
            # output
            if ti > 0:
                yield X_split, y_split[ti-batch_size:ti]
            X_split = np.zeros([batch_size, 1, FEATURE_LEN, len(ALPHABET)], dtype='int')
            
        chars = list(tx)
        for ci, ch in enumerate(chars):
            if ch in ALPHABET:
                X_split[ti%batch_size][0][ci] = np.array(character_hash[ch])

def example():
    count = 0
    for minibatch in load_data_frame('amazon_review_polarity_test.csv', batch_size=5, shuffle=True):
        count += 1
        print(minibatch[-1])
        if count == 6:
            break

In [14]:
example()

The file amazon_review_polarity_test.csv already exist in the current directory

[1 1 0 1 1]
[0 0 0 1 0]
[1 0 0 1 0]
[0 1 1 1 1]
[0 0 1 1 0]
[0 1 0 1 0]


In [33]:
import mxnet as mx

def create_crepe():
    
    """    
    Number of features = 70, input feature length = 1014
    
    2 Dropout modules inserted between 3 fully-connected layers (0.5)
    
    Number of output units for last layer = num_classes
    For polarity test = 2
    """
    
    input_x = mx.sym.Variable('data')

    #1. alphabet x 1014
    conv1 = mx.symbol.Convolution(
        data=input_x, kernel=(7,7), num_filter=256)   
    relu1 = mx.symbol.Activation(
        data=conv1, act_type="relu")
    pool1 = mx.symbol.Pooling(
        data=relu1, pool_type="max", kernel=(3,3), stride=(1,1))

    #2. 336 x 256
    conv2 = mx.symbol.Convolution(
        data=input_x, kernel=(7,7), num_filter=256)   
    relu2 = mx.symbol.Activation(
        data=conv2, act_type="relu")
    pool2 = mx.symbol.Pooling(
        data=relu2, pool_type="max", kernel=(3,3), stride=(1,1))
    
    #3. 110 x 256
    conv3 = mx.symbol.Convolution(
        data=input_x, kernel=(3,3), num_filter=256)   
    relu3 = mx.symbol.Activation(
        data=conv3, act_type="relu")  
    
    #4. 108 x 256
    conv4 = mx.symbol.Convolution(
        data=input_x, kernel=(3,3), num_filter=256)   
    relu4 = mx.symbol.Activation(
        data=conv4, act_type="relu")  
        
    #5. 106 x 256
    conv5 = mx.symbol.Convolution(
        data=input_x, kernel=(3,3), num_filter=256)   
    relu5 = mx.symbol.Activation(
        data=conv5, act_type="relu")      
        
    #6. 104 x 256
    conv6 = mx.symbol.Convolution(
        data=input_x, kernel=(3,3), num_filter=256)   
    relu6 = mx.symbol.Activation(
        data=conv6, act_type="relu")      
    pool6 = mx.symbol.Pooling(
        data=relu6, pool_type="max", kernel=(3,3), stride=(1,1))    
    
    # 34 x 256
    flatten = mx.symbol.Flatten(data=pool6)
    
    #7.  8704
    fc1 = mx.symbol.FullyConnected(
        data=flatten, num_hidden=1024) 
    act_fc1 = mx.symbol.Activation(
        data=fc1, act_type="relu")
    drop1 = mx.sym.Dropout(act_fc1, p=0.5) 
    
    #8. 1024
    fc2 = mx.symbol.FullyConnected(
        data=drop1, num_hidden=1024) 
    act_fc2 = mx.symbol.Activation(
        data=fc2, act_type="relu")
    drop2 = mx.sym.Dropout(act_fc2, p=0.5) 
    
    #9. 1024
    # https://github.com/zhangxiangxiao/Crepe/blob/master/train/config.lua
    # has this as 14 classes?
    fc3 = mx.symbol.FullyConnected(
        data=drop2, num_hidden=2) 

    crepe = mx.symbol.SoftmaxOutput(
        data=fc3, name="softmax")
    
    return crepe    
    
# create the NN
ctx = mx.cpu()
cnn = create_crepe()

m = mx.model.FeedForward(
    ctx = ctx,
    symbol = cnn, 
    num_epoch = 10,
    learning_rate = 0.01,
    momentum = 0.9, 
    wd = 0.00001
    )

# train NN
for epoch in range(10):
    
    num_correct = 0
    num_total = 0
        
    for batchX, batchY in load_data_frame('amazon_review_polarity_test.csv', BATCH_SIZE*10):
        
        train_iter = mx.io.NDArrayIter(batchX, batchY, batch_size = BATCH_SIZE, shuffle=True)
        m.fit(X=train_iter)
        
        # evaluate on training
        num_correct += sum(batchY == np.argmax(m.cnn_exec.outputs[0].asnumpy(), axis=1))
        num_total += len(batchY)

    # end of training loop
    train_acc = num_correct * 100 / float(num_total)
    print("Iter [%d], Training Accuracy: %.3f" % (epoch, train_acc) )

The file amazon_review_polarity_test.csv already exist in the current directory



MXNetError: [19:20:34] d:\chhong\mxnet\src\storage\./cpu_device_storage.h:44: Check  notnull: ptr 