In [1]:
"""
SUMMARY:
Amazon pos/neg sentiment classification

TBD
"""

'\nSUMMARY:\nAmazon pos/neg sentiment classification\n\nAccuracy: 0.94\nTime per Epoch: 9550 seconds = 220 rev/s\nTotal time: 9550*10 = 1592 min = 26.5 hours\nTrain size = 2,097,152\nTest size = 233,016\n\nDETAILS:\nAttempt to replicate crepe model using MXNET:\nhttps://github.com/zhangxiangxiao/Crepe\n\nThis uses an efficient numpy array (dtype=bool)\nto hold all data in RAM. \n\nRun on one GPU (Tesla K80) with batch=128\nPeak RAM: 142GB, and training cut to: 2,097,152 (from 3.6M)\n'

In [2]:
%matplotlib inline
import numpy as np
import pickle
import pandas as pd
import mxnet as mx
import wget
import time
import os.path
import math
import matplotlib.pyplot as plt
import logging

In [3]:
AZ_ACC = "amazonsentimenik"
AZ_CONTAINER = "textclassificationdatasets"

ALPHABET = list("abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+ =<>()[]{}")
FEATURE_LEN = 1014
BATCH_SIZE = 128
NUM_FILTERS = 256
DATA_SHAPE = (BATCH_SIZE, 1, FEATURE_LEN, len(ALPHABET))

ctx = mx.cpu()
EPOCHS = 10
SD = 0.05  # std for gaussian distribution
NOUTPUT = 2  # good or bad
INITY = mx.init.Normal(sigma=SD)
LR = 0.01
MOMENTUM = 0.9
WDECAY = 0.00001

In [4]:
# logging
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='crepe_amazon.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

In [5]:
def download_file(url):
    # Create file-name
    local_filename = url.split('/')[-1]
    if os.path.isfile(local_filename):
        pass
        # print("The file %s already exist in the current directory\n" % local_filename)
    else:
        # Download
        print("downloading ...\n")
        wget.download(url)
        print('\nsaved data')

In [None]:
def load_data_frame(infile, shuffle = False):
    print("processing data frame: %s" % infile)
    # Get data from windows blob
    download_file('https://%s.blob.core.windows.net/%s/%s' % (AZ_ACC, AZ_CONTAINER, infile))
    
    # 3.6 mill is too much, use 2 mill (keep same ratio)
    if "test" in infile:
        maxrows = int(2097152/9)  # 16,384 batches
    elif "train" in infile:
        maxrows = int(2097152)

    # load data into dataframe
    df = pd.read_csv(infile,
                     header=None,
                     names=['sentiment', 'summary', 'text'],
                     nrows=maxrows)
    # Shuffle
    if shuffle:
        df = df.sample(frac=1).reset_index(drop=True)
        
    # concat summary, review; trim to 1014 char; reverse; lower
    df['rev'] = df.apply(lambda x: "%s %s" % (x['summary'], x['text']), axis=1)
    df.rev = df.rev.str[:FEATURE_LEN].str[::-1].str.lower()
    
    # store class as nparray
    df.sentiment -= 1
    y_split = np.asarray(df.sentiment, dtype='bool')
    # drop columns
    df.drop(['text', 'summary', 'sentiment'], axis=1, inplace=True)

    # Dictionary to create character vectors
    character_hash = pd.DataFrame(np.identity(len(ALPHABET), dtype='bool'), columns=ALPHABET)
    print("finished processing data frame: %s" % infile)
    print("data contains %d obs" % df.shape[0])
    batch_size = df.shape[0]
    # Create encoding
    X_split = np.zeros([batch_size, 1, FEATURE_LEN, len(ALPHABET)], dtype='bool')
    # Main loop
    for ti, tx in enumerate(df.rev):
        if (ti+1) % (100*1000) == 0:
            print("Processed: ", ti+1)
        chars = list(tx)
        for ci, ch in enumerate(chars):
            if ch in ALPHABET:
                X_split[ti % batch_size][0][ci] = np.array(character_hash[ch], dtype='bool')
                
    # Return as a DataBatch
    #return DataBatch(data=[mx.nd.array(X_split)],
    #                 label=[mx.nd.array(y_split[ti + 1 - batch_size:ti + 1])])
    return X_split, y_split

In [None]:
def create_vdcnn():
    """
    Replicating: ...
    """  
    
    vocab_size = 69
    embedding_size = 16
    kernel = (3, embedding_size)
    stride = (2, embedding_size)
    num_filters1 = 64
    num_filters2 = num_filters1 * 2
    num_filters3 = num_filters1 * 3
    num_filters4 = num_filters1 * 4

    input_x = mx.sym.Variable('data')  # placeholder for input
    input_y = mx.sym.Variable('softmax_label')  # placeholder for output  

    # Lookup Table 16
    embed_layer = mx.symbol.Embedding(
        data = input_x, input_dim = vocab_size, output_dim = embedding_size, name = 'word_embedding')
    conv_input = mx.symbol.Reshape(
        data = embed_layer, shape = (BATCH_SIZE, 1, FEATURE_LEN, embedding_size))
        
    # Temp Conv
    conv0 = mx.symbol.Convolution(
        data=conv_input, num_filter=num_filters1, kernel=kernel)

    # CONVOLUTION_BLOCK (1 of 4) -> 64 FILTERS
    conv11 = mx.symbol.Convolution(
        data=conv0, kernel=kernel, num_filter=num_filters1)
    norm11 = mx.symbol.BatchNorm(
        data=conv11)
    act11 = mx.symbol.Activation(
        data=norm11, act_type='relu')
    conv12 = mx.symbol.Convolution(
        data=act11, kernel=kernel, num_filter=num_filters1)
    norm12 = mx.symbol.BatchNorm(
        data=conv12)
    act12 = mx.symbol.Activation(
        data=norm12, act_type='relu')
    conv21 = mx.symbol.Convolution(
        data=act12, kernel=kernel, num_filter=num_filters1)
    norm21 = mx.symbol.BatchNorm(
        data=conv21)
    act21 = mx.symbol.Activation(
        data=norm21, act_type='relu')
    conv22 = mx.symbol.Convolution(
        data=act21, kernel=kernel, num_filter=num_filters1)
    norm22 = mx.symbol.BatchNorm(
        data=conv22)
    act22 = mx.symbol.Activation(
        data=norm22, act_type='relu')
    pool1 = mx.symbol.Pooling(
        data=act22, pool_type='max', kernel=kernel, stride=stride)

    # CONVOLUTION_BLOCK (2 of 4) -> 128 FILTERS
    conv31 = mx.symbol.Convolution(
        data=pool1, kernel=kernel, num_filter=num_filters2)
    norm31 = mx.symbol.BatchNorm(
        data=conv31)
    act31 = mx.symbol.Activation(
        data=norm31, act_type='relu')
    conv32 = mx.symbol.Convolution(
        data=act31, kernel=kernel, num_filter=num_filters2)
    norm32 = mx.symbol.BatchNorm(
        data=conv32)
    act32 = mx.symbol.Activation(
        data=norm32, act_type='relu')
    conv41 = mx.symbol.Convolution(
        data=act32, kernel=kernel, num_filter=num_filters2)
    norm41 = mx.symbol.BatchNorm(
        data=conv41)
    act41 = mx.symbol.Activation(
        data=norm41, act_type='relu')
    conv42 = mx.symbol.Convolution(
        data=act41, kernel=kernel, num_filter=num_filters2)
    norm42 = mx.symbol.BatchNorm(
        data=conv42)
    act42 = mx.symbol.Activation(
        data=norm42, act_type='relu')
    pool2 = mx.symbol.Pooling(
        data=act42, pool_type='max', kernel=kernel, stride=stride)

     # CONVOLUTION_BLOCK (3 of 4) -> 256 FILTERS
    conv51 = mx.symbol.Convolution(
        data=pool2, kernel=kernel, num_filter=num_filters3)
    norm51 = mx.symbol.BatchNorm(
        data=conv51)
    act51 = mx.symbol.Activation(
        data=norm51, act_type='relu')
    conv52 = mx.symbol.Convolution(
        data=act51, kernel=kernel, num_filter=num_filters3)
    norm52 = mx.symbol.BatchNorm(
        data=conv52)
    act52 = mx.symbol.Activation(
        data=norm52, act_type='relu')
    conv61 = mx.symbol.Convolution(
        data=act52, kernel=kernel, num_filter=num_filters3)
    norm61 = mx.symbol.BatchNorm(
        data=conv61)
    act61 = mx.symbol.Activation(
        data=norm61, act_type='relu')
    conv62 = mx.symbol.Convolution(
        data=act61, kernel=kernel, num_filter=num_filters3)
    norm62 = mx.symbol.BatchNorm(
        data=conv62)
    act62 = mx.symbol.Activation(
        data=norm62, act_type='relu')
    pool3 = mx.symbol.Pooling(
        data=act62, pool_type='max', kernel=kernel, stride=stride)   

     # CONVOLUTION_BLOCK (4 of 4) -> 512 FILTERS
    conv71 = mx.symbol.Convolution(
        data=pool3, kernel=kernel, num_filter=num_filters4)
    norm71 = mx.symbol.BatchNorm(
        data=conv71)
    act71 = mx.symbol.Activation(
        data=norm71, act_type='relu')
    conv72 = mx.symbol.Convolution(
        data=act71, kernel=kernel, num_filter=num_filters4)
    norm72 = mx.symbol.BatchNorm(
        data=conv72)
    act72 = mx.symbol.Activation(
        data=norm72, act_type='relu')
    conv81 = mx.symbol.Convolution(
        data=act72, kernel=kernel, num_filter=num_filters4)
    norm81 = mx.symbol.BatchNorm(
        data=conv81)
    act81 = mx.symbol.Activation(
        data=norm81, act_type='relu')
    conv82 = mx.symbol.Convolution(
        data=act81, kernel=kernel, num_filter=num_filters4)
    norm82 = mx.symbol.BatchNorm(
        data=conv82)
    act82 = mx.symbol.Activation(
        data=norm82, act_type='relu')
    pool4 = mx.symbol.Pooling(
        data=act82, pool_type='max', kernel=kernel, stride=stride) 

    # Flatten (dimensions * feature length * filters)
    flatten = mx.symbol.Flatten(data=pool4)

    # First fully connected
    fc1 = mx.symbol.FullyConnected(
        data=flatten, num_hidden=2048) 
    act_fc1 = mx.symbol.Activation(
        data=fc1, act_type='relu')
    # Second fully connected
    fc2 = mx.symbol.FullyConnected(
        data=act_fc1, num_hidden=2048)
    act_fc2 = mx.symbol.Activation(
        data=fc2, act_type='relu')
    # Third fully connected
    fc3 = mx.symbol.FullyConnected(
        data=act_fc2, num_hidden=NOUTPUT)
    net = mx.symbol.SoftmaxOutput(
        data=fc3, label=input_y, name="softmax")
    return net

In [None]:
# Visualise symbol (for crepe)
cnn = create_vdcnn()

a = mx.viz.plot_network(cnn)
a.render('Crepe Model')
a

In [None]:
train_x, train_y = load_data_frame('amazon_review_polarity_train.csv')

processing data frame: amazon_review_polarity_train.csv
finished processing data frame: amazon_review_polarity_train.csv
data contains 2097152 obs
Processed:  100000
Processed:  200000


In [None]:
print(train_x.shape)
print(train_y.shape)

In [None]:
train_iter = mx.io.NDArrayIter(train_x, train_y, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
del train_x
del train_y

In [None]:
model = mx.model.FeedForward(
    ctx = ctx,
    symbol = cnn, 
    num_epoch = EPOCHS,  # number of training rounds
    learning_rate = LR,  # learning rate
    momentum = MOMENTUM,   # momentum for sgd
    wd = WDECAY,  # weight decay for reg
    initializer = INITY  # init with sd of 0.05
)

In [None]:
tic = time.time()
model.fit(
    X = train_iter,
    eval_metric=['accuracy'],
    batch_end_callback=mx.callback.Speedometer(100*BATCH_SIZE),
    epoch_end_callback=mx.callback.do_checkpoint("vdcnn_checkp_") 
)

print("Finished training in %.0f seconds" % (time.time() - tic))

In [None]:
del train_iter

In [None]:
"""
# GPU broke after completing 7th epoch
# Re-load checkpoint
# If training breaks (happens on GPU), we can train further like so:

# Load trained model:
pretrained_model = mx.model.FeedForward.load("crepe_checkp_v2_", 7)  

model = mx.model.FeedForward(
    ctx = ctx,
    symbol=pretrained_model.symbol,
    arg_params=pretrained_model.arg_params,
    aux_params=pretrained_model.aux_params,
    num_epoch=11, begin_epoch=7
)

# Train remaining epochs
tic = time.time()
model.fit(
    X = train_iter,
    eval_metric=['accuracy'],
    batch_end_callback=mx.callback.Speedometer(100*BATCH_SIZE),
    epoch_end_callback=mx.callback.do_checkpoint("crepe_checkp_v2_") 
)

print("Finished training in %.0f seconds" % (time.time() - tic))
"""

In [None]:
# Load test data
test_x, test_y = load_data_frame('amazon_review_polarity_test.csv')
test_iter = mx.io.NDArrayIter(test_x, test_y, batch_size=BATCH_SIZE, shuffle=False)

In [21]:
# Predict
pred = np.argsort(model.predict(X = test_iter))[:,-1]

# Save Results
np.savetxt('crepe_predict_sentiment_amazon.csv', np.c_[pred, test_y], delimiter=',', fmt='%d')

In [22]:
# Accuracy
acc = sum(pred==test_y.astype('int'))/float(len(test_y))
logger.info(acc)
acc  # 0.94

0.94166495004634876

## Log Extract

```
2016-08-24 21:11:28,407 - root - INFO - Start training with [gpu(0)]
2016-08-24 21:12:40,015 - root - INFO - Epoch[0] Batch [50]	Speed: 11477.97 samples/sec	Train-accuracy=0.502031
2016-08-24 21:13:15,326 - root - INFO - Epoch[0] Batch [100]	Speed: 18131.85 samples/sec	Train-accuracy=0.514531
2016-08-24 21:13:46,977 - root - INFO - Epoch[0] Batch [150]	Speed: 20220.53 samples/sec	Train-accuracy=0.515938
2016-08-24 21:14:17,447 - root - INFO - Epoch[0] Batch [200]	Speed: 21004.96 samples/sec	Train-accuracy=0.522344
2016-08-24 21:14:48,170 - root - INFO - Epoch[0] Batch [250]	Speed: 20831.30 samples/sec	Train-accuracy=0.540156
2016-08-24 21:15:19,698 - root - INFO - Epoch[0] Batch [300]	Speed: 20298.77 samples/sec	Train-accuracy=0.524062
2016-08-24 21:15:50,674 - root - INFO - Epoch[0] Batch [350]	Speed: 20661.82 samples/sec	Train-accuracy=0.522344
2016-08-24 21:16:21,581 - root - INFO - Epoch[0] Batch [400]	Speed: 20718.01 samples/sec	Train-accuracy=0.526406
2016-08-24 21:16:53,575 - root - INFO - Epoch[0] Batch [450]	Speed: 20003.13 samples/sec	Train-accuracy=0.522969
2016-08-24 21:17:24,561 - root - INFO - Epoch[0] Batch [500]	Speed: 20655.16 samples/sec	Train-accuracy=0.529375
2016-08-24 21:17:55,624 - root - INFO - Epoch[0] Batch [550]	Speed: 20603.29 samples/sec	Train-accuracy=0.538125
2016-08-24 21:18:26,703 - root - INFO - Epoch[0] Batch [600]	Speed: 20592.68 samples/sec	Train-accuracy=0.540312
2016-08-24 21:18:58,474 - root - INFO - Epoch[0] Batch [650]	Speed: 20143.52 samples/sec	Train-accuracy=0.541406
2016-08-24 21:19:30,115 - root - INFO - Epoch[0] Batch [700]	Speed: 20226.92 samples/sec	Train-accuracy=0.544063
2016-08-24 21:20:00,844 - root - INFO - Epoch[0] Batch [750]	Speed: 20838.08 samples/sec	Train-accuracy=0.544219
2016-08-24 21:20:30,993 - root - INFO - Epoch[0] Batch [800]	Speed: 21239.17 samples/sec	Train-accuracy=0.544375
2016-08-24 21:21:02,349 - root - INFO - Epoch[0] Batch [850]	Speed: 20419.88 samples/sec	Train-accuracy=0.573125
2016-08-24 21:21:33,382 - root - INFO - Epoch[0] Batch [900]	Speed: 20623.21 samples/sec	Train-accuracy=0.595625
2016-08-24 21:22:03,338 - root - INFO - Epoch[0] Batch [950]	Speed: 21365.38 samples/sec	Train-accuracy=0.521875
2016-08-24 21:22:36,104 - root - INFO - Epoch[0] Batch [1000]	Speed: 19541.98 samples/sec	Train-accuracy=0.525156
...
2016-08-26 20:47:59,068 - root - INFO - Epoch[10] Batch [15650]	Speed: 22745.04 samples/sec	Train-accuracy=0.973437
2016-08-26 20:48:27,661 - root - INFO - Epoch[10] Batch [15700]	Speed: 22383.10 samples/sec	Train-accuracy=0.973437
2016-08-26 20:48:56,799 - root - INFO - Epoch[10] Batch [15750]	Speed: 21964.45 samples/sec	Train-accuracy=0.969375
2016-08-26 20:49:25,234 - root - INFO - Epoch[10] Batch [15800]	Speed: 22506.68 samples/sec	Train-accuracy=0.968594
2016-08-26 20:49:54,374 - root - INFO - Epoch[10] Batch [15850]	Speed: 21963.69 samples/sec	Train-accuracy=0.973594
2016-08-26 20:50:22,418 - root - INFO - Epoch[10] Batch [15900]	Speed: 22820.47 samples/sec	Train-accuracy=0.969531
2016-08-26 20:50:50,575 - root - INFO - Epoch[10] Batch [15950]	Speed: 22729.69 samples/sec	Train-accuracy=0.972969
2016-08-26 20:51:19,290 - root - INFO - Epoch[10] Batch [16000]	Speed: 22288.00 samples/sec	Train-accuracy=0.970000
2016-08-26 20:51:48,007 - root - INFO - Epoch[10] Batch [16050]	Speed: 22286.45 samples/sec	Train-accuracy=0.972812
2016-08-26 20:52:15,960 - root - INFO - Epoch[10] Batch [16100]	Speed: 22896.39 samples/sec	Train-accuracy=0.971875
2016-08-26 20:52:43,770 - root - INFO - Epoch[10] Batch [16150]	Speed: 23012.48 samples/sec	Train-accuracy=0.970938
2016-08-26 20:53:13,394 - root - INFO - Epoch[10] Batch [16200]	Speed: 21615.78 samples/sec	Train-accuracy=0.967656
2016-08-26 20:53:43,127 - root - INFO - Epoch[10] Batch [16250]	Speed: 21524.90 samples/sec	Train-accuracy=0.970313
2016-08-26 20:54:11,734 - root - INFO - Epoch[10] Batch [16300]	Speed: 22371.36 samples/sec	Train-accuracy=0.967812
2016-08-26 20:54:40,546 - root - INFO - Epoch[10] Batch [16350]	Speed: 22213.74 samples/sec	Train-accuracy=0.971250
2016-08-26 20:55:00,513 - root - INFO - Epoch[10] Resetting Data Iterator
2016-08-26 20:55:00,513 - root - INFO - Epoch[10] Time cost=9474.580
2016-08-26 21:16:02,765 - root - INFO - 0.941664950046
```