In [1]:
%matplotlib inline
import numpy as np
import pickle
import pandas as pd
import mxnet as mx
import wget
import time
import os.path
import math
import matplotlib.pyplot as plt
import logging

In [2]:
ctx = [mx.gpu(0), mx.gpu(1), mx.gpu(2), mx.gpu(3)]
ctx = mx.gpu(0)
AZ_ACC = "amazonsentimenik"
AZ_CONTAINER = "textclassificationdatasets"
ALPHABET = list("abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+ =<>()[]{}")
FEATURE_LEN = 1014
BATCH_SIZE = 128 
NUM_FILTERS = 256
EPOCHS = 10000
SD = 0.05  # std for gaussian distribution
NOUTPUT = 2  # good or bad
DATA_SHAPE = (BATCH_SIZE, 1, FEATURE_LEN, len(ALPHABET))

In [3]:
# logging
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='crepe_inram_onegpu.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

In [4]:
def download_file(url):
    # Create file-name
    local_filename = url.split('/')[-1]
    if os.path.isfile(local_filename):
        pass
        # print("The file %s already exist in the current directory\n" % local_filename)
    else:
        # Download
        print("downloading ...\n")
        wget.download(url)
        print('\nsaved data')

In [5]:
def load_data_frame(infile, shuffle = False):
    print("processing data frame: %s" % infile)
    # Get data from windows blob
    download_file('https://%s.blob.core.windows.net/%s/%s' % (AZ_ACC, AZ_CONTAINER, infile))
    
    # 3.6 mill is too much, use 2 mill (keep same ratio)
    if "test" in infile:
        maxrows = int(2097152/9)  # 16,384 batches
    elif "train" in infile:
        maxrows = int(2097152)

    # load data into dataframe
    df = pd.read_csv(infile,
                     header=None,
                     names=['sentiment', 'summary', 'text'],
                     nrows=maxrows)
    # concat summary, review; trim to 1014 char; reverse; lower
    df['rev'] = df.apply(lambda x: "%s %s" % (x['summary'], x['text']), axis=1)
    df.rev = df.rev.str[:FEATURE_LEN].str[::-1].str.lower()
    # store class as nparray
    df.sentiment -= 1
    y_split = np.asarray(df.sentiment, dtype='bool')
    # drop columns
    df.drop(['text', 'summary', 'sentiment'], axis=1, inplace=True)
    if shuffle:
        df = df.sample(frac=1).reset_index(drop=True)
    # Dictionary to create character vectors
    character_hash = pd.DataFrame(np.identity(len(ALPHABET), dtype='bool'), columns=ALPHABET)
    print("finished processing data frame: %s" % infile)
    print("data contains %d obs" % df.shape[0])
    batch_size = df.shape[0]
    # Create encoding
    X_split = np.zeros([batch_size, 1, FEATURE_LEN, len(ALPHABET)], dtype='bool')
    # Main loop
    for ti, tx in enumerate(df.rev):
        if (ti+1) % (100*1000) == 0:
            print("Processed: ", ti+1)
        chars = list(tx)
        for ci, ch in enumerate(chars):
            if ch in ALPHABET:
                X_split[ti % batch_size][0][ci] = np.array(character_hash[ch], dtype='bool')
                
    # Return as a DataBatch
    #return DataBatch(data=[mx.nd.array(X_split)],
    #                 label=[mx.nd.array(y_split[ti + 1 - batch_size:ti + 1])])
    return X_split, y_split

In [None]:
def create_crepe():
    """
    Replicating: https://github.com/zhangxiangxiao/Crepe/blob/master/train/config.lua
    """
    input_x = mx.sym.Variable('data')  # placeholder for input
    input_y = mx.sym.Variable('softmax_label')  # placeholder for output
    # 1. alphabet x 1014
    conv1 = mx.symbol.Convolution(
        data=input_x, kernel=(7, 69), num_filter=NUM_FILTERS)
    relu1 = mx.symbol.Activation(
        data=conv1, act_type="relu")
    pool1 = mx.symbol.Pooling(
        data=relu1, pool_type="max", kernel=(3, 1), stride=(1, 1))
    # 2. 336 x 256
    conv2 = mx.symbol.Convolution(
        data=pool1, kernel=(7, 1), num_filter=NUM_FILTERS)
    relu2 = mx.symbol.Activation(
        data=conv2, act_type="relu")
    pool2 = mx.symbol.Pooling(
        data=relu2, pool_type="max", kernel=(3, 1), stride=(1, 1))
    # 3. 110 x 256
    conv3 = mx.symbol.Convolution(
        data=pool2, kernel=(3, 1), num_filter=NUM_FILTERS)
    relu3 = mx.symbol.Activation(
        data=conv3, act_type="relu")
    # 4. 108 x 256
    conv4 = mx.symbol.Convolution(
        data=relu3, kernel=(3, 1), num_filter=NUM_FILTERS)
    relu4 = mx.symbol.Activation(
        data=conv4, act_type="relu")
    # 5. 106 x 256
    conv5 = mx.symbol.Convolution(
        data=relu4, kernel=(3, 1), num_filter=NUM_FILTERS)
    relu5 = mx.symbol.Activation(
        data=conv5, act_type="relu")
    # 6. 104 x 256
    conv6 = mx.symbol.Convolution(
        data=relu5, kernel=(3, 1), num_filter=NUM_FILTERS)
    relu6 = mx.symbol.Activation(
        data=conv6, act_type="relu")
    pool6 = mx.symbol.Pooling(
        data=relu6, pool_type="max", kernel=(3, 1), stride=(1, 1))
    # 34 x 256
    flatten = mx.symbol.Flatten(data=pool6)
    # 7.  8704
    fc1 = mx.symbol.FullyConnected(
        data=flatten, num_hidden=1024)
    act_fc1 = mx.symbol.Activation(
        data=fc1, act_type="relu")
    drop1 = mx.sym.Dropout(act_fc1, p=0.5)
    # 8. 1024
    fc2 = mx.symbol.FullyConnected(
        data=drop1, num_hidden=1024)
    act_fc2 = mx.symbol.Activation(
        data=fc2, act_type="relu")
    drop2 = mx.sym.Dropout(act_fc2, p=0.5)
    # 9. 1024
    fc3 = mx.symbol.FullyConnected(
        data=drop2, num_hidden=NOUTPUT)
    crepe = mx.symbol.SoftmaxOutput(
        data=fc3, label=input_y, name="softmax")
    return crepe

In [None]:
train_x, train_y = load_data_frame('amazon_review_polarity_train.csv')

processing data frame: amazon_review_polarity_train.csv
finished processing data frame: amazon_review_polarity_train.csv
data contains 2097152 obs


In [None]:
print(train_x.shape)
print(train_y.shape)

In [None]:
train_iter = mx.io.NDArrayIter(train_x, train_y, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
del train_x
del train_y
# 147 GB, 69% with 2,097,000 mill obs (3.6 will be too much)

In [None]:
model = mx.model.FeedForward(
    ctx = ctx,
    symbol = create_crepe(), 
    num_epoch = EPOCHS,  # number of training rounds
    learning_rate = 0.01,  # learning rate
    momentum = 0.9,   # momentum for sgd
    wd = 0.00001,  # weight decay for reg
    initializer = mx.init.Normal(sigma=SD)  # init with sd of 0.05
)

In [None]:
tic = time.time()
model.fit(
    X = train_iter,
    eval_metric=['accuracy'],
    batch_end_callback=mx.callback.Speedometer(100*BATCH_SIZE),
    epoch_end_callback=mx.callback.do_checkpoint("crepe_checkp_") 
)

print("Finished training in %.0f seconds" % (time.time() - tic))

In [None]:
"""
2016-08-23 05:34:21,667 - root - INFO - Auto-select kvstore type = local_allreduce_cpu
2016-08-23 05:34:21,822 - root - INFO - Start training with [gpu(0), gpu(1), gpu(2), gpu(3)]
2016-08-23 05:38:04,459 - root - INFO - Epoch[0] Batch [50]	Speed: 3951.01 samples/sec	Train-accuracy=0.501406
2016-08-23 05:40:48,198 - root - INFO - Epoch[0] Batch [100]	Speed: 3908.64 samples/sec	Train-accuracy=0.492031
2016-08-23 05:43:31,387 - root - INFO - Epoch[0] Batch [150]	Speed: 3922.22 samples/sec	Train-accuracy=0.497812
2016-08-23 05:46:17,117 - root - INFO - Epoch[0] Batch [200]	Speed: 3861.70 samples/sec	Train-accuracy=0.499219
2016-08-23 05:49:03,273 - root - INFO - Epoch[0] Batch [250]	Speed: 3852.15 samples/sec	Train-accuracy=0.505313
2016-08-23 05:51:49,802 - root - INFO - Epoch[0] Batch [300]	Speed: 3843.20 samples/sec	Train-accuracy=0.501719
2016-08-23 05:54:35,424 - root - INFO - Epoch[0] Batch [350]	Speed: 3864.22 samples/sec	Train-accuracy=0.499375
2016-08-23 05:57:22,983 - root - INFO - Epoch[0] Batch [400]	Speed: 3819.53 samples/sec	Train-accuracy=0.506250
2016-08-23 06:00:07,595 - root - INFO - Epoch[0] Batch [450]	Speed: 3887.95 samples/sec	Train-accuracy=0.502031
2016-08-23 06:02:52,483 - root - INFO - Epoch[0] Batch [500]	Speed: 3881.40 samples/sec	Train-accuracy=0.511094
2016-08-23 06:05:36,200 - root - INFO - Epoch[0] Batch [550]	Speed: 3909.21 samples/sec	Train-accuracy=0.506406
2016-08-23 06:08:22,447 - root - INFO - Epoch[0] Batch [600]	Speed: 3850.76 samples/sec	Train-accuracy=0.517031
2016-08-23 06:11:07,180 - root - INFO - Epoch[0] Batch [650]	Speed: 3885.48 samples/sec	Train-accuracy=0.498906
2016-08-23 06:13:54,270 - root - INFO - Epoch[0] Batch [700]	Speed: 3830.25 samples/sec	Train-accuracy=0.510000
2016-08-23 06:16:38,315 - root - INFO - Epoch[0] Batch [750]	Speed: 3901.75 samples/sec	Train-accuracy=0.503750
2016-08-23 06:19:22,877 - root - INFO - Epoch[0] Batch [800]	Speed: 3889.14 samples/sec	Train-accuracy=0.505000
2016-08-23 06:22:10,806 - root - INFO - Epoch[0] Batch [850]	Speed: 3811.11 samples/sec	Train-accuracy=0.488906
2016-08-23 06:24:57,601 - root - INFO - Epoch[0] Batch [900]	Speed: 3837.05 samples/sec	Train-accuracy=0.502188
2016-08-23 06:27:41,450 - root - INFO - Epoch[0] Batch [950]	Speed: 3906.04 samples/sec	Train-accuracy=0.504062
2016-08-23 06:30:27,716 - root - INFO - Epoch[0] Batch [1000]	Speed: 3849.28 samples/sec	Train-accuracy=0.509062
2016-08-23 06:33:12,042 - root - INFO - Epoch[0] Batch [1050]	Speed: 3895.03 samples/sec	Train-accuracy=0.497500
2016-08-23 06:35:58,604 - root - INFO - Epoch[0] Batch [1100]	Speed: 3842.78 samples/sec	Train-accuracy=0.492500
2016-08-23 06:38:42,322 - root - INFO - Epoch[0] Batch [1150]	Speed: 3909.16 samples/sec	Train-accuracy=0.510469
2016-08-23 06:41:30,394 - root - INFO - Epoch[0] Batch [1200]	Speed: 3807.89 samples/sec	Train-accuracy=0.492031
2016-08-23 06:44:14,674 - root - INFO - Epoch[0] Batch [1250]	Speed: 3896.14 samples/sec	Train-accuracy=0.494375
2016-08-23 06:47:02,858 - root - INFO - Epoch[0] Batch [1300]	Speed: 3805.36 samples/sec	Train-accuracy=0.502344
2016-08-23 06:49:45,404 - root - INFO - Epoch[0] Batch [1350]	Speed: 3937.35 samples/sec	Train-accuracy=0.512188
2016-08-23 06:52:33,546 - root - INFO - Epoch[0] Batch [1400]	Speed: 3806.33 samples/sec	Train-accuracy=0.508594
2016-08-23 06:55:17,378 - root - INFO - Epoch[0] Batch [1450]	Speed: 3906.42 samples/sec	Train-accuracy=0.497812
2016-08-23 06:58:00,380 - root - INFO - Epoch[0] Batch [1500]	Speed: 3926.36 samples/sec	Train-accuracy=0.495625
2016-08-23 07:00:49,207 - root - INFO - Epoch[0] Batch [1550]	Speed: 3790.84 samples/sec	Train-accuracy=0.495625
2016-08-23 07:03:31,723 - root - INFO - Epoch[0] Batch [1600]	Speed: 3938.07 samples/sec	Train-accuracy=0.492344
2016-08-23 07:06:17,380 - root - INFO - Epoch[0] Batch [1650]	Speed: 3863.80 samples/sec	Train-accuracy=0.504219
2016-08-23 07:09:06,976 - root - INFO - Epoch[0] Batch [1700]	Speed: 3773.67 samples/sec	Train-accuracy=0.489375
2016-08-23 07:11:52,362 - root - INFO - Epoch[0] Batch [1750]	Speed: 3870.06 samples/sec	Train-accuracy=0.504531
2016-08-23 07:14:37,066 - root - INFO - Epoch[0] Batch [1800]	Speed: 3886.14 samples/sec	Train-accuracy=0.495312
2016-08-23 07:17:25,444 - root - INFO - Epoch[0] Batch [1850]	Speed: 3800.99 samples/sec	Train-accuracy=0.497656
2016-08-23 07:20:12,398 - root - INFO - Epoch[0] Batch [1900]	Speed: 3833.39 samples/sec	Train-accuracy=0.497656
2016-08-23 07:22:58,779 - root - INFO - Epoch[0] Batch [1950]	Speed: 3846.59 samples/sec	Train-accuracy=0.500156
2016-08-23 07:25:45,132 - root - INFO - Epoch[0] Batch [2000]	Speed: 3847.22 samples/sec	Train-accuracy=0.511250
2016-08-23 07:28:28,523 - root - INFO - Epoch[0] Batch [2050]	Speed: 3917.34 samples/sec	Train-accuracy=0.492031
2016-08-23 07:31:14,592 - root - INFO - Epoch[0] Batch [2100]	Speed: 3854.21 samples/sec	Train-accuracy=0.507656
2016-08-23 07:33:56,562 - root - INFO - Epoch[0] Batch [2150]	Speed: 3951.35 samples/sec	Train-accuracy=0.487031
2016-08-23 07:36:42,016 - root - INFO - Epoch[0] Batch [2200]	Speed: 3868.12 samples/sec	Train-accuracy=0.507188
2016-08-23 07:39:24,408 - root - INFO - Epoch[0] Batch [2250]	Speed: 3941.08 samples/sec	Train-accuracy=0.502500
2016-08-23 07:42:07,263 - root - INFO - Epoch[0] Batch [2300]	Speed: 3929.88 samples/sec	Train-accuracy=0.496094
2016-08-23 07:44:55,990 - root - INFO - Epoch[0] Batch [2350]	Speed: 3793.49 samples/sec	Train-accuracy=0.504062
2016-08-23 07:47:42,301 - root - INFO - Epoch[0] Batch [2400]	Speed: 3848.21 samples/sec	Train-accuracy=0.496719
2016-08-23 07:50:29,348 - root - INFO - Epoch[0] Batch [2450]	Speed: 3831.23 samples/sec	Train-accuracy=0.491563
2016-08-23 07:53:11,974 - root - INFO - Epoch[0] Batch [2500]	Speed: 3935.41 samples/sec	Train-accuracy=0.498125
2016-08-23 07:55:58,289 - root - INFO - Epoch[0] Batch [2550]	Speed: 3848.12 samples/sec	Train-accuracy=0.500313
2016-08-23 07:58:43,556 - root - INFO - Epoch[0] Batch [2600]	Speed: 3872.87 samples/sec	Train-accuracy=0.520625
2016-08-23 08:01:29,214 - root - INFO - Epoch[0] Batch [2650]	Speed: 3863.38 samples/sec	Train-accuracy=0.497969
2016-08-23 08:04:14,497 - root - INFO - Epoch[0] Batch [2700]	Speed: 3873.60 samples/sec	Train-accuracy=0.502656
2016-08-23 08:07:01,671 - root - INFO - Epoch[0] Batch [2750]	Speed: 3828.71 samples/sec	Train-accuracy=0.500938
2016-08-23 08:09:46,490 - root - INFO - Epoch[0] Batch [2800]	Speed: 3883.07 samples/sec	Train-accuracy=0.489063
2016-08-23 08:12:33,032 - root - INFO - Epoch[0] Batch [2850]	Speed: 3842.87 samples/sec	Train-accuracy=0.500313
2016-08-23 08:15:17,720 - root - INFO - Epoch[0] Batch [2900]	Speed: 3886.47 samples/sec	Train-accuracy=0.504531
2016-08-23 08:18:03,858 - root - INFO - Epoch[0] Batch [2950]	Speed: 3852.24 samples/sec	Train-accuracy=0.498281
2016-08-23 08:20:49,171 - root - INFO - Epoch[0] Batch [3000]	Speed: 3871.42 samples/sec	Train-accuracy=0.507656
2016-08-23 08:23:37,118 - root - INFO - Epoch[0] Batch [3050]	Speed: 3810.75 samples/sec	Train-accuracy=0.503437
2016-08-23 08:26:23,233 - root - INFO - Epoch[0] Batch [3100]	Speed: 3852.73 samples/sec	Train-accuracy=0.495625
2016-08-23 08:29:08,282 - root - INFO - Epoch[0] Batch [3150]	Speed: 3877.64 samples/sec	Train-accuracy=0.505313
2016-08-23 08:31:52,980 - root - INFO - Epoch[0] Batch [3200]	Speed: 3885.92 samples/sec	Train-accuracy=0.513750
2016-08-23 08:34:39,924 - root - INFO - Epoch[0] Batch [3250]	Speed: 3833.94 samples/sec	Train-accuracy=0.506719
2016-08-23 08:37:28,635 - root - INFO - Epoch[0] Batch [3300]	Speed: 3793.45 samples/sec	Train-accuracy=0.498906
2016-08-23 08:40:12,437 - root - INFO - Epoch[0] Batch [3350]	Speed: 3907.54 samples/sec	Train-accuracy=0.496094
2016-08-23 08:42:58,313 - root - INFO - Epoch[0] Batch [3400]	Speed: 3858.68 samples/sec	Train-accuracy=0.496719
...
"""