In [1]:
%matplotlib inline
import numpy as np
import pickle
import pandas as pd
import mxnet as mx
import wget
import time
import os.path
import math
import matplotlib.pyplot as plt
import logging

In [2]:
ctx = mx.gpu(0)
AZ_ACC = "amazonsentimenik"
AZ_CONTAINER = "textclassificationdatasets"
ALPHABET = list("abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+ =<>()[]{}")
FEATURE_LEN = 1014
BATCH_SIZE = 128 
NUM_FILTERS = 256
EPOCHS = 10000
SD = 0.05  # std for gaussian distribution
NOUTPUT = 2  # good or bad
DATA_SHAPE = (BATCH_SIZE, 1, FEATURE_LEN, len(ALPHABET))

In [3]:
# logging
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='crepe_inram_onegpu.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

In [4]:
def download_file(url):
    # Create file-name
    local_filename = url.split('/')[-1]
    if os.path.isfile(local_filename):
        pass
        # print("The file %s already exist in the current directory\n" % local_filename)
    else:
        # Download
        print("downloading ...\n")
        wget.download(url)
        print('\nsaved data')

In [5]:
def load_data_frame(infile, shuffle = False):
    print("processing data frame: %s" % infile)
    # Get data from windows blob
    download_file('https://%s.blob.core.windows.net/%s/%s' % (AZ_ACC, AZ_CONTAINER, infile))
    
    # 3.6 mill is too much, use 2 mill (keep same ratio)
    if "test" in infile:
        maxrows = int(2097152/9)  # 16,384 batches
    elif "train" in infile:
        maxrows = int(2097152)

    # load data into dataframe
    df = pd.read_csv(infile,
                     header=None,
                     names=['sentiment', 'summary', 'text'],
                     nrows=maxrows)
    # concat summary, review; trim to 1014 char; reverse; lower
    df['rev'] = df.apply(lambda x: "%s %s" % (x['summary'], x['text']), axis=1)
    df.rev = df.rev.str[:FEATURE_LEN].str[::-1].str.lower()
    # store class as nparray
    df.sentiment -= 1
    y_split = np.asarray(df.sentiment, dtype='bool')
    # drop columns
    df.drop(['text', 'summary', 'sentiment'], axis=1, inplace=True)
    if shuffle:
        df = df.sample(frac=1).reset_index(drop=True)
    # Dictionary to create character vectors
    character_hash = pd.DataFrame(np.identity(len(ALPHABET), dtype='bool'), columns=ALPHABET)
    print("finished processing data frame: %s" % infile)
    print("data contains %d obs" % df.shape[0])
    batch_size = df.shape[0]
    # Create encoding
    X_split = np.zeros([batch_size, 1, FEATURE_LEN, len(ALPHABET)], dtype='bool')
    # Main loop
    for ti, tx in enumerate(df.rev):
        if (ti+1) % (100*1000) == 0:
            print("Processed: ", ti+1)
        chars = list(tx)
        for ci, ch in enumerate(chars):
            if ch in ALPHABET:
                X_split[ti % batch_size][0][ci] = np.array(character_hash[ch], dtype='bool')
                
    # Return as a DataBatch
    #return DataBatch(data=[mx.nd.array(X_split)],
    #                 label=[mx.nd.array(y_split[ti + 1 - batch_size:ti + 1])])
    return X_split, y_split

In [6]:
def create_crepe():
    """
    Replicating: https://github.com/zhangxiangxiao/Crepe/blob/master/train/config.lua
    """
    input_x = mx.sym.Variable('data')  # placeholder for input
    input_y = mx.sym.Variable('softmax_label')  # placeholder for output
    # 1. alphabet x 1014
    conv1 = mx.symbol.Convolution(
        data=input_x, kernel=(7, 69), num_filter=NUM_FILTERS)
    relu1 = mx.symbol.Activation(
        data=conv1, act_type="relu")
    pool1 = mx.symbol.Pooling(
        data=relu1, pool_type="max", kernel=(3, 1), stride=(1, 1))
    # 2. 336 x 256
    conv2 = mx.symbol.Convolution(
        data=pool1, kernel=(7, 1), num_filter=NUM_FILTERS)
    relu2 = mx.symbol.Activation(
        data=conv2, act_type="relu")
    pool2 = mx.symbol.Pooling(
        data=relu2, pool_type="max", kernel=(3, 1), stride=(1, 1))
    # 3. 110 x 256
    conv3 = mx.symbol.Convolution(
        data=pool2, kernel=(3, 1), num_filter=NUM_FILTERS)
    relu3 = mx.symbol.Activation(
        data=conv3, act_type="relu")
    # 4. 108 x 256
    conv4 = mx.symbol.Convolution(
        data=relu3, kernel=(3, 1), num_filter=NUM_FILTERS)
    relu4 = mx.symbol.Activation(
        data=conv4, act_type="relu")
    # 5. 106 x 256
    conv5 = mx.symbol.Convolution(
        data=relu4, kernel=(3, 1), num_filter=NUM_FILTERS)
    relu5 = mx.symbol.Activation(
        data=conv5, act_type="relu")
    # 6. 104 x 256
    conv6 = mx.symbol.Convolution(
        data=relu5, kernel=(3, 1), num_filter=NUM_FILTERS)
    relu6 = mx.symbol.Activation(
        data=conv6, act_type="relu")
    pool6 = mx.symbol.Pooling(
        data=relu6, pool_type="max", kernel=(3, 1), stride=(1, 1))
    # 34 x 256
    flatten = mx.symbol.Flatten(data=pool6)
    # 7.  8704
    fc1 = mx.symbol.FullyConnected(
        data=flatten, num_hidden=1024)
    act_fc1 = mx.symbol.Activation(
        data=fc1, act_type="relu")
    drop1 = mx.sym.Dropout(act_fc1, p=0.5)
    # 8. 1024
    fc2 = mx.symbol.FullyConnected(
        data=drop1, num_hidden=1024)
    act_fc2 = mx.symbol.Activation(
        data=fc2, act_type="relu")
    drop2 = mx.sym.Dropout(act_fc2, p=0.5)
    # 9. 1024
    fc3 = mx.symbol.FullyConnected(
        data=drop2, num_hidden=NOUTPUT)
    crepe = mx.symbol.SoftmaxOutput(
        data=fc3, label=input_y, name="softmax")
    return crepe

In [7]:
train_x, train_y = load_data_frame('amazon_review_polarity_train.csv')

processing data frame: amazon_review_polarity_train.csv
finished processing data frame: amazon_review_polarity_train.csv
data contains 2097152 obs
('Processed: ', 100000)
('Processed: ', 200000)
('Processed: ', 300000)
('Processed: ', 400000)
('Processed: ', 500000)
('Processed: ', 600000)
('Processed: ', 700000)
('Processed: ', 800000)
('Processed: ', 900000)
('Processed: ', 1000000)
('Processed: ', 1100000)
('Processed: ', 1200000)
('Processed: ', 1300000)
('Processed: ', 1400000)
('Processed: ', 1500000)
('Processed: ', 1600000)
('Processed: ', 1700000)
('Processed: ', 1800000)
('Processed: ', 1900000)
('Processed: ', 2000000)


In [8]:
print(train_x.shape)
print(train_y.shape)

(2097152L, 1L, 1014L, 69L)
(2097152L,)


In [9]:
train_iter = mx.io.NDArrayIter(train_x, train_y, batch_size=BATCH_SIZE, shuffle=True)

In [10]:
del train_x
del train_y
# 147 GB, 69% with 2,097,000 mill obs (3.6 will be too much)

In [None]:
model = mx.model.FeedForward(
    ctx = ctx,
    symbol = create_crepe(), 
    num_epoch = EPOCHS,  # number of training rounds
    learning_rate = 0.01,  # learning rate
    momentum = 0.9,   # momentum for sgd
    wd = 0.00001,  # weight decay for reg
    initializer = mx.init.Normal(sigma=SD)  # init with sd of 0.05
)

In [None]:
tic = time.time()
model.fit(
    X = train_iter,
    eval_metric=['accuracy'],
    batch_end_callback=mx.callback.Speedometer(100*BATCH_SIZE),
    epoch_end_callback=mx.callback.do_checkpoint("crepe_checkp_") 
)

print("Finished training in %.0f seconds" % (time.time() - tic))

*Log*

Estimated epoch time = 580 min per epoch

`
2016-08-23 11:44:46,536 - root - INFO - Start training with [gpu(0)]
2016-08-23 11:47:02,052 - root - INFO - Epoch[0] Batch [50]	Speed: 5799.10 samples/sec	Train-accuracy=0.492344
2016-08-23 11:48:50,002 - root - INFO - Epoch[0] Batch [100]	Speed: 5929.55 samples/sec	Train-accuracy=0.498594
2016-08-23 11:50:34,694 - root - INFO - Epoch[0] Batch [150]	Speed: 6113.11 samples/sec	Train-accuracy=0.509375
2016-08-23 11:52:22,282 - root - INFO - Epoch[0] Batch [200]	Speed: 5948.62 samples/sec	Train-accuracy=0.496250
2016-08-23 11:54:09,532 - root - INFO - Epoch[0] Batch [250]	Speed: 5967.37 samples/sec	Train-accuracy=0.500313
2016-08-23 11:55:53,025 - root - INFO - Epoch[0] Batch [300]	Speed: 6184.05 samples/sec	Train-accuracy=0.505000
2016-08-23 11:57:40,069 - root - INFO - Epoch[0] Batch [350]	Speed: 5978.85 samples/sec	Train-accuracy=0.511563
2016-08-23 11:59:25,526 - root - INFO - Epoch[0] Batch [400]	Speed: 6068.77 samples/sec	Train-accuracy=0.500625
2016-08-23 12:01:12,180 - root - INFO - Epoch[0] Batch [450]	Speed: 6000.77 samples/sec	Train-accuracy=0.502656
2016-08-23 12:02:56,980 - root - INFO - Epoch[0] Batch [500]	Speed: 6106.81 samples/sec	Train-accuracy=0.497500
2016-08-23 12:04:42,397 - root - INFO - Epoch[0] Batch [550]	Speed: 6071.18 samples/sec	Train-accuracy=0.504844
2016-08-23 12:06:27,793 - root - INFO - Epoch[0] Batch [600]	Speed: 6072.28 samples/sec	Train-accuracy=0.485781
2016-08-23 12:08:13,898 - root - INFO - Epoch[0] Batch [650]	Speed: 6031.82 samples/sec	Train-accuracy=0.507188
2016-08-23 12:09:58,694 - root - INFO - Epoch[0] Batch [700]	Speed: 6107.05 samples/sec	Train-accuracy=0.499063
2016-08-23 12:11:40,805 - root - INFO - Epoch[0] Batch [750]	Speed: 6268.00 samples/sec	Train-accuracy=0.491875
2016-08-23 12:13:25,710 - root - INFO - Epoch[0] Batch [800]	Speed: 6100.70 samples/sec	Train-accuracy=0.498594
2016-08-23 12:15:10,071 - root - INFO - Epoch[0] Batch [850]	Speed: 6132.62 samples/sec	Train-accuracy=0.494531
2016-08-23 12:16:53,038 - root - INFO - Epoch[0] Batch [900]	Speed: 6216.55 samples/sec	Train-accuracy=0.497031
2016-08-23 12:18:35,365 - root - INFO - Epoch[0] Batch [950]	Speed: 6254.46 samples/sec	Train-accuracy=0.507031
2016-08-23 12:20:19,480 - root - INFO - Epoch[0] Batch [1000]	Speed: 6146.99 samples/sec	Train-accuracy=0.499063
2016-08-23 12:22:04,961 - root - INFO - Epoch[0] Batch [1050]	Speed: 6067.73 samples/sec	Train-accuracy=0.501094
2016-08-23 12:23:51,604 - root - INFO - Epoch[0] Batch [1100]	Speed: 6001.67 samples/sec	Train-accuracy=0.502500
2016-08-23 12:25:35,513 - root - INFO - Epoch[0] Batch [1150]	Speed: 6159.24 samples/sec	Train-accuracy=0.497188
2016-08-23 12:27:19,124 - root - INFO - Epoch[0] Batch [1200]	Speed: 6177.90 samples/sec	Train-accuracy=0.510781
2016-08-23 12:29:04,947 - root - INFO - Epoch[0] Batch [1250]	Speed: 6047.78 samples/sec	Train-accuracy=0.502344
2016-08-23 12:30:48,331 - root - INFO - Epoch[0] Batch [1300]	Speed: 6191.47 samples/sec	Train-accuracy=0.498750
2016-08-23 12:32:33,285 - root - INFO - Epoch[0] Batch [1350]	Speed: 6100.87 samples/sec	Train-accuracy=0.498125
2016-08-23 12:34:18,167 - root - INFO - Epoch[0] Batch [1400]	Speed: 6102.10 samples/sec	Train-accuracy=0.506563
2016-08-23 12:36:05,661 - root - INFO - Epoch[0] Batch [1450]	Speed: 5954.65 samples/sec	Train-accuracy=0.504375
2016-08-23 12:37:49,828 - root - INFO - Epoch[0] Batch [1500]	Speed: 6144.22 samples/sec	Train-accuracy=0.497812
2016-08-23 12:39:36,111 - root - INFO - Epoch[0] Batch [1550]	Speed: 6021.60 samples/sec	Train-accuracy=0.487031
2016-08-23 12:41:20,530 - root - INFO - Epoch[0] Batch [1600]	Speed: 6130.09 samples/sec	Train-accuracy=0.486250
2016-08-23 12:43:04,519 - root - INFO - Epoch[0] Batch [1650]	Speed: 6154.62 samples/sec	Train-accuracy=0.507500
2016-08-23 12:44:50,089 - root - INFO - Epoch[0] Batch [1700]	Speed: 6062.27 samples/sec	Train-accuracy=0.499063
2016-08-23 12:46:36,177 - root - INFO - Epoch[0] Batch [1750]	Speed: 6032.78 samples/sec	Train-accuracy=0.498437
2016-08-23 12:48:20,635 - root - INFO - Epoch[0] Batch [1800]	Speed: 6126.81 samples/sec	Train-accuracy=0.502188
2016-08-23 12:50:06,719 - root - INFO - Epoch[0] Batch [1850]	Speed: 6033.01 samples/sec	Train-accuracy=0.501250
2016-08-23 12:51:52,374 - root - INFO - Epoch[0] Batch [1900]	Speed: 6057.68 samples/sec	Train-accuracy=0.491250
2016-08-23 12:53:37,819 - root - INFO - Epoch[0] Batch [1950]	Speed: 6069.46 samples/sec	Train-accuracy=0.503594
2016-08-23 12:55:23,374 - root - INFO - Epoch[0] Batch [2000]	Speed: 6063.25 samples/sec	Train-accuracy=0.509375
2016-08-23 12:57:08,145 - root - INFO - Epoch[0] Batch [2050]	Speed: 6108.50 samples/sec	Train-accuracy=0.494688
2016-08-23 12:58:54,782 - root - INFO - Epoch[0] Batch [2100]	Speed: 6001.73 samples/sec	Train-accuracy=0.496875
2016-08-23 13:00:41,371 - root - INFO - Epoch[0] Batch [2150]	Speed: 6004.32 samples/sec	Train-accuracy=0.502188
2016-08-23 13:02:27,224 - root - INFO - Epoch[0] Batch [2200]	Speed: 6046.12 samples/sec	Train-accuracy=0.503906
2016-08-23 13:04:12,967 - root - INFO - Epoch[0] Batch [2250]	Speed: 6052.41 samples/sec	Train-accuracy=0.497031
2016-08-23 13:05:57,377 - root - INFO - Epoch[0] Batch [2300]	Speed: 6129.74 samples/sec	Train-accuracy=0.498594
2016-08-23 13:07:41,209 - root - INFO - Epoch[0] Batch [2350]	Speed: 6163.80 samples/sec	Train-accuracy=0.497188
2016-08-23 13:09:29,085 - root - INFO - Epoch[0] Batch [2400]	Speed: 5933.62 samples/sec	Train-accuracy=0.505313
`