#### Imports

In [1]:
import numpy as np
import pandas as pd
import gc
DATA_PATH='../data/'

In [2]:
# Takes awhile, it's a hot 16G
df = pd.read_pickle(DATA_PATH+'padded_train.pickle')

In [3]:
df.shape[0]/206209

68.0

#### Processing starts here

Sanity check: `user_id` is unique.

In [None]:
len(list(set([index[0] for index in df.index])))

Get latest order for each `user_id`

In [4]:
labels = df.groupby('user_id').last()

In [None]:
labels.shape

In [6]:
labels.to_pickle(DATA_PATH+"training_labels_fresh.pickle")

In [25]:
labels = pd.read_pickle(DATA_PATH+"training_labels_fresh.pickle")

Drop columns that we don't want to predict or don't need to link back to other dataframes

In [26]:
labels.drop(['eval_set','order_number','order_dow','order_number','order_hour_of_day','days_since_prior_order'],axis=1, inplace=True)

Sanity check columns

In [27]:
labels.columns

Index(['order_id',          0,          1,          2,          3,          4,
                5,          6,          7,          8,
       ...
              135,        136,        137,        138,        139,        140,
              141,        142,        143,        144],
      dtype='object', length=146)

In [5]:
np.save(DATA_PATH+"final/sequence_labels.npy",labels)

Prepare for numpy conversion

In [None]:
df.drop(['eval_set'],inplace=True, axis=1)

In [7]:
np.save(DATA_PATH+'train_sequences0.npy',df)

In [3]:
df = np.load(DATA_PATH+'train_sequences0.npy', mmap_mode='r+')

Remove last order for each `user_id`.  We need this to avoid redundancy between features array and labels array.

In [4]:
df = np.delete(df,list(range(67,df.shape[0],68)), axis=0)

In [5]:
np.save(DATA_PATH+'train_sequences.npy', df)

Sanity checks

In [2]:
nump = np.load(DATA_PATH+'train_sequences.npy',mmap_mode='r+')

In [23]:
df.iloc[67,:]

order_id                  1.1879e+06
eval_set                       train
order_number                      11
order_dow                          4
order_hour_of_day                  8
days_since_prior_order            14
0                                196
1                              25133
2                              38928
3                              26405
4                              39657
5                              10258
6                              13032
7                              26088
8                              27845
9                              49235
10                             46149
11                               NaN
12                               NaN
13                               NaN
14                               NaN
15                               NaN
16                               NaN
17                               NaN
18                               NaN
19                               NaN
20                               NaN
2

In [24]:
nump[66]

memmap([  1.18789900e+06,   1.10000000e+01,   4.00000000e+00,
          8.00000000e+00,   1.40000000e+01,   1.96000000e+02,
          2.51330000e+04,   3.89280000e+04,   2.64050000e+04,
          3.96570000e+04,   1.02580000e+04,   1.30320000e+04,
          2.60880000e+04,   2.78450000e+04,   4.92350000e+04,
          4.61490000e+04,              nan,              nan,
                     nan,              nan,              nan,
                     nan,              nan,              nan,
                     nan,              nan,              nan,
                     nan,              nan,              nan,
                     nan,              nan,              nan,
                     nan,              nan,              nan,
                     nan,              nan,              nan,
                     nan,              nan,              nan,
                     nan,              nan,              nan,
                     nan,              nan,              nan,
        

Useful for remembering which column is which in features array.

In [28]:
df.columns # Take out 'eval_set' when considering which numpy columns are which

Index([              'order_id',               'eval_set',
                 'order_number',              'order_dow',
            'order_hour_of_day', 'days_since_prior_order',
                              0,                        1,
                              2,                        3,
       ...
                            135,                      136,
                            137,                      138,
                            139,                      140,
                            141,                      142,
                            143,                      144],
      dtype='object', length=151)

Fill NAs, stack, and normalize.

In [3]:
np.nan_to_num(nump,copy=False)

memmap([[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        ..., 
        [  1.85473600e+06,   1.10000000e+01,   4.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  6.26363000e+05,   1.20000000e+01,   1.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  2.97766000e+06,   1.30000000e+01,   1.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00]])

In [4]:
nump = np.split(nump,206209)

In [5]:
nump = np.stack(nump,axis=1)

In [6]:
nump.shape

(67, 206209, 150)

In [7]:
maxer = nump.max(axis=1, keepdims=True)
minner = nump.min(axis=1, keepdims=True)
maxer[:,:,:2] = 1
minner[:,:,:2] = 0

In [8]:
new = (nump-minner)/(maxer-minner)

  if __name__ == '__main__':


In [9]:
maxer[maxer==minner]=1
minner[maxer==minner]=0

In [10]:
np.nan_to_num(new, copy=False)

array([[[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        ..., 
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00]],

       [[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+0

In [11]:
np.save(DATA_PATH+'train_normalized_sequences.npy', new)

Finally, create a splitting mask for reproducible training/validation split.  Note we're using a traditional random split instead of splitting on the sequences, mainly because of the variable lengths of the order sequences.

In [2]:
features = np.load(DATA_PATH+'train_normalized_sequences.npy', mmap_mode='r')
labels = np.load(DATA_PATH+'sequence_labels.npy',mmap_mode='r')

In [3]:
print(features.shape, labels.shape)

(67, 206209, 150) (206209, 146)


In [4]:
#msk = np.random.rand(len(labels)) < 0.8 # Don't run this again
msk = np.load('../splitting_mask.npy')

In [5]:
X_train = features[:,msk,:]
X_val = features[:,~msk,:]
y_train = labels[msk]
y_val = labels[~msk]

In [6]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

(67, 164792, 150) (164792, 146)
(67, 41417, 150) (41417, 146)


In [7]:
#np.save('../splitting_mask.npy',msk)
np.save(DATA_PATH+'final/training.npy',X_train)
np.save(DATA_PATH+'final/training_labels.npy',y_train)
np.save(DATA_PATH+'final/validation.npy',X_val)
np.save(DATA_PATH+'final/validation_labels.npy',y_val)