In [12]:
%matplotlib inline

import pandas as pd
import numpy as np
from scipy import stats, integrate
import matplotlib.pyplot as plt
import seaborn as sns

import keras as ks
from keras import backend as K
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping, ModelCheckpoint


from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler

from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from pandas_summary import DataFrameSummary

from tqdm import tqdm


In [2]:
data_path = 'input/'
train = pd.read_csv(data_path +'train.csv', parse_dates=['listing_at'])
test = pd.read_csv(data_path +'test.csv', parse_dates=['listing_at'])

In [33]:
DataFrameSummary(train).summary()

Unnamed: 0,item_id,category_class,sold_price,price,area_name,condition,size,listing_at,item_tag_hash
count,700,700,700,700,,,700,,
mean,5529.53,1.49857,1092.12,1118.31,,,9.40714,,
std,2590.82,1.35902,201.949,298.301,,,7.8042,,
min,1041,0,646,645,,,0,,
25%,3309.25,0,948.75,948,,,4,,
50%,5556.5,1,1059,1058,,,7,,
75%,7617.5,2,1188.25,1193.75,,,12,,
max,9990,4,1874,3181,,,59,,
counts,700,700,700,700,700,700,700,700,700
uniques,700,5,453,446,10,3,41,700,188


In [34]:
DataFrameSummary(test).summary()

Unnamed: 0,item_id,category_class,sold_price,price,area_name,condition,size,listing_at,item_tag_hash
count,300,0,300,300,,,300,,
mean,5455.23,,1099.69,1104.9,,,9.34333,,
std,2638.17,,214.623,235.455,,,7.8786,,
min,1006,,685,684,,,0,,
25%,3090.75,,962.75,961.75,,,4,,
50%,5389,,1051,1050.5,,,7,,
75%,7683.75,,1198,1196.25,,,13,,
max,9981,,1700,2570,,,40,,
counts,300,0,300,300,300,300,300,300,300
uniques,300,0,245,243,10,3,33,300,146


In [52]:
num_cols = ['item_id', 'sold_price', 'price', 'size']
cat_cols = ['area_name', 'condition', 'item_tag_hash']
dt_cols = ['listing_at']
y_cols = ['category_class']

def create_y_and_merged(train, test, y_cols):
    y = train[y_cols]

    merged = pd.concat([train, test], axis=0)
    merged.drop(y_cols, axis=1, inplace=True)
    print(merged.shape, y.shape[0])
    return y, merged

In [80]:
def preprocess(df_original, num_cols, cat_cols, dt_cols, if_cat_to_dummy=True, if_cat_to_int=False):
    new_dt_cat_cols, df  = prepr_dt(df_original, dt_cols)
    num_ar = prepr_num(df, num_cols)
    new_cat_cols = cat_cols + new_dt_cat_cols
    if if_cat_to_dummy:
        cat_ar = prepr_cat(df, new_cat_cols)
    else:
        if if_cat_to_int:
            cat_ar = cat_to_int(df, new_cat_cols)
        else:
            cat_ar = df[new_cat_cols].values
#     print('num_cols, new_cat_cols:', num_cols, new_cat_cols)
#     print('df.shape, num_ar.shape, cat_ar.shape:', df.shape, num_ar.shape, cat_ar.shape)
    return np.concatenate([num_ar, cat_ar], axis=1)

def prepr_num(df, num_cols):
    if num_cols == None: return None
    scaler = StandardScaler()
    return scaler.fit_transform(df[num_cols])    

def prepr_cat(df, cat_cols):
    if cat_cols == None: return None
    return pd.get_dummies(df[cat_cols]).values

def cat_to_int(df, cat_cols):
    return df[cat_cols].apply(lambda x: pd.factorize(x)[0])

def prepr_dt(df, dt_cols, drop_original_dt_cols=True):
    if cat_cols == None: return None
    '''
    Creates addition columns from datetime column.
    :param df: pandas.DataFrame
    :param dt_col_name: name of the datetime column
    :return: pandas.DataFrame with added columns.
    '''
    new_dt_cat_cols = []
    for col in dt_cols:
        day_col = col+'_day'
        hour_col = col+'_hour'
        df[day_col]  = df[col].apply(lambda x: x.day).astype(int)
        df[hour_col]  = df[col].apply(lambda x: x.hour).astype(int)
        new_dt_cat_cols += [day_col, hour_col]
    if drop_original_dt_cols: df.drop(columns=dt_cols, axis=1, inplace=True)
    return new_dt_cat_cols, df

In [81]:
# test cell:

_, merged = create_y_and_merged(train, test, y_cols)
print(merged.shape)
# print(merged.head())

# new_dt_cat_cols, temp = prepr_dt(merged, dt_cols, drop_original_dt_cols=True)

# temp = prepr_num(merged, num_cols)

# temp = prepr_cat(merged, cat_cols)

# temp = cat_to_int(merged, cat_cols)

temp = preprocess(merged, num_cols, cat_cols, dt_cols, if_cat_to_dummy=False, if_cat_to_int=False)
# temp = preprocess(merged, num_cols, cat_cols, dt_cols, if_cat_to_dummy=False, if_cat_to_int=True)
# temp = preprocess(merged, num_cols, cat_cols, dt_cols, if_cat_to_dummy=True, if_cat_to_int=False)

print(new_dt_cat_cols, temp.shape, temp[:2])
del temp, merged

(1000, 8) 700
(1000, 8)
['listing_at_day', 'listing_at_hour'] (1000, 9) [[0.9446912770873567 0.33846602141784227 0.16994873782256065
  -0.3054181543952137 'fff' 'Fair'
  '3ca192bd7558780793444f73366c58d60c9d7775' 1 16]
 [-1.405164687165467 -0.43468711775747204 -0.3928544014108358
  -0.8170063527121546 'fff' 'Fair'
  'fbaacb960902382e4f6c96f2d8f225c24eecadb4' 2 14]]


In [79]:
def save_arr(arr, name):
    nm = 'input/' + name + '.csv'
    df = pd.DataFrame(arr)
    df.to_csv(nm, header=None, index=None)
    print(f'Saved "{nm}" with shape: {arr.shape}')

y, merged = create_y_and_merged(train, test, y_cols)
save_arr(y.astype(int), 'y')
    
arr = preprocess(merged, num_cols, cat_cols, dt_cols, if_cat_to_dummy=False, if_cat_to_int=False)
save_arr(arr, 'merged_with_cat_as_str')

_, merged = create_y_and_merged(train, test, y_cols)
arr = preprocess(merged, num_cols, cat_cols, dt_cols, if_cat_to_dummy=False, if_cat_to_int=True)
save_arr(arr, 'merged_with_cat_as_int')

_, merged = create_y_and_merged(train, test, y_cols)
arr = preprocess(merged, num_cols, cat_cols, dt_cols, if_cat_to_dummy=True, if_cat_to_int=False)
save_arr(arr, 'merged_with_cat_as_dummy')


(1000, 8) 700
Saved "input/y.csv" with shape: (700, 1)
num_cols, new_cat_cols: ['item_id', 'sold_price', 'price', 'size'] ['area_name', 'condition', 'item_tag_hash', 'listing_at_day', 'listing_at_hour']
df.shape, num_ar.shape, cat_ar.shape: (1000, 9) (1000, 4) (1000, 5)
Saved "input/merged_with_cat_as_str.csv" with shape: (1000, 9)
(1000, 8) 700
num_cols, new_cat_cols: ['item_id', 'sold_price', 'price', 'size'] ['area_name', 'condition', 'item_tag_hash', 'listing_at_day', 'listing_at_hour']
df.shape, num_ar.shape, cat_ar.shape: (1000, 9) (1000, 4) (1000, 5)
Saved "input/merged_with_cat_as_int.csv" with shape: (1000, 9)
(1000, 8) 700
num_cols, new_cat_cols: ['item_id', 'sold_price', 'price', 'size'] ['area_name', 'condition', 'item_tag_hash', 'listing_at_day', 'listing_at_hour']
df.shape, num_ar.shape, cat_ar.shape: (1000, 9) (1000, 4) (1000, 215)
Saved "input/merged_with_cat_as_dummy.csv" with shape: (1000, 219)


# Feature Engineering with NN
We will use the Autoencoder NN to generate the latent features. We will use these latent features as the additional features for training and prediction.

In [4]:
def load_arr(name):
    nm = 'input/' + name + '.csv'
    arr = pd.read_csv(nm, header=None).values
    print(f'Loaded "{nm}" with shape: {arr.shape}')
    return arr

y = load_arr('y')
    
merged_with_cat_as_dummy = load_arr('merged_with_cat_as_dummy')

merged_with_cat_as_int = load_arr('merged_with_cat_as_int')

merged_with_cat_as_str = load_arr('merged_with_cat_as_str')

Loaded "input/y.csv" with shape: (700, 1)
Loaded "input/merged_with_cat_as_dummy.csv" with shape: (1000, 219)
Loaded "input/merged_with_cat_as_int.csv" with shape: (1000, 9)
Loaded "input/merged_with_cat_as_str.csv" with shape: (1000, 9)


In [7]:
# convert y to dummies !!! use train and test all compound because we do not need y for creating the latent features.
x_full = merged_with_cat_as_dummy[:]
print([x.shape for x in [x_full]])


[(1000, 219)]


In [9]:
def create_ED_model(inp_shape, dropout=0.25):
    inp = ks.Input(shape=(inp_shape,), dtype='float32')
    out = ks.layers.Dense(128, activation='relu')(inp)
    out = ks.layers.BatchNormalization()(out)
    out = ks.layers.Dropout(dropout)(out)
    out = ks.layers.Dense(64, activation='relu')(out)
    out = ks.layers.BatchNormalization()(out)
    out = ks.layers.Dropout(dropout)(out)
    out = ks.layers.Dense(32, activation='relu')(out)
    out = ks.layers.BatchNormalization()(out)
    out = ks.layers.Dropout(dropout)(out)
    out = ks.layers.Dense(64, activation='relu')(out)
    out = ks.layers.BatchNormalization()(out)
    out = ks.layers.Dropout(dropout)(out)
    out = ks.layers.Dense(128, activation='relu')(out)
    out = ks.layers.BatchNormalization()(out)
    out = ks.layers.Dropout(dropout)(out)
    out = ks.layers.Dense(inp_shape, activation='relu')(out)

    model = ks.Model(inp, out)
    model.compile(loss='mean_squared_error', optimizer=ks.optimizers.Adam(lr=3e-3))
    #print(model.summary())
    return model

def train_ED_model(model, x_train, batch_size = 32, epochs = 1000):
    earlystopper = EarlyStopping(patience=int(epochs/10), verbose=1)
    checkpointer = ModelCheckpoint('models/EncoderDecoder.model', verbose=1, save_best_only=True)
    results = model.fit(x=x_train, y=x_train, validation_split=0.2, batch_size=batch_size, epochs=epochs, verbose=0,
                        callbacks=[earlystopper, checkpointer])
    return model

model = create_ED_model(x_full.shape[1])
model = train_ED_model(model, x_full, batch_size = 32, epochs = 1000)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 219)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 128)               28160     
_________________________________________________________________
batch_normalization_11 (Batc (None, 128)               512       
_________________________________________________________________
dropout_11 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 64)                8256      
_________________________________________________________________
batch_normalization_12 (Batc (None, 64)                256       
_________________________________________________________________
dropout_12 (Dropout)         (None, 64)                0         
__________

Epoch 00311: val_loss did not improve

Epoch 00312: val_loss did not improve

Epoch 00313: val_loss did not improve

Epoch 00314: val_loss did not improve

Epoch 00315: val_loss did not improve

Epoch 00316: val_loss did not improve

Epoch 00317: val_loss did not improve

Epoch 00318: val_loss did not improve

Epoch 00319: val_loss did not improve

Epoch 00320: val_loss did not improve

Epoch 00321: val_loss did not improve

Epoch 00322: val_loss did not improve

Epoch 00323: val_loss did not improve

Epoch 00324: val_loss did not improve

Epoch 00325: val_loss did not improve

Epoch 00326: val_loss did not improve

Epoch 00327: val_loss did not improve

Epoch 00328: val_loss did not improve

Epoch 00329: val_loss did not improve

Epoch 00330: val_loss did not improve

Epoch 00331: val_loss did not improve

Epoch 00332: val_loss did not improve

Epoch 00333: val_loss did not improve

Epoch 00334: val_loss did not improve

Epoch 00335: val_loss did not improve

Epoch 00336: val_loss did

In [15]:
def get_layer_output(layer_num, layer_outs):
    assert layer_num >= 0 and layer_num < len(layer_outs)
    layer_out = layer_outs[layer_num][0]
    cols = ['ly'+str(layer_num)+'_'+str(col) for col in range(layer_out.shape[1])]
    return pd.DataFrame(data=layer_out, columns=cols)

def get_ED_outputs(ed_model, x_train):
    # print('ED_model number of layers (-1):', ed_model_layers_num)  # Input layer does not count
    #print(ed_model.summary())

    ed_model.layers.pop() # not interested in the last layer
    #print(ed_model.summary())

    features = ed_model.predict(x_train)
    #print('features.shape:',  features.shape)

    inp = ed_model.input  # input placeholder
    outputs = [layer.output for layer in ed_model.layers]  # all layer outputs
    functors = [K.function([inp] + [K.learning_phase()], [out]) for out in outputs]
    layer_outs = [func([x_train, 1.]) for func in functors]
    return [get_layer_output(l, layer_outs) for l in range(1, len(layer_outs))]

layer_outs_dfs = get_ED_outputs(model, x_full)
_ = [print(i, ly.shape) for i, ly in enumerate(layer_outs_dfs)]


0 (1000, 128)
1 (1000, 128)
2 (1000, 128)
3 (1000, 64)
4 (1000, 64)
5 (1000, 64)
6 (1000, 32)
7 (1000, 32)
8 (1000, 32)
9 (1000, 64)
10 (1000, 64)
11 (1000, 64)


In [19]:
# we need the 6th layer output as the latent features
layer_outs_dfs[6].to_csv('input/ED.layer_6.csv', index=False)