# Embedding

## Loading data

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
cat_size = [len(train[x].unique()) for x in train.columns if x.startswith('cat')]
cat_size

[2, 15, 19, 13, 20, 84, 16, 51, 61, 19, 299, 2, 2, 2, 2, 4, 4, 4, 4]

In [4]:
cat_cols = [x for x in train.columns if x.startswith('cat')]
cat_cols

['cat0',
 'cat1',
 'cat2',
 'cat3',
 'cat4',
 'cat5',
 'cat6',
 'cat7',
 'cat8',
 'cat9',
 'cat10',
 'cat11',
 'cat12',
 'cat13',
 'cat14',
 'cat15',
 'cat16',
 'cat17',
 'cat18']

In [5]:
cont_cols = [x for x in train.columns if x.startswith('cont')]
cont_cols

['cont0',
 'cont1',
 'cont2',
 'cont3',
 'cont4',
 'cont5',
 'cont6',
 'cont7',
 'cont8',
 'cont9',
 'cont10']

## Define the model

In [6]:
import tensorflow as tf
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Embedding, Reshape, concatenate, Dense, Lambda
from tensorflow.keras.optimizers import Adam

In [7]:
# from fast.ai
#embedding_size = min(50, (m+1) / 2)

In [8]:
def build_model():
    combi_input = Input((len(cat_cols)+len(cont_cols),))
    cat_input = []
    for i, cat in enumerate(cat_cols):
        ci = combi_input[:, i]
        cat_input.append(ci)

    cont_input = combi_input[:, len(cat_cols):]
    inputs = cat_input
    inputs.append(cont_input)

    cat_embedding = []
    for i, size in enumerate(cat_size):
        if size <= 2:
            cat = Reshape(target_shape=(1,))(cat_input[i])
            cat_embedding.append(cat)
        else:
            embedding_size = min(50, (size + 1) / 2)
            cat = Embedding(size, 2, name=f'cat{i}_embedding')(cat_input[i])
            cat_embedding.append(cat)
    
    embedded = [cont_input] + cat_embedding
    output = concatenate(embedded)
    output = Dense(200, activation='relu')(output)
    output = Dense(100, activation='relu')(output)
    output = Dense(2, activation='softmax')(output)

    model = Model(inputs=combi_input, outputs=output)
    initial_learning_rate = 0.01
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate,
        decay_steps=200,
        decay_rate=0.96,
        staircase=False)
    #optimizer = Adam(learning_rate=lr_schedule)
    optimizer = Adam()
    
    #model.compile(loss='binary_crossentropy', optimizer=optimizer)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    
    return model

In [9]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [10]:
from sklearn.pipeline import Pipeline
from categorical_transform import IntegerCategoricalTransform
p = Pipeline([('cat_trans', IntegerCategoricalTransform(cat_cols)), 
              ('mlp', KerasClassifier(build_model, epochs=1))])

In [11]:
x_train = train.drop(columns=['target','id'])
y_train = train['target']
p.fit(x_train, y_train)



Pipeline(steps=[('cat_trans',
                 IntegerCategoricalTransform(cat_cols=['cat0', 'cat1', 'cat2',
                                                       'cat3', 'cat4', 'cat5',
                                                       'cat6', 'cat7', 'cat8',
                                                       'cat9', 'cat10', 'cat11',
                                                       'cat12', 'cat13',
                                                       'cat14', 'cat15',
                                                       'cat16', 'cat17',
                                                       'cat18'])),
                ('mlp',
                 <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7f5d421284f0>)])

In [None]:
p.predict(x_train)

In [None]:
from sklearn.model_selection import cross_validate
scores = cross_validate(p, X=x_train, y=y_train, cv=5, return_train_score = True,
                         scoring='roc_auc')
scores