# Prepare Data for Keras

There are 3 types of data:
- **Nominal category** - a group of objects that can be collectively grouped on the basis of a particular characteristics -a qualitative property
- **Ordinal data** - is a categorical data type where the variables have natural orderd categories and the distances between the categories is not known
- **Numerical data**


In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

%run utils.ipynb
df = testimon_fraud()

In [2]:
# Example 1 - One-hot encoding of a categorical variable
type_dummies = 'Type_' + df['type'].astype(str)
print(type_dummies.head())
dummies = pd.get_dummies(type_dummies)
print(dummies.head(5))

0    Type_TRANSFER
1    Type_CASH_OUT
2    Type_CASH_OUT
3    Type_TRANSFER
4    Type_TRANSFER
Name: type, dtype: object
   Type_CASH_OUT  Type_TRANSFER
0              0              1
1              1              0
2              1              0
3              0              1
4              0              1


In [3]:
# Example 2 - Map categories to integer values
types = df.type.astype('category')
print(types.head(5))
print(types.cat.codes.head(5))

0    TRANSFER
1    CASH_OUT
2    CASH_OUT
3    TRANSFER
4    TRANSFER
Name: type, dtype: category
Categories (2, object): [CASH_OUT, TRANSFER]
0    1
1    0
2    0
3    1
4    1
dtype: int8


c

In [4]:
from keras.layers import *
from keras.models import *

# Create first layer for the categorical variable

num_types = len(df['type'].unique()) 
type_embedding_dim = 3 # arbitrary choice

type_in = Input(shape=(1,))
type_embedding = Embedding(num_types,type_embedding_dim,input_length=1)(type_in)
type_out = Reshape(target_shape=(type_embedding_dim,))(type_embedding)

type_model = Model(type_in,type_out)

inputs = []
outputs = []
inputs.append(type_in)
outputs.append(type_out)

print(inputs)

Using TensorFlow backend.


[<tf.Tensor 'input_1:0' shape=(?, 1) dtype=float32>]


In [5]:
# Create another input for all other variables (non-categorial ones)

other_cols = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
df['nameOrig'] = df.nameOrig.astype('category')
df['nameDest'] = df.nameDest.astype('category')
print(other_cols)
num_rest = len(other_cols)

rest_in = Input(shape = (num_rest,)) 
rest_out = Dense(16)(rest_in)

rest_model = Model(rest_in,rest_out)

inputs.append(rest_in)
outputs.append(rest_out)

['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']


In [6]:
# Build the model
concatenated = Concatenate()(outputs)
x = Dense(16)(concatenated)
x = Activation('sigmoid')(x)
x = Dense(1)(concatenated)
model_out = Activation('sigmoid')(x)

merged_model = Model(inputs, model_out)
merged_model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])


In [12]:
# Train the model
types = df.type.astype('category').cat.codes
rest = df[other_cols]
target = df['isFraud']
history = merged_model.fit([types.values,rest.values],target.values, epochs = 1, batch_size = 128)


Epoch 1/1
