Our new strategy is to turn categorical data into embeddings and merge into our neural network. Based on the random forest features, we will drop unnecessary feature

Categorical Data that needs to be turned into embeddings: Breed, Sterilized, Vaccinated, Maturity Size, Type, Fur Length, State, Dewormed, Gender, Color, Health, Name

We will merge Bree1 and Breed2

In [1]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.model_selection import train_test_split
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical


from keras.layers import Dense, Input, Flatten, Reshape
#from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, concatenate
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, Concatenate
from keras.models import Model, Sequential

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import optimizers
from keras import regularizers
from keras.utils.layer_utils import print_summary
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.callbacks import CSVLogger

Using TensorFlow backend.


In [2]:
pet_data = pd.read_csv("./all/train.csv", sep=",")

In [3]:
pet_data = pd.read_csv("./all/train.csv", sep=",")
breed_labels = pd.read_csv('./all/breed_labels.csv')
state_labels = pd.read_csv('./all/state_labels.csv')
color_labels = pd.read_csv('./all/color_labels.csv')

readable_pet_data = pet_data
readable_pet_data['Breed1'] = readable_pet_data['Breed1'].map(breed_labels.set_index('BreedID')['BreedName'])
readable_pet_data['Breed2'] = readable_pet_data['Breed2'].map(breed_labels.set_index('BreedID')['BreedName'])

readable_pet_data['State'] = readable_pet_data['State'].map(state_labels.set_index('StateID')['StateName'])

readable_pet_data['Color1'] = readable_pet_data['Color1'].map(color_labels.set_index('ColorID')['ColorName'])
readable_pet_data['Color2'] = readable_pet_data['Color2'].map(color_labels.set_index('ColorID')['ColorName'])
readable_pet_data['Color3'] = readable_pet_data['Color3'].map(color_labels.set_index('ColorID')['ColorName'])

genderDict = {1:'Male', 2:'Female', 3:'Mixed'}
typeDict = {1:'Dog', 2:'Cat'}
maturityDict = {1: 'Small', 2: 'Medium', 3: 'Large', 4: 'Extra Large', 0: 'Not Specified'}
healthDict = {1:'Healthy', 2:'Minor Injury', 3:'Serious Injury', 0:'Not Specified'}
furDict = {1: 'Short', 2: 'Medium', 3: 'Long', 0: 'Not Specified'}

readable_pet_data['Gender'] = readable_pet_data['Gender'].map(genderDict)
readable_pet_data['Type'] = readable_pet_data['Type'].map(typeDict)
readable_pet_data['MaturitySize'] = readable_pet_data['MaturitySize'].map(maturityDict)
readable_pet_data['Health'] = readable_pet_data['Health'].map(healthDict)
readable_pet_data['FurLength'] = readable_pet_data['FurLength'].map(furDict)

In [4]:
categorical_vars = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
        'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
        'Sterilized', 'Health', 'State']
numerical_vars = ['Age', 'Quantity', 'Fee', 'VideoAmt', 'PhotoAmt']

In [5]:
readable_pet_data.axes

[RangeIndex(start=0, stop=14993, step=1),
 Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
        'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
        'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
        'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed'],
       dtype='object')]

In [6]:
embeddings = []
inputs = []
for categorical_var in categorical_vars :
    i = Input(shape=(1,))
    model = Sequential()
    no_of_unique_cat  = pet_data[categorical_var].nunique()
    embedding_size = min(np.ceil((no_of_unique_cat)/2), 50 )
    embedding_size = int(embedding_size)
    vocab  = no_of_unique_cat+1
    embedding = Embedding(vocab ,embedding_size, input_length = 1 )(i)
    embedding = Reshape(target_shape=(embedding_size,))(embedding)
    embeddings.append( embedding )
    inputs.append(i)

In [7]:
input_numeric = Input(shape=(5,))
embedding_numeric = Dense(16)(input_numeric) 
inputs.append(input_numeric)
embeddings.append(embedding_numeric)

In [8]:
Concatenate()(embeddings)

<tf.Tensor 'concatenate_1/concat:0' shape=(?, 148) dtype=float32>

In [9]:
x = Concatenate()(embeddings)
x = Dense(80, activation='relu')(x)
x = Dropout(.35)(x)
x = Dense(20, activation='relu')(x)
x = Dropout(.15)(x)
x = Dense(10, activation='relu')(x)
x = Dropout(.15)(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(inputs, output)

model.compile(loss='binary_crossentropy', optimizer='adam')

In [10]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_5 (I

In [11]:
p = pet_data[categorical_vars + numerical_vars]

In [12]:
def preproc(X_train, X_val, X_test):

    input_list_train = []
    input_list_val = []
    input_list_test = []
    
    #the cols to be embedded: rescaling to range [0, # values)
    for c in embed_cols:
        raw_vals = np.unique(X_train[c])
        val_map = {}
        for i in range(len(raw_vals)):
            val_map[raw_vals[i]] = i       
        input_list_train.append(X_train[c].map(val_map).values)
        input_list_val.append(X_val[c].map(val_map).fillna(0).values)
        input_list_test.append(X_test[c].map(val_map).fillna(0).values)
     
    #the rest of the columns
    #other_cols = [c for c in X_train.columns if (not c in embed_cols)]
    #input_list_train.append(X_train[other_cols].values)
    #input_list_val.append(X_val[other_cols].values)
    #input_list_test.append(X_test[other_cols].values)
    
    return input_list_train, input_list_val, input_list_test

In [13]:
embed_cols = categorical_vars
#embed_cols = categorical_vars + numerical_vars
p1 = preproc(p, p, p)

TypeError: '<' not supported between instances of 'float' and 'str'

In [None]:
p1

In [None]:
model.fit(p1[0], pet_data.AdoptionSpeed, epochs=10)

In [None]:
pet_data.AdoptionSpeed.ndim()

In [None]:
p1[0]