In [1]:
#libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from keras.layers import Dense, Input, Flatten, Reshape
#from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, concatenate
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, Concatenate
from keras.models import Model, Sequential
from keras.utils import to_categorical

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import optimizers
from keras import regularizers
from keras.utils.layer_utils import print_summary
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.callbacks import CSVLogger

def kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

Using TensorFlow backend.


In [2]:
breeds = pd.read_csv('./all/breed_labels.csv')
colors = pd.read_csv('./all/color_labels.csv')
states = pd.read_csv('./all/state_labels.csv')

data = pd.read_csv('./all/train.csv')

all_data = data

In [3]:
data['Breed1'] = data['Breed1'].map(breeds.set_index('BreedID')['BreedName'])
data['Breed2'] = data['Breed2'].map(breeds.set_index('BreedID')['BreedName'])

data['State'] = data['State'].map(states.set_index('StateID')['StateName'])

data['Color1'] = data['Color1'].map(colors.set_index('ColorID')['ColorName'])
data['Color2'] = data['Color2'].map(colors.set_index('ColorID')['ColorName'])
data['Color3'] = data['Color3'].map(colors.set_index('ColorID')['ColorName'])

genderDict = {1:'Male', 2:'Female', 3:'Mixed'}
typeDict = {1:'Dog', 2:'Cat'}
maturityDict = {1: 'Small', 2: 'Medium', 3: 'Large', 4: 'Extra Large', 0: 'Not Specified'}
healthDict = {1:'Healthy', 2:'Minor Injury', 3:'Serious Injury', 0:'Not Specified'}
furDict = {1: 'Short', 2: 'Medium', 3: 'Long', 0: 'Not Specified'}

data['Gender'] = data['Gender'].map(genderDict)
data['Type'] = data['Type'].map(typeDict)
data['MaturitySize'] = data['MaturitySize'].map(maturityDict)
data['Health'] = data['Health'].map(healthDict)
data['FurLength'] = data['FurLength'].map(furDict)

In [4]:
data_label = data.AdoptionSpeed
#We drop name because it creates a huge embedding vector and we know that name is not very useful anyway
data = data.drop(['AdoptionSpeed', 'Name'], axis=1)

In [5]:
#Add Name as a binary label


In [6]:
data

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,...,Sterilized,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt
0,Cat,3,Tabby,,Male,Black,White,,Small,Short,...,2,Healthy,1,100,Selangor,8480853f516546f6cf33aa88cd76c379,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0
1,Cat,1,Domestic Medium Hair,,Male,Black,Brown,,Medium,Medium,...,3,Healthy,1,0,Kuala Lumpur,3082c7125d8fb66f7dd4bff4192c8b14,0,I just found it alone yesterday near my apartm...,6296e909a,2.0
2,Dog,1,Mixed Breed,,Male,Brown,White,,Medium,Medium,...,2,Healthy,1,0,Selangor,fa90fa5b1ee11c86938398b60abc32cb,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0
3,Dog,4,Mixed Breed,,Female,Black,Brown,,Medium,Short,...,2,Healthy,1,150,Kuala Lumpur,9238e4f44c71a75282e62f7136c6b240,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0
4,Dog,1,Mixed Breed,,Male,Black,,,Medium,Short,...,2,Healthy,1,0,Selangor,95481e953f8aed9ec3d16fc4509537e8,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0
5,Cat,3,Domestic Short Hair,,Female,Cream,Gray,,Medium,Short,...,2,Healthy,1,0,Selangor,22fe332bf9c924d4718005891c63fbed,0,This is a stray kitten that came to my house. ...,d24c30b4b,2.0
6,Cat,12,Domestic Long Hair,Domestic Long Hair,Male,Black,,,Medium,Long,...,3,Healthy,1,300,Selangor,1e0b5a458b5b77f5af581d57ebf570b3,0,anyone within the area of ipoh or taiping who ...,1caa6fcdb,3.0
7,Dog,0,Mixed Breed,,Female,Black,Brown,White,Medium,Short,...,2,Healthy,6,0,Selangor,1fba5f6e5480946254590d48f9c5198d,0,Siu Pak just give birth on 13/6/10 to 6puppies...,97aa9eeac,9.0
8,Cat,2,Domestic Medium Hair,,Female,Gray,,,Medium,Medium,...,2,Healthy,1,0,Selangor,d8af7afece71334473575c9f70daf00d,0,"healthy and active, feisty kitten found in nei...",c06d167ca,6.0
9,Cat,12,Domestic Medium Hair,,Female,Black,White,,Medium,Medium,...,3,Healthy,1,0,Selangor,1f3f36e4b18e94855b3e88af0852fdc4,0,"Very manja and gentle stray cat found, we woul...",7a0942d61,2.0


In [7]:
train, test, train_label, test_label = train_test_split(data, data_label, test_size=0.33, random_state=9)

#Turn labels into n dimensional vectors for loss calculation
train_label = to_categorical(train_label, num_classes=None)
test_label = to_categorical(test_label, num_classes=None)

In [8]:
train.drop('Description', axis=1).head()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,...,Dewormed,Sterilized,Health,Quantity,Fee,State,RescuerID,VideoAmt,PetID,PhotoAmt
7236,Cat,1,Domestic Medium Hair,,Female,Black,Brown,Gray,Medium,Medium,...,2,2,Healthy,1,0,Johor,7cb867bc9aabe81e7e26780a93bb57ea,0,c15c93964,3.0
1782,Dog,2,Labrador Retriever,Mixed Breed,Male,Gray,White,,Medium,Medium,...,2,2,Healthy,1,0,Kuala Lumpur,88da1210e021a5cf43480b074778f3bc,0,91f5bb21f,1.0
4359,Dog,2,Mixed Breed,Mixed Breed,Male,Black,Brown,,Medium,Medium,...,1,2,Healthy,1,0,Selangor,b53c34474d9e24574bcec6a3d3306a0d,0,6202e7d4d,5.0
13445,Cat,19,Domestic Short Hair,Domestic Short Hair,Male,Yellow,,,Medium,Short,...,1,1,Healthy,1,0,Selangor,13733222f015ec6a0017c3c0527738ff,0,4b932ab38,5.0
11184,Cat,4,Domestic Short Hair,,Male,Brown,White,,Small,Short,...,1,2,Healthy,1,0,Kuala Lumpur,c00756f2bdd8fa88fc9f07a8309f7d5d,0,e9292cee2,5.0


In [9]:
categorical_vars = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
        'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
        'Sterilized', 'Health', 'State']
numerical_vars = ['Age', 'Quantity', 'Fee', 'VideoAmt', 'PhotoAmt']

In [10]:
#Creating a Embedding model for categorical variables using the fast.ai approach
def createEmbeddingsModel(data, categorical_vars, numerical_vars):
    embeddings = []
    inputs = []
    for categorical_var in categorical_vars :
        i = Input(shape=(1,))
        model = Sequential()
        no_of_unique_cat  = data[categorical_var].nunique()
        embedding_size = min(np.ceil((no_of_unique_cat)/2), 50 )
        embedding_size = int(embedding_size)
        vocab  = no_of_unique_cat+1
        embedding = Embedding(vocab ,embedding_size, input_length = 1 )(i)
        embedding = Reshape(target_shape=(embedding_size,))(embedding)
        embeddings.append( embedding )
        inputs.append(i)
        
    input_numeric = Input(shape=(len(numerical_vars),))
    embedding_numeric = Dense(16)(input_numeric) 
    inputs.append(input_numeric)
    embeddings.append(embedding_numeric)
    
    x = Concatenate()(embeddings)
    x = Dense(80, activation='relu')(x)
    x = Dropout(.35)(x)
    x = Dense(20, activation='relu')(x)
    x = Dropout(.15)(x)
    x = Dense(10, activation='relu')(x)
    x = Dropout(.15)(x)
    output = Dense(5, activation='sigmoid')(x)

    model = Model(inputs, output)
    model.compile(metrics=['accuracy'], loss='categorical_crossentropy', optimizer='adam')
    return model

In [11]:
model = createEmbeddingsModel(train, categorical_vars, numerical_vars)

In [12]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_5 (I

In [13]:
def preproc(X_train, X_test, embed_cols, num_cols):

    input_list_train = []
    input_list_test = []
    m= MinMaxScaler()
        
    #the cols to be embedded: rescaling to range [0, # values)
    for c in embed_cols:
        raw_vals = np.unique(X_train[c])
        val_map = {}
        for i in range(len(raw_vals)):
            val_map[raw_vals[i]] = i       
        m.fit(X_train[c].map(val_map).values.reshape(-1, 1))
        input_list_train.append(m.transform(X_train[c].map(val_map).values.reshape(-1, 1)))
        
        m.fit(X_test[c].map(val_map).fillna(0).values.reshape(-1, 1))
        input_list_test.append(m.transform(X_test[c].map(val_map).fillna(0).values.reshape(-1, 1)))
        
    #the numerical columns
    m.fit(X_train[num_cols].values)
    input_list_train.append(m.transform(X_train[num_cols].values))
    
    m.fit(X_test[num_cols].values)
    input_list_test.append(m.transform(X_test[num_cols].values))
    
    return input_list_train, input_list_test

In [14]:
X_train, X_test = preproc(train, test, categorical_vars, numerical_vars)



TypeError: '<' not supported between instances of 'float' and 'str'

In [None]:
hist = model.fit(X_train, train_label, batch_size=64 ,epochs=50, validation_split=0.1, shuffle=True)

In [None]:
plt.plot(hist.history['acc'])


In [None]:
model.evaluate(X_test, test_label)