In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import cohen_kappa_score as kappa_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from skopt import gp_minimize
from skopt.space import Real, Integer
from sklearn.preprocessing import StandardScaler
kappa_scorer = make_scorer(kappa_score)

import os
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler

In [2]:
train_df = pd.read_csv("../input/all/train.csv")
test_df = pd.read_csv("../input/all/test/test.csv")

In [3]:
train_df.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,2,Nibble,3,299,0,1,1,7,0,1,...,1,1,100,41326,8480853f516546f6cf33aa88cd76c379,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,1,1,0,41401,3082c7125d8fb66f7dd4bff4192c8b14,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0
2,1,Brisco,1,307,0,1,2,7,0,2,...,1,1,0,41326,fa90fa5b1ee11c86938398b60abc32cb,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3
3,1,Miko,4,307,0,2,1,2,0,2,...,1,1,150,41401,9238e4f44c71a75282e62f7136c6b240,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2
4,1,Hunter,1,307,0,1,1,0,0,2,...,1,1,0,41326,95481e953f8aed9ec3d16fc4509537e8,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2


In [4]:
cat_cols = ['Type','Age','Breed1','Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'MaturitySize', 
          'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized','Health', 'Quantity','State','VideoAmt','PhotoAmt']

In [5]:
num_cols = ['Fee']

In [6]:
text_cols = ['Description']

## Handling categorical columns

In [7]:
embed_sizes = [len(train_df[col].unique()) + 1 for col in cat_cols]

In [8]:
print(embed_sizes)

[3, 107, 177, 136, 4, 8, 8, 7, 5, 4, 4, 4, 4, 4, 20, 15, 10, 32]


## Handling numerical columns

In [9]:
print('scaling num_cols')
for col in num_cols:
    print('scaling {}'.format(col))
    col_mean = train_df[col].mean()
    train_df[col].fillna(col_mean, inplace=True)
    test_df[col].fillna(col_mean, inplace=True)
    scaler = StandardScaler()
    train_df[col] = scaler.fit_transform(train_df[col].values.reshape(-1, 1))
    test_df[col] = scaler.transform(test_df[col].values.reshape(-1, 1))

scaling num_cols
scaling Fee




## Handling text columns

In [10]:
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [11]:
print('getting embeddings')
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in tqdm(open('../input/all/wiki-news-300d-1M-subword.vec')))


933it [00:00, 9325.05it/s]

getting embeddings


1000000it [01:25, 11662.49it/s]


In [12]:
num_words = 20000
maxlen = 80
embed_size = 300

In [13]:
train_df['Description'] = train_df['Description'].astype(str).fillna('no text')
test_df['Description'] = test_df['Description'].astype(str).fillna('no text')

In [14]:
print("   Fitting tokenizer...")
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_df['Description'].values.tolist())

   Fitting tokenizer...


In [15]:
train_df['Description'] = tokenizer.texts_to_sequences(train_df['Description'])
test_df['Description'] = tokenizer.texts_to_sequences(test_df['Description'])

In [16]:
word_index = tokenizer.word_index
nb_words = min(num_words, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= num_words: continue
    try:
        embedding_vector = embeddings_index[word]
    except KeyError:
        embedding_vector = None
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector


In [17]:
def get_input_features(df):
    X = {'description':pad_sequences(df['Description'], maxlen=maxlen)}
    X['numerical'] = np.array(df[num_cols])
    for cat in cat_cols:
        X[cat] = np.array(df[cat])
    return X

## Define NN Model

In [18]:
from keras.layers import Input, Embedding, Concatenate, Flatten, Dense, Dropout, BatchNormalization,LSTM, CuDNNLSTM, SpatialDropout1D
from keras.layers import Bidirectional, GlobalAveragePooling1D, GlobalMaxPool1D
from keras.models import Model
from keras.optimizers import  Adam

categorical_inputs = []
for cat in cat_cols:
    categorical_inputs.append(Input(shape=[1], name=cat))

categorical_embeddings = []
for i, cat in enumerate(cat_cols):
    categorical_embeddings.append(
        Embedding(embed_sizes[i], 10)(categorical_inputs[i]))

categorical_logits = Concatenate()([Flatten()(cat_emb) for cat_emb in categorical_embeddings])
categorical_logits = Dense(256, activation = 'relu')(categorical_logits)


numerical_inputs = Input(shape=[len(num_cols)], name='numerical')
numerical_logits = numerical_inputs
numerical_logits = BatchNormalization()(numerical_logits)
numerical_logits = Dense(128, activation = 'relu')(numerical_logits)

text_inp = Input(shape=[maxlen], name='description')
text_embed = Embedding(nb_words, embed_size, weights=[embedding_matrix], trainable=False)(text_inp)
text_logits = SpatialDropout1D(0.2)(text_embed)
text_logits = Bidirectional(LSTM(64, return_sequences=True))(text_logits)
avg_pool = GlobalAveragePooling1D()(text_logits)
max_pool = GlobalMaxPool1D()(text_logits)
text_logits = Concatenate()([avg_pool, max_pool])

x = Concatenate()([categorical_logits, text_logits, numerical_logits])
x = BatchNormalization()(x)

x = Dense(128, activation = 'relu')(x)
x = Dropout(0.3)(x)
out = Dense(1, activation = 'sigmoid')(x)

model = Model(inputs=[text_inp] + categorical_inputs + [numerical_inputs],outputs=out)
model.compile(optimizer=Adam(lr = 0.0001), loss = 'mse')

In [19]:
from sklearn.model_selection import train_test_split

#for i, l in enumerate(tr_df['AdoptionSpeed'].values):
#    y_train[i,l] = 1
#for i, l in enumerate(val_df['AdoptionSpeed'].values):
#    y_valid[i,l] = 1
tr_df, val_df = train_test_split(train_df, test_size = 0.2, random_state = 23)

In [20]:
# from keras.utils.np_utils import to_categorical

In [21]:
tr_df['AdoptionSpeed'].values.shape

(11994,)

In [22]:
y_train = tr_df['AdoptionSpeed'].values / 4
y_valid = val_df['AdoptionSpeed'].values / 4

In [23]:
y_train = np_utils.to_categorical(tr_df['AdoptionSpeed'], num_classes=5)
y_valid = np_utils.to_categorical(val_df['AdoptionSpeed'], num_classes=5)
# y_test = np_utils.to_categorical(test_df['AdoptionSpeed'], num_classes=5)

NameError: name 'np_utils' is not defined

In [None]:
# X_train = get_input_features(tr_df)
# X_valid = get_input_features(val_df)
# X_test = get_input_features(test_df)
test_df.head()

In [None]:

import glob
import cv2
import os
from keras.models import Sequential
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.layers.core import Activation
from keras.layers.core import Dropout
from keras.layers.core import Dense
from keras.layers import Flatten
from keras.layers import Input
from keras.models import Model
from sklearn.model_selection import train_test_split
from keras.layers.core import Dense
from keras.models import Model
from keras.optimizers import Adam
from keras.layers import concatenate
import numpy as np
import argparse
import locale
from keras.utils import np_utils
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(1337)  # for reproducibility
from keras.datasets import mnist
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import RMSprop

In [None]:
continuous = ['Type','Age','Breed1','Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'MaturitySize', 
          'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized','Health', 'Quantity','State','VideoAmt','PhotoAmt']
# zipcodes = tr_df[uncontinuous].value_counts().keys().tolist()
# counts = tr_df[uncontinuous].value_counts().tolist()

# 	# loop over each of the unique zip codes and their corresponding
# 	# count
# for (zipcode, count) in zip(zipcodes, counts):
# 		# the zip code counts for our housing dataset is *extremely*
# 		# unbalanced (some only having 1 or 2 houses per zip code)
# 		# so let's sanitize our data by removing any houses with less
# 		# than 25 houses per zip code
# 	if count < 25:
# 		idxs = tr_df[tr_df[uncontinuous] == zipcode].index
# 		tr_df.drop(idxs, inplace=True)
        
# zipBinarizer = LabelBinarizer().fit(tr_df[uncontinuous])
# trainCategorical = zipBinarizer.transform(tr_df[uncontinuous])

cs = MinMaxScaler()
trainContinuous = cs.fit_transform(tr_df[continuous])
trainContinuous2 = cs.fit_transform(val_df[continuous])
trainContinuous3 = cs.fit_transform(test_df[continuous])

In [None]:
x_train = np.hstack([trainContinuous])
x_valid = np.hstack([trainContinuous2])
x_test = np.hstack([trainContinuous3])
model = Sequential([
    Dense(32, input_dim=18),
    Activation('relu'),
    Dense(5),
    Activation('sigmoid'),
])
rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
# We add metrics to get more results you want to see
#categorical_crossentropy--mse
model.compile(optimizer=rmsprop,
              loss='mse',
              metrics=['accuracy'])
model.fit(x_train, y_train,validation_data = (x_valid,y_valid),epochs=260, batch_size=10)

In [None]:
# print('\nTesting ------------')
# # Evaluate the model with the metrics we defined earlier
# loss, accuracy = model.evaluate(x_test, y_test)

# print('test loss: ', loss)
# print('test accuracy: ', accuracy)


In [None]:
#y_pred2 = np.argmax(y_pred,axis = 1)
#y_pred2.shape