In [1]:
import pandas as pd
import numpy as np
import h5py
import matplotlib.pyplot as plt
import tensorflow as tf
import math
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input
from keras.optimizers import SGD
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [2]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Merge, Reshape
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint

In [3]:
data = pd.read_csv('~/AHS-ML-Project/data/22_AHS_COMB_Clean.csv', nrows=500)
data = data[np.isfinite(data)]
data = data.astype(float)

In [4]:
col_to_be_removed = [
    'state',
    'Unnamed: 0',
    'psu_id',
    'house_no',
    'house_hold_no',
    'member_identity',
    'father_serial_no',
    'mother_serial_no',
    'date_of_birth',
    'month_of_birth',
    'year_of_birth',
    'date_of_marriage',
    'month_of_marriage',
    'year_of_marriage',
    'building_no',
    'no_of_dwelling_rooms',
    'rural_1',
    'rural_2',
    'stratum_code',
    'relation_to_head',
    'member_identity',
    'father_serial_no',
    'mother_serial_no',
    'date_of_birth',
    'month_of_birth',
    'year_of_birth',
    'date_of_marriage',
    'month_of_marriage',
    'year_of_marriage',
    'isheadchanged',
    'year'
]

In [5]:
data = data.drop(col_to_be_removed, axis=1, errors='ignore')

In [6]:
data.shape

(500, 58)

In [30]:
data.head()

Unnamed: 0,district,rural,sex,usual_residance,age,religion,social_group_code,marital_status,currently_attending_school,reason_for_not_attending_school,...,is_water_pump,cart,land_possessed,residancial_status,iscoveredbyhealthscheme,healthscheme_1,healthscheme_2,housestatus,householdstatus,as_binned
0,3.0,1.0,1.0,1.0,30.0,1.0,3.0,1.0,,,...,2.0,4.0,6.0,1.0,2.0,,,1.0,1.0,
1,3.0,1.0,1.0,1.0,49.0,1.0,3.0,3.0,,,...,2.0,4.0,6.0,1.0,2.0,,,1.0,1.0,
2,3.0,1.0,2.0,1.0,47.0,1.0,3.0,3.0,,,...,2.0,4.0,6.0,1.0,2.0,,,1.0,1.0,
3,3.0,1.0,1.0,1.0,25.0,1.0,3.0,1.0,,,...,2.0,4.0,6.0,1.0,2.0,,,1.0,1.0,
4,3.0,1.0,1.0,1.0,27.0,1.0,3.0,3.0,,,...,2.0,4.0,6.0,1.0,2.0,,,1.0,1.0,


In [34]:
data['district'].unique()

array([  3.,  10.])

In [7]:
data_col_list = list(data)

data_col_unique_dict = {}
for col in data_col_list:
    data_col_unique_dict[col] = len(data[col].unique())
    
def get_col_unique_dict(data):
    data_col_list = list(data)

    data_col_unique_dict = {}
    for col in data_col_list:
        data_col_unique_dict[col] = len(data[col].unique())
    
    return (data_col_list, data_col_unique_dict)

In [12]:
import numpy as numpy

In [8]:
def split_features(X):
    X_list = []

    feat_list = X.shape[1]
    
    for i in range(feat_list):
        X_list.append(X[..., [i]])

    return X_list

In [34]:
class Model(object):

    def evaluate(self, X_val, y_val):
        assert(min(y_val) > 0)
        guessed_sales = self.guess(X_val)
        relative_err = numpy.absolute((y_val - guessed_sales) / y_val)
        result = numpy.sum(relative_err) / len(y_val)
        return result

In [50]:
data = data[np.isfinite(data['diagnosed_for'])]

In [82]:
Y = data[['diagnosed_for']]
X = data.drop(['diagnosed_for'], inplace=False, axis=1, errors='ignore')

In [83]:
X_col_list, X_col_unique_dict = get_col_unique_dict(X)

Y = np.array(Y.astype(float))
X = np.array(X)

In [84]:
np.unique(Y)

array([  0.,   6.,  11.,  17.,  19.,  21.,  99.])

In [85]:
Y.shape

(385, 1)

In [86]:
len(Y)

385

In [60]:
def replace_labes(label_data):
	    dict_map = {1.0 : 1.0, 2.0 : 2.0, 3.0 : 3.0, 7.0 : 4.0, 9.0 : 5.0,
	                19.0 : 6.0, 21.0 : 7.0, 99.0 : 7.0}
	    for i in range(len(label_data)):
	        if label_data[i] in dict_map:
	            label_data[i] = dict_map[label_data[i]]-1
	        else :
	            label_data[i] = 0.0
	    return label_data

In [87]:
X[..., [0]].shape

(385, 1)

In [88]:
X_train = X[:300]
X_test = X[300:]

Y_train = Y[:300]
Y_test = Y[300:]

In [89]:
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

((300, 57), (300, 1))
((85, 57), (85, 1))


In [90]:
print(len(X_col_list))
print(len(X_col_unique_dict))

57
57


In [91]:
class NN_with_EntityEmbedding(Model):

    def __init__(self, X_train, y_train, X_val, y_val, X_col_list, X_col_unique_dict):
#         super().__init__()
        
        self.X_col_list = X_col_list
        self.X_col_unique_dict = X_col_unique_dict
        
        self.nb_epoch = 10
        self.checkpointer = ModelCheckpoint(filepath="best_model_weights.hdf5", verbose=1, save_best_only=True)
        self.max_log_y = max(numpy.max(numpy.log(y_train)), numpy.max(numpy.log(y_val)))
        self.__build_keras_model()
        self.fit(X_train, y_train, X_val, y_val)

    def preprocessing(self, X):
        X_list = split_features(X)
        return X_list

    def __build_keras_model(self):
        models = []
        
        for col in self.X_col_list:            
            model_col = Sequential()
            in_size = X_col_unique_dict[col]
            if in_size == 1:
                out_size = 1
            elif in_size < 10:
                out_size = in_size - 1
            else:
                out_size = int(in_size * 0.7)
            
            model_col.add(Embedding(in_size, out_size, input_length=1))
            model_col.add(Reshape(target_shape=(out_size,)))
            models.append(model_col)
        
        self.model = Sequential()
        self.model.add(Merge(models, mode='concat'))
        self.model.add(Dense(1000, init='uniform'))
        self.model.add(Activation('relu'))
        self.model.add(Dense(500, init='uniform'))
        self.model.add(Activation('relu'))
        self.model.add(Dense(1))
        self.model.add(Activation('sigmoid'))

        self.model.compile(loss='mean_absolute_error', optimizer='adam')

    def _val_for_fit(self, val):
        val = numpy.log(val) / self.max_log_y
        return val

    def _val_for_pred(self, val):
        return numpy.exp(val * self.max_log_y)

    def fit(self, X_train, y_train, X_val, y_val):
        self.model.fit(self.preprocessing(X_train), self._val_for_fit(y_train),
                       validation_data=(self.preprocessing(X_val), self._val_for_fit(y_val)),
                       nb_epoch=self.nb_epoch, batch_size=128,
                       # callbacks=[self.checkpointer],
                       )
        # self.model.load_weights('best_model_weights.hdf5')
        print("Result on validation data: ", self.evaluate(X_val, y_val))

    def guess(self, features):
        features = self.preprocessing(features)
        result = self.model.predict(features).flatten()
        return self._val_for_pred(result)

In [92]:
X_col_unique_dict

{'age': 63,
 'alcohol': 5,
 'as_binned': 1,
 'cart': 3,
 'chew': 5,
 'cooking_fuel': 5,
 'currently_attending_school': 3,
 'diagnosis_source': 7,
 'disability_status': 5,
 'district': 2,
 'drinking_water_source': 4,
 'healthscheme_1': 5,
 'healthscheme_2': 1,
 'highest_qualification': 10,
 'house_status': 2,
 'house_structure': 4,
 'household_have_electricity': 3,
 'householdstatus': 2,
 'housestatus': 1,
 'illness_type': 6,
 'injury_treatment_type': 1,
 'is_bicycle': 3,
 'is_car': 2,
 'is_computer': 2,
 'is_radio': 3,
 'is_refrigerator': 3,
 'is_scooter': 3,
 'is_sewing_machine': 3,
 'is_telephone': 4,
 'is_television': 3,
 'is_toilet_shared': 2,
 'is_tractor': 2,
 'is_washing_machine': 2,
 'is_water_filter': 2,
 'is_water_pump': 3,
 'iscoveredbyhealthscheme': 3,
 'kitchen_availability': 3,
 'land_possessed': 6,
 'lighting_source': 3,
 'marital_status': 4,
 'occupation_status': 15,
 'owner_status': 3,
 'reason_for_not_attending_school': 3,
 'regular_treatment': 4,
 'regular_treatment_

In [93]:
model = NN_with_EntityEmbedding(X_train, Y_train, X_test, Y_test, X_col_list, X_col_unique_dict)



Train on 300 samples, validate on 85 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


AssertionError: 