# Neural Net model 1

- V0: HyperOpt NN with one layer

Scores:
- V0: 

In [31]:

import os
import random
import time
import sys
import pickle

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

#import sys
#import pymongo
#import bson.objectid
#pymongo.objectid = bson.objectid
#sys.modules["pymongo.objectid"] = bson.objectid

#os.environ['KERAS_BACKEND']='tensorflow'
#os.environ['THEANO_FLAGS'] = 'device=cpu'
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from keras.optimizers import SGD
from keras.callbacks import Callback
import keras

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.cross_validation import StratifiedKFold, train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import log_loss

from ml_toolbox.kaggle import KaggleResult

%matplotlib inline


In [32]:
dir_in = 'data_ori'
dir_feat = 'data'
sub_dir = 'model_1_nn'

In [33]:

rs = 123
fixed_seed_num = 1234
np.random.seed(fixed_seed_num)
random.seed(fixed_seed_num) # not sure if needed or not

feature_files = ['features_brand_model_bag',
                 'features_brand_bag',
                 'features_appid_installed',
                 'features_label_app_installed']

# Function to read feature file
def open_feature_file(fname, samples='train'):
    if fname[-3:] == 'csv':
        if samples=='train':
            X = gatrain[['device_id']].merge( pd.read_csv(os.path.join(dir_feat, fname)), on='device_id', how='left')
        else:
            X = gatest[['device_id']].merge( pd.read_csv(os.path.join(dir_feat, fname)), on='device_id', how='left')
            
        X.drop('device_id', axis=1, inplace=True)
        X.fillna(0, inplace=True)
        
        for c in X.columns:
            if X[c].max()>1:
                X[c] = StandardScaler().fit_transform(X)
                
        return csr_matrix(X.values)
    else:
        # Assume it is a pickle file
        with open(os.path.join(dir_feat, '{}_{}.pickle'.format(fname,samples)), 'rb') as f:
            return pickle.load(f)

## Load train/test data

In [34]:
gatrain = pd.read_csv(os.path.join(dir_in,'gender_age_train.csv'))
gatest = pd.read_csv(os.path.join(dir_in,'gender_age_test.csv'))
targetencoder = LabelEncoder().fit(gatrain.group)
y = targetencoder.transform(gatrain.group)
nclasses = len(targetencoder.classes_)

In [35]:
fw = [1, 1, 1, 1]
Xtrain = hstack([open_feature_file(f) for f in feature_files],format='csr')
Xtest = hstack([open_feature_file(f,'test') for f in feature_files],format='csr')

print('Train on {} features'.format(Xtrain.shape[1]))
print('Test on {} features'.format(Xtrain.shape[1]))


Train on 21527 features
Test on 21527 features


In [36]:
# With selection (15xxx features): 2.27427
# Without selection (21527 features): 2.27427
selector = VarianceThreshold().fit(Xtrain)
Xtrain = selector.transform(Xtrain)
Xtest = selector.transform(Xtest)

## Test Keras model

In [37]:
dummy_y = np_utils.to_categorical(y)

In [38]:
X_train, X_val, y_train, y_val = train_test_split(Xtrain, dummy_y, stratify=y,
                                                  test_size=0.1, random_state=42)

In [39]:
def batch_generator(X, y, batch_size, shuffle):
    #chenglong code for fiting from generator (https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices)
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch, np.ones(X_batch.shape[0])
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch#, np.ones(X_batch.shape[0])
        if (counter == number_of_batches):
            counter = 0


In [40]:
params = {'optimizer': 'adam',
          'batch_size': 32,
          'n_epoch': 4,
         'layer_1': {'on1': True,
                     'units1': 150,
                     'activation1': 'relu',
                     'dropout1': 0.6},
         'layer_2': {'on2': False,
                    'units2': 150,
                    'activation2': 'tanh',
                    'dropout2': 0.2},
          'layer_3': {'on3': False,
                     'units3': 12,
                     'activation3': 'sigmoid',
                     'dropout3': 0.2}
          }
# According: 
#https://www.kaggle.com/agavranis/talkingdata-mobile-user-demographics/bag-of-apps-keras-11-08-16-no-val/code
params = {'optimizer': 'adadelta',
          'batch_size': 400,
          'n_epoch': 16,
         'layer_1': {'on1': True,
                     'units1': 150,
                     'activation1': 'relu',
                     'dropout1': 0.4},
         'layer_2': {'on2': True,
                    'units2': 50,
                    'activation2': 'relu',
                    'dropout2': 0.2},
          'layer_3': {'on3': False,
                     'units3': 12,
                     'activation3': 'sigmoid',
                     'dropout3': 0.2}
          }

In [41]:
# create model
print ('Model with following parameters: %s' % (params))

model = Sequential()
model.add(Dense(params['layer_1']['units1'], 
                input_dim=X_train.shape[1], 
                init='normal', 
                activation=params['layer_1']['activation1']))

model.add(Dropout(params['layer_1']['dropout1']))

if params['layer_2']['on2']:
    model.add(Dense(params['layer_2']['units2'], 
                    input_dim=X_train.shape[1], 
                    init='normal', 
                    activation=params['layer_2']['activation2']))
    #https://www.kaggle.com/poiss0nriot/talkingdata-mobile-user-demographics/bag-of-apps-keras-11-08-16-no-val/run/328610
    #model.add(PReLU())
    model.add(Dropout(params['layer_2']['dropout2']))

if params['layer_3']['on3']:
    model.add(Dense(params['layer_3']['units3'], 
                    init='normal', 
                    activation=params['layer_3']['activation3']))

    model.add(Dropout(params['layer_3']['dropout3']))

model.add(Dense(12, init='normal', activation='softmax'))    

# Compile model
model.compile(loss='categorical_crossentropy', optimizer=params['optimizer'], metrics=['accuracy'])  #logloss


fit=model.fit(X_train.todense(), y_train, nb_epoch = int(params['n_epoch']), batch_size=int(params['batch_size']),
                         validation_data=(X_val.todense(), y_val), verbose=2) 

scores_val = model.predict_proba(X_val.todense(), batch_size = 128, verbose = 0)


Model with following parameters: {'optimizer': 'adadelta', 'layer_1': {'units1': 150, 'activation1': 'relu', 'on1': True, 'dropout1': 0.4}, 'layer_3': {'activation3': 'sigmoid', 'on3': False, 'units3': 12, 'dropout3': 0.2}, 'layer_2': {'on2': True, 'activation2': 'relu', 'units2': 50, 'dropout2': 0.2}, 'batch_size': 400, 'n_epoch': 16}
Train on 67179 samples, validate on 7466 samples
Epoch 1/16
24s - loss: 2.4320 - acc: 0.1353 - val_loss: 2.3829 - val_acc: 0.1501
Epoch 2/16
21s - loss: 2.3611 - acc: 0.1614 - val_loss: 2.3163 - val_acc: 0.1789
Epoch 3/16
18s - loss: 2.3197 - acc: 0.1752 - val_loss: 2.2920 - val_acc: 0.1855
Epoch 4/16
18s - loss: 2.2997 - acc: 0.1815 - val_loss: 2.2792 - val_acc: 0.1887
Epoch 5/16
19s - loss: 2.2877 - acc: 0.1866 - val_loss: 2.2713 - val_acc: 0.1937
Epoch 6/16
19s - loss: 2.2773 - acc: 0.1921 - val_loss: 2.2656 - val_acc: 0.2001
Epoch 7/16
20s - loss: 2.2677 - acc: 0.1956 - val_loss: 2.2612 - val_acc: 0.2033
Epoch 8/16
20s - loss: 2.2596 - acc: 0.1984 - 

In [42]:
# evaluate the model
scores_val = model.predict_generator(generator=batch_generatorp(X_val, 32, False), val_samples=X_val.shape[0])

cv_score = log_loss(y_val, scores_val)

print('logloss val {}'.format(log_loss(y_val, scores_val)))

logloss val 2.24847219608


## Create test score

In [43]:
# evaluate the model
pred_test = model.predict_generator(generator=batch_generatorp(Xtest, 32, False), val_samples=Xtest.shape[0])
pred = pd.DataFrame(pred_test, index = gatest['device_id'], columns=targetencoder.classes_)

In [44]:
kag = KaggleResult(pred.values, pred.index.values, cv_score=cv_score, description='NN Model 1', subdir=sub_dir)

In [45]:
if kag.validate():
    kag.upload()
print kag.lb_score

2.2496
