## Ideas moving forward:
1. Create more robust NN model
    a. Ideas:
        https://www.kaggle.com/javapocalypse/breast-cancer-classification-in-keras-using-ann
        http://parneetk.github.io/blog/neural-networks-in-keras/
2. Find way for NN to use roc auc as loss metric to optimize
3. Would filling in missing values differently make a significant difference?
    a. I was thinking maybe cluster before filling in missing NAN values, then use the means/medians of the columns based on the clusters

Since these models are using both the test and training sets for preprocessing, this might be considered leakage
For now, we will accept this leakage as we want to make our model as accurate as possible for competition purposes
Once a solid model is build, we will try preprocessing without the test set, and rerun; then go from there

In [1]:
import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
color = sns.color_palette()
sns.set()
from IPython.display import display

import random
import os
import gc
# print(os.listdir("../input"))

In [2]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.optimizers import SGD

Using TensorFlow backend.


In [3]:
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, Imputer
from sklearn.model_selection import train_test_split

In [54]:
'''
#Uncomment this for kaggle
pos = pd.read_csv('../input/POS_CASH_balance.csv')
test = pd.read_csv('../input/application_test.csv')
train = pd.read_csv('../input/application_train.csv')
bureau = pd.read_csv('../input/bureau.csv')
bureau_balance = pd.read_csv('../input/bureau_balance.csv')
cc_bal = pd.read_csv('../input/credit_card_balance.csv')
inst = pd.read_csv('../input/installments_payments.csv')
prev_app = pd.read_csv('../input/previous_application.csv')
sample_submission = pd.read_csv('../input/sample_submission.csv')

'''

#Uncomment this for local
pos = pd.read_csv('data/POS_CASH_balance.csv')
test = pd.read_csv('data/application_test.csv')
train = pd.read_csv('data/application_train.csv')
bureau = pd.read_csv('data/bureau.csv')
bureau_balance = pd.read_csv('data/bureau_balance.csv')
cc_bal = pd.read_csv('data/credit_card_balance.csv')
inst = pd.read_csv('data/installments_payments.csv')
prev_app = pd.read_csv('data/previous_application.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

In [5]:
gc.enable()

## Preprocessing

What we are doing here, basically, is just get_dummies for each table, getting the count of past records and then averaging the tables to get unique values of SK_ID_CURR. 
This is pretty basic but could be effective feature engineering.

In [6]:
pp_cols = []

In [7]:
print('Bureau_balance preprocess')
bureau_balance = pd.get_dummies(bureau_balance)

print('Counting bureaus')
bb_counts = bureau_balance[['SK_ID_BUREAU', 'MONTHS_BALANCE']].groupby('SK_ID_BUREAU').count()
bureau_balance['bb_count'] = bureau_balance['SK_ID_BUREAU'].map(bb_counts['MONTHS_BALANCE'])

print('averaging bureau bal')
avg_bureau_bal = bureau_balance.groupby('SK_ID_BUREAU').mean()

avg_bureau_bal.reset_index(inplace=True)

del bureau_balance, bb_counts
gc.collect()

Bureau_balance preprocess
Counting bureaus
averaging bureau bal


83

In [8]:
print('Bureau preprocess, includes merging bureau_balance preprocess')
bureau = pd.get_dummies(bureau)

bureau_full = bureau.merge(avg_bureau_bal, how='left', on='SK_ID_BUREAU')

print('Counting buro per SK_ID_CURR')
nb_bureau_per_curr = bureau_full[['SK_ID_CURR', 'SK_ID_BUREAU']].groupby('SK_ID_CURR').count()
bureau_full['SK_ID_BUREAU'] = bureau_full['SK_ID_CURR'].map(nb_bureau_per_curr['SK_ID_BUREAU'])

print('Averaging bureau')
avg_bureau = bureau_full.groupby('SK_ID_CURR').mean()

avg_bureau.columns = ['avg_bureau_' + f_ for f_ in avg_bureau.columns]
avg_bureau.reset_index(inplace = True)
pp_cols.append('avg_bureau')

del bureau, bureau_full, nb_bureau_per_curr
gc.collect()

Bureau preprocess, includes merging bureau_balance preprocess
Counting buro per SK_ID_CURR
Averaging bureau


125

In [9]:
print('Prev_app preprocess')
prev_app = pd.get_dummies(prev_app)

print('Counting number of prevs')
nb_prevs = prev_app[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
prev_app['SK_ID_PREV'] = prev_app['SK_ID_CURR'].map(nb_prevs['SK_ID_PREV'])

print('Averaging prev')
avg_prev = prev_app.groupby('SK_ID_CURR').mean()

avg_prev.columns = ['avg_prev_' + f_ for f_ in avg_prev.columns]
avg_prev.reset_index(inplace=True)
pp_cols.append('avg_prev')

del prev_app, nb_prevs
gc.collect()

Prev_app preprocess
Counting number of prevs
Averaging prev


269

In [10]:
print('Pos preprocess')
pos = pd.get_dummies(pos)

print('Counting number of prevs')
nb_prevs = pos[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
pos['SK_ID_PREV'] = pos['SK_ID_CURR'].map(nb_prevs['SK_ID_PREV'])

print('Averaging pos')
avg_pos = pos.groupby('SK_ID_CURR').mean()

avg_pos.columns = ['avg_pos_' + f_ for f_ in avg_pos.columns]
avg_pos.reset_index(inplace=True)
pp_cols.append('avg_pos')

del pos, nb_prevs
gc.collect()

Pos preprocess
Counting number of prevs
Averaging pos


102

In [11]:
print('Cc_bal preprocess')
cc_bal = pd.get_dummies(cc_bal)

print('COunting number of prevs')
nb_prevs = cc_bal[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
cc_bal['SK_ID_PREV'] = cc_bal['SK_ID_CURR'].map(nb_prevs['SK_ID_PREV'])

print('Averaging cc_bal')
avg_cc_bal = cc_bal.groupby('SK_ID_CURR').mean()

avg_cc_bal.columns = ['avg_cc_bal_' + f_ for f_ in avg_cc_bal.columns]
avg_cc_bal.reset_index(inplace=True)
pp_cols.append('avg_cc_bal')

del cc_bal, nb_prevs
gc.collect()

Cc_bal preprocess
COunting number of prevs
Averaging cc_bal


100

In [12]:
print('Inst preprocess')
print('No categorical cols')

print('Counting number of prevs')
nb_prevs = inst[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
inst['SK_ID_PREV'] = inst['SK_ID_CURR'].map(nb_prevs['SK_ID_PREV'])

print('Averaging inst')
avg_inst = inst.groupby('SK_ID_CURR').mean()

avg_inst.columns = ['avg_inst_' + f_ for f_ in avg_inst.columns]
avg_inst.reset_index(inplace=True)
pp_cols.append('avg_inst')

del inst, nb_prevs
gc.collect()

Inst preprocess
No categorical cols
Counting number of prevs
Averaging inst


58

In [13]:
#lets merge our train and test to create a bigger dataset to preprocess on
test['is_test'] = 1 
test['is_train'] = 0
train['is_test'] = 0
train['is_train'] = 1

In [14]:
X_train = train.drop(['TARGET'], axis = 1)
y_train = train['TARGET']
X_train.shape

(307511, 123)

In [15]:
X_test = test
X_test.shape

(48744, 123)

In [16]:
data = pd.concat([X_train, X_test], axis=0)
data = pd.get_dummies(data)

In [17]:
pp_cols

['avg_bureau', 'avg_prev', 'avg_pos', 'avg_cc_bal', 'avg_inst']

In [18]:
#this could be automated with a function perhaps
data = data.merge(avg_bureau, how='left', on='SK_ID_CURR')
data = data.merge(avg_prev, how='left', on='SK_ID_CURR')
data = data.merge(avg_pos, how='left', on='SK_ID_CURR')
data = data.merge(avg_cc_bal, how='left', on='SK_ID_CURR')
data = data.merge(avg_inst, how='left', on='SK_ID_CURR')
data.shape

(356255, 506)

In [19]:
del avg_bureau, avg_prev, avg_pos, avg_cc_bal, avg_inst
gc.collect()

279

In [20]:
# Function to calculate missing values by column
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [21]:
missing_values_table(data)

Your selected dataframe has 506 columns.
There are 320 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
avg_prev_RATE_INTEREST_PRIMARY,350534,98.4
avg_prev_RATE_INTEREST_PRIVILEGED,350534,98.4
avg_cc_bal_AMT_PAYMENT_CURRENT,284135,79.8
avg_cc_bal_CNT_DRAWINGS_ATM_CURRENT,284061,79.7
avg_cc_bal_AMT_DRAWINGS_ATM_CURRENT,284061,79.7
avg_cc_bal_AMT_DRAWINGS_POS_CURRENT,284061,79.7
avg_cc_bal_CNT_DRAWINGS_POS_CURRENT,284061,79.7
avg_cc_bal_CNT_DRAWINGS_OTHER_CURRENT,284061,79.7
avg_cc_bal_AMT_DRAWINGS_OTHER_CURRENT,284061,79.7
avg_cc_bal_AMT_PAYMENT_TOTAL_CURRENT,252697,70.9


In [22]:
# Get the columns with > 60% missing
missing_df = missing_values_table(data);
missing_columns = list(missing_df[missing_df['% of Total Values'] > 60].index)
print('We will remove %d columns.' % len(missing_columns))

Your selected dataframe has 506 columns.
There are 320 columns that have missing values.
We will remove 57 columns.


In [23]:
# Drop the columns
data = data.drop(labels = list(missing_columns), axis=1)

In [24]:
missing_data = data[data.columns[data.isnull().sum()>0]]

In [30]:
data = data.fillna(data.mean())

In [47]:
ignore_features = ['SK_ID_CURR','is_train', 'is_test']
relevant_features = [col for col in data.columns if col not in ignore_features]
X_train = data[data['is_train']==1][relevant_features]
test_tf = data[data['is_test']==1][relevant_features]

In [48]:
X_train.shape

(307511, 446)

In [49]:
test_tf.shape

(48744, 446)

In [50]:
test_tf.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,avg_pos_NAME_CONTRACT_STATUS_Returned to the store,avg_pos_NAME_CONTRACT_STATUS_Signed,avg_pos_NAME_CONTRACT_STATUS_XNA,avg_inst_SK_ID_PREV,avg_inst_NUM_INSTALMENT_VERSION,avg_inst_NUM_INSTALMENT_NUMBER,avg_inst_DAYS_INSTALMENT,avg_inst_DAYS_ENTRY_PAYMENT,avg_inst_AMT_INSTALMENT,avg_inst_AMT_PAYMENT
307511,0,135000.0,568800.0,20560.5,450000.0,0.01885,-19241,-2329,-5170.0,-812,...,0.0,0.0,0.0,7.0,1.142857,2.714286,-2187.714286,-2195.0,5885.132143,5885.132143
307512,0,99000.0,222768.0,17370.0,180000.0,0.035792,-18064,-4469,-9118.0,-1623,...,0.0,0.090909,0.0,9.0,1.111111,5.0,-586.0,-609.555556,6240.205,6240.205
307513,0,202500.0,663264.0,69777.0,630000.0,0.019101,-20038,-4458,-2175.0,-3503,...,0.0,0.027778,0.0,155.0,0.277419,43.729032,-1352.929032,-1358.109677,10897.898516,9740.235774
307514,2,315000.0,1575000.0,49018.5,1575000.0,0.026392,-13976,-1866,-2000.0,-4208,...,0.0,0.0,0.0,113.0,0.460177,30.504425,-855.548673,-858.548673,4979.282257,4356.731549
307515,1,180000.0,625500.0,32067.0,625500.0,0.010032,-13040,-2191,-4000.0,-4262,...,0.0,0.0,0.0,12.0,1.0,6.5,-622.0,-634.25,11100.3375,11100.3375


In [200]:
#x_train, x_test, y_train, y_test = train_test_split(X_train, y_train, test_size=.20)

In [51]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
#x_test = sc.transform(x_test)
test_tf = sc.transform(test_tf)

In [52]:
input_dim = X_train.shape[1]

## Building ANN model
Running this in Kaggle, so ignore these errors, theyre from me stopping the Jupyter kernel

In [53]:
clf = Sequential()
clf.add(Dense(units=6, kernel_initializer='uniform', activation='relu', input_dim = input_dim))
clf.add(Dense(units=6, kernel_initializer='uniform', activation='relu'))
clf.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))

In [204]:
sgd = SGD(lr=0.1)
clf.compile(optimizer = sgd, loss = 'binary_crossentropy', metrics = ['accuracy'])

In [207]:
earlystopping = EarlyStopping(patience=2)
clf.fit(X_train, y_train, batch_size=8, epochs=100, callbacks=[earlystopping], validation_split=0.2)

Train on 246008 samples, validate on 61503 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
 38520/246008 [===>..........................] - ETA: 46s - loss: 0.2535 - acc: 0.9178

KeyboardInterrupt: 

In [214]:
clf.predict(test_tf)

NameError: name 'test_tf' is not defined

In [56]:
test_preds = clf.predict(test_tf)
pred_df = pd.DataFrame()
pred_df['SK_ID_CURR'] = test['SK_ID_CURR']
pred_df['TARGET'] = test_preds
pred_df.to_csv('NN_baseline.csv', index=False)
pred_df.head(20)

KeyboardInterrupt: 

In [57]:
#running on Kaggle, will merge later