## Pre-work on Data

### Import Module

In [0]:
import tensorflow as tf
from tensorflow import keras

import numpy as np
import pandas as pd

### Load File

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [0]:
# change the directory of the data
data = pd.read_csv('/content/gdrive/My Drive/Colab Notebook/Data/ivas_business_hackathon_final_2018.csv')
#month_cols = [col for col in data.columns if 'month' in col]

#data[month_cols] /= 12
data.drop(columns=['issuer'], inplace = True)

In [0]:
train_data = data[data.Data_Set == 'train'].drop(columns=['acct_card_no','Data_Set'])
test_data = data[data.Data_Set == 'test'].drop(columns=['acct_card_no','Data_Set'])

train_labels = train_data.Dormant_Flag
test_labels = test_data.Dormant_Flag

In [0]:
feature_column = [col for col in train_data.columns if 'Dormant_Flag' not in col]
train_data = train_data[feature_column]
test_data = test_data[feature_column]

mean = train_data.mean(axis=0)
std = train_data.std(axis=0)
train_data = (train_data - mean) / std
test_data = (test_data - mean) / std

### Feature Selection Method

#### Feature Importance

In [34]:
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier

# create a base classifier used to evaluate a subset of attributes
model = ExtraTreesClassifier()
model.fit(train_data, train_labels)
# display the relative importance of each attribute
#print(model.feature_importances_)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [35]:
feature_etc = [feature_column[i] for i in sorted(range(len(model.feature_importances_)), key=lambda k: model.feature_importances_[k], reverse=True)[0:10]]
feature_etc

['B_LAST_6MNTHS_TOTAL_TXNS',
 'active_months',
 'B_LAST_3MNTHS_TOTAL_TXNS',
 'RATIO_DI_TOTAL_TXNS_L1_L6',
 'vintage_months',
 'B_D_LST_9MNTHS_TOTAL_TXNS',
 'd_active_months',
 'RATIO_D_TOTAL_TXNS_L1_L6',
 'B_D_LST_6MNTHS_TOTAL_TXNS',
 'B_PRV_6MNTHS_TOTAL_AMT']

#### Univariate Selection (must not admit negative value feature)

In [0]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

test = SelectKBest(score_func=chi2, k=10)
fit = test.fit(train_data, train_labels)
# summarize scores
np.set_printoptions(precision=3)
#print(fit.scores_)
features = fit.transform(train_data)
# summarize selected features
#print(features[0:5,:])

In [0]:
feature_us = [feature_column[i] for i in sorted(range(len(fit.scores_)), key=lambda k: fit.scores_[k], reverse=True)[0:10]]
feature_us

In [0]:
feature_etc_us = list(set(feature_etc).union(set(feature_us)))
feature_etc_us

#### Recursive Feature Elimination from Feature Importance and Univariate Selection

In [46]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
rfe = RFE(model, 7) # 7 final parameter
fit = rfe.fit(train_data[feature_etc], train_labels)
print("Num Features: {}".format(fit.n_features_))
print("Selected Features: {}".format(fit.support_))
print("Feature Ranking: {}".format(fit.ranking_))

Num Features: 7
Selected Features: [ True  True  True False  True  True  True False  True False]
Feature Ranking: [1 1 1 3 1 1 1 2 1 4]


In [47]:
from itertools import compress
feature_etc_us_ref = list(compress(feature_etc_us, fit.support_.tolist()))
feature_etc_us_ref

['B_D_LST_1MNTH_TOTAL_TXNS',
 'RATIO_DI_TOTAL_AMT_L1_L6',
 'RATIO_DI_TOTAL_TXNS_L1_L3',
 'B_LAST_12MNTHS_TOTAL_TXNS',
 'B_LAST_3MNTHS_TOTAL_TXNS',
 'vintage_months',
 'RATIO_DI_TOTAL_AMT_L6_P6']

## Create Layer

In [0]:
model = keras.Sequential([
    keras.layers.Dense(128, activation=tf.nn.relu, input_shape=(train_data.shape[1],)),
    keras.layers.Dense(128, activation=tf.nn.relu),
    keras.layers.Dense(128, activation=tf.nn.relu),
    keras.layers.Dense(128, activation=tf.nn.relu),
    keras.layers.Dense(2, activation=tf.nn.softmax)
])

In [0]:
model.compile(optimizer=tf.train.AdamOptimizer(), 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

## Train Model

In [99]:
model.fit(train_data, train_labels, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb8f1e689e8>

In [100]:
test_loss, test_acc = model.evaluate(test_data, test_labels)

print('Test accuracy:', test_acc)

Test accuracy: 0.7390921172265565


In [0]:
predictions = model.predict(test_data)

In [102]:
k = 201
print('Prob = {}'.format(predictions[k]))
print('Prediction result is {}'.format(np.argmax(predictions[k])))
print('Original result is {}'.format(test_labels.iloc[k]))



Prob = [0.706 0.294]
Prediction result is 0
Original result is 0
