In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix
import xgboost as xgb
import warnings; warnings.filterwarnings('ignore')

### Data preparation

In [2]:
#https://www.kaggle.com/mlg-ulb/creditcardfraud/downloads/creditcardfraud.zip/3
df = pd.read_csv('creditcardfraud.zip')
lgl = [df['Class'] == 0]
frd = [df['Class'] == 1]
print(df['Class'].value_counts())

lgl = df[df['Class'] == 0].sample(frac=1)
frd = df[df['Class'] == 1].sample(frac=1)
lgt = lgl[:10000]
frt = frd[:400]
lgv = lgl[10000:10000 + 1000]
frv = frd[400:]

train = pd.concat([lgt,frt]).sample(frac=1)
vald = pd.concat([lgv,frv]).sample(frac=1)
Xt = train.iloc[:,:-1]
yt = train.iloc[:,-1:].values.ravel()
Xv = vald.iloc[:,:-1]
yv = vald.iloc[:,-1:].values.ravel()

0    284315
1       492
Name: Class, dtype: int64


### Sklearn classifiers

In [21]:
#clf = MLPClassifier([30])
clf = RandomForestClassifier(n_jobs=-1)
#clf = GradientBoostingClassifier()
clf.fit(Xt, yt)
y_pred = clf.predict(Xv)
print('acc: ',accuracy_score(yv, y_pred), '\tmcc: ', matthews_corrcoef(yv, y_pred))
print('Confusion matrix:')
pd.DataFrame(confusion_matrix(yv, y_pred, [1,0]).T, index=['PP','PN'], columns=['CP','CN'])

acc:  0.9880952380952381 	mcc:  0.9206154520651885
Confusion matrix:


Unnamed: 0,CP,CN
PP,80,1
PN,12,999


### XGBoost classifiers

In [3]:
dtrain = xgb.DMatrix(Xt, label=yt)
dval = xgb.DMatrix(Xv, label=yv)
dtest = xgb.DMatrix(Xv)
param = {'nthread':8, 'seed':0, 'objective':'binary:logistic', 'max_depth':15, 'eta':0.1, 'booster':'gbtree'}
param['eval_metric'] = ['auc','aucpr']
watchlist = [(dval, 'eval'), (dtrain, 'train')]
num_round = 200
evals_result = {}
clf = xgb.train(param, dtrain, num_round, watchlist, early_stopping_rounds=None, evals_result=evals_result, verbose_eval=0)
#bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
#bst.predict(dtest, ntree_limit=num_round)

y_pred = np.where(clf.predict(dtest) < .5, 0, 1)
print('acc: ',accuracy_score(yv, y_pred), '\tmcc: ', matthews_corrcoef(yv, y_pred))
print('Train: ','auc->',evals_result['train']['auc'][-1], '\taucpr->', evals_result['train']['aucpr'][-1])
print('Eval:' , 'auc->', evals_result['eval']['auc'][-1], '\taucpr->', evals_result['eval']['aucpr'][-1])
print('Confusion matrix:')
pd.DataFrame(confusion_matrix(yv, y_pred, [1,0]).T, index=['PP','PN'], columns=['CP','CN'])

acc:  0.9844322344322345 	mcc:  0.8951900606298095
Train:  auc-> 1.0 	aucpr-> 1.0
Eval: auc-> 0.979804 	aucpr-> 0.932696
Confusion matrix:


Unnamed: 0,CP,CN
PP,76,1
PN,16,999


#### Save XGBoost model

In [11]:
#Xv.to_csv('test.csv', header=None, index=False)
Xv.to_json('test.json', orient='records', lines=True)
clf.save_model('model.xgb')
"Columns order:", '","'.join(Xv.columns)

('Columns order:',
 'Time","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15","V16","V17","V18","V19","V20","V21","V22","V23","V24","V25","V26","V27","V28","Amount')

### Tensorflow basic estimator

In [30]:
import tensorflow as tf
x_train, y_train, x_test, y_test = Xt, yt, Xv, yv

model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(10, activation='relu', input_dim=x_train.shape[1]),
#     tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy')#, metrics=['accuracy']

model.fit(x_train, y_train, epochs=100, verbose=0, batch_size=128)
#model.evaluate(x_test, y_test)
#model.save('teste.mod')
#model2 = tf.keras.models.load_model('teste.mod')
y_pred = model.predict_classes(x_test)
print('acc: ', accuracy_score(y_test, y_pred), '\tmcc: ', matthews_corrcoef(y_test, y_pred))
print('Confusion matrix:\n')
pd.DataFrame(confusion_matrix(y_test, y_pred, [1,0]).T, index=['PP','PN'], columns=['CP','CN'])

acc:  0.9853479853479854 	mcc:  0.9016536382569664
Confusion matrix:



Unnamed: 0,CP,CN
PP,78,2
PN,14,998


### Tensorflow boosted tree estimator
https://www.tensorflow.org/tutorials/estimators/boosted_trees

In [32]:
NUM_EXAMPLES = len(y_train)
NUMERIC_COLUMNS = x_train.columns
feature_columns = []
fc = tf.feature_column
for feature_name in NUMERIC_COLUMNS:
    feature_columns.append(fc.numeric_column(feature_name, dtype=tf.float32))

def make_input_fn(X, y, n_epochs=None, shuffle=True):
	def input_fn():
		dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))
		if shuffle:
			dataset = dataset.shuffle(NUM_EXAMPLES)
		# For training, cycle thru dataset as many times as need (n_epochs=None).
		dataset = dataset.repeat(n_epochs)
		# In memory training doesn't use batching.
		dataset = dataset.batch(NUM_EXAMPLES)
		return dataset
	return input_fn

'''
# Since data fits into memory, use entire dataset per layer. It will be faster.
# Above one batch is defined as the entire dataset.
n_batches = 1
est = tf.estimator.BoostedTreesClassifier(feature_columns,
                                          n_batches_per_layer=n_batches)

# The model will stop training once the specified number of trees is built, not
# based on the number of steps.
est.train(train_input_fn, max_steps=100)

# Eval.
results = est.evaluate(eval_input_fn)
print('Accuracy : ', results['accuracy'])
print('Dummy model: ', results['accuracy_baseline'])
'''

def make_inmemory_train_input_fn(X, y):
    def input_fn():
        return dict(X), y
    return input_fn

train_input_fn = make_inmemory_train_input_fn(x_train, y_train)
eval_input_fn = make_input_fn(x_test, y_test, shuffle=False, n_epochs=1)
est = tf.contrib.estimator.boosted_trees_classifier_train_in_memory(train_input_fn,feature_columns)

pred_dicts = list(est.predict(eval_input_fn))
y_pred = [x['class_ids'][0] for x in pred_dicts]
print('acc: ', accuracy_score(y_test, y_pred), '\tmcc: ', matthews_corrcoef(y_test, y_pred))
print('Confusion matrix:\n')
pd.DataFrame(confusion_matrix(y_test, y_pred, [1,0]).T, index=['PP','PN'], columns=['CP','CN'])

W0723 20:17:02.725621 139660223960896 estimator.py:1811] Using temporary folder as model directory: /tmp/tmpd6c4gnf7
W0723 20:17:03.756095 139660223960896 meta_graph.py:449] Issue encountered when serializing resources.
'_Resource' object has no attribute 'name'
W0723 20:17:05.402199 139660223960896 meta_graph.py:449] Issue encountered when serializing resources.
'_Resource' object has no attribute 'name'
W0723 20:17:05.585092 139660223960896 meta_graph.py:449] Issue encountered when serializing resources.
'_Resource' object has no attribute 'name'
W0723 20:17:06.962818 139660223960896 basic_session_run_hooks.py:724] It seems that global step (tf.train.get_global_step) has not been increased. Current value (could be stable): 0 vs previous value: 0. You could increase the global step by passing tf.train.get_global_step() to Optimizer.apply_gradients or Optimizer.minimize.
W0723 20:17:14.411398 139660223960896 meta_graph.py:449] Issue encountered when serializing resources.
'_Resource' o

acc:  0.9908424908424909 	mcc:  0.9394038281288066
Confusion matrix:



Unnamed: 0,CP,CN
PP,82,0
PN,10,1000
