# CNN Model 1

## The approach:

### Without time feature
### Pass 1x29 vectors into a convolutional layer, with kernel size 29, with some D number of filters
### Add extra conv and dense layer to the model to see the effect

## First run: single conv layer single dense layer:

In [65]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
import numpy
from numpy.random import seed
import pandas as pd
import numpy as np
from sklearn.preprocessing import minmax_scale
from keras.layers.convolutional import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D
from keras.optimizers import SGD
from keras.models import Sequential
from keras.layers import Dense, Flatten
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
import keras

data = pd.read_csv("creditcard.csv")

# Normalise and reshape the Amount column, so it's values lie between -1 and 1
from sklearn.preprocessing import StandardScaler
data['norm_Amount'] = StandardScaler().fit_transform(data['Amount'].reshape(-1,1))

# Drop the old Amount column and also the Time column as we don't want to include this at this stage
data = data.drop(['Time', 'Amount'], axis=1)

# Assign variables x and y corresponding to row data and it's class value
X = data.ix[:, data.columns != 'Class']
y = data.ix[:, data.columns == 'Class']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [127]:
import keras.backend as K
from sklearn.metrics import f1_score, precision_score, recall_score
def f1_score_custom(y_true, y_pred):
    """
    f1 score

    :param y_true:
    :param y_pred:
    :return:
    """
    tp_3d = K.concatenate(
        [
            K.cast(y_true, 'bool'),
            K.cast(K.round(y_pred), 'bool'),
            K.cast(K.ones_like(y_pred), 'bool')
        ], axis=1
    )

    fp_3d = K.concatenate(
        [
            K.cast(K.abs(y_true - K.ones_like(y_true)), 'bool'),
            K.cast(K.round(y_pred), 'bool'),
            K.cast(K.ones_like(y_pred), 'bool')
        ], axis=1
    )

    fn_3d = K.concatenate(
        [
            K.cast(y_true, 'bool'),
            K.cast(K.abs(K.round(y_pred) - K.ones_like(y_pred)), 'bool'),
            K.cast(K.ones_like(y_pred), 'bool')
        ], axis=1
    )

    tp = K.sum(K.cast(K.all(tp_3d, axis=1), 'int32'))
    fp = K.sum(K.cast(K.all(fp_3d, axis=1), 'int32'))
    fn = K.sum(K.cast(K.all(fn_3d, axis=1), 'int32'))

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    return 2 * ((precision * recall) / (precision + recall))


class Metrics(keras.callbacks.Callback):
    
    def on_train_begin(self, logs={}):
        self.precision = []
        self.recall = []
        self.f1s = []

    def on_epoch_end(self, epoch, logs={}):
        score = np.asarray(self.model.predict(self.validation_data[0]))
        predict = np.round(np.asarray(self.model.predict(self.validation_data[0])))
        targ = self.validation_data[1]
        
        val_f1_score = f1_score(targ, predict, average=None)
        val_precision_score = precision_score(targ, predict,average=None)
        val_recall_score = recall_score(targ, predict, average=None)
        
        self.precision.append(val_precision_score)
        self.recall.append(val_recall_score)
        self.f1s.append(val_f1_score)
        
        print ' — val_f1: {} — val_precision: {} — val_recall {}'.format(val_f1_score, val_precision_score, val_recall_score)
                
        return

In [128]:
# Function to create model
def create_model():
    # create model
    seed(2017)
    conv = Sequential()
    conv.add(Conv1D(256, 29, input_shape=(29, 1), activation='relu'))
    conv.add(Flatten())
    conv.add(Dense(300, activation = 'relu'))
    conv.add(Dense(2, activation = 'softmax'))

    sgd = SGD(lr = 0.1, momentum = 0.9, decay = 0, nesterov = False)
    
    # Compile model
    conv.compile(loss='categorical_crossentropy', optimizer=sgd)
    return conv
 

In [138]:
# Setting up dataframe table properties
log_cols=["Name", "F1 Score", "Precision", "Recall", "Training Time"]
log = pd.DataFrame(columns=log_cols)

In [139]:
def smote_data(x_data, y_data):
    from collections import Counter
    from imblearn.over_sampling import SMOTE
    sm = SMOTE()
    X_res, y_res = sm.fit_sample(x_data, y_data)
    print('Resampling the data with SMOTE. . .')
    print('Resampled training dataset shape {}'.format(Counter(y_res)))

    return X_res, y_res

In [140]:
def custom_cross_val(X, y, model, n):
    from sklearn.model_selection import StratifiedKFold
    from sklearn.base import clone
    import datetime
    from sklearn.metrics import precision_recall_fscore_support
    
    print 'Cross validating... \n'
    skfolds = StratifiedKFold(n_splits=n, random_state=42)
    
    precision = []
    recall = []
    f1score = []
    elapsed_times = []
    cv = 0
    
    for train_index, test_index in skfolds.split(X, y):
        cv=cv+1
        print len(train_index)
        clone_clf = clone(model)
        X_train_folds = X.iloc[train_index]
        y_train_folds = y.iloc[train_index]
        X_test_fold = X.iloc[test_index]
        y_test_fold = y.iloc[test_index]
        
        print len(y_train_folds[y_train_folds['Class']==1])
        X_res, y_res = smote_data(X_train_folds, y_train_folds )
        
        
        
        print X_res.shape, type(X_res)
        print y_res.shape

        X_train = X_res.reshape(X_res.shape[0], 29, 1)
        Y_train = y_res.reshape(y_res.shape[0], 1)
        X_test = X_test_fold.values.reshape(X_test_fold.values.shape[0], 29, 1)
        Y_test = y_test_fold.values.reshape(y_test_fold.values.shape[0], 1)

        Y_test = keras.utils.to_categorical(Y_test)
        Y_train = keras.utils.to_categorical(Y_train)
        print Y_test.shape
        print Y_train.shape

        
        start = datetime.datetime.now()
        
        print('Fitting the model... CV[{}]'.format(cv))
        clone_clf.fit(X_train, Y_train)
        
        end = datetime.datetime.now()
        elapsed = end - start
        elapsed_times.append(elapsed)
        
        y_pred = clone_clf.predict(X_test)
        
        # Set cut off point for class boundaries
        cutt_off_tr = 0.5
        y_pred[np.where(y_pred>=cutt_off_tr)] = 1
        y_pred[np.where(y_pred<cutt_off_tr)]  = 0
        
        prfs = precision_recall_fscore_support(Y_test, y_pred, pos_label=1, average='binary')
        
        precision.append(prfs[0])
        recall.append(prfs[1])
        f1score.append(prfs[2])  
    
    average_timedelta = sum(elapsed_times, datetime.timedelta(0)) / len(elapsed_times)
    entry = [name, np.mean(f1score), np.mean(precision), np.mean(recall), average_timedelta]
    print('Mean scores: ', entry )
    return entry

In [149]:
def custom_cross_val_2(X, y, create_model, n):
    from sklearn.model_selection import StratifiedKFold
    from sklearn.base import clone
    import datetime
    from sklearn.metrics import precision_recall_fscore_support
    
    print 'Cross validating... \n'
    skfolds = StratifiedKFold(n_splits=n, random_state=42)
    
    precision = []
    recall = []
    f1score = []
    elapsed_times = []
    cv = 0
    
    for train_index, test_index in skfolds.split(X, y):
        cv=cv+1
        print len(train_index)
        clone_clf = create_model()
        X_train_folds = X.iloc[train_index]
        y_train_folds = y.iloc[train_index]
        X_test_fold = X.iloc[test_index]
        y_test_fold = y.iloc[test_index]
        
        print len(y_train_folds[y_train_folds['Class']==1])
        X_res, y_res = smote_data(X_train_folds, y_train_folds )
        
        
        
        print X_res.shape, type(X_res)
        print y_res.shape

        X_train = X_res.reshape(X_res.shape[0], 29, 1)
        Y_train = y_res.reshape(y_res.shape[0], 1)
        X_test = X_test_fold.values.reshape(X_test_fold.values.shape[0], 29, 1)
        Y_test = y_test_fold.values.reshape(y_test_fold.values.shape[0], 1)

        Y_test = keras.utils.to_categorical(Y_test)
        Y_train = keras.utils.to_categorical(Y_train)
        print Y_test.shape
        print Y_train.shape
        
        # metrics = Metrics()
        
        start = datetime.datetime.now()
        
        print('Fitting the model... CV[{}]'.format(cv))
        # clone_clf.fit(X_train, Y_train, callbacks=[metrics], validation_data=(X_test, Y_test))
        
        clone_clf.fit(X_train, Y_train, batch_size = 500, epochs = 25, verbose =1)
        end = datetime.datetime.now()
        elapsed = end - start
        elapsed_times.append(elapsed)
        
        y_pred = clone_clf.predict(X_test)
        
        # Set cut off point for class boundaries
        cutt_off_tr = 0.5
        y_pred[np.where(y_pred>=cutt_off_tr)] = 1
        y_pred[np.where(y_pred<cutt_off_tr)]  = 0
        
        prfs = precision_recall_fscore_support(Y_test, y_pred, labels=[0])
        
        precision.append(prfs[0][1])
        recall.append(prfs[1][1])
        f1score.append(prfs[2][1])  
    
    average_timedelta = sum(elapsed_times, datetime.timedelta(0)) / len(elapsed_times)
    entry = ['CNN Model 1', np.mean(f1score), np.mean(precision), np.mean(recall), average_timedelta]
    print('Mean scores: ', entry )
    return entry

In [150]:
results = custom_cross_val_2(X, y, create_model, 3)

log_entry = pd.DataFrame([results], columns=log_cols)
log = log.append(log_entry)

# Replace table index by the Classifier column
log.set_index('Name', inplace=True)
print 'Cross validation training results: '
print log 


Cross validating... 

189871
328
Resampling the data with SMOTE. . .
Resampled training dataset shape Counter({0: 189543, 1: 189543})
(379086, 29) <type 'numpy.ndarray'>
(379086,)
(94936, 2)
(379086, 2)
Fitting the model... CV[1]
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
189871
328
Resampling the data with SMOTE. . .
Resampled training dataset shape Counter({0: 189543, 1: 189543})
(379086, 29) <type 'numpy.ndarray'>
(379086,)
(94936, 2)
(379086, 2)
Fitting the model... CV[2]
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoc

In [151]:
results2 = custom_cross_val_2(X, y, create_model, 3)
print results2

Cross validating... 

189871
328
Resampling the data with SMOTE. . .
Resampled training dataset shape Counter({0: 189543, 1: 189543})
(379086, 29) <type 'numpy.ndarray'>
(379086,)
(94936, 2)
(379086, 2)
Fitting the model... CV[1]
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
189871
328
Resampling the data with SMOTE. . .
Resampled training dataset shape Counter({0: 189543, 1: 189543})
(379086, 29) <type 'numpy.ndarray'>
(379086,)
(94936, 2)
(379086, 2)
Fitting the model... CV[2]
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoc

In [153]:
log_entry = pd.DataFrame([results2], columns=log_cols)
log = log.append(log_entry)

# Replace table index by the Classifier column
log.set_index('Name', inplace=True)
print 'Cross validation training results: '
print log 

Cross validation training results: 
             F1 Score  Precision    Recall   Training Time
Name                                                      
NaN          0.001150   0.000576  0.333333 00:03:06.678778
NaN          0.697258   0.626006  0.823171 00:00:43.629459
NaN          0.699859   0.632257  0.821138 00:01:48.112916
CNN Model 1  0.682613   0.623450  0.796748 00:01:48.029946
CNN Model 1  0.682613   0.623450  0.796748 00:01:48.029946


In [154]:
print log

             F1 Score  Precision    Recall   Training Time
Name                                                      
NaN          0.001150   0.000576  0.333333 00:03:06.678778
NaN          0.697258   0.626006  0.823171 00:00:43.629459
NaN          0.699859   0.632257  0.821138 00:01:48.112916
CNN Model 1  0.682613   0.623450  0.796748 00:01:48.029946
CNN Model 1  0.682613   0.623450  0.796748 00:01:48.029946


In [155]:
# Function to create model, required for KerasClassifier
def create_model_2():
    # create model
    seed(2017)
    conv = Sequential()
    conv.add(Conv1D(256, 29, input_shape=(29, 1), activation='relu'))
    conv.add(Conv1D(256, 1, activation='relu'))
    conv.add(Flatten())

    conv.add(Dense(300, activation = 'relu'))
    conv.add(Dense(100, activation = 'relu'))
    conv.add(Dense(2, activation = 'softmax'))

    sgd = SGD(lr = 0.1, momentum = 0.9, decay = 0, nesterov = False)
    
    # Compile model
    conv.compile(loss='categorical_crossentropy', optimizer=sgd)
    return conv
 

In [156]:
def custom_cross_val_2(X, y, create_model, n):
    from sklearn.model_selection import StratifiedKFold
    from sklearn.base import clone
    import datetime
    from sklearn.metrics import precision_recall_fscore_support
    
    print 'Cross validating... \n'
    skfolds = StratifiedKFold(n_splits=n, random_state=42)
    
    precision = []
    recall = []
    f1score = []
    elapsed_times = []
    cv = 0
    
    for train_index, test_index in skfolds.split(X, y):
        cv=cv+1
        print len(train_index)
        clone_clf = create_model()
        X_train_folds = X.iloc[train_index]
        y_train_folds = y.iloc[train_index]
        X_test_fold = X.iloc[test_index]
        y_test_fold = y.iloc[test_index]
        
        print len(y_train_folds[y_train_folds['Class']==1])
        X_res, y_res = smote_data(X_train_folds, y_train_folds )
        
        
        
        print X_res.shape, type(X_res)
        print y_res.shape

        X_train = X_res.reshape(X_res.shape[0], 29, 1)
        Y_train = y_res.reshape(y_res.shape[0], 1)
        X_test = X_test_fold.values.reshape(X_test_fold.values.shape[0], 29, 1)
        Y_test = y_test_fold.values.reshape(y_test_fold.values.shape[0], 1)

        Y_test = keras.utils.to_categorical(Y_test)
        Y_train = keras.utils.to_categorical(Y_train)
        print Y_test.shape
        print Y_train.shape
        
        # metrics = Metrics()
        
        start = datetime.datetime.now()
        
        print('Fitting the model... CV[{}]'.format(cv))
        # clone_clf.fit(X_train, Y_train, callbacks=[metrics], validation_data=(X_test, Y_test))
        
        clone_clf.fit(X_train, Y_train, batch_size = 500, epochs = 25, verbose =1)
        end = datetime.datetime.now()
        elapsed = end - start
        elapsed_times.append(elapsed)
        
        y_pred = clone_clf.predict(X_test)
        
        # Set cut off point for class boundaries
        cutt_off_tr = 0.5
        y_pred[np.where(y_pred>=cutt_off_tr)] = 1
        y_pred[np.where(y_pred<cutt_off_tr)]  = 0
        
        prfs = precision_recall_fscore_support(Y_test, y_pred, labels=[0])
        
        precision.append(prfs[0][1])
        recall.append(prfs[1][1])
        f1score.append(prfs[2][1])  
    
    average_timedelta = sum(elapsed_times, datetime.timedelta(0)) / len(elapsed_times)
    entry = ['CNN Model 1.2', np.mean(f1score), np.mean(precision), np.mean(recall), average_timedelta]
    print('Mean scores: ', entry )
    return entry

In [157]:
results1_2 = custom_cross_val_2(X, y, create_model_2, 3)
print results1_2

Cross validating... 

189871
328
Resampling the data with SMOTE. . .
Resampled training dataset shape Counter({0: 189543, 1: 189543})
(379086, 29) <type 'numpy.ndarray'>
(379086,)
(94936, 2)
(379086, 2)
Fitting the model... CV[1]
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
189871
328
Resampling the data with SMOTE. . .
Resampled training dataset shape Counter({0: 189543, 1: 189543})
(379086, 29) <type 'numpy.ndarray'>
(379086,)
(94936, 2)
(379086, 2)
Fitting the model... CV[2]
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoc

In [158]:
# Setting up dataframe table properties
log_cols=["Name", "F1 Score", "Precision", "Recall", "Training Time"]
log = pd.DataFrame(columns=log_cols)

In [159]:
log_entry = pd.DataFrame([results1_2], columns=log_cols)
log = log.append(log_entry)

# Replace table index by the Classifier column
log.set_index('Name', inplace=True)
print 'Cross validation training results: '
print log 

Cross validation training results: 
               F1 Score  Precision    Recall   Training Time
Name                                                        
CNN Model 1.2  0.747466   0.705142  0.806911 00:03:56.400232


In [144]:
from numpy.random import seed
import pandas as pd
import numpy as np
from sklearn.preprocessing import minmax_scale
# from keras_diagram import ascii
from keras.layers.convolutional import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D
from keras.optimizers import SGD
from keras.models import Sequential
from keras.layers import Dense, Flatten
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
import keras

data = pd.read_csv("creditcard.csv")

# Normalise and reshape the Amount column, so it's values lie between -1 and 1
from sklearn.preprocessing import StandardScaler
data['norm_Amount'] = StandardScaler().fit_transform(data['Amount'].reshape(-1,1))

# Drop the old Amount column and also the Time column as we don't want to include this at this stage
data = data.drop(['Time', 'Amount'], axis=1)

# Assign variables x and y corresponding to row data and it's class value
X = data.ix[:, data.columns != 'Class']
y = data.ix[:, data.columns == 'Class']

def generate_train_test_sample(x_data, y_data): 
    ''' 1) Generate new, random train-test split
        2) Random smote oversample the train data, keeping test data unseen
        3) Use this new train-test split to fit and test model
    '''

    X_train, X_test, y_train, y_test = train_test_split(x_data,y_data,test_size = 0.3)

    from collections import Counter
    from imblearn.over_sampling import SMOTE
    sm = SMOTE()
    X_res, y_res = sm.fit_sample(X_train, y_train)
    print('Resampling the data with SMOTE. . .')
    print('Resampled training dataset shape {}'.format(Counter(y_res)))

    return X_res, y_res, X_test, y_test

########################################################################

X_res, y_res, X_test, y_test = generate_train_test_sample(X, y)

print X_res.shape, type(X_res)
print y_res.shape

X_train = X_res.reshape(X_res.shape[0], 29, 1)
Y_train = y_res.reshape(y_res.shape[0], 1)
X_test = X_test.values.reshape(X_test.values.shape[0], 29, 1)
Y_test = y_test.values.reshape(y_test.values.shape[0], 1)

Y_test = keras.utils.to_categorical(Y_test)
Y_train = keras.utils.to_categorical(Y_train)
print Y_test.shape
print Y_train.shape


seed(2017)
conv = Sequential()
conv.add(Conv1D(256, 29, input_shape=(29, 1), activation='relu'))

conv.add(Flatten())

conv.add(Dense(300, activation = 'relu'))
conv.add(Dense(2, activation = 'softmax'))

sgd = SGD(lr = 0.1, momentum = 0.9, decay = 0, nesterov = False)
conv.compile(loss = 'categorical_crossentropy', optimizer = sgd, metrics = ['accuracy'])
conv.fit(X_train, Y_train, batch_size = 500, epochs = 10, verbose = 1)
score = conv.evaluate(X_test, Y_test, batch_size=500)

y_pred = conv.predict(X_test)


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


Resampling the data with SMOTE. . .
Resampled training dataset shape Counter({0: 199015, 1: 199015})
(398030, 29) <type 'numpy.ndarray'>
(398030,)
(85443, 2)
(398030, 2)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [145]:
y_pred.shape

(85443, 2)

In [146]:
cutt_off_tr = 0.5
y_pred[np.where(y_pred>=cutt_off_tr)] = 1
y_pred[np.where(y_pred<cutt_off_tr)]  = 0

prfs0 = precision_recall_fscore_support(Y_test, y_pred, labels=[0])

print 'Classification Report: \n'
print '[Precision, Recall, F1, Support]'
print '='*100
print '0:      {}         {}      {}      {}   '.format(prfs0[0][0], prfs0[1][0], prfs0[2][0], prfs0[3][0])
print '1:      {}         {}      {}      {}   '.format(prfs0[0][1], prfs0[1][1], prfs0[2][1], prfs0[3][1])
print '='*100

print 'F1 Score, Fraud Class = {}'.format(prfs0[2][1])

Classification Report: 

[Precision, Recall, F1, Support]
0:      0.999788863735         0.999237983587      0.999513347757      85300   
1:      0.657894736842         0.874125874126      0.750750750751      143   
F1 Score, Fraud Class = 0.750750750751


In [18]:
print conv.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_6 (Conv1D)            (None, 1, 256)            7680      
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 1, 256)            65792     
_________________________________________________________________
flatten_2 (Flatten)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 300)               77100     
_________________________________________________________________
dense_4 (Dense)              (None, 100)               30100     
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 202       
Total params: 180,874
Trainable params: 180,874
Non-trainable params: 0
_________________________________________________________________
None

### Evaluation of first run

We can see that this very simple model, with just a single convolution layer piped into a simple dense network, already gives comparable F1 to our top two baseline classifiers:

CNNv1.1:                 0.776316

RandomForestClassifier:  0.846437   
MLPClassifier:           0.750672 

## Second run, with added conv and dense layer 

In [17]:
########################################################################
# CNNv1.2

X_res, y_res, X_test, y_test = generate_train_test_sample(X, y)

print X_res.shape, type(X_res)
print y_res.shape

X_train = X_res.reshape(X_res.shape[0], 29, 1)
Y_train = y_res.reshape(y_res.shape[0], 1)
X_test = X_test.values.reshape(X_test.values.shape[0], 29, 1)
Y_test = y_test.values.reshape(y_test.values.shape[0], 1)

Y_test = keras.utils.to_categorical(Y_test)
Y_train = keras.utils.to_categorical(Y_train)
print Y_test.shape
print Y_train.shape


seed(2017)
conv = Sequential()
conv.add(Conv1D(256, 29, input_shape=(29, 1), activation='relu'))
conv.add(Conv1D(256, 1, activation='relu'))
conv.add(Flatten())

conv.add(Dense(300, activation = 'relu'))
conv.add(Dense(100, activation = 'relu'))
conv.add(Dense(2, activation = 'softmax'))

sgd = SGD(lr = 0.1, momentum = 0.9, decay = 0, nesterov = False)
conv.compile(loss = 'categorical_crossentropy', optimizer = sgd, metrics = ['accuracy'])
conv.fit(X_train, Y_train, batch_size = 500, epochs = 50, verbose = 1)
score = conv.evaluate(X_test, Y_test, batch_size=500)

y_pred = conv.predict(X_test)

Resampling the data with SMOTE. . .
Resampled training dataset shape Counter({0: 199022, 1: 199022})
(398044, 29) <type 'numpy.ndarray'>
(398044,)
(85443, 2)
(398044, 2)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [19]:
y_pred[np.where(y_pred>=cutt_off_tr)] = 1
y_pred[np.where(y_pred<cutt_off_tr)]  = 0

prfs0 = precision_recall_fscore_support(Y_test, y_pred, labels=[0])

print 'Classification Report: \n'
print '[Precision, Recall, F1, Support]'
print '='*100
print '0:      {}         {}      {}      {}   '.format(prfs0[0][0], prfs0[1][0], prfs0[2][0], prfs0[3][0])
print '1:      {}         {}      {}      {}   '.format(prfs0[0][1], prfs0[1][1], prfs0[2][1], prfs0[3][1])
print '='*100

print 'F1 Score, Fraud Class = {}'.format(prfs0[2][1])

Classification Report: 

[Precision, Recall, F1, Support]
0:      0.999718584009         0.999601374087      0.999659975612      85293   
1:      0.7875         0.84      0.812903225806      150   
F1 Score, Fraud Class = 0.812903225806


### Evaluation of second run

This seems very promising, already matching our best baseline classifier, with our first CNN approach.

#### However, we should at least average runs to get confidence in results

## CNNv1 averaged

In [44]:
def average_run():
    reports = []
    for i in range(3):
        X_res, y_res, X_test, y_test = generate_train_test_sample(X, y)

        print X_res.shape, type(X_res)
        print y_res.shape

        X_train = X_res.reshape(X_res.shape[0], 29, 1)
        Y_train = y_res.reshape(y_res.shape[0], 1)
        X_test = X_test.values.reshape(X_test.values.shape[0], 29, 1)
        Y_test = y_test.values.reshape(y_test.values.shape[0], 1)

        Y_test = keras.utils.to_categorical(Y_test)
        Y_train = keras.utils.to_categorical(Y_train)
        print Y_test.shape
        print Y_train.shape


        seed(2017)
        conv = Sequential()
        conv.add(Conv1D(256, 29, input_shape=(29, 1), activation='relu'))
        conv.add(Flatten())
        conv.add(Dense(300, activation = 'relu'))
        conv.add(Dense(2, activation = 'softmax'))

        sgd = SGD(lr = 0.1, momentum = 0.9, decay = 0, nesterov = False)
        conv.compile(loss = 'categorical_crossentropy', optimizer = sgd, metrics = ['accuracy'])
        conv.fit(X_train, Y_train, batch_size = 500, epochs = 50, verbose = 1, callbacks=[keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.0001, patience=5, verbose=1, mode='auto')])
        score = conv.evaluate(X_test, Y_test, batch_size=500)

        y_pred = conv.predict(X_test)

        cutt_off_tr = 0.5
        y_pred[np.where(y_pred>=cutt_off_tr)] = 1
        y_pred[np.where(y_pred<cutt_off_tr)]  = 0

        prfs0 = precision_recall_fscore_support(Y_test, y_pred, labels=[0])
        reports.append(prfs0)
    
    return reports

In [45]:
reports = average_run()

Resampling the data with SMOTE. . .
Resampled training dataset shape Counter({0: 199015, 1: 199015})
(398030, 29) <type 'numpy.ndarray'>
(398030,)
(85443, 2)
(398030, 2)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 00016: early stopping
Resampling the data with SMOTE. . .
Resampled training dataset shape Counter({0: 199032, 1: 199032})
(398064, 29) <type 'numpy.ndarray'>
(398064,)
(85443, 2)
(398064, 2)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 00016: early stopping
Resampling the data with SMOTE. . .
Resampled training dataset shape Counter({0: 199021, 1: 199021})
(398042, 29) <type 'numpy.ndarray'>
(398042,)
(85443, 2)
(398042, 2)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Ep

In [46]:
results = [reports[0][2][1],reports[1][2][1],reports[1][2][1]]
avg_f1 = np.mean(results)
print 'F1 AVG = {}'.format(avg_f1)

F1 AVG = 0.771745237875


## CNNv1.2 averaged

In [41]:
def average_run():
    reports = []
    for i in range(3):
        X_res, y_res, X_test, y_test = generate_train_test_sample(X, y)

        print X_res.shape, type(X_res)
        print y_res.shape

        X_train = X_res.reshape(X_res.shape[0], 29, 1)
        Y_train = y_res.reshape(y_res.shape[0], 1)
        X_test = X_test.values.reshape(X_test.values.shape[0], 29, 1)
        Y_test = y_test.values.reshape(y_test.values.shape[0], 1)

        Y_test = keras.utils.to_categorical(Y_test)
        Y_train = keras.utils.to_categorical(Y_train)
        print Y_test.shape
        print Y_train.shape


        seed(2017)
        conv = Sequential()
        conv.add(Conv1D(256, 29, input_shape=(29, 1), activation='relu'))
        conv.add(Conv1D(256, 1, activation='relu'))
        conv.add(Flatten())

        conv.add(Dense(300, activation = 'relu'))
        conv.add(Dense(100, activation = 'relu'))
        conv.add(Dense(2, activation = 'softmax'))

        sgd = SGD(lr = 0.1, momentum = 0.9, decay = 0, nesterov = False)
        conv.compile(loss = 'categorical_crossentropy', optimizer = sgd, metrics = ['accuracy'])
        conv.fit(X_train, Y_train, batch_size = 500, epochs = 50, verbose = 1, callbacks=[keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.0001, patience=5, verbose=0, mode='auto')])
        score = conv.evaluate(X_test, Y_test, batch_size=500)

        y_pred = conv.predict(X_test)

        cutt_off_tr = 0.5
        y_pred[np.where(y_pred>=cutt_off_tr)] = 1
        y_pred[np.where(y_pred<cutt_off_tr)]  = 0

        prfs0 = precision_recall_fscore_support(Y_test, y_pred, labels=[0])
        reports.append(prfs0)
    
    return reports


In [42]:
reports = average_run()


Resampling the data with SMOTE. . .
Resampled training dataset shape Counter({0: 199028, 1: 199028})
(398056, 29) <type 'numpy.ndarray'>
(398056,)
(85443, 2)
(398056, 2)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Resampling the data with SMOTE. . .
Resampled training dataset shape Counter({0: 199033, 1: 199033})
(398066, 29) <type 'numpy.ndarray'>
(398066,)
(85443, 2)
(398066, 2)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Resampling the data with SMOTE. . .
Resampled training dataset shape Counter({0: 199026, 1: 199026})
(398052, 29) <type 'numpy.ndarray'>
(398052,)
(85443, 2)
(398052, 2)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50

In [43]:
results = [reports[0][2][1],reports[1][2][1],reports[1][2][1]]
avg_f1 = np.mean(results)
print 'F1 AVG = {}'.format(avg_f1)

F1 AVG = 0.810230099502


# CNNv1 : F1 AVG = 0.771745237875
# CNNv1.2: F1 AVG = 0.810230099502

In [49]:
X_res, y_res, X_test, y_test = generate_train_test_sample(X, y)

print X_res.shape, type(X_res)
print y_res.shape

Resampling the data with SMOTE. . .
Resampled training dataset shape Counter({0: 199012, 1: 199012})
(398024, 29) <type 'numpy.ndarray'>
(398024,)


In [52]:
X_res.size


11542696

In [53]:
X_train = X_res.reshape(X_res.shape[0], 29, 1)

In [56]:
X_res

array([[ 2.15010044,  0.15115908, -2.19244029, ..., -0.07457871,
        -0.07238514, -0.34995096],
       [-0.68092548, -0.26758453, -1.95069919, ..., -0.53863108,
        -0.09602917,  0.72397497],
       [ 2.02016107, -0.19543356, -1.97548432, ..., -0.11447656,
        -0.07892874, -0.19350585],
       ..., 
       [-3.51735006,  1.72241489, -2.70740266, ...,  0.91308367,
        -0.34269234,  0.95741203],
       [-3.69408913,  0.19000357, -3.66340915, ...,  1.0578821 ,
         0.04396139,  0.96994245],
       [-2.01612965,  1.5548113 , -2.76132834, ..., -0.29483718,
         0.66595781, -0.33692074]])

In [None]:
x_train_test = []