In [195]:
#import the necessary libraries to perform a Sequential Neural Network using Keras
import pandas as pd
import numpy
import arff
import sklearn
from keras.models import Sequential
from keras.layers import Dense

In [13]:
#Load arff file
dataset = arff.load(open('credit_fraud2.arff','r'))

In [62]:
#Convert to numpy array
data = numpy.array(dataset['data'])

In [165]:
#convert to pandas dataframe
DF = pd.DataFrame(data)

#Recode variable status of checking account to an ordinal variable
#between 0 and 1
DF[0].unique()
di0 ={'no checking':0.25, '<0':0.50, '0<=X<200':0.75,  '>=200':1.00}

#Recode variable Credit History to an ordinal variable between 0 and 1
DF[2].unique()
di2 ={'no credits/all paid':0.20, 'all paid':0.40, 'existing paid':0.60, 
      'delayed previously':0.80, 'critical/other existing credit':1.00}

#Convert purpose to a binary variable
DF[3].unique()
DF3=pd.get_dummies(DF[3])

#Recode Savings Accounts/Bonds to an ordinal variable between 0 and 1
DF[5].unique()
di5 = {'no known savings':0.2,'<100':0.4,'100<=X<500':0.6, 
       '500<=X<1000':0.8,'>=1000':1.0}

#Recode Employment Years Status to an ordian variable between 0 and 1
DF[6].unique()
di6 = {'unemployed':0.2, '<1':0.4, '1<=X<4':0.6, '4<=X<7':0.8, 
       '>=7':1.0}

#Convert Personal status to an ordinal variable
DF8=pd.get_dummies(DF[8])

#Convert Application to a binary variable
DF9=pd.get_dummies(DF[9])

#Convert property to a binary variable
DF11=pd.get_dummies(DF[11])

#Convert installment plans to a binary variable
DF[13].unique()
DF13=pd.get_dummies(DF[13])

#Convert Housing to a binary variable
DF[14].unique()
DF14=pd.get_dummies(DF[14])

#Convert Job type to a binary variable
DF[16].unique()
DF16=pd.get_dummies(DF[16])

#Convert have telephone, Foreign worker, Credit 
#to a binary variable this variable is being 
#recoded to limit the number of columns since 
#there is only two #outcomes
DF[18].unique()
d18 = {'yes':1, 'none':0}

#Foreign Worker Y/N
DF[19].unique()
d19 = {'yes':1, 'no':0}

#Credit Good Bad
DF[20].unique()
d20 = {'good':1, 'bad':0}

#Use the dictionaries to recode each of the columns so that they
#can be passed to the neural network
DF = DF.replace({0:di0,2:di2,5:di5, 6:di6, 18:d18, 19:d19, 20:d20})

#Rename Column names to the appropriate name
DF_Names = DF.rename(columns={0:'Checking_Account',1:'Duration_In_Month',
                              2:'Credit_History',3:'Purpose',4:'Credit_Amount', 
                              5:'Savings',6:'Work_Tenure', 
                              7:'Installment_Rate_%_Income', 
                              8:'Sex/Maritial_Status',
                              9:'Application', 10:'Present_Residence_Since',
                              11:'Property', 12:'Age',
                              13:'Other_Installment_Plans', 14:'Housing', 
                              15:'Number_of_Existing_Credits_at_Bank',
                              16:'Job',17:'Number_of_People_Maintenance',
                              18:'Telephone', 19:'Foreign_Worker', 20:'Credit_Result'})



#Separate Variables into their separate types
Ordinal_Variables = DF_Names[[0,2,5,6,18,19,20]]
Binary_Variables = pd.concat([DF3,DF8,DF9,DF11,DF13,DF14,DF16], axis=1)
Continous_Variables = DF_Names[[1,4,7,10,12,15,17]]

#Normalize Continous Variables 
from sklearn import preprocessing
Normalize = pd.DataFrame(preprocessing.normalize(Continous_Variables))

#Combine back into one Dataframe
DF_NN = pd.concat([Binary_Variables, Normalize, Ordinal_Variables], axis=1)

#Rename Normalized Columns
DF_NN = DF_NN.rename(columns={0:'Duration_In_Month', 1:'Credit_Amount', 2:'Installment_Rate_%_Income',
                             3:'Present_Residence_Since', 4:'Age', 5:'Number_of_Existing_Credits_at_Bank',
                             6:'Number_of_People_Maintenance'})









business                              float64
domestic appliance                    float64
education                             float64
furniture/equipment                   float64
new car                               float64
other                                 float64
radio/tv                              float64
repairs                               float64
retraining                            float64
used car                              float64
female div/dep/mar                    float64
male div/sep                          float64
male mar/wid                          float64
male single                           float64
co applicant                          float64
guarantor                             float64
none                                  float64
car                                   float64
life insurance                        float64
no known property                     float64
real estate                           float64
bank                              

In [203]:
#Breakdown data into training and testing set
DF_ARRAY = DF_NN.values
sample = numpy.random.rand(len(DF_ARRAY)) < 0.8

train = DF_ARRAY[sample]
test = DF_ARRAY[~sample]

#Split into dependent and independendant variables
X = train[:,0:44]
Y = train[:,44]

[ 1.  1.  0.  1.  1.  1.  0.  0.  0.  1.  0.  1.  0.  1.  1.  0.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  0.  1.  1.
  1.  1.  1.  1.  1.  0.  0.  1.  1.  0.  1.  0.  0.  1.  1.  1.  0.  1.
  1.  1.  1.  1.  0.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  0.  1.  1.
  0.  1.  1.  0.  1.  1.  1.  1.  1.  0.  0.  1.  1.  1.  1.  1.  0.  0.
  1.  0.  1.  1.  1.  0.  1.  1.  0.  1.  0.  1.  0.  1.  1.  1.  1.  1.
  0.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.
  1.  1.  1.  1.  0.  1.  1.  0.  0.  1.  0.  1.  0.  1.  1.  1.  0.  0.
  0.  1.  0.  1.  0.  1.  0.  1.  0.  0.  1.  0.  0.  1.  0.  1.  0.  1.
  1.  0.  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  0.  0.  1.  0.  1.  1.  1.  0.  0.  0.  1.  1.  1.  0.  1.
  1.  1.  1.  1.  1.  0.  1.  0.  1.  1.  1.  1.  0.  1.  1.  1.  1.  0.
  1.  0.  1.  1.  1.  1.  0.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  0.  1.  0.  1.  1.  1.  0.  1.  1.  1.  1.  0.  1

In [204]:
#Use Sequential model from keras
model = Sequential()
#input layer has 44 nodes to match input data
#use relu activation layer
model.add(Dense(44,input_dim=44,activation='relu'))
#run model through a 100 node hidden layer 
#and relu activation layer as this
#will reduce the liklihood of vanishing gradient
#and sparsity
model.add(Dense(100,activation='relu'))
#run the output from the hidden layer
#through sigmoid activation layer because
#this will force the result between 0 and 1
#the two possible outcomes
model.add(Dense(1,activation='sigmoid'))

(None, 44)


In [205]:
#create model
#use binary crossentropy to calculate loss as the outcome is 2 defined classes
#use the adam optimizer because it is computationally efficient and requires
#a small amount of memory
#use accuracy as a metric as we are mostly concerened with classes where
#they default
model.compile(loss='binary_crossentropy',  optimizer='adam', metrics=['accuracy'])
model.fit(X,Y,epochs=150, batch_size=10)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x1170ffd0>

In [206]:
#Break test set into dependent and independant variables
X_Test = test[:,0:44]
Y_Test = test[:,44]

#evaluate model using test set data
scores = model.evaluate(X_Test,Y_Test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))


#In my case the model shows that on average it is correct 72% of the time in predicting fraud 

 32/194 [===>..........................] - ETA: 1s
acc: 72.16%
