-
Notifications
You must be signed in to change notification settings - Fork 0
/
Code and description
242 lines (199 loc) · 8.72 KB
/
Code and description
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
###########################################################################
###############CREDIT CARD FRAUD ANALYSIS IN R#############################
###############################
#1.Introduction
###############################
#The dataset that I have analysed contains transactions made by credit cards in September 2013
#by european cardholders.This is a opensource dataset
#Source: http://mlg.ulb.ac.be
#To start with the loading of neccesary libraries to analyse the data
library(unbalanced) # contains SMOTE method(oversamples by using bootstrapping and k-nearest neighbor
#to synthetically create additional observations) to generate synthetic examples of the
#minority class in an unbalanced-class data set.
library(readr) #for reading rectangular data like csv
library(magrittr) #for chaining commands
library(caret) #for classification and regression training
library(dplyr) #for manipulating data set
library(e1071) #Functions for latent class analysis, short time Fourier transform,
#fuzzy clustering, support vector machines, shortest path computation...
library(rattle) #Graphical User Interface for Data Mining
library(ROCR) #for visualizing the Performance of Scoring Classifiers. ROC graphs,
#sensitivity/specificity curves, lift charts, and precision/recall plots
library(randomForest) #for Classification and regression
library(rpart.plot) # for plotting rpart trees
library(pROC) #for roc function
library(stringi) #for Character String Processing Facilities
#Now,that we have loaded necessary libraries to analyse for our data,Let us load the data
credit <- read.csv("C:/Users/Maddy/Desktop/Kaggle/Creditcard/creditcard.csv")
#Our data is approximately 144MB ,so it will take few seconds instead of loading immediately
#To check the structure of our data,we can use str()
str(credit)
#'data.frame': 284807 obs. of 31 variables:
# Time : num 0 0 1 1 2 2 4 7 7 9 ...
# V1 : num -1.36 1.192 -1.358 -0.966 -1.158 ...
# V2 : num -0.0728 0.2662 -1.3402 -0.1852 0.8777 ...
# V3 : num 2.536 0.166 1.773 1.793 1.549 .
# V4 : num 1.378 0.448 0.38 -0.863 0.403 ...
# V5 : num -0.3383 0.06 -0.5032 -0.0103 -0.4072 ...
# V6 : num 0.4624 -0.0824 1.8005 1.2472 0.0959 ...
# V7 : num 0.2396 -0.0788 0.7915 0.2376 0.5929 ...
# V8 : num 0.0987 0.0851 0.2477 0.3774 -0.2705 ...
# V9 : num 0.364 -0.255 -1.515 -1.387 0.818 ...
# V10 : num 0.0908 -0.167 0.2076 -0.055 0.7531 ...
# V11 : num -0.552 1.613 0.625 -0.226 -0.823 ...
# V12 : num -0.6178 1.0652 0.0661 0.1782 0.5382 ...
# V13 : num -0.991 0.489 0.717 0.508 1.346 ...
# V14 : num -0.311 -0.144 -0.166 -0.288 -1.12 ...
# V15 : num 1.468 0.636 2.346 -0.631 0.175 ...
# V16 : num -0.47 0.464 -2.89 -1.06 -0.451 ...
# V17 : num 0.208 -0.115 1.11 -0.684 -0.237 ...
# V18 : num 0.0258 -0.1834 -0.1214 1.9658 -0.0382 ...
# V19 : num 0.404 -0.146 -2.262 -1.233 0.803 ...
# V20 : num 0.2514 -0.0691 0.525 -0.208 0.4085 ...
# V21 : num -0.01831 -0.22578 0.248 -0.1083 -0.00943 ...
# V22 : num 0.27784 -0.63867 0.77168 0.00527 0.79828 ...
# V23 : num -0.11 0.101 0.909 -0.19 -0.137 ...
# V24 : num 0.0669 -0.3398 -0.6893 -1.1756 0.1413 ...
# V25 : num 0.129 0.167 -0.328 0.647 -0.206 ...
# V26 : num -0.189 0.126 -0.139 -0.222 0.502 ...
# V27 : num 0.13356 -0.00898 -0.05535 0.06272 0.21942 ...
# V28 : num -0.0211 0.0147 -0.0598 0.0615 0.2152 ...
# Amount: num 149.62 2.69 378.66 123.5 69.99 ...
# Class : int 0 0 0 0 0 0 0 0 0 0 ...
#Explanation for variables: 1)Time: First and current transaction timing elapse
#2)V1...V28:Principal Componentsthrough PCA(Prinicipal Component Analysis(Dimensionality-reduction technique)-
#To reduce the dimensionality of data or summarizing the data)
#3)Amount:Transaction Amount
#4)Class: Response Variable(1-Fraud,0-Legal)
#Since we have loaded the data,we should check whether it is balaced or not
table(credit$Class)/nrow(credit)
# 0 1
#0.998272514 0.001727486
#The output tells us the data is imbalanced.
#To balace the data,we can use ubsMOTE() function from unbalanced library
?ubSMOTE #synthetic minority over-sampling technique
balance <- ubSMOTE(X = credit[,-31], Y = as.factor(credit$Class),
perc.over=200, perc.under=800, verbose=TRUE)
balancedf <- cbind(balance$X, Class = balance$Y)
table(balancedf$Class)/nrow(balancedf)
# 0 1
#0.8421053 0.1578947
#Now balancedf dataframe has balanced data
####################################################
#2.Visualization
####################################################
#We can plot the data and get more clarity
pdf("credit.doc")
for (i in seq(from =1, to = 30, by = 4))
{
show(
featurePlot(
x = balancedf[, c(i+2,i+3,i,i+1)],
y = balancedf$Class,plot = "density",
adjust = 1.5, pch = "|", layout = c(2,2 ), auto.key=TRUE
)
)
}
dev.off()
#Outputs for our comparision is as below
#As we can see from the plotting, columns v15,v20, v22,v24,v25 and v26 have less impact on our target.
#Hence, we will exclude these columns and divide the data into 80% and 20% as a part of training and testing data
#3.Pre-processing the data
newdata <-balancedf[,-c(16,21,23,25,26,27)]
sample<- sample(2, nrow(newdata),
replace = T,
prob = c(0.8,0.2))
train <- newdata[sample==1,]
test <-newdata[sample==2,]
#4.Modelling
#Applying the randomforest algorithm to our balanced data
?randomForest()
balancedf.rf <- randomForest(Class~.,
train,
ntree=300,
importance=T, do.trace=T)
balancedf.rf$confusion
# 0 1 class.error
#0 6204 10 0.001609269
#1 117 1086 0.097256858
plot(balancedf.rf)
#
#Variable importance plot is also a useful tool and can be plotted using varImpPlot function.
# Top 5 variables are selected and plotted based on Model Accuracy and Gini value.
varImpPlot(balancedf.rf,
sort = T,
main="Variable Importance",
n.var=5)
var.imp <- data.frame(importance(balancedf.rf,
type=2))
var.imp$Variables <- row.names(var.imp)
Imp <- var.imp[order(var.imp$MeanDecreaseGini,decreasing = T),]
head(Imp)
# MeanDecreaseGini Variables
#V14 396.8361 V14
#V10 266.3485 V10
#V17 210.3685 V17
#V11 208.0458 V11
#V12 195.9173 V12
#V3 106.6907 V3
#Based on Random Forest variable importance, the variables could be selected for any other predictive modelling
#techniques or machine learning.
v_pred <- predict(balancedf.rf ,test)
confusionMatrix(v_pred,test$Class,
positive='1')
#Confusion Matrix and Statistics
#Reference
#Prediction 0 1
#0 1656 23
#1 2 250
#Accuracy : 0.9871
#95% CI : (0.9809, 0.9916)
#No Information Rate : 0.8586
#P-Value [Acc > NIR] : < 2.2e-16
#Kappa : 0.9449
#Mcnemar's Test P-Value : 6.334e-05
# Sensitivity : 0.9158
# Specificity : 0.9988
# Pos Pred Value : 0.9921
# Neg Pred Value : 0.9863
# Prevalence : 0.1414
# Detection Rate : 0.1295
# Detection Prevalence : 0.1305
# Balanced Accuracy : 0.9573
# 'Positive' Class : 1
ctrl <- rpart.control(maxdepth=4)
balancedf.rp <- rpart( Class ~ .,
balancedf,
control=ctrl
)
fancyRpartPlot(balancedf.rp)
traindata <- createDataPartition(balancedf$Class, p=0.8, list=FALSE)
train80<- balancedf[traindata,]
train20<- balancedf[-traindata,]
class_pred<- predict(balancedf.rp, train20[, -c(31) ], type="class")
confusionMatrix(class_pred, train20[,31],
positive = "1",dnn=c("predictions","actual"),
mode="prec_recall")
#Confusion Matrix and Statistics
#actual
#predictions 0 1
#0 1567 42
#1 7 253
#Accuracy : 0.9738
#95% CI : (0.9655, 0.9805)
#No Information Rate : 0.8422
#P-Value [Acc > NIR] : < 2.2e-16
#Kappa : 0.8964
#Mcnemar's Test P-Value : 1.191e-06
# Precision : 0.9731
# Recall : 0.8576
# F1 : 0.9117
# Prevalence : 0.1578
# Detection Rate : 0.1354
#Detection Prevalence : 0.1391
# Balanced Accuracy : 0.9266
# 'Positive' Class : 1
###AUC graph
prob_predictions <- predict(balancedf.rp, train20[, -c(31) ], type="prob")
df_roc<-roc(train20[,31],prob_predictions[,2])
plot(df_roc,main="AUC" %s+% df_roc$auc)