In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 

In [2]:
credit_card_data = pd.read_csv("creditcard.csv")

In [3]:
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
# 0 -> Normal transaction 
# 1 -> Fraudulent transaction 

In [4]:
credit_card_data['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

#### The dataset is highly unbalanced, there are more cases of normal transactions over fraudulents,  We need to balance it ,  Create a new datasets with all legits and fraud seperated 

In [6]:
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [None]:
# Inspect the amount columns 

In [7]:
legit.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [8]:
fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [None]:
# Compare the values for both transactions 

In [9]:
credit_card_data.groupby("Class").mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [11]:
print(legit.shape, fraud.shape)

(284315, 31) (492, 31)


### Under-sampling

####  This is the act of randomly taking equal volume of normal transactions to equal the vlume of fraud transactions (492)

In [None]:
# Build a sample dataset containing similar distribution of normal & fraud
# transactions 

In [10]:
legit_sample = legit.sample(n=492)

In [12]:
legit_sample.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
59124,48705.0,-2.501014,2.056529,0.111806,0.917107,-1.291148,0.237563,-0.948522,1.473037,0.114912,...,0.240081,0.638807,-0.282669,-0.400284,0.143762,-0.189104,-0.62636,-0.242373,34.22,0
16982,28339.0,0.173288,-1.551776,2.123017,-1.128553,0.726355,0.021535,2.100135,-3.67328,2.651181,...,-0.236715,1.702132,-0.45116,-0.063151,-1.345813,-0.708136,-4.320909,-2.32189,64.99,0
276415,167076.0,-0.314308,0.953298,0.279334,-0.578192,0.864094,-0.379366,0.938413,-1.227382,0.053206,...,0.493556,-0.822526,0.040042,-0.734516,-0.713558,0.196171,-0.12699,-0.039736,9.99,0
229671,146012.0,2.073199,0.03006,-1.102317,0.370593,0.032135,-1.101662,0.261151,-0.392062,0.374541,...,-0.285408,-0.630577,0.307778,-0.078342,-0.239193,0.201145,-0.063316,-0.058889,1.98,0
60313,49270.0,-0.806933,1.200183,1.294502,-0.052346,-0.494019,-0.766606,0.187475,0.541338,-0.714582,...,-0.130608,-0.521359,0.052394,0.493623,-0.255024,0.046297,0.131719,0.048093,1.98,0


In [None]:
# Concatenate 2 Dataframe 

In [14]:
dataset = pd.concat([legit_sample, fraud], axis=0)

# axis=0 -> Row-wise, axis=1 => col-wise

In [17]:
dataset.shape

(984, 31)

In [19]:
# Compare the values for both transactions on new dataset, you'll notice
# that nothing much has changed, that means we got a good sampling of the data

In [18]:
dataset.groupby("Class").mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,95387.278455,0.05997,-0.081982,-0.004384,-0.038118,0.100274,-0.165602,0.09651,-0.059909,0.031463,...,0.019672,-0.024321,0.052261,0.033725,0.016341,0.007003,0.010372,-0.022329,0.001163,87.964228
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321
