In [46]:
# Preprocessing Libraries
import pandas as pd
import numpy as np
from datetime import datetime, date

# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier


In [2]:
train_df = pd.read_csv('fraudTrain.csv')
test_df = pd.read_csv('fraudTest.csv')
combine = [train_df, test_df]

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [4]:
train_df.describe()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0
mean,648337.0,4.17192e+17,70.35104,48800.67,38.53762,-90.22634,88824.44,1349244000.0,38.53734,-90.22646,0.005788652
std,374318.0,1.308806e+18,160.316,26893.22,5.075808,13.75908,301956.4,12841280.0,5.109788,13.77109,0.07586269
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.02779,-166.6712,0.0
25%,324168.5,180042900000000.0,9.65,26237.0,34.6205,-96.798,743.0,1338751000.0,34.73357,-96.89728,0.0
50%,648337.0,3521417000000000.0,47.52,48174.0,39.3543,-87.4769,2456.0,1349250000.0,39.36568,-87.43839,0.0
75%,972505.5,4642255000000000.0,83.14,72042.0,41.9404,-80.158,20328.0,1359385000.0,41.95716,-80.2368,0.0
max,1296674.0,4.992346e+18,28948.9,99783.0,66.6933,-67.9503,2906700.0,1371817000.0,67.51027,-66.9509,1.0


In [5]:
def age(born):
    born = datetime.strptime(born, "%Y-%m-%d").date()
    today = date.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))
  
train_df['age'] = train_df['dob'].apply(age)
test_df['age'] = test_df['dob'].apply(age)
train_df = train_df.drop('dob', axis= 1)
test_df = test_df.drop('dob', axis=1)

combine = [train_df, test_df]

In [6]:
print(train_df.columns.values)

['Unnamed: 0' 'trans_date_trans_time' 'cc_num' 'merchant' 'category' 'amt'
 'first' 'last' 'gender' 'street' 'city' 'state' 'zip' 'lat' 'long'
 'city_pop' 'job' 'trans_num' 'unix_time' 'merch_lat' 'merch_long'
 'is_fraud' 'age']


In [7]:
train_df[['age', 'is_fraud']].groupby(['age'], as_index=False).mean().sort_values(by='is_fraud', ascending=False)

Unnamed: 0,age,is_fraud
78,97,1.000000
66,85,0.030047
71,90,0.014511
46,65,0.013756
72,91,0.013494
...,...,...
30,49,0.003338
26,45,0.003332
28,47,0.003246
20,39,0.003188


In [8]:
train_df[['gender', 'is_fraud']].groupby(['gender'], as_index=False).mean().sort_values(by='is_fraud', ascending=False)

Unnamed: 0,gender,is_fraud
1,M,0.006426
0,F,0.005262


In [9]:
for dataset in combine:
    dataset['gender'] = dataset['gender'].map( {'F': 1, 'M': 0} ).astype(np.int64)

In [10]:
train_df[['gender', 'is_fraud']].groupby(['gender'], as_index=False).mean().sort_values(by='is_fraud', ascending=False)


Unnamed: 0,gender,is_fraud
0,0,0.006426
1,1,0.005262


In [11]:
train_df['trans_date_trans_time'] = pd.to_datetime(train_df['trans_date_trans_time'])
test_df['trans_date_trans_time'] = pd.to_datetime(test_df['trans_date_trans_time'])

train_df['hour'] = train_df['trans_date_trans_time'].dt.strftime('%H')
test_df['hour'] = test_df['trans_date_trans_time'].dt.strftime('%H')
combine = [train_df, test_df]

In [12]:
train_df['weekday'] = train_df['trans_date_trans_time'].dt.strftime('%w')
test_df['weekday'] = test_df['trans_date_trans_time'].dt.strftime('%w')
combine = [train_df, test_df]

In [13]:
train_df[['weekday', 'is_fraud']].groupby(['weekday'], as_index=False).mean().sort_values(by='is_fraud', ascending=False)

Unnamed: 0,weekday,is_fraud
5,5,0.007086
4,4,0.006844
3,3,0.006554
6,6,0.006106
2,2,0.005835
0,0,0.004853
1,1,0.004648


In [14]:
train_df['weekday'] = train_df['weekday'].values.astype(np.int64)
test_df['weekday'] = test_df['weekday'].values.astype(np.int64)

In [15]:
train_df[['hour', 'is_fraud']].groupby(['hour'], as_index=False).mean().sort_values(by='is_fraud', ascending=False)

Unnamed: 0,hour,is_fraud
22,22,0.028829
23,23,0.028374
1,1,0.015349
0,0,0.01494
2,2,0.014652
3,3,0.014239
5,5,0.001423
7,7,0.001327
14,14,0.001325
19,19,0.001236


In [16]:
train_df['hour'] = train_df['hour'].values.astype(np.int64)
test_df['hour'] = test_df['hour'].values.astype(np.int64)

In [17]:
train_df[['merchant', 'is_fraud']].groupby(['merchant'], as_index=False).mean().sort_values(by='is_fraud', ascending=False)

Unnamed: 0,merchant,is_fraud
337,fraud_Kozey-Boehm,0.025723
245,"fraud_Herman, Treutel and Dickens",0.025385
304,fraud_Kerluke-Abshire,0.022307
79,fraud_Brown PLC,0.022109
200,fraud_Goyette Inc,0.021616
...,...,...
465,fraud_Osinski Inc,0.000000
261,"fraud_Hodkiewicz, Prohaska and Paucek",0.000000
568,"fraud_Schroeder, Wolff and Hermiston",0.000000
211,fraud_Gulgowski LLC,0.000000


In [18]:
train_df[['state', 'is_fraud']].groupby(['state'], as_index=False).mean().sort_values(by='is_fraud', ascending=False)

Unnamed: 0,state,is_fraud
8,DE,1.0
39,RI,0.027273
0,AK,0.016981
33,NV,0.008382
5,CO,0.008141
37,OR,0.008012
42,TN,0.007975
29,NE,0.007448
21,ME,0.00721
30,NH,0.007127


In [20]:
train_df['state'].unique()

array(['NC', 'WA', 'ID', 'MT', 'VA', 'PA', 'KS', 'TN', 'IA', 'WV', 'FL',
       'CA', 'NM', 'NJ', 'OK', 'IN', 'MA', 'TX', 'WI', 'MI', 'WY', 'HI',
       'NE', 'OR', 'LA', 'DC', 'KY', 'NY', 'MS', 'UT', 'AL', 'AR', 'MD',
       'GA', 'ME', 'AZ', 'MN', 'OH', 'CO', 'VT', 'MO', 'SC', 'NV', 'IL',
       'NH', 'SD', 'AK', 'ND', 'CT', 'RI', 'DE'], dtype=object)

In [21]:
for dataset in combine:
    dataset['state'] = dataset['state'].map( { 'NC': 0, 'WA': 1, 'ID': 2, 'MT': 3, 'VA': 4, 'PA': 5, 'KS': 6, 'TN': 7, 'IA': 8, 'WV': 9, 'FL': 10,
       'CA': 11, 'NM': 12, 'NJ': 13, 'OK': 14, 'IN': 15, 'MA': 16, 'TX': 17, 'WI': 18, 'MI': 19, 'WY': 20, 'HI': 21,
       'NE': 22, 'OR': 23, 'LA': 24, 'DC': 25, 'KY': 26, 'NY': 27, 'MS': 28, 'UT': 29, 'AL': 30, 'AR': 31, 'MD': 32,
       'GA': 33, 'ME': 34, 'AZ': 35, 'MN': 36, 'OH': 37, 'CO': 38, 'VT': 39, 'MO': 40, 'SC': 41, 'NV': 42, 'IL': 43,
       'NH': 44, 'SD': 45, 'AK': 46, 'ND': 47, 'CT': 48, 'RI': 49, 'DE': 50 })

In [22]:
train_df['state'] = train_df['state'].values.astype(np.int64)
test_df['state'] = test_df['state'].values.astype(np.int64)

In [23]:
train_df[['category', 'is_fraud']].groupby(['category'], as_index=False).mean().sort_values(by='is_fraud', ascending=False)

Unnamed: 0,category,is_fraud
11,shopping_net,0.017561
8,misc_net,0.014458
4,grocery_pos,0.014098
12,shopping_pos,0.007225
2,gas_transport,0.004694
9,misc_pos,0.003139
3,grocery_net,0.002948
13,travel,0.002864
0,entertainment,0.002478
10,personal_care,0.002424


In [24]:
train_df['category'].unique()

array(['misc_net', 'grocery_pos', 'entertainment', 'gas_transport',
       'misc_pos', 'grocery_net', 'shopping_net', 'shopping_pos',
       'food_dining', 'personal_care', 'health_fitness', 'travel',
       'kids_pets', 'home'], dtype=object)

In [25]:
for dataset in combine:
    dataset['category'] = dataset['category'].map( {'misc_net': 0, 'grocery_pos': 1, 'entertainment': 2, 'gas_transport': 3,
                                                    'misc_pos': 4, 'grocery_net': 5, 'shopping_net': 6, 'shopping_pos': 7, 
                                                    'food_dining': 8, 'personal_care': 9, 'health_fitness': 10, 'travel': 11, 
                                                    'kids_pets': 12, 'home': 13} )

In [26]:
train_df['category'] = train_df['category'].values.astype(np.int64)
test_df['category'] = test_df['category'].values.astype(np.int64)

In [27]:
train_df[['category', 'is_fraud']].groupby(['category'], as_index=False).mean().sort_values(by='is_fraud', ascending=False)

Unnamed: 0,category,is_fraud
6,6,0.017561
0,0,0.014458
1,1,0.014098
7,7,0.007225
3,3,0.004694
4,4,0.003139
5,5,0.002948
11,11,0.002864
2,2,0.002478
9,9,0.002424


In [28]:
train_df = train_df.drop('trans_date_trans_time', axis= 1)
train_df = train_df.drop('merchant', axis=1)
train_df = train_df.drop('cc_num', axis= 1)
train_df = train_df.drop('amt', axis= 1)
train_df = train_df.drop('first', axis= 1)
train_df = train_df.drop('last', axis= 1)
train_df = train_df.drop('street', axis= 1)
train_df = train_df.drop('city', axis= 1)
train_df = train_df.drop('zip', axis= 1)
train_df = train_df.drop('lat', axis= 1)
train_df = train_df.drop('long', axis= 1)
train_df = train_df.drop('city_pop', axis= 1)
train_df = train_df.drop('job', axis= 1)
train_df = train_df.drop('trans_num', axis= 1)
train_df = train_df.drop('unix_time', axis= 1)
train_df = train_df.drop('merch_lat', axis= 1)
train_df = train_df.drop('merch_long', axis= 1)


test_df = test_df.drop('trans_date_trans_time', axis= 1)
test_df = test_df.drop('cc_num', axis=1)
test_df = test_df.drop('merchant', axis=1)
test_df = test_df.drop('amt', axis= 1)
test_df = test_df.drop('first', axis= 1)
test_df = test_df.drop('last', axis= 1)
test_df = test_df.drop('street', axis= 1)
test_df = test_df.drop('city', axis= 1)
test_df = test_df.drop('zip', axis= 1)
test_df = test_df.drop('lat', axis= 1)
test_df = test_df.drop('long', axis= 1)
test_df = test_df.drop('city_pop', axis= 1)
test_df = test_df.drop('job', axis= 1)
test_df = test_df.drop('trans_num', axis= 1)
test_df = test_df.drop('unix_time', axis= 1)
test_df = test_df.drop('merch_lat', axis= 1)
test_df = test_df.drop('merch_long', axis= 1)

combine = [train_df, test_df]


In [29]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 8 columns):
 #   Column      Non-Null Count    Dtype
---  ------      --------------    -----
 0   Unnamed: 0  1296675 non-null  int64
 1   category    1296675 non-null  int64
 2   gender      1296675 non-null  int64
 3   state       1296675 non-null  int64
 4   is_fraud    1296675 non-null  int64
 5   age         1296675 non-null  int64
 6   hour        1296675 non-null  int64
 7   weekday     1296675 non-null  int64
dtypes: int64(8)
memory usage: 79.1 MB


In [30]:
train_df.describe()

Unnamed: 0.1,Unnamed: 0,category,gender,state,is_fraud,age,hour,weekday
count,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0
mean,648337.0,6.527678,0.5474487,21.87392,0.005788652,48.95281,12.80486,2.717872
std,374318.0,4.10492,0.4977437,12.9811,0.07586269,17.38734,6.817824,2.132074
min,0.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0
25%,324168.5,3.0,0.0,11.0,0.0,36.0,7.0,1.0
50%,648337.0,7.0,1.0,20.0,0.0,47.0,14.0,2.0
75%,972505.5,10.0,1.0,32.0,0.0,60.0,19.0,5.0
max,1296674.0,13.0,1.0,50.0,1.0,98.0,23.0,6.0


In [31]:
train_df = train_df.drop('Unnamed: 0', axis=1)

In [32]:
test_df = test_df.drop('is_fraud', axis=1)


combine = [train_df, test_df]

In [33]:
X_train = train_df.drop("is_fraud", axis=1)
Y_train = train_df["is_fraud"]
X_test  = test_df.drop("Unnamed: 0", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((1296675, 6), (1296675,), (555719, 6))

In [34]:
test_df

Unnamed: 0.1,Unnamed: 0,category,gender,state,age,hour,weekday
0,0,9,0,41,55,12,0
1,1,9,1,29,33,12,0
2,2,10,1,27,52,12,0
3,3,4,0,10,35,12,0
4,4,11,0,19,67,12,0
...,...,...,...,...,...,...,...
555714,555714,10,0,40,57,23,4
555715,555715,12,0,17,23,23,4
555716,555716,12,1,1,41,23,4
555717,555717,11,0,2,57,23,4


In [35]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

99.42

In [37]:
coeff_df = pd.DataFrame(train_df.columns.delete(3))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
4,hour,0.098484
5,weekday,0.058183
3,age,0.016817
2,state,0.002328
0,category,-0.219318
1,gender,-0.22352


In [43]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian

99.42

In [44]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron

99.42

In [45]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

99.42

In [41]:
# K Nearest Neighbor

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

99.63

In [38]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

99.79

In [39]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=40)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

99.79