In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import tensorflow as tf

In [2]:
fraud = pd.read_csv('credit_card_fraud.csv')
fraud.head(20)

Unnamed: 0,trans_date_trans_time,merchant,category,amt,city,state,lat,long,city_pop,job,dob,trans_num,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:44,"Heller, Gutmann and Zieme",grocery_pos,107.23,Orient,WA,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,49.159047,-118.186462,0
1,2019-01-01 00:00:51,Lind-Buckridge,entertainment,220.11,Malad City,ID,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,43.150704,-112.154481,0
2,2019-01-01 00:07:27,Kiehn Inc,grocery_pos,96.29,Grenada,CA,41.6125,-122.5258,589,Systems analyst,1945-12-21,413636e759663f264aae1819a4d4f231,41.65752,-122.230347,0
3,2019-01-01 00:09:03,Beier-Hyatt,shopping_pos,7.77,High Rolls Mountain Park,NM,32.9396,-105.8189,899,Naval architect,1967-08-30,8a6293af5ed278dea14448ded2685fea,32.863258,-106.520205,0
4,2019-01-01 00:21:32,Bruen-Yost,misc_pos,6.85,Freedom,WY,43.0172,-111.0292,471,"Education officer, museum",1967-08-02,f3c43d336e92a44fc2fb67058d5949e3,43.753735,-111.454923,0
5,2019-01-01 00:22:06,Kunze Inc,grocery_pos,90.22,Honokaa,HI,20.0827,-155.488,4878,Physiotherapist,1966-12-03,95826e3caa9e0b905294c6dae985aec1,19.560013,-156.045889,0
6,2019-01-01 00:22:18,"Nitzsche, Kessler and Wol",shopping_pos,4.02,Valentine,NE,42.8062,-100.6215,4005,Network engineer,1945-03-15,20490f3f0966ce74b4aaba8dc2c4ed52,42.47559,-101.265846,0
7,2019-01-01 00:22:36,"Kihn, Abernathy and Douglas",shopping_net,3.66,Westfir,OR,43.7575,-122.481,597,Forensic psychologist,1961-05-19,870c92b288a974a2faf1f24b05c27e33,44.278191,-121.815161,0
8,2019-01-01 00:31:51,Ledner-Pfannerstill,gas_transport,102.13,Thompson,UT,38.9999,-109.615,46,"Surveyor, minerals",1987-04-23,47238da5b40d126c8abea40a857c7809,39.807313,-109.348294,0
9,2019-01-01 00:34:10,Stracke-Lemke,grocery_pos,83.07,Conway,WA,48.34,-122.3456,85,"Research officer, political party",1984-09-01,9b7a0619dcc5c572dc134f2827ed5a6b,48.682111,-122.719904,0


In [3]:
fraud.describe()

Unnamed: 0,amt,lat,long,city_pop,merch_lat,merch_long,is_fraud
count,339607.0,339607.0,339607.0,339607.0,339607.0,339607.0,339607.0
mean,70.577984,39.718991,-110.622605,107140.9,39.718853,-110.622383,0.005247
std,161.675242,5.094961,12.65137,293029.9,5.130894,12.663998,0.072248
min,1.0,20.0271,-165.6723,46.0,19.027422,-166.671575,0.0
25%,9.6,36.7154,-120.0936,471.0,36.817194,-119.823755,0.0
50%,46.46,39.6171,-111.0985,1645.0,39.586209,-111.036443,0.0
75%,83.35,41.71,-100.6215,35439.0,42.193072,-100.353096,0.0
max,28948.9,66.6933,-89.6287,2383912.0,67.510267,-88.629203,1.0


In [4]:
fraud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339607 entries, 0 to 339606
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   trans_date_trans_time  339607 non-null  object 
 1   merchant               339607 non-null  object 
 2   category               339607 non-null  object 
 3   amt                    339607 non-null  float64
 4   city                   339607 non-null  object 
 5   state                  339607 non-null  object 
 6   lat                    339607 non-null  float64
 7   long                   339607 non-null  float64
 8   city_pop               339607 non-null  int64  
 9   job                    339607 non-null  object 
 10  dob                    339607 non-null  object 
 11  trans_num              339607 non-null  object 
 12  merch_lat              339607 non-null  float64
 13  merch_long             339607 non-null  float64
 14  is_fraud               339607 non-nu

In [5]:
fraud.nunique()

trans_date_trans_time    338504
merchant                    693
category                     14
amt                       32112
city                        176
state                        13
lat                         183
long                        183
city_pop                    174
job                         163
dob                         187
trans_num                339607
merch_lat                335608
merch_long               337263
is_fraud                      2
dtype: int64

In [6]:
fraud.duplicated().any()

False

In [7]:
fraud2 = fraud.drop(['trans_num', 'trans_date_trans_time'], axis=1)
fraud2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339607 entries, 0 to 339606
Data columns (total 13 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   merchant    339607 non-null  object 
 1   category    339607 non-null  object 
 2   amt         339607 non-null  float64
 3   city        339607 non-null  object 
 4   state       339607 non-null  object 
 5   lat         339607 non-null  float64
 6   long        339607 non-null  float64
 7   city_pop    339607 non-null  int64  
 8   job         339607 non-null  object 
 9   dob         339607 non-null  object 
 10  merch_lat   339607 non-null  float64
 11  merch_long  339607 non-null  float64
 12  is_fraud    339607 non-null  int64  
dtypes: float64(5), int64(2), object(6)
memory usage: 33.7+ MB


In [8]:
fraud_dummy = pd.get_dummies(fraud2, columns=['merchant', 'category', 'city', 'state', 'job', 'dob' ], drop_first=True)

In [15]:
fraud_dummy.isnull().sum() > 1

amt               False
lat               False
long              False
city_pop          False
merch_lat         False
                  ...  
dob_1999-05-31    False
dob_1999-09-29    False
dob_1999-11-30    False
dob_2000-08-16    False
dob_2001-07-26    False
Length: 1247, dtype: bool