In [1]:
import numpy as np
import pandas as pd

In [18]:
df_train = pd.read_csv("datasets/fraudTrain.csv")
df_test = pd.read_csv("datasets/fraudTest.csv")

df_train.shape, df_test.shape

((1296675, 23), (555719, 23))

In [19]:
df_train.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [20]:
df_train.drop("Unnamed: 0", axis=1, inplace=True)
df_test.drop("Unnamed: 0", axis=1, inplace=True)

df_train.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')

In [21]:
df_train["trans_date_trans_time"] = pd.to_datetime(df_train["trans_date_trans_time"], format="%Y-%m-%d %H:%M:%S")
df_test["trans_date_trans_time"] = pd.to_datetime(df_test["trans_date_trans_time"], format="%Y-%m-%d %H:%M:%S")

df_train['transaction_hour'] = df_train['trans_date_trans_time'].dt.hour
df_train['transaction_day'] = df_train['trans_date_trans_time'].dt.day
df_train['transaction_month'] = df_train['trans_date_trans_time'].dt.month

df_test['transaction_hour'] = df_test['trans_date_trans_time'].dt.hour
df_test['transaction_day'] = df_test['trans_date_trans_time'].dt.day
df_test['transaction_month'] = df_test['trans_date_trans_time'].dt.month

In [22]:
dataframes = [df_train, df_test]

df_train.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,transaction_hour,transaction_day,transaction_month
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,0,1,1
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,0,1,1
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0,0,1,1
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0,0,1,1
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0,0,1,1


In [23]:
features = ['amt', 'city_pop', 'transaction_hour', 'transaction_day', 'transaction_month', 'lat', 'long', 'merch_lat', 'merch_long']

X_train, y_train = df_train[features], df_train['is_fraud']
X_test, y_test = df_test[features], df_test['is_fraud']

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.impute import SimpleImputer

In [26]:
imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()

X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

In [27]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
model.fit(X_train, y_train)

In [30]:
y_pred = model.predict(X_test)

y_pred.shape, y_pred[:5]

((555719,), array([0, 0, 0, 0, 0]))

## Evaluation

In [31]:
from sklearn.metrics import accuracy_score

score = accuracy_score(y_test, y_pred)
print(score)

0.9970308735170113


In [32]:
# Evaluate fraud detected

total, accurate = 0, 0
for i in range(len(y_test)):
    if y_test[i] == 0 and y_pred[i] == 0:
        continue
    total += 1
    accurate += 1 if y_test[i] == y_pred[i] else 0

score = accurate / total
print(score)

0.2819843342036554


In [33]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score

cnf_report = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

cnf_report, precision, recall

(array([[553421,    153],
        [  1497,    648]]),
 0.8089887640449438,
 0.3020979020979021)