# Pre-processing Work and Model - "Credit Card Fraud Detection"¶

#### Objective: 

## Import

In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input

## Load the Cleaned Data

In [2]:
transactions = pd.read_csv('data/cleaned_transactions.csv')

In [3]:
print(transactions.columns)

Index(['credit_card', 'date', 'transaction_dollar_amount', 'Long', 'Lat',
       'day_of_week', 'hour_of_day', 'transactions_last_hour'],
      dtype='object')


## Convert 'date' and extract components

In [4]:
transactions['date'] = pd.to_datetime(transactions['date'])

transactions['year'] = transactions['date'].dt.year
transactions['month'] = transactions['date'].dt.month
transactions['day'] = transactions['date'].dt.day
transactions['hour'] = transactions['date'].dt.hour
transactions['minute'] = transactions['date'].dt.minute

transactions.drop(['date'], axis=1, inplace=True)

## Pre-processing

In [5]:
start_time = time.time()

In [6]:
transactions = pd.get_dummies(transactions)

In [7]:
scaler = StandardScaler()
features_to_scale = ['transaction_dollar_amount']
transactions[features_to_scale] = scaler.fit_transform(transactions[features_to_scale])

In [8]:
np.random.seed(42)
transactions['fraud_label'] = np.random.choice([0, 1], size=(len(transactions),), p=[0.95, 0.05])

In [9]:
X = transactions.drop('fraud_label', axis=1)
y = transactions['fraud_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
elapsed_time = time.time() - start_time
print(f"The preprocessing block took {elapsed_time} seconds to run.")

The preprocessing block took 0.08675193786621094 seconds to run.


In [11]:
print("Number of features in X_train:", X_train.shape[1])

Number of features in X_train: 18


## Model Development

In [12]:
start_time = time.time()

In [13]:
print("Number of features in X_train:", X_train.shape[1])
print("Number of features in X_test:", X_test.shape[1])

Number of features in X_train: 18
Number of features in X_test: 18


In [14]:
model = Sequential([
    Input(shape=(18,)),
    Dense(16, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [15]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [16]:
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/20
[1m7365/7365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 354us/step - accuracy: 0.8930 - loss: 8314212581376.0000 - val_accuracy: 0.9511 - val_loss: 2221261455360.0000
Epoch 2/20
[1m7365/7365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 337us/step - accuracy: 0.9046 - loss: 1087539773440.0000 - val_accuracy: 0.9511 - val_loss: 298977460224.0000
Epoch 3/20
[1m7365/7365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 342us/step - accuracy: 0.9038 - loss: 932244160512.0000 - val_accuracy: 0.9511 - val_loss: 1029683019776.0000
Epoch 4/20
[1m7365/7365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 337us/step - accuracy: 0.9048 - loss: 760718688256.0000 - val_accuracy: 0.0489 - val_loss: 24278323200.0000
Epoch 5/20
[1m7365/7365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 335us/step - accuracy: 0.9038 - loss: 627086000128.0000 - val_accuracy: 0.9511 - val_loss: 313999261696.0000
Epoch 6/20
[1m7365/7365[0m [32m━━━━━━━━━━━━━━━━━━━━[0

<keras.src.callbacks.history.History at 0x16c132190>

In [17]:
elapsed_time = time.time() - start_time
print(f"The model training block took {elapsed_time} seconds to run.")

The model training block took 50.93340301513672 seconds to run.


In [18]:
preds = model.predict(X_test)
preds = np.round(preds).astype(int)

[1m1842/1842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 203us/step


In [19]:
print("Accuracy:", accuracy_score(y_test, preds))

Accuracy: 0.9511354764248616
