In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
import glob

In [2]:
# Defining the directory containing the .pkl files
directory = r"C:\Users\Kruti Agrawal\Desktop\Projects\fraud_detection\fraud_detection\dataset\data"

# Loading all .pkl files and concatenating them into a single DataFrame
all_files = glob.glob(os.path.join(directory, "*.pkl"))

# Creating an empty DataFrame to hold all data
df_list = [pd.read_pickle(file) for file in all_files]
df = pd.concat(df_list, ignore_index=True)

# Checking the loaded data
print(df.head())

   TRANSACTION_ID         TX_DATETIME CUSTOMER_ID TERMINAL_ID  TX_AMOUNT  \
0               0 2018-04-01 00:00:31         596        3156      57.16   
1               1 2018-04-01 00:02:10        4961        3412      81.51   
2               2 2018-04-01 00:07:56           2        1365     146.00   
3               3 2018-04-01 00:09:29        4128        8737      64.49   
4               4 2018-04-01 00:10:34         927        9906      50.99   

  TX_TIME_SECONDS TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  
0              31            0         0                  0  
1             130            0         0                  0  
2             476            0         0                  0  
3             569            0         0                  0  
4             634            0         0                  0  


In [3]:
# Data Preprocessing
# Converting TX_DATETIME to datetime type
df['TX_DATETIME'] = pd.to_datetime(df['TX_DATETIME'])
df['day'] = df['TX_DATETIME'].dt.day
df['hour'] = df['TX_DATETIME'].dt.hour

In [4]:
# Feature engineering based on defined fraud scenarios
df['is_high_amount'] = df['TX_AMOUNT'] > 220
df['is_fraud_terminal'] = df.groupby('TERMINAL_ID')['TX_FRAUD'].transform('sum') > 0
df['is_high_spend_customer'] = df.groupby('CUSTOMER_ID')['TX_AMOUNT'].transform(lambda x: x > x.mean() * 5)

In [5]:
# Defining features and target
features = ['TX_AMOUNT', 'is_high_amount', 'is_fraud_terminal', 'is_high_spend_customer', 'day', 'hour']
X = df[features]
y = df['TX_FRAUD']

In [6]:
# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [7]:
# Defining the model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [8]:
# Compiling the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [9]:
# Training the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m30698/30698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 860us/step - accuracy: 0.9914 - loss: 0.0464 - val_accuracy: 0.9937 - val_loss: 0.0330
Epoch 2/10
[1m30698/30698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 836us/step - accuracy: 0.9937 - loss: 0.0336 - val_accuracy: 0.9937 - val_loss: 0.0330
Epoch 3/10
[1m30698/30698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 833us/step - accuracy: 0.9938 - loss: 0.0327 - val_accuracy: 0.9937 - val_loss: 0.0331
Epoch 4/10
[1m30698/30698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 832us/step - accuracy: 0.9938 - loss: 0.0328 - val_accuracy: 0.9937 - val_loss: 0.0331
Epoch 5/10
[1m30698/30698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 844us/step - accuracy: 0.9937 - loss: 0.0332 - val_accuracy: 0.9937 - val_loss: 0.0332
Epoch 6/10
[1m30698/30698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 871us/step - accuracy: 0.9937 - loss: 0.0330 - val_accuracy: 0.9937 - val

In [10]:
# Evaluating the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy}')

[1m16446/16446[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 565us/step - accuracy: 0.9939 - loss: 0.0321
Test Accuracy: 0.9938697814941406


In [11]:
model.save("fraud_detection_model.h5")

