In [None]:

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:

import os

# Update the file path based on where the CSV file is stored in your Google Drive
file_path = '/content/drive/My Drive/Resources/fraudTrain.csv'

# Ensure the file exists before proceeding
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}")


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
import numpy as np
from sklearn.metrics import classification_report

In [None]:
df = pd.read_csv('/content/drive/My Drive/Resources/fraudTrain.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [None]:
if 'category' in df.columns:
    le_category = LabelEncoder()
    df['category'] = le_category.fit_transform(df['category'])

if 'gender' in df.columns:
    le_gender = LabelEncoder()
    df['gender'] = le_gender.fit_transform(df['gender'])

In [None]:
relevant_columns = ['category', 'amt', 'lat', 'long', 'gender', 'city_pop', 'is_fraud']
df_filtered = df[relevant_columns]

In [None]:
scaler = MinMaxScaler()
features = df_filtered.drop('is_fraud', axis=1)
normalized_features = scaler.fit_transform(features)

In [None]:
df_normalized = pd.DataFrame(normalized_features, columns=features.columns)

In [None]:
df_normalized['is_fraud'] = df_filtered['is_fraud']

In [None]:
normal_data = df_normalized[df_normalized['is_fraud'] == 0].drop('is_fraud', axis=1)
fraud_data = df_normalized[df_normalized['is_fraud'] == 1].drop('is_fraud', axis=1)

In [None]:
train_data, val_data = train_test_split(normal_data, test_size=0.2, random_state=42)

In [None]:
input_dim = train_data.shape[1]
encoding_dim = 10

In [None]:
input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='sigmoid')(encoded)

In [None]:
autoencoder = Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer='adam', loss='mse')

In [None]:
autoencoder.fit(
    train_data,
    train_data,
    epochs=50,
    batch_size=32,
    shuffle=True,
    validation_data=(val_data, val_data)
)

Epoch 1/50
[1m32230/32230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 2ms/step - loss: 0.0116 - val_loss: 4.0877e-04
Epoch 2/50
[1m32230/32230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 2ms/step - loss: 4.0961e-04 - val_loss: 4.0722e-04
Epoch 3/50
[1m32230/32230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 3ms/step - loss: 4.0718e-04 - val_loss: 3.9145e-04
Epoch 4/50
[1m32230/32230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 3ms/step - loss: 3.9339e-04 - val_loss: 3.9048e-04
Epoch 5/50
[1m32230/32230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 2ms/step - loss: 3.9196e-04 - val_loss: 3.9086e-04
Epoch 6/50
[1m32230/32230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 2ms/step - loss: 3.9062e-04 - val_loss: 3.9726e-04
Epoch 7/50
[1m32230/32230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 2ms/step - loss: 3.8967e-04 - val_loss: 3.9729e-04
Epoch 8/50
[1m32230/32230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x7ba2486c3c70>

In [None]:
reconstructions = autoencoder.predict(df_normalized.drop('is_fraud', axis=1))
reconstruction_error = np.mean(np.square(reconstructions - df_normalized.drop('is_fraud', axis=1)), axis=1)

[1m40522/40522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 2ms/step


In [None]:
threshold = np.percentile(reconstruction_error, 95)

In [None]:
df_normalized['anomaly'] = reconstruction_error > threshold

In [None]:
print(classification_report(df_normalized['is_fraud'], df_normalized['anomaly']))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97   1289169
           1       0.01      0.07      0.01      7506

    accuracy                           0.95   1296675
   macro avg       0.50      0.51      0.49   1296675
weighted avg       0.99      0.95      0.97   1296675



In [None]:
print(df_normalized[['is_fraud', 'anomaly']].value_counts())

is_fraud  anomaly
0         False      1224873
          True         64296
1         False         6968
          True           538
Name: count, dtype: int64
