- This data is too large, i resized the data into 100k rows only for portfolio purpose

In [1]:
%%time    

import pandas as pd

train = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv', nrows=100000)
test = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv', nrows=100000)

CPU times: user 5.84 s, sys: 912 ms, total: 6.76 s
Wall time: 9.1 s


In [2]:
train_data = train.fillna(0)
test_data = test.fillna(0)

In [3]:
# Define a function to cap outliers based on the 1st and 99th percentiles
def cap_outliers(df, columns):
    for col in columns:
        if df[col].dtype in ['float64', 'int64']:
            lower_limit = df[col].quantile(0.01)
            upper_limit = df[col].quantile(0.99)
            df[col] = df[col].clip(lower=lower_limit, upper=upper_limit)
    return df

# Apply the function to both train_data and test_data
numeric_columns_train = train_data.select_dtypes(include=['float64', 'int64']).columns
numeric_columns_test = test_data.select_dtypes(include=['float64', 'int64']).columns

train_data = cap_outliers(train_data, numeric_columns_train)
test_data = cap_outliers(test_data, numeric_columns_test)

train_data, test_data

(       TransactionID  isFraud  TransactionDT  TransactionAmt ProductCD  card1  \
 0         2987999.99        0      112153.97          68.500         W  13926   
 1         2987999.99        0      112153.97          29.000         W   2755   
 2         2987999.99        0      112153.97          59.000         W   4663   
 3         2987999.99        0      112153.97          50.000         W  18132   
 4         2987999.99        0      112153.97          50.000         H   4497   
 ...              ...      ...            ...             ...       ...    ...   
 99995     3085999.01        0     1990764.03          55.385         C   2256   
 99996     3085999.01        0     1990764.03         117.000         W   2518   
 99997     3085999.01        0     1990764.03          50.000         S   2748   
 99998     3085999.01        0     1990764.03         100.000         H  16075   
 99999     3085999.01        0     1990764.03         107.950         W   4436   
 
        card2 

In [4]:
from sklearn.preprocessing import LabelEncoder

# Apply label encoding to categorical columns in train_data
label_encoders = {}

# Convert object columns to strings in train_data and test_data
for column in train_data.select_dtypes(include=['object']).columns:
    train_data[column] = train_data[column].astype(str)

for column in test_data.select_dtypes(include=['object']).columns:
    test_data[column] = test_data[column].astype(str)

# Apply label encoding for unseen labels
label_encoders = {}
for column in train_data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    train_data[column] = le.fit_transform(train_data[column])
    label_encoders[column] = le

for column in test_data.select_dtypes(include=['object']).columns:
    if column in label_encoders:
        le = label_encoders[column]
        test_data[column] = test_data[column].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
    else:
        le = LabelEncoder()
        test_data[column] = le.fit_transform(test_data[column])
        label_encoders[column] = le

In [5]:
# Calculate correlations between 'isFraud' and other numeric columns
correlations = train_data.corr()['isFraud'].sort_values(ascending=False)
correlations

isFraud    1.000000
V232       0.170946
V233       0.164732
V218       0.159838
V231       0.156044
             ...   
V161            NaN
V162            NaN
V163            NaN
V269            NaN
V305            NaN
Name: isFraud, Length: 394, dtype: float64

In [6]:
feature_columns = correlations[correlations > 0].index.tolist()

In [7]:
features = train_data[feature_columns]
target = train_data['isFraud']

In [8]:
from sklearn.model_selection import train_test_split

# Perform train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, stratify=target, random_state=42)

In [9]:
%%time

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

# Initialize the scaler
scaler = StandardScaler()

# Define K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize arrays to store all predictions and true labels
all_predictions = []
all_true_labels = []

# Perform K-Fold cross-validation
for train_index, test_index in kf.split(X_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
    
    # Standardize the features
    X_train_scaled = scaler.fit_transform(X_train_fold)
    X_test_scaled = scaler.transform(X_test_fold)
    
    # Define the neural network model architecture
    def build_model(input_dim):
        model = Sequential()
        model.add(Dense(64, input_dim=input_dim, activation='relu'))
        model.add(Dropout(0.3))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model

    # Build and train the model
    model = build_model(input_dim=X_train_scaled.shape[1])
    
    # Early stopping callback
    early_stop = EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
    
    # Train the model
    model.fit(X_train_scaled, y_train_fold, epochs=5, batch_size=32, validation_data=(X_test_scaled, y_test_fold), callbacks=[early_stop], verbose=1)
    
    # Predict probabilities on the test set
    y_pred_proba = model.predict(X_test_scaled).flatten()
    
    # Store the predictions and true labels
    all_predictions.extend(y_pred_proba)
    all_true_labels.extend(y_test_fold.values)

# Convert to numpy arrays
y_pred_proba_all = np.array(all_predictions)
y_true_all = np.array(all_true_labels)

# Calculate the ROC AUC score
roc_auc = roc_auc_score(y_true_all, y_pred_proba_all)
roc_auc

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.9790 - loss: 0.0626 - val_accuracy: 0.9997 - val_loss: 6.8625e-04
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9993 - loss: 0.0025 - val_accuracy: 0.9994 - val_loss: 0.0023
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9846 - loss: 0.0586 - val_accuracy: 0.9997 - val_loss: 9.1093e-04
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9988 - loss: 0.0056 - val_accuracy: 1.0000 - val_loss: 2.5922e-05
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0014 - val_accuracy: 0.9995 - val_loss: 0.0012
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9822 - loss: 0.0565 - val_accuracy: 0.9992 - val_loss: 0.0022
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9988 - loss: 0.0040 - val_accuracy: 1.0000 - val_loss: 1.1327e-05
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9991 - loss: 0.0041 - val_accuracy: 1.0000 - val_loss: 3.7535e-06
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0017 - val_accuracy: 0.9999 - val_loss: 1.3382e-04
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9851 - loss: 0.0517 - val_accuracy: 0.9994 - val_loss: 0.0028
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9983 - loss: 0.0053 - val_accuracy: 1.0000 - val_loss: 1.1275e-05
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0019 - val_accuracy: 1.0000 - val_loss: 2.9748e-06
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0013 - val_accuracy: 0.9997 - val_loss: 0.0011
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9826 - loss: 0.0603 - val_accuracy: 0.9999 - val_loss: 3.9854e-04
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0026 - val_accuracy: 1.0000 - val_loss: 9.3265e-05
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9998 - loss: 0.0012 - val_accuracy: 0.9999 - val_loss: 3.3866e-04
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
CPU times: user 1min 31s, sys: 14.5 s, total: 1min 46s
Wall time: 1min 20s


0.9999999572603178

In [10]:
# Standardize the test data features 
X_test_scaled = scaler.transform(X_test)

In [11]:
# Predict probabilities on the test set
y_pred_proba_test = model.predict(X_test_scaled)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


In [12]:
# Calculate the percentage of transactions predicted as fraudulent
fraudulent_percentage = (y_pred_proba_test.sum() / len(y_pred_proba_test)) * 100
fraudulent_percentage

2.572697265625

- The percentage of transactions predicted as fraudulent is 2.57%

In [13]:
import pickle

# Save the trained model into a pickle file
with open('ieee_cis_fraud_detection_model.pkl', 'wb') as file:
    pickle.dump(model, file)