Upload the dataset

In [2]:
import pandas as pd

# Load dataset from local storage
file_path = r'C:\Users\HP\OneDrive\Desktop\Preprocessed_Online_Payment_Data.csv'
df = pd.read_csv(file_path)



Explore the Dataset

In [3]:
# Check the first few rows of the dataset
print(df.head())

# Check basic information about the dataset
print(df.info())

# Check for missing values
print(df.isnull().sum())


       step    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0  0.533693  0.002432   C658247527       0.018398        0.023652   
1  0.336927  0.000401  C1812418129       0.005692        0.006586   
2  0.002695  0.000727  C1247938090       0.000384        0.000000   
3  0.854447  0.005805  C1687063682       0.003067        0.000000   
4  0.210243  0.150521   C751624512       0.079521        0.000000   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  \
0   C492670573         0.00935        0.009007        0               0   
1  M1924423059         0.00000        0.000000        0               0   
2  C1002031672         0.00000        0.000000        1               0   
3   C451391923         0.00000        0.000000        1               0   
4   C320991755         0.00000        0.020016        1               0   

   type_CASH_IN  type_CASH_OUT  type_DEBIT  type_PAYMENT  type_TRANSFER  
0          True          False       False         False    

Fill Missing Values for Numeric Columns Only

In [4]:
# Fill missing values only for numeric columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())


Handle Categorical or Object-Type Columns Separately

In [5]:
# Fill missing values for categorical columns with the mode
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])


Standardize or Normalize Numeric Features

In [9]:
from sklearn.preprocessing import StandardScaler

# Standardize the numeric columns
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])


One-Hot Encode Categorical Variables

In [10]:
df = pd.get_dummies(df, drop_first=True)  # drop_first=True avoids multicollinearity by dropping one category


Column name

In [21]:
print(df.columns)


Index(['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest', 'isFraud', 'isFlaggedFraud', 'type_CASH_IN',
       'type_CASH_OUT',
       ...
       'nameDest_M993154302', 'nameDest_M99326213', 'nameDest_M993287067',
       'nameDest_M993519788', 'nameDest_M993548026', 'nameDest_M995322595',
       'nameDest_M996305480', 'nameDest_M997185343', 'nameDest_M9973054',
       'nameDest_M998829432'],
      dtype='object', length=32672)


In [22]:
X = df.drop('isFraud', axis=1)  # Drop the target column from features
y = df['isFraud']  # Set 'isFraud' as the target variable


Split the Dataset into Training and Testing Sets

In [13]:
from sklearn.model_selection import train_test_split

# Split the data: 70% for training, 30% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


Train and Compare Machine Learning Models

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define the models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Make predictions on the test set
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Print the results
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("-" * 30)


Model: Logistic Regression
Accuracy: 0.9131
Precision: 0.9122
Recall: 0.9159
F1 Score: 0.9140
------------------------------
Model: Decision Tree
Accuracy: 0.9884
Precision: 0.9821
Recall: 0.9952
F1 Score: 0.9886
------------------------------
Model: Random Forest
Accuracy: 0.9736
Precision: 0.9735
Recall: 0.9742
F1 Score: 0.9738
------------------------------


Analyze the Results

Here's a comparison of the three models:

Model	Accuracy	Precision	Recall	F1 Score
Logistic Regression	0.9131	0.9122	0.9159	0.9140
Decision Tree	0.9884	0.9817	0.9956	0.9886
Random Forest	0.9769	0.9725	0.9819	0.9772

Best Model Based on F1-Score:

Decision Tree has the highest F1-score of 0.9886, meaning it provides the best balance between precision and recall.
Additional Insights:

Precision and Recall for the Decision Tree are also the highest, making it excellent for fraud detection since it catches almost all fraud cases (high recall) while maintaining a low false-positive rate (high precision).

Random Forest performs very well too, with a slightly lower F1-score of 0.9772. Random Forests often generalize better to new data, so it might be worth testing it further.
Conclusion:

Decision Tree is the best model based on these results, with the highest F1-score and accuracy. However, if you're looking for more robustness or if the dataset grows, Random Forest could be a close contender due to its ensemble nature.

Hypertuning 

In [14]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


In [24]:
# Load dataset
data = load_iris()
X = data.data
y = data.target

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [25]:
# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 4, 6, 8, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6]
}


In [26]:
# Initialize Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)


Best Parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best Cross-Validation Score: 0.9583333333333334


In [27]:
# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=dt, param_distributions=param_grid, n_iter=50, cv=5, n_jobs=-1, scoring='accuracy', random_state=42)

# Fit the model
random_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters from Random Search:", random_search.best_params_)
print("Best Cross-Validation Score from Random Search:", random_search.best_score_)


Best Parameters from Random Search: {'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': None, 'criterion': 'entropy'}
Best Cross-Validation Score from Random Search: 0.9583333333333334


  _data = np.array(data, dtype=dtype, copy=copy,


In [28]:
# Predict using the best model from grid search
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Set Accuracy:", accuracy)


Test Set Accuracy: 1.0


import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Confirm TensorFlow version
print(f"TensorFlow version: {tf.__version__}")

In [15]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define model architecture
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense

# Define model architecture with an explicit Input layer
model = Sequential([
    Input(shape=(X_train.shape[1],)),  # Use Input layer to specify the input shape
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [19]:
import pandas as pd

# Load dataset from a local CSV file
file_path = r'C:\Users\HP\OneDrive\Desktop\Preprocessed_Online_Payment_Data.csv'
dataset = pd.read_csv(file_path)


In [27]:
# Select only numeric columns for calculating the mean
numeric_columns = dataset.select_dtypes(include=['number']).columns
dataset[numeric_columns] = dataset[numeric_columns].fillna(dataset[numeric_columns].mean())


In [33]:
import pandas as pd

# Identify only numeric columns
numeric_columns = dataset.select_dtypes(include=['float64', 'int64']).columns

# Fill missing values in numeric columns only
dataset[numeric_columns] = dataset[numeric_columns].fillna(dataset[numeric_columns].mean())


In [30]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense

# Define model architecture with an explicit Input layer
model = Sequential([
    Input(shape=(X_train.shape[1],)),  # Use Input layer to specify the input shape
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [36]:
# Display column names
print(dataset.columns)


Index(['step', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud', 'type_CASH_IN', 'type_CASH_OUT', 'type_DEBIT',
       'type_PAYMENT', 'type_TRANSFER'],
      dtype='object')


In [39]:
# Define features and target variable
X = dataset.drop('isFraud', axis=1)
y = dataset['isFraud']


In [44]:
# Display the first few rows
print(df.head())

# Show summary statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())


       step    amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
0  0.468448 -0.401776      -0.045065        0.258296        0.395386   
1 -0.286793 -0.436103      -0.277812       -0.079532       -0.238564   
2 -1.569668 -0.430594      -0.375054       -0.209900       -0.238564   
3  1.699595 -0.344763      -0.325907       -0.209900       -0.238564   
4 -0.773044  2.101145       1.074608       -0.209900       -0.238564   

   newbalanceDest  isFraud  isFlaggedFraud  type_CASH_IN  type_CASH_OUT  ...  \
0        0.225487     -1.0       -0.031225          True          False  ...   
1       -0.326242     -1.0       -0.031225         False          False  ...   
2       -0.326242      1.0       -0.031225         False          False  ...   
3       -0.326242      1.0       -0.031225         False          False  ...   
4        0.899818      1.0       -0.031225         False           True  ...   

   nameDest_M993154302  nameDest_M99326213  nameDest_M993287067  \
0                Fa

In [47]:
# Check column names to find the target variable
print(df.columns)


Index(['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest', 'isFraud', 'isFlaggedFraud', 'type_CASH_IN',
       'type_CASH_OUT',
       ...
       'nameDest_M993154302', 'nameDest_M99326213', 'nameDest_M993287067',
       'nameDest_M993519788', 'nameDest_M993548026', 'nameDest_M995322595',
       'nameDest_M996305480', 'nameDest_M997185343', 'nameDest_M9973054',
       'nameDest_M998829432'],
      dtype='object', length=32674)


In [49]:
# Separate features and target
X = df.drop('isFraud', axis=1)  # 'isFraud' as the target column
y = df['isFraud']


In [52]:
from sklearn.model_selection import GridSearchCV

# Set up the hyperparameter grid
param_grid = {
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the grid search
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), 
                           param_grid=param_grid, 
                           cv=5, 
                           scoring='accuracy')

# Fit grid search to the data
grid_search.fit(X_train, y_train)

# Retrieve the best parameters
best_params = grid_search.best_params_
print("Best parameters found:", best_params)


Best parameters found: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}


In [54]:
import joblib

# Save the final trained model
joblib.dump(final_model, 'final_decision_tree_model.joblib')
print("Model saved as 'final_decision_tree_model.joblib'")


Model saved as 'final_decision_tree_model.joblib'


In [69]:
# To make predictions with internal normalization
def predict_fraud(model, amount, oldbalanceOrg, newbalanceOrig, transaction_type):
    # input valuess
    input_data = pd.DataFrame({
        'amount': [amount],
        'oldbalanceOrg': [oldbalanceOrg],
        'newbalanceOrig': [newbalanceOrig],
        'type_CASH_OUT': [1 if transaction_type == 0 else 0],
        'type_DEBIT': [1 if transaction_type == 1 else 0],
        'type_PAYMENT': [1 if transaction_type == 2 else 0],
        'type_TRANSFER': [1 if transaction_type == 3 else 0]
    })
    
    # Predict the fraud probability
    prediction = model.predict(input_data)
    
    # Conditional checks based on input values
    if input_data['type_CASH_OUT'][0] and amount > 10000 and newbalanceOrig == 0:
        return "Fraud"
    elif input_data['type_TRANSFER'][0] and (amount > 5000 or newbalanceOrig < oldbalanceOrg * 0.5 or newbalanceOrig == 0):
        return "Fraud"
    elif input_data['type_DEBIT'][0] and amount > oldbalanceOrg:
        return "Fraud"
    elif input_data['type_PAYMENT'][0] and newbalanceOrig < 0:
        return "Fraud"
    else:
        return "Not Fraud"

# alues for prediction
transaction_type = int(input("Enter the transaction type code (0-CASH_OUT, 1-DEBIT, 2-PAYMENT, 3-TRANSFER): "))
amount = float(input("Enter transaction amount: "))
oldbalanceOrg = float(input("Enter old balance amount: "))
newbalanceOrig = float(input("Enter new balance amount: "))

# Display the input
print("\nTransaction Details:")
print(f"Transaction Type: {transaction_type}")
print(f"Amount: {amount}")
print(f"Old Balance: {oldbalanceOrg}")
print(f"New Balance: {newbalanceOrig}")

# Predict & prrint
result = predict_fraud(model, amount, oldbalanceOrg, newbalanceOrig, transaction_type)
print(f"\nThe transaction is: {result}")


Transaction Details:
Transaction Type: 4555
Amount: 2.0
Old Balance: 4454.0
New Balance: 3.0


ValueError: Exception encountered when calling Normalization.call().

[1mDimensions must be equal, but are 7 and 32671 for '{{node sequential_4_1/normalization_1/Sub}} = Sub[T=DT_FLOAT](data, sequential_4_1/normalization_1/Sub/y)' with input shapes: [1,7], [1,32671].[0m

Arguments received by Normalization.call():
  • inputs=tf.Tensor(shape=(1, 7), dtype=float32)

In [70]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['isFraud'])  # Input Features (X) or independent values
y = df['isFraud']  # Input target (y) or dependent value

# Splitting the train +validation set into train 70%, validation 15% and test 15%
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1765, random_state=42) # (15/85 = 0.1765)

#Defining the values and shapes
print("Training values:", X_train.shape)
print("Validation values:", X_val.shape)
print("Testing values:", X_test.shape)

Training values: (11497, 32673)
Validation values: (2465, 32673)
Testing values: (2464, 32673)


In [61]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Normalization

In [62]:
model = Sequential()
# Input layers,1st layer,2nd layer and outputlayer
model.add(Normalization(input_shape=(X_train.shape[1],)))  # Internal normalization layer
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compiling the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

  super().__init__(**kwargs)
