In [23]:
# importing the necessary libraries
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
import joblib
import time

In [24]:
#Load the file
df = pd.read_csv("fraud_detection_dataset.csv")
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [26]:
#row and column info
df.shape

(6362620, 11)

In [27]:
#Checking for Null Values
df.isnull().sum()
# from the result no null value was found

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [28]:
num_duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates}")


Number of duplicate rows: 0


In [29]:
#Exploratory Data Analysis

# Calculate value counts for the TYPE column
type_counts = df['type'].value_counts()
type_counts

type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64

In [30]:

is_flagged_count = df['isFlaggedFraud'].value_counts()
is_flagged_count

isFlaggedFraud
0    6362604
1         16
Name: count, dtype: int64

In [31]:
df["isFlaggedFraud"] = df["isFlaggedFraud"].map({
    0: "not flagged",
    1: "flagged"
    
})
is_flagged_count = df["isFlaggedFraud"].value_counts()
is_flagged_count

isFlaggedFraud
not flagged    6362604
flagged             16
Name: count, dtype: int64

In [32]:
# Assuming df is your DataFrame
flagged_fraud_df = df[df['isFlaggedFraud'] == "flagged"]

# Display the result
flagged_fraud_df.head(16)


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
2736446,212,TRANSFER,4953893.08,C728984460,4953893.08,4953893.08,C639921569,0.0,0.0,1,flagged
3247297,250,TRANSFER,1343002.08,C1100582606,1343002.08,1343002.08,C1147517658,0.0,0.0,1,flagged
3760288,279,TRANSFER,536624.41,C1035541766,536624.41,536624.41,C1100697970,0.0,0.0,1,flagged
5563713,387,TRANSFER,4892193.09,C908544136,4892193.09,4892193.09,C891140444,0.0,0.0,1,flagged
5996407,425,TRANSFER,10000000.0,C689608084,19585040.37,19585040.37,C1392803603,0.0,0.0,1,flagged
5996409,425,TRANSFER,9585040.37,C452586515,19585040.37,19585040.37,C1109166882,0.0,0.0,1,flagged
6168499,554,TRANSFER,3576297.1,C193696150,3576297.1,3576297.1,C484597480,0.0,0.0,1,flagged
6205439,586,TRANSFER,353874.22,C1684585475,353874.22,353874.22,C1770418982,0.0,0.0,1,flagged
6266413,617,TRANSFER,2542664.27,C786455622,2542664.27,2542664.27,C661958277,0.0,0.0,1,flagged
6281482,646,TRANSFER,10000000.0,C19004745,10399045.08,10399045.08,C1806199534,0.0,0.0,1,flagged


In [33]:
# Create a bar plot using plotly Express
bar_plot = px.bar(x =type_counts.index, y=type_counts.values, labels={'x':'Transaction Type', 'y':'Count'}, title='Transaction Type Distribution')
#Show the plot
bar_plot.show()

In [34]:
# Count the number of occurrences of each value in the 'isFraud' column
fraud_counts = df['isFraud'].value_counts()
fraud_counts

isFraud
0    6354407
1       8213
Name: count, dtype: int64

In [35]:
# Oversampling
#Undersampling

# However, to avoid bias we have to balance the dataset by under sampling the majority
df_majority = df[df['isFraud'] == 0]
df_minority = df[df['isFraud'] == 1]

df_majority_downSampled = resample(df_majority, 
                                   replace=False, 
                                   n_samples=len(df_minority), 
                                   random_state=42)

df_balanced = pd.concat([df_majority_downSampled, df_minority])
balanced_dataset = df_balanced['isFraud'].value_counts()
balanced_dataset.head()



isFraud
0    8213
1    8213
Name: count, dtype: int64

In [40]:
# Group by 'type' and 'isFraud', then count occurrences
balanced_dataset = df.groupby(['type','isFraud']).size().unstack(fill_value=0)

# Display the counts
balanced_dataset


isFraud,0,1
type,Unnamed: 1_level_1,Unnamed: 2_level_1
CASH_IN,1399284,0
CASH_OUT,2233384,4116
DEBIT,41432,0
PAYMENT,2151495,0
TRANSFER,528812,4097


In [38]:
# Group by 'type' and 'isFlagged', then count occurrences
balanced_dataset = df.groupby(['type','isFlaggedFraud']).size().unstack(fill_value=0)

# Display the counts
balanced_dataset


isFlaggedFraud,flagged,not flagged
type,Unnamed: 1_level_1,Unnamed: 2_level_1
CASH_IN,0,1399284
CASH_OUT,0,2237500
DEBIT,0,41432
PAYMENT,0,2151495
TRANSFER,16,532893


In [14]:
# Mapping isFraud that is false to "NO Fraud" and true to "Fraud"

df["isFraud"] = df["isFraud"].map({
    0: "No Fraud", 
    1: "Fraud"
})

df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,No Fraud,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,No Fraud,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,Fraud,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,Fraud,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,No Fraud,0


In [15]:
def train_and_save_model(df, model_filename='fraud_model.pkl', scaler_filename='scaler.pkl', encoder_filename='encoder.pkl'):
   
    # Separate features and target variable
    x = df[['type', 'amount', 'oldbalanceOrg', 'newbalanceOrig']]
    y = df['isFraud']
    
    # One-hot encode the 'type' column
    encoder = OneHotEncoder(sparse_output=False)
    type_encoded = encoder.fit_transform(x[['type']])
    
    # Create a DataFrame with the encoded type features
    type_encoded_df = pd.DataFrame(type_encoded, columns=encoder.get_feature_names_out(['type']))
    
    # Concatenate the one-hot encoded columns with the rest of the features
    x = pd.concat([x.drop('type', axis=1), type_encoded_df], axis=1)
    
    # Split the data into training and testing sets
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.10, random_state=42)
    
    # Scale the features
    scaler = StandardScaler()
    xtrain = scaler.fit_transform(xtrain)
    xtest = scaler.transform(xtest)
    
    # Train the Random Forest Classifier
    model = RandomForestClassifier(random_state=42)
    model.fit(xtrain, ytrain)
    
    # Save the model, scaler, and encoder
    joblib.dump(model, model_filename)
    joblib.dump(scaler, scaler_filename)
    joblib.dump(encoder, encoder_filename)
    
    # Evaluate the model
    accuracy = model.score(xtest, ytest) * 100
    print(f"Model Accuracy: {accuracy:.2f}%")
    print(f"Model, scaler, and encoder saved to {model_filename}, {scaler_filename}, and {encoder_filename}")

# Example usage
train_and_save_model(df)


Model Accuracy: 99.98%
Model, scaler, and encoder saved to fraud_model.pkl, scaler.pkl, and encoder.pkl


In [41]:
def load_and_predict_fraud(test_features, model_filename='fraud_model.pkl', scaler_filename='scaler.pkl', encoder_filename='encoder.pkl'):
    # Load the model, scaler, and encoder
    model = joblib.load(model_filename)
    scaler = joblib.load(scaler_filename)
    encoder = joblib.load(encoder_filename)
    
    # Convert test features to a DataFrame with appropriate column names
    feature_names = ['type', 'amount', 'oldbalanceOrg', 'newbalanceOrig']
    test_features_df = pd.DataFrame([test_features], columns=feature_names)
    
    # One-hot encode the 'type' column
    type_encoded = encoder.transform(test_features_df[['type']])
    
    # Create a DataFrame with the encoded type features
    type_encoded_df = pd.DataFrame(type_encoded, columns=encoder.get_feature_names_out(['type']))
    
    # Concatenate the one-hot encoded columns with the rest of the features
    test_features_encoded = pd.concat([test_features_df.drop('type', axis=1), type_encoded_df], axis=1)
    
    # Scale the input features
    test_features_scaled = scaler.transform(test_features_encoded)
    
    # Make prediction
    start_time = time.time()
    prediction = model.predict(test_features_scaled)
    end_time = time.time()
    
    # the time taken
    # print(f"Prediction Time: {end_time - start_time:.4f} seconds")
    
    # Return "Fraud" or "No Fraud" based on prediction
    return "Fraud" if prediction[0] == 1 else "No Fraud"

# Example usage:
test_features_list = [
    ["TRANSFER", 10000000, 15000000, 5000000],
    ["TRANSFER", 4753893, 10000000, 6000000],
    ["TRANSFER", 4753893, 4753893, 0],
    ["TRANSFER", 4753893, 4753893, 4753893],
    ["TRANSFER", 4753893, 10000000, 5256107],
    ["TRANSFER", 3000000, 8000000, 5000000],
    ["CASH_OUT", 20000000, 25000000, 5000000],
    ["TRANSFER", 4753893, 9507786, 4753893],
    ["TRANSFER", 4753893, 4754893, 4753890],
    ["TRANSFER", 4753893, 4753893, 0],
    ["TRANSFER", 9999999, 10000000, 10000000]
]

for features in test_features_list:
    result = load_and_predict_fraud(features)
    print(f"Test Features: {features} -> Prediction: {result}")



Test Features: ['TRANSFER', 10000000, 15000000, 5000000] -> Prediction: No Fraud
Test Features: ['TRANSFER', 4753893, 10000000, 6000000] -> Prediction: No Fraud
Test Features: ['TRANSFER', 4753893, 4753893, 0] -> Prediction: No Fraud
Test Features: ['TRANSFER', 4753893, 4753893, 4753893] -> Prediction: No Fraud
Test Features: ['TRANSFER', 4753893, 10000000, 5256107] -> Prediction: No Fraud
Test Features: ['TRANSFER', 3000000, 8000000, 5000000] -> Prediction: No Fraud
Test Features: ['CASH_OUT', 20000000, 25000000, 5000000] -> Prediction: No Fraud
Test Features: ['TRANSFER', 4753893, 9507786, 4753893] -> Prediction: No Fraud
Test Features: ['TRANSFER', 4753893, 4754893, 4753890] -> Prediction: No Fraud
Test Features: ['TRANSFER', 4753893, 4753893, 0] -> Prediction: No Fraud
Test Features: ['TRANSFER', 9999999, 10000000, 10000000] -> Prediction: No Fraud
