In [43]:
# We will import the necessary libraries we will be needing to perform our EDA's below.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [45]:
# This first line will grab the fraud_data.csv file from the Data folder and we will display
# The first few rows of the data frame
df = pd.read_csv('../Data/fraud_data.csv')
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [46]:
# Describe will allow us to view the basic statistical items regarding our data
print(df.describe())

               step        amount  oldbalanceOrg  newbalanceOrig  \
count  6.362620e+06  6.362620e+06   6.362620e+06    6.362620e+06   
mean   2.433972e+02  1.798619e+05   8.338831e+05    8.551137e+05   
std    1.423320e+02  6.038582e+05   2.888243e+06    2.924049e+06   
min    1.000000e+00  0.000000e+00   0.000000e+00    0.000000e+00   
25%    1.560000e+02  1.338957e+04   0.000000e+00    0.000000e+00   
50%    2.390000e+02  7.487194e+04   1.420800e+04    0.000000e+00   
75%    3.350000e+02  2.087215e+05   1.073152e+05    1.442584e+05   
max    7.430000e+02  9.244552e+07   5.958504e+07    4.958504e+07   

       oldbalanceDest  newbalanceDest       isFraud  isFlaggedFraud  
count    6.362620e+06    6.362620e+06  6.362620e+06    6.362620e+06  
mean     1.100702e+06    1.224996e+06  1.290820e-03    2.514687e-06  
std      3.399180e+06    3.674129e+06  3.590480e-02    1.585775e-03  
min      0.000000e+00    0.000000e+00  0.000000e+00    0.000000e+00  
25%      0.000000e+00    0.000000e+00

In [47]:
# In order to create a cleaner look, we will be creating an image folder called Images, all of our graphs will be saved here
if not os.path.exists('../Images'):
    os.makedirs('../Images')

In [48]:
# Before we begin performing EDA, we will be reviewing our data to see if there are any missing values. 
# We will use the `isnull()` function to check for missing values in our data and sum up the total number of missing values in each column.
print("Missing values in each column:")
print(df.isnull().sum())

Missing values in each column:
step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


In [49]:
# This will provide us with a histogram that will display the Distribution of Transaction Amounts 
plt.figure(figsize=(12, 6))
plt.hist(df['amount'], bins=50, color='orange', edgecolor='orange')
plt.title('Distribution of Transaction Amounts')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.75)
plt.savefig('../Images/01_Distribution_of_Transaction_Amounts.png')
plt.close()

In [50]:
# This will provide us with a count plot that will display the Transaction Types Distribution across the different Transaction Types.
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='type', palette='pastel')
plt.title('Transaction Types Distribution')
plt.xlabel('Transaction Type')
plt.ylabel('Count')
plt.savefig('../Images/02_Transaction_Types_Distribution.png')
plt.close()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df, x='type', palette='pastel')


In [51]:
# This will provide us with a count plot that will display count of fradulent counts, creating a Distribution of Fraudilent Transactions
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='isFraud', palette='pastel')
plt.title('Distribution of Fraudulent Transactions')
plt.xlabel('Is Fraud')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Not Fraud', 'Fraud'])
plt.savefig('../Images/03_Distribution_of_Fraudulent_Distribution.png')
plt.close()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df, x='isFraud', palette='pastel')


In [52]:
# This will provide us with a box plot that will display the Transaction Amounts, comparing the amount of what is not fraud and what is fraud
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='isFraud', y='amount', palette='pastel')
plt.title('Fraud vs. Not Fraud')
plt.xlabel('Is Fraud')
plt.ylabel('Transaction Amount')
plt.xticks(ticks=[0, 1], labels=['Not Fraud', 'Fraud'])
plt.savefig('../Images/04_Fraud_vs_Not_Fraud.png')
plt.close()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df, x='isFraud', y='amount', palette='pastel')


In [53]:
# This will provide us with a scatter plot that will display the Old Balance vs the New Balance of Fraud Status of fraud cases
plt.figure(figsize=(12, 6))
sns.scatterplot(data=df, x='oldbalanceOrg', y='newbalanceOrig', hue='isFraud', alpha=0.6, palette='pastel')
plt.title('Old Balance vs. New Balance (Colored by Fraud Status)')
plt.xlabel('Old Balance')
plt.ylabel('New Balance')
plt.legend(title='Is Fraud', loc='upper left', labels=['Not Fraud', 'Fraud'])
plt.savefig('../Images/05_Old_Balance_vs_New_Balance.png')
plt.close()

In [54]:
# This will provide us with a Correlation Heatmap that will select only numeric columns
plt.figure(figsize=(10, 8))
numeric_df = df.select_dtypes(include=['number'])
correlation_matrix = numeric_df.corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Correlation Heatmap')
plt.savefig('../Images/06_Correlation_Heatmap.png')
plt.close()

In [55]:
# This will provide us with a Pairplot for these three features: oldBbalanceOrg, newbalanceoRIG and isFraud
selected_features = df[['amount', 'oldbalanceOrg', 'newbalanceOrig', 'isFraud']]
sns.pairplot(selected_features, hue='isFraud', palette='pastel')
plt.title('Pairplot of Selected Features')
plt.savefig('../Images/07_Pairplot_Selected_Features.png')
plt.close()

In [56]:
# This will provide us with a time series analysis with inputs that are integars 
plt.figure(figsize=(12, 6))
df['step'] = df['step'].astype(int) 
fraud_counts = df[df['isFraud'] == 1].groupby('step').size()
total_counts = df.groupby('step').size()

plt.plot(total_counts.index, total_counts, label='Total Transactions', color='blue')
plt.plot(fraud_counts.index, fraud_counts, label='Fraudulent Transactions', color='red')
plt.title('Transactions Over Time')
plt.xlabel('Time Step')
plt.ylabel('Number of Transactions')
plt.legend()
plt.savefig('../Images/08_Transactions_Over_Time.png')
plt.close()

In [57]:
# This will provide us with the fraud detection by account
account_fraud_counts = df.groupby('nameOrig')['isFraud'].agg(['count', 'sum']).reset_index()
account_fraud_counts.columns = ['account', 'total_transactions', 'fraudulent_transactions']
account_fraud_counts['fraud_rate'] = account_fraud_counts['fraudulent_transactions'] / account_fraud_counts['total_transactions']

plt.figure(figsize=(12, 6))
sns.barplot(data=account_fraud_counts.sort_values(by='fraud_rate', ascending=False).head(20), 
            x='account', y='fraud_rate', palette='viridis')
plt.title('Top 20 Accounts by Fraud Rate')
plt.xlabel('Account')
plt.ylabel('Fraud Rate')
plt.xticks(rotation=90)
plt.savefig('../Images/09_Top_Accounts_by_Fraud_Rate.png')
plt.close()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=account_fraud_counts.sort_values(by='fraud_rate', ascending=False).head(20),
