In [44]:
import pandas as pd
import numpy as np

In [45]:
# We will load the data set and display the first few rows to display the column names once more
data = pd.read_csv('../Data/fraud_data.csv')
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [46]:
# We will begin by checking for any duplicates and removing them
duplicates = data.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
data = data.drop_duplicates()

Number of duplicate rows: 0


In [47]:
# We will also check for null values and remove any rows with them
nulls = data.isnull().sum()
print("Null values in each column:")
print(nulls[nulls > 0])
data = data.dropna()

Null values in each column:
Series([], dtype: int64)


In [48]:
# We will go ahead and drop columns that do not seem relevant in helping us detect fraud, we will print the remaining column names
columns_to_drop = ['oldbalanceDest','newbalanceDest']
data = data.drop(columns=columns_to_drop)
print("These are the remaining columns:")
print(data.columns)

These are the remaining columns:
Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'isFraud', 'isFlaggedFraud'],
      dtype='object')


In [49]:
# We will then create a new data frame to save our cleaned data and reprint the columns
cleaned_data = data.copy()
fraud_cases = cleaned_data[cleaned_data['isFraud'] == 1]
fraud_cases.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,isFraud,isFlaggedFraud
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,1,0
251,1,TRANSFER,2806.0,C1420196421,2806.0,0.0,C972765878,1,0
252,1,CASH_OUT,2806.0,C2101527076,2806.0,0.0,C1007251739,1,0
680,1,TRANSFER,20128.0,C137533655,20128.0,0.0,C1848415041,1,0


In [50]:
# Identify cash outs or transfers
# Assuming cash outs are represented by negative amounts in the 'amount' column
cash_outs = cleaned_data[cleaned_data['amount'] < 0]

# Display the first few rows of cash outs
print("Cash outs detected:")
print(cash_outs.head())

Cash outs detected:
Empty DataFrame
Columns: [step, type, amount, nameOrig, oldbalanceOrg, newbalanceOrig, nameDest, isFraud, isFlaggedFraud]
Index: []


In [None]:
# Once all of the above actions are completed we will save the cleaned data into a csv file called clean_data.csv
cleaned_data.to_csv('../Data/clean_data.csv', index=False)
print("Data cleaning is complete, cleaned data has been saved to 'clean_data.csv'.")

Data cleaning is complete, cleaned data has been saved to 'clean_data.csv'.
