#  1️⃣ Setup

In [1]:
# ============================
# Data Cleaning & Logging Notebook
# ============================


import pandas as pd
import os

# Paths
raw_data_path = "../Data/Raw/AIML Dataset.csv"  
cleaned_data_path = "../Data/Cleaned/cleaned_data.csv"
log_folder = "../Data/Log/"

# Create folders if they don't exist
os.makedirs("cleaned", exist_ok=True)
os.makedirs(log_folder, exist_ok=True)

# Load data
df = pd.read_csv(raw_data_path)
print(f"Original data shape: {df.shape}")
df.head()


Original data shape: (6362620, 11)


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


# 2️⃣ Inspect dataset info

In [2]:

print(df.info())
print(df.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB
None
               step        amount  oldbalanceOrg  newbalanceOrig  \
count  6.362620e+06  6.362620e+06   6.362620e+06    6.362620e+06   
mean   2.433972e+02  1.798619e+05   8.338831e+05    8.551137e+05   
std    1.423320e+02  6.038582e+05   2.888243e+06    2.924049e+06   
min    1.000000e+00  0.000000e+00   0.000000e+00    0.000000e+00   
25%    1.560000e+02  1.338957e+04   0.000000e+00    0.000000e+00   
50%    2.390000e+02  7.487194e+04

# 3️⃣ Remove duplicates

In [3]:

duplicates = df[df.duplicated()]
if not duplicates.empty:
    duplicates.to_csv(os.path.join(log_folder, "deleted_duplicates.csv"), index=False)
    print(f"Logged {len(duplicates)} duplicate rows.")
df = df.drop_duplicates()
print(f"Shape after removing duplicates: {df.shape}")


Shape after removing duplicates: (6362620, 11)


# 4️⃣ Handle missing values

In [4]:
missing_rows = df[df.isnull().any(axis=1)]
if not missing_rows.empty:
    missing_rows.to_csv(os.path.join(log_folder, "deleted_missing.csv"), index=False)
    print(f"Logged {len(missing_rows)} rows with missing values.")
df = df.dropna()
print(f"Shape after removing missing values: {df.shape}")

Shape after removing missing values: (6362620, 11)


# 5️⃣ Filter invalid transactions (negative or zero amounts)

In [5]:
invalid_amounts = df[df['amount'] <= 0]
if not invalid_amounts.empty:
    invalid_amounts.to_csv(os.path.join(log_folder, "deleted_invalid_amounts.csv"), index=False)
    print(f"Logged {len(invalid_amounts)} rows with invalid amounts.")
df = df[df['amount'] > 0]
print(f"Shape after removing invalid amounts: {df.shape}")

Logged 16 rows with invalid amounts.
Shape after removing invalid amounts: (6362604, 11)


# 6️⃣ Save cleaned data

In [6]:
df.to_csv(cleaned_data_path, index=False)
print(f"Cleaned data saved to '{cleaned_data_path}'")

Cleaned data saved to '../Data/Cleaned/cleaned_data.csv'


# 7️⃣ Summary

In [7]:
print("✅ Data cleaning completed successfully!")
print(f"Final dataset shape: {df.shape}")
print("Check the 'log/' folder for deleted rows.")

✅ Data cleaning completed successfully!
Final dataset shape: (6362604, 11)
Check the 'log/' folder for deleted rows.
