## Manual Cleaning Script

VERY simple cleaning script; will be expanded in Phase 3

In [1]:
# time tracking - START
import time
start_time = time.time()

In [2]:
# package imports
import numpy as np
import pandas as pd

In [3]:
# import data
df = pd.read_csv("../data/fdic/fdic_250.csv")

In [4]:
print(f"cols before: {df.shape[1]}")

cols before: 121


In [5]:
# drop all columns that are totally empty or contains just empty strings
df = df.replace("", np.nan).dropna(axis=1, how='all')

In [6]:
print(f"cols after: {df.shape[1]}")

cols after: 108


In [7]:
# date conversion
date_cols = ['established_date', 'last_updated', 'effective_date', 'end_effective_date', 'report_date',
             'reporting_period_end_date', 'run_date', 'deposit_insurance_date']

In [8]:
# date conversion
for col in date_cols:
    if col in df.columns: 
        df[col] = pd.to_datetime(df[col], errors='coerce')

In [9]:
# boolean handling
bools = df.select_dtypes(include=['object']).columns

In [10]:
# boolean handling; ensures all y/n and true/false cols are labeled as boolean
for col in bools:
    if df[col].nunique() <= 2 and 'true' in str(df[col].iloc[0]).lower(): 
        df[col] = df[col].astype(bool)

In [11]:
# finanical edits
financial_cols = ['total_assets', 'total_deposits', 'equity_capital', 'net_income']

In [12]:
# finanical edits; make sure all "money cols" with nan are replaced with 0 to represent real life data
for col in financial_cols: 
    if col in df.columns: 
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

In [13]:
# force id and key cols to be strings to prevent leading 0s
id_cols = ['fdic_certificate_number', 'zip_code', 'county_fips_code', 'rssd_id']

In [14]:
for col in id_cols:
    if col in df.columns: 
        df[col] = df[col].astype(str).str.replace('.0', '', regex=False)

In [15]:
df.to_csv('../data/fdic/mt_cleaned_fdic.csv', index=False)

In [16]:
# time tracking - STOP
end_time = time.time()
duration = end_time - start_time
print(f"Execution time: {duration:.2f} seconds")

Execution time: 0.47 seconds
