In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

In [None]:
# Loading Dataset
df = pd.read_csv('../data/raw_dataset.csv') 

In [None]:
# Exploring Dataset
print(df.head())

df.info()
df.describe()

In [None]:
# Formatting data into data we can manipulate              
df.columns = df.columns.str.strip()
df.columns = df.columns.str.title()

In [None]:
# Dataframe shape before cleaned
print(f"Total values before cleaning: ", df.shape)

In [None]:
# Summary before cleaned
print("\nSummary statistics BEFORE cleaning")
print(df.describe(include="all"))

In [None]:
## Missing count
missing_count = df.isna().sum().sum()
print(f"Missing values: {missing_count}")

In [None]:
# Dropping data with null values
df.replace(r"^\s*\?\s*$", np.nan, regex=True, inplace=True)    
df.dropna(inplace = True)

In [None]:
# Formatting vague data
allowed = ["HS-grad", "HS-undergrad", "Some-college",
       "Bachelors", "Masters", "Doctorate", "Assoc-acdm"]
mask = df["Education"].str.contains(r"\b\d+th\b", case=False, na=False)
df.loc[mask, "Education"] = "HS-undergrad"

In [None]:
# Inconsistent data
inconsistent_count = mask.sum()
print(f"Inconsistent values: {inconsistent_count}")

In [None]:
# Dropping duplicate data
duplicate_count = df.duplicated().sum()
print(f"Duplicates: {duplicate_count}")
df.drop_duplicates(inplace = True)

In [None]:
# Export cleaned data to a new csv file
export_dir = Path("../data/")
export_dir.mkdir(parents=True, exist_ok=True)
output_path = export_dir / "cleaned_dataset.csv"
df.to_csv(output_path, index=False)
print(f"Cleaned data saved to: {output_path}")

In [None]:
# Dataframe shape after cleaned
print(f"Shape after cleaning: ", df.shape)

In [None]:
# Summary after cleaned 
print("\nSummary statistics AFTER cleaning")
print(df.describe(include="all"))