In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
# Loading Dataset
df = pd.read_csv('../data/raw_dataset.csv') 

In [3]:
# Exploring Dataset
print(df.head())

df.info()
df.describe()

   Age          Workclass   FNLWGT   Education   Education-num  \
0   39          State-gov    77516   Bachelors              13   
1   50   Self-emp-not-inc    83311   Bachelors              13   
2   38            Private   215646     HS-grad               9   
3   53            Private   234721        11th               7   
4   28            Private   338409   Bachelors              13   

        Marital-Status          Occupation    Relationship    Race      Sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

    Capital-gain   Capital-loss   Hours-per-week  Native-country  income  
0           2174              0               40   Un

Unnamed: 0,Age,FNLWGT,Education-num,Capital-gain,Capital-loss,Hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [4]:
# Formatting data into data we can manipulate              
df.columns = df.columns.str.strip()
df.columns = df.columns.str.title()

In [5]:
# Dataframe shape before cleaned
print(f"Total values before cleaning: ", df.shape)

Total values before cleaning:  (32561, 15)


In [6]:
# Summary before cleaned
print("\nSummary statistics BEFORE cleaning")
print(df.describe(include="all"))


Summary statistics BEFORE cleaning
                 Age Workclass        Fnlwgt Education  Education-Num  \
count   32561.000000     32561  3.256100e+04     32561   32561.000000   
unique           NaN         9           NaN        16            NaN   
top              NaN   Private           NaN   HS-grad            NaN   
freq             NaN     22696           NaN     10501            NaN   
mean       38.581647       NaN  1.897784e+05       NaN      10.080679   
std        13.640433       NaN  1.055500e+05       NaN       2.572720   
min        17.000000       NaN  1.228500e+04       NaN       1.000000   
25%        28.000000       NaN  1.178270e+05       NaN       9.000000   
50%        37.000000       NaN  1.783560e+05       NaN      10.000000   
75%        48.000000       NaN  2.370510e+05       NaN      12.000000   
max        90.000000       NaN  1.484705e+06       NaN      16.000000   

             Marital-Status       Occupation Relationship    Race    Sex  \
count      

In [7]:
## Missing count
missing_count = df.isna().sum().sum()
print(f"Missing values: {missing_count}")

Missing values: 0


In [8]:
# Dropping data with null values    
df.dropna(inplace = True)

In [8]:
# Formatting vague data
allowed = ["HS-grad", "HS-undergrad", "Some-college",
       "Bachelors", "Masters", "Doctorate", "Assoc-acdm"]
mask = df["Education"].str.contains(r"\b\d+th\b", case=False, na=False)
df.loc[mask, "Education"] = "HS-undergrad"

In [9]:
# Inconsistent data
inconsistent_count = mask.sum()
print(f"Inconsistent values: {inconsistent_count}")

Inconsistent values: 4202


In [10]:
# Dropping duplicate data
duplicate_count = df.duplicated().sum()
print(f"Duplicates: {duplicate_count}")
df.drop_duplicates(inplace = True)

Duplicates: 24


In [11]:
# Export cleaned data to a new csv file
export_dir = Path("../data/")
export_dir.mkdir(parents=True, exist_ok=True)
output_path = export_dir / "cleaned_dataset.csv"
df.to_csv(output_path, index=False)
print(f"Cleaned data saved to: {output_path}")

Cleaned data saved to: ../data/cleaned_dataset.csv


In [12]:
# Dataframe shape after cleaned
print(f"Shape after cleaning: ", df.shape)

Shape after cleaning:  (32537, 15)


In [13]:
# Summary after cleaned 
print("\nSummary statistics AFTER cleaning")
print(df.describe(include="all"))


Summary statistics AFTER cleaning
                 Age Workclass        Fnlwgt Education  Education-Num  \
count   32537.000000     32537  3.253700e+04     32537   32537.000000   
unique           NaN         9           NaN        10            NaN   
top              NaN   Private           NaN   HS-grad            NaN   
freq             NaN     22673           NaN     10494            NaN   
mean       38.585549       NaN  1.897808e+05       NaN      10.081815   
std        13.637984       NaN  1.055565e+05       NaN       2.571633   
min        17.000000       NaN  1.228500e+04       NaN       1.000000   
25%        28.000000       NaN  1.178270e+05       NaN       9.000000   
50%        37.000000       NaN  1.783560e+05       NaN      10.000000   
75%        48.000000       NaN  2.369930e+05       NaN      12.000000   
max        90.000000       NaN  1.484705e+06       NaN      16.000000   

             Marital-Status       Occupation Relationship    Race    Sex  \
count       