In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
sys.path.append(str(Path("..").resolve()))
DATA = Path("../data/insurance.csv")
OUT = Path("../data/insurance_cleaned.csv")
df = pd.read_csv(DATA)
print("Raw Shape:", df.shape)
print(df.head())
df.info()
display(df.describe().T.head(15))
print("Duplicates:", df.duplicated().sum())



Raw Shape: (1338, 7)
   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1338.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
bmi,1338.0,30.663397,6.098187,15.96,26.29625,30.4,34.69375,53.13
children,1338.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
charges,1338.0,13270.422265,12110.011237,1121.8739,4740.28715,9382.033,16639.912515,63770.42801


Duplicates: 1


In [2]:
from src.data_cleaning import DataCleaning


In [3]:
cleaner = DataCleaning(df)
clean_df = (
    cleaner
      .canonicalize_text()                               # unify text + fix region variants
      .remove_duplicates()
      .validate_schema(drop_invalid=True)                # filter invalid ranges
      .fill_categorical_mode()
      .fill_numerical_median(group_cols=["sex","smoker","region"])  # smarter impute
      .handle_outliers_iqr(columns=["bmi","charges"], iqr_factor=1.5, winsorize=True)
      # .log_transform(["charges"])                      
      .encode_label(["sex","smoker"])                    # stable binary mapping
      .encode_one_hot(["region"], drop_first=True)
      .validate_data()                                   # quick sanity check print
      .show_log()                                        # see the cleaning steps summary
      .get_cleaned_data()
)

print('Cleaned shape:', clean_df.shape)
OUT.parent.mkdir(parents=True, exist_ok=True)
clean_df.to_csv(OUT, index=False)




Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1337 entries, 0 to 1336
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1337 non-null   int64  
 1   sex               1337 non-null   Int64  
 2   bmi               1337 non-null   float64
 3   children          1337 non-null   int64  
 4   smoker            1337 non-null   Int64  
 5   charges           1337 non-null   float64
 6   region_northwest  1337 non-null   bool   
 7   region_southeast  1337 non-null   bool   
 8   region_southwest  1337 non-null   bool   
dtypes: Int64(2), bool(3), float64(2), int64(2)
memory usage: 69.3 KB

First 5 Rows:
   age  sex     bmi  children  smoker      charges  region_northwest  \
0   19    0  27.900         0       1  16884.92400             False   
1   18    1  33.770         1       0   1725.55230             False   
2   28    1  33.000         3       0   4449.46200           