In [5]:
import pandas as pd
import joblib
from etl_model_utils import predict_column_transformation

model = joblib.load("../models/column_transform_model.pkl")

def smart_etl_with_model(filepath, model):
    df = pd.read_csv(filepath)
    transformed_df = df.copy()

    for col in df.columns:
        action = predict_column_transformation(df[col], model)
        print(f"{col}: {action}")

        if action == 'drop_column':
            transformed_df.drop(columns=[col], inplace=True)
        elif action == 'fill_zero':
            transformed_df[col] = transformed_df[col].fillna(0)
        elif action == 'fill_mean':
            if pd.api.types.is_numeric_dtype(df[col]):
                transformed_df[col] = transformed_df[col].fillna(df[col].mean())
        elif action == 'strip_string':
            transformed_df[col] = transformed_df[col].astype(str).str.strip()
        elif action == 'cast_to_string':
            transformed_df[col] = transformed_df[col].astype(str)

    print("\n✅ Final transformed data:")
    print(transformed_df.head())
    return transformed_df

# Example usage:
df = smart_etl_with_model("../data/raw/Employee.csv", model)


Education: 0
JoiningYear: 1
City: 1
PaymentTier: 1
Age: 1
Gender: 1
EverBenched: 1
ExperienceInCurrentDomain: 1
LeaveOrNot: 1

✅ Final transformed data:
   Education  JoiningYear       City  PaymentTier  Age  Gender EverBenched  \
0  Bachelors         2017  Bangalore            3   34    Male          No   
1  Bachelors         2013       Pune            1   28  Female          No   
2  Bachelors         2014  New Delhi            3   38  Female          No   
3    Masters         2016  Bangalore            3   27    Male          No   
4    Masters         2017       Pune            3   24    Male         Yes   

   ExperienceInCurrentDomain  LeaveOrNot  
0                          0           0  
1                          3           1  
2                          2           0  
3                          5           1  
4                          2           1  


In [9]:
df.dtypes

Education                    object
JoiningYear                   int64
City                         object
PaymentTier                   int64
Age                           int64
Gender                       object
EverBenched                  object
ExperienceInCurrentDomain     int64
LeaveOrNot                    int64
dtype: object