In [None]:
# 📓 01_feature_engineering.ipynb
# Extract features from columns for training the ML model

import pandas as pd
import os

# Directory with raw CSV files
data_dir = "../data/raw"
all_feature_rows = []

def extract_features_from_column(series: pd.Series):
    return {
        'column': series.name,
        'dtype': str(series.dtype),
        'null_pct': series.isnull().mean(),
        'unique_pct': series.nunique() / len(series),
        'avg_len': series.astype(str).str.len().mean() if series.dtype == 'object' else 0,
        'label': ''  # To be filled manually later
    }

# Loop through all CSVs and extract column-wise features
for file in os.listdir(data_dir):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(data_dir, file))
        for col in df.columns:
            features = extract_features_from_column(df[col])
            features['source_file'] = file
            all_feature_rows.append(features)

features_df = pd.DataFrame(all_feature_rows)
features_df.to_csv("../data/column_features.csv", index=False)
print("✅ Features extracted. Please label them in column_features.csv")
