In [1]:
# 📓 01_feature_engineering.ipynb
# Extract true features from columns for training the ML model

import pandas as pd
import os

# Directory with raw CSV files
data_dir = "../data/raw"
all_feature_rows = []

# --- New Smart Detection Function ---
def detect_true_dtype(series: pd.Series) -> str:
    series_non_null = series.dropna().astype(str)

    # Try Integer
    try:
        _ = series_non_null.astype(int)
        return 'integer'
    except:
        pass

    # Try Float
    try:
        _ = series_non_null.astype(float)
        return 'float'
    except:
        pass

    # Try DateTime
    try:
        _ = pd.to_datetime(series_non_null, errors='raise')
        return 'datetime'
    except:
        pass

    # Else treat as Text
    return 'text'

# --- Updated Feature Extraction ---
def extract_features_from_column(series: pd.Series):
    true_dtype = detect_true_dtype(series)

    return {
        'column': series.name,
        'dtype': true_dtype,  # 🚀 detected based on content, not pandas
        'null_pct': series.isnull().mean(),
        'unique_pct': series.nunique() / len(series),
        'avg_len': series.dropna().astype(str).str.len().mean(),
        'label': ''  # to be filled manually later
    }

# --- Loop through all CSVs and Extract Features ---
for file in os.listdir(data_dir):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(data_dir, file))
        for col in df.columns:
            features = extract_features_from_column(df[col])
            features['source_file'] = file
            all_feature_rows.append(features)

# --- Save extracted features to a CSV ---
features_df = pd.DataFrame(all_feature_rows)
features_df.to_csv("../data/column_features.csv", index=False)
print("✅ Features extracted with true data types. Please label them in column_features.csv")


✅ Features extracted with true data types. Please label them in column_features.csv


  _ = pd.to_datetime(series_non_null, errors='raise')
  _ = pd.to_datetime(series_non_null, errors='raise')
  _ = pd.to_datetime(series_non_null, errors='raise')
  _ = pd.to_datetime(series_non_null, errors='raise')
