In [61]:
# Standard imports
import pandas as pd
import numpy as np
from pathlib import Path
import astropy.units as u
from astropy.cosmology import Planck18
import warnings
from tqdm.auto import tqdm
import joblib

# Scikit-learn imports
from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, RobustScaler,
    PowerTransformer, QuantileTransformer
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

warnings.filterwarnings('ignore')

In [62]:
ROJECT_ROOT = Path.cwd().parents[1]  # Goes up to project root from notebooks/data_preprocessing
PROCESSED_DATA_DIR = PROJECT_ROOT / 'data' / 'processed'
TRANSFORMED_DATA_DIR = PROJECT_ROOT / 'data' / 'transformed'
TRANSFORMED_DATA_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [63]:
class UnitConverter(BaseEstimator, TransformerMixin):
    """Custom transformer for astronomical unit conversions"""
    def __init__(self, conversions=None):
        self.conversions = conversions or {}
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        X = X.copy()
        for col, (from_unit, to_unit) in self.conversions.items():
            if col in X.columns:
                try:
                    quantity = X[col].values * u.Unit(from_unit)
                    X[col] = quantity.to(u.Unit(to_unit)).value
                    print(f"Converted {col} from {from_unit} to {to_unit}")
                except Exception as e:
                    print(f"Error converting {col}: {e}")
        return X

In [64]:
def create_preprocessor(numeric_features, categorical_features):
    """Create preprocessing pipeline for numeric and categorical features"""
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features) if numeric_features else ('passthrough', 'passthrough', []),
            ('cat', categorical_transformer, categorical_features) if categorical_features else ('passthrough', 'passthrough', [])
        ])
    
    return preprocessor

In [65]:
def transform_data(df, preprocessor, fit=False):
    """Apply transformations to the data"""
    if fit:
        transformed = preprocessor.fit_transform(df)
        # Save the preprocessor
        joblib.dump(preprocessor, TRANSFORMED_DATA_DIR / 'preprocessor.joblib')
    else:
        transformed = preprocessor.transform(df)
    
    # Get feature names
    if hasattr(preprocessor, 'get_feature_names_out'):
        feature_names = preprocessor.get_feature_names_out()
    else:
        feature_names = df.columns.tolist()
    
    return pd.DataFrame(transformed, columns=feature_names, index=df.index)

In [None]:
def main():
    """Main transformation pipeline"""
    print("Starting data transformation...")
    
    # Define unit conversions (customize based on your data)
    unit_conversions = {
        # 'column_name': ('from_unit', 'to_unit')
        'distance': ('pc', 'kpc'),
        'radius': ('lyr', 'pc'),
        'mass': ('solMass', 'kg'),
        'luminosity': ('solLum', 'W')
    }
    
    # Load processed data
    processed_files = list(PROCESSED_DATA_DIR.glob('*_cleaned.csv'))
    if not processed_files:
        print("No processed data found. Run the cleaning notebook first.")
        return
    
    for filepath in tqdm(processed_files, desc="Processing files"):
        try:
            # Load data
            df = pd.read_csv(filepath)
            print(f"\nProcessing {filepath.name}...")
            print(f"Initial shape: {df.shape}")
            
            # Apply unit conversions
            converter = UnitConverter(unit_conversions)
            df = converter.fit_transform(df)
            
            # Identify feature types
            numeric_features = df.select_dtypes(include=np.number).columns.tolist()
            categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
            
            # Create and apply preprocessor
            preprocessor = create_preprocessor(numeric_features, categorical_features)
            df_transformed = transform_data(df, preprocessor, fit=True)
            
            # Save transformed data
            output_path = TRANSFORMED_DATA_DIR / f"{filepath.stem}_transformed.parquet"
            df_transformed.to_parquet(output_path)
            print(f"Saved transformed data to {output_path}")
            
        except Exception as e:
            print(f"Error processing {filepath.name}: {e}")
            continue

if __name__ == "__main__":
    main()