In [23]:
import pandas as pd
import numpy as np
from pathlib import Path
import astropy.units as u
from astropy.cosmology import Planck18
import warnings
from tqdm.auto import tqdm

warnings.filterwarnings('ignore')

In [24]:
PROJECT_ROOT = Path.cwd().parents[1]
RAW_DATA_DIR = PROJECT_ROOT / 'data' / 'raw'
PROCESSED_DATA_DIR = PROJECT_ROOT / 'data' / 'processed'
for directory in [RAW_DATA_DIR, PROCESSED_DATA_DIR]:
    directory.mkdir(parents=True, exist_ok=True)

In [None]:
def load_raw_data():
    print("Loading raw data files...")
    data = {}
    
    file_formats = ['*.csv', '*.fits', '*.fits.gz']
    for fmt in file_formats:
        for filepath in RAW_DATA_DIR.glob(fmt):
            try:
                if filepath.suffix == '.csv':
                    df = pd.read_csv(filepath)
                else:
                    from astropy.table import Table
                    table = Table.read(filepath)
                    # Handle multi-dimensional columns for SDSS data
                    if 'sdss' in filepath.name.lower():
                        names = [name for name in table.colnames if len(table[name].shape) <= 1]
                        df = table[names].to_pandas()
                    else:
                        df = table.to_pandas()
                
                key = filepath.stem
                if key in data:
                    key = f"{key}_{len([k for k in data.keys() if k.startswith(key)])}"
                data[key] = df
                print(f"✓ Loaded {filepath.name} with shape {df.shape}")
                
            except Exception as e:
                print(f"Error loading {filepath.name}: {e}")
    
    return data

In [26]:
def clean_column_names(df):
    """Standardize column names to lowercase with underscores."""
    df.columns = (df.columns
                 .str.lower()
                 .str.replace(' ', '_')
                 .str.replace('(', '')
                 .str.replace(')', '')
                 .str.replace('[^a-zA-Z0-9_]', ''))
    return df

In [27]:
def handle_missing_values(df, threshold=0.7):
    """Handle missing values based on data type and missingness threshold."""
    # Drop columns with too many missing values
    missing_cols = df.columns[df.isnull().mean() > threshold]
    if not missing_cols.empty:
        print(f"Dropping columns with >{threshold*100}% missing values: {list(missing_cols)}")
        df = df.drop(columns=missing_cols)
    
    # For numeric columns, fill with median
    num_cols = df.select_dtypes(include=np.number).columns
    for col in num_cols:
        if df[col].isnull().any():
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
            print(f"Filled missing values in {col} with median: {median_val:.4f}")
    
    # For categorical columns, fill with mode
    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    for col in cat_cols:
        if df[col].isnull().any():
            mode_val = df[col].mode()[0]
            df[col] = df[col].fillna(mode_val)
            print(f"Filled missing values in {col} with mode: {mode_val}")
    
    return df

In [28]:
def convert_units(df, column, from_unit, to_unit):
    """Convert units for a column using astropy."""
    if column in df.columns:
        try:
            quantity = df[column].values * u.Unit(from_unit)
            converted = quantity.to(to_unit).value
            df[column] = converted
            print(f"Converted {column} from {from_unit} to {to_unit}")
        except Exception as e:
            print(f"Error converting {column}: {e}")
    return df

In [29]:
def standardize_units(df):
    """Standardize units across the dataset."""
    # Example unit conversions - customize based on your data
    unit_conversions = {
        # 'column_name': ('from_unit', 'to_unit')
        'distance': ('Mpc', 'Mpc'),
        'redshift': ('', ''),  # Dimensionless
        'mass': ('solMass', 'solMass'),
        'radius': ('solRad', 'solRad'),
        'temperature': ('K', 'K'),
    }
    
    for col, (from_unit, to_unit) in unit_conversions.items():
        if col in df.columns:
            df = convert_units(df, col, from_unit, to_unit)
    
    return df

In [30]:
def remove_duplicates(df, subset=None):
    """Remove duplicate rows, optionally based on a subset of columns."""
    initial_rows = len(df)
    df = df.drop_duplicates(subset=subset)
    removed = initial_rows - len(df)
    if removed > 0:
        print(f"Removed {removed} duplicate rows")
    return df

In [31]:
def clean_data(data_dict):
    """Clean all datasets in the dictionary."""
    cleaned_data = {}
    
    for name, df in tqdm(data_dict.items(), desc="Cleaning datasets"):
        print(f"\\nProcessing {name}...")
        
        # Make a copy to avoid modifying original
        df_clean = df.copy()
        
        # Apply cleaning steps
        df_clean = clean_column_names(df_clean)
        df_clean = handle_missing_values(df_clean)
        df_clean = standardize_units(df_clean)
        
        # Remove duplicates (use specific columns if known)
        subset = None
        if 'id' in df_clean.columns:
            subset = ['id']
        elif 'name' in df_clean.columns:
            subset = ['name']
        df_clean = remove_duplicates(df_clean, subset=subset)
        
        # Store cleaned data
        cleaned_data[name] = df_clean
        
        # Save cleaned data
        output_path = PROCESSED_DATA_DIR / f"{name}_cleaned.csv"
        df_clean.to_csv(output_path, index=False)
        print(f"✓ Saved cleaned data to {output_path}")
    
    return cleaned_data

In [32]:
def main():
    """Main function to run the data cleaning pipeline."""
    print("Starting data cleaning pipeline...")
    
    # Load raw data
    raw_data = load_raw_data()
    
    if not raw_data:
        print("No data files found. Exiting.")
        return
    
    # Clean all datasets
    cleaned_data = clean_data(raw_data)
    
    # Generate summary report
    print("\\n=== Data Cleaning Summary ===")
    for name, df in cleaned_data.items():
        print(f"\\n{name}:")
        print(f"  Rows: {len(df):,}")
        print(f"  Columns: {len(df.columns):,}")
        print(f"  Missing values: {df.isnull().sum().sum():,}")
    
    print("\\nData cleaning complete! Cleaned data saved to:", PROCESSED_DATA_DIR)

if __name__ == "__main__":
    main()

Starting data cleaning pipeline...
Loading raw data files...
✓ Loaded act_cmb.csv with shape (4, 5)
✓ Loaded combined_cosmology.csv with shape (109309, 29)
✓ Loaded cosmological_parameters.csv with shape (8, 6)
✓ Loaded des_cosmology.csv with shape (3, 5)
✓ Loaded galaxy_distance_evolution.csv with shape (71900, 5)
✓ Loaded monte_carlo_times.csv with shape (10000, 7)
✓ Loaded nasa_exoplanets.csv with shape (5989, 13)
✓ Loaded planck_cosmology.csv with shape (6, 5)
✓ Loaded planck_cosmology_2018.csv with shape (6, 3)
✓ Loaded scale_factor_evolution.csv with shape (1604, 4)
✓ Loaded sh0es_hubble.csv with shape (1, 5)
✓ Loaded vacuum_decay_bubbles.csv with shape (25285, 7)
✓ Loaded 2dfgrs_galaxies.fits with shape (10000, 11)
Error loading sdss_dr16q_quasars.fits: Cannot convert a table with multidimensional columns to a pandas DataFrame. Offending columns are: ['Z_DLA', 'NHI_DLA', 'CONF_DLA', 'PLATE_DUPLICATE', 'MJD_DUPLICATE', 'FIBERID_DUPLICATE', 'SPECTRO_DUPLICATE', 'PSFFLUX', 'PSFFLUX

Cleaning datasets:   0%|          | 0/13 [00:00<?, ?it/s]

\nProcessing act_cmb...
✓ Saved cleaned data to d:\Data Science\Cosmic_Fate_Simulator\data\processed\act_cmb_cleaned.csv
\nProcessing combined_cosmology...
Dropping columns with >70.0% missing values: ['parameter', 'value', 'error', 'unit', 'source', 'hubble_constant_h0', 'omega_m', 'omega_lambda', 'w', 'omega_k', 'h0', 'time_to_end_gyr', 'parameter', 'value', 'error', 'scale_factor_a', 'hubble_parameter_h', 'bubble_id', 'bubble_center_x', 'bubble_center_y', 'bubble_radius', 'universe_fraction_remaining']
Filled missing values in galaxy_id with median: 511.0000
Filled missing values in initial_distance_mpc with median: 2613.0734
Filled missing values in time_gyr with median: 42.5000
Filled missing values in distance_mpc with median: 23757.2637
Filled missing values in simulation_id with median: 71.0000
Filled missing values in scenario with mode: Big Freeze
Removed 17574 duplicate rows
✓ Saved cleaned data to d:\Data Science\Cosmic_Fate_Simulator\data\processed\combined_cosmology_clean