# Synthetic data generation

1. Loads the original ENB2012 dataset
2. Generates synthetic data that preserves:

- Statistical distributions of each feature
- Correlations between features
- Domain constraints (e.g., positive values for areas)


Includes:
- Validation to ensure the synthetic data matches the original distribution
- Outputs the data in CSV format ready for your MLops pipeline

To use this script:

1. Place your ENB2012_data.csv file in the same directory as the script
2. Run the script to generate synthetic_ENB2012_data.csv
3. The script will print validation metrics showing how well the synthetic data matches the original

The synthetic data preserves important properties like:

* Relative Compactness staying between 0 and 1
* Orientation values at 45-degree intervals
* Positive values for areas and loads
* Discrete values for Glazing Area Distribution

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats

def load_original_data():
    """Load the original ENB2012 dataset"""
    try:
        # Try to load with default numeric columns (X1, X2, etc.)
        df = pd.read_csv('ENB2012_data.csv')
        
        # Rename columns to meaningful names
        column_names = {
            'X1': 'Relative_Compactness', 
            'X2': 'Surface_Area',
            'X3': 'Wall_Area',
            'X4': 'Roof_Area',
            'X5': 'Overall_Height',
            'X6': 'Orientation',
            'X7': 'Glazing_Area',
            'X8': 'Glazing_Area_Distribution',
            'Y1': 'Heating_Load',
            'Y2': 'Cooling_Load'
        }
        
        df = df.rename(columns=column_names)
        return df
    
    except Exception as e:
        print(f"Error loading data: {e}")
        raise

def generate_synthetic_data(original_data, multiplier=4, random_state=42):
    """
    Generate synthetic data using a simplified approach with individual column distributions
    """
    np.random.seed(random_state)
    
    # Number of samples to generate
    n_synthetic = len(original_data) * multiplier
    
    # Create empty DataFrame for synthetic data
    synthetic_df = pd.DataFrame()
    
    # Generate synthetic data for each column independently
    for column in original_data.columns:
        # Get original column data
        orig_data = original_data[column].astype(float)
        
        # Calculate mean and std
        mean = orig_data.mean()
        std = orig_data.std()
        
        # Generate synthetic values
        if column == 'Orientation':
            # Generate random orientations (0, 45, 90, 135, 180, 225, 270, 315)
            synthetic_values = np.random.choice([0, 45, 90, 135, 180, 225, 270, 315], size=n_synthetic)
        
        elif column == 'Glazing_Area_Distribution':
            # Generate discrete values between 0 and 5
            synthetic_values = np.random.randint(0, 6, size=n_synthetic)
        
        elif column == 'Relative_Compactness':
            # Generate values between 0 and 1
            synthetic_values = np.random.normal(mean, std, n_synthetic)
            synthetic_values = np.clip(synthetic_values, 0, 1)
        
        else:
            # Generate normally distributed values
            synthetic_values = np.random.normal(mean, std, n_synthetic)
            
            # Ensure positive values for areas and loads
            if column in ['Surface_Area', 'Wall_Area', 'Roof_Area', 'Heating_Load', 'Cooling_Load']:
                synthetic_values = np.abs(synthetic_values)
            
            # Clip to original range
            min_val = orig_data.min()
            max_val = orig_data.max()
            synthetic_values = np.clip(synthetic_values, min_val, max_val)
        
        synthetic_df[column] = synthetic_values
    
    return synthetic_df

def validate_synthetic_data(original_data, synthetic_data):
    """
    Validate the synthetic data by comparing distributions with original data
    """
    validation_results = {}
    
    # Compare basic statistics
    for column in original_data.columns:
        orig_stats = original_data[column].describe()
        synt_stats = synthetic_data[column].describe()
        
        # Perform Kolmogorov-Smirnov test
        ks_statistic, p_value = stats.ks_2samp(
            original_data[column].astype(float),
            synthetic_data[column].astype(float)
        )
        
        validation_results[column] = {
            'ks_statistic': ks_statistic,
            'p_value': p_value,
            'mean_diff_percent': ((synt_stats['mean'] - orig_stats['mean']) / orig_stats['mean']) * 100,
            'std_diff_percent': ((synt_stats['std'] - orig_stats['std']) / orig_stats['std']) * 100
        }
    
    return validation_results

def main():
    try:
        # Load original data
        print("Loading original data...")
        original_data = load_original_data()
        print("Original data shape:", original_data.shape)
        
        # Generate synthetic data
        print("\nGenerating synthetic data...")
        synthetic_data = generate_synthetic_data(original_data, multiplier=4)
        print("Synthetic data shape:", synthetic_data.shape)
        
        # Validate synthetic data
        print("\nValidating synthetic data...")
        validation_results = validate_synthetic_data(original_data, synthetic_data)
        
        # Save synthetic data to CSV
        output_file = 'synthetic_ENB2012_data.csv'
        synthetic_data.to_csv(output_file, index=False)
        print(f"\nSynthetic data saved to {output_file}")
        
        # Print validation summary
        print("\nValidation Summary:")
        for column, metrics in validation_results.items():
            print(f"\n{column}:")
            print(f"KS statistic: {metrics['ks_statistic']:.4f}")
            print(f"Mean difference: {metrics['mean_diff_percent']:.2f}%")
            print(f"Std difference: {metrics['std_diff_percent']:.2f}%")
            
    except Exception as e:
        print(f"An error occurred: {e}")
        raise

if __name__ == "__main__":
    main()

Loading original data...
Original data shape: (1296, 10)

Generating synthetic data...
Synthetic data shape: (5184, 10)

Validating synthetic data...

Synthetic data saved to synthetic_ENB2012_data.csv

Validation Summary:

Relative_Compactness:
KS statistic: nan
Mean difference: 0.01%
Std difference: -1.71%

Surface_Area:
KS statistic: nan
Mean difference: -0.28%
Std difference: -7.85%

Wall_Area:
KS statistic: nan
Mean difference: 0.35%
Std difference: -5.48%

Roof_Area:
KS statistic: nan
Mean difference: -1.23%
Std difference: -20.61%

Overall_Height:
KS statistic: nan
Mean difference: -0.29%
Std difference: -28.51%

Orientation:
KS statistic: nan
Mean difference: 4356.10%
Std difference: 9105.61%

Glazing_Area:
KS statistic: nan
Mean difference: -2.99%
Std difference: -12.13%

Glazing_Area_Distribution:
KS statistic: nan
Mean difference: -10.47%
Std difference: 10.66%

Heating_Load:
KS statistic: nan
Mean difference: 1.06%
Std difference: -5.98%

Cooling_Load:
KS statistic: nan
Mea