In [6]:
import numpy as np
import matplotlib.pyplot as plt

In [7]:
from DatasetBuilder import DatasetBuilder

In [8]:
import numpy as np

# Number of samples
n_samples = 100

# Initialize the DatasetBuilder
builder = DatasetBuilder(n_samples=n_samples, random_seed=42)

# Subaru dataset: model, engine_displacement, horsepower, torque, fuel_efficiency, weight, price

# Add 'engine_displacement' as a primary feature (measured in liters)
builder.add_feature('engine_displacement', lambda n: np.random.uniform(1.6, 3.6, size=n), coefficient=0.4, noise_level=0.1)

# Add dependent features: 'horsepower', 'torque', and 'fuel_efficiency'
# Add 'horsepower', which correlates with 'engine_displacement' (e.g., 80% correlation)
builder.add_feature('horsepower', lambda n: np.random.randint(150, 400, size=n), 
                    coefficient=0.3, depends_on='engine_displacement', correlation_factor=0.8)

# Add 'torque', which correlates with 'engine_displacement' (e.g., 75% correlation)
builder.add_feature('torque', lambda n: np.random.randint(150, 350, size=n), 
                    coefficient=0.2, depends_on='engine_displacement', correlation_factor=0.75)

# Add 'fuel_efficiency', which inversely correlates with 'engine_displacement' (e.g., 60% inverse correlation)
builder.add_feature('fuel_efficiency', lambda n: np.random.uniform(15, 35, size=n), 
                    coefficient=-0.2, depends_on='engine_displacement', correlation_factor=-0.6)

# Add independent features: 'model', 'weight', 'price'
# Add 'model', categorical feature for Subaru car models
builder.add_feature('model', lambda n: np.random.choice(['Forester', 'Outback', 'Crosstrek', 'Impreza', 'WRX', 'Legacy'], size=n), 
                    coefficient=5, noise_level=2)

# Add 'weight', which can be independent
builder.add_feature('weight', lambda n: np.random.randint(3000, 4500, size=n), coefficient=0.15)

# Add 'price', which depends on 'horsepower' and 'model'
builder.add_feature('price', lambda n: np.random.randint(20000, 45000, size=n), 
                    coefficient=0.25, depends_on='horsepower', correlation_factor=0.7)

# Build the target variable: 'price', with some bias and noise
subaru_data = builder.build_target(target_name='price', bias=25000, noise_std=2000, target_range=(18000, 50000))
# Subaru dataset generation complete

In [9]:
# Display the first few rows of the dataset
subaru_data.head()

Unnamed: 0,engine_displacement,horsepower,torque,fuel_efficiency,model,weight,price
0,2.34908,44.279264,75.76181,38.395151,2.0,3825.0,28675.7316
1,3.501429,51.801143,71.376071,27.62147,5.0,4098.0,27250.658263
2,3.063988,78.45119,72.047991,33.888608,1.0,3412.0,29005.166744
3,2.797317,80.237854,52.597988,46.135067,4.0,4399.0,24715.721511
4,1.912037,41.72963,81.684028,45.922855,4.0,4443.0,29182.639156


In [10]:
# Save to CSV
csv_file_path = 'subaru_stats.csv'
subaru_data.to_csv(csv_file_path, index=False)

csv_file_path

'subaru_stats.csv'