In [2]:
import numpy as np
from sklearn.datasets import make_regression
import pandas as pd
from sklearn.model_selection import KFold

# Generate synthetic regression data
# N_SAMPLES - number of samples (rows)
# N_FEATURES - number of featues (columns)
# N_INFORMATIVE - number of features related to the target 
# - The rest are random noise
N_SAMPLES = 10_000
N_FEATURES = 20
N_INFORMATIVE = 10
X, y = make_regression(
    n_samples=N_SAMPLES,
    n_features=N_FEATURES,
    n_informative=N_INFORMATIVE,
    random_state=42  # for reproducibility
)

# Convert to DataFrame for better readability
feature_names = [f'feature_{i}' for i in range(1, 21)]
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

# Add fold column using KFold
kf = KFold(n_splits=4, shuffle=True, random_state=42)
df['fold_id'] = -1  # Initialize with -1

# Assign fold IDs (0, 1, 2, 3) to each row
for fold_id, (_, test_idx) in enumerate(kf.split(df)):
    df.loc[test_idx, 'fold_id'] = fold_id

# Convert fold_id to integer
df['fold_id'] = df['fold_id'].astype(int)

# Save to CSV
df.to_csv('/home/jovyan/data/synthetic_regression_data.csv', index=False)

print(f"Synthetic dataset created with shape: {df.shape}")
print(f"Sample data:")
print(df.head())
print(f"Data summary:")
print(df.describe())

# Verify fold distribution
print("\nFold distribution:")
print(df['fold_id'].value_counts().sort_index())

Synthetic dataset created with shape: (10000, 22)
Sample data:
   feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0  -0.186761   0.478045  -1.154557   0.212808   0.268562   1.541141   
1  -0.635283   0.251530  -0.548343  -0.248227  -1.498527  -0.798143   
2   1.491810   1.130562   0.857294  -0.094590  -0.629634   1.161238   
3   0.034152   0.666313  -1.148794  -1.085825   0.679373  -0.081523   
4  -0.939496   1.234623  -0.747609  -0.331974   0.849726   0.653575   

   feature_7  feature_8  feature_9  feature_10  ...  feature_13  feature_14  \
0   2.217046   0.447667   0.040777   -1.702876  ...    0.307114    0.117864   
1   0.265727   0.631096  -0.732237   -1.339556  ...    0.991846   -0.769236   
2   0.376251  -0.286641  -0.587925    1.378734  ...    0.800813    1.370580   
3  -1.778588   1.235782   1.192508    0.462591  ...    0.436739   -1.087246   
4   0.200738  -0.494521  -2.249391   -1.093166  ...    0.251269   -0.634671   

   feature_15  feature_16  feature_