In [2]:
import numpy as np
from sklearn.datasets import make_regression
import pandas as pd
from sklearn.model_selection import KFold

# Generate synthetic regression data
# N_SAMPLES - number of samples (rows)
# N_FEATURES - number of featues (columns)
# N_INFORMATIVE - number of features related to the target 
# - The rest are random noise
N_SAMPLES = 100_000
N_FEATURES = 100
N_INFORMATIVE = 10
X, y = make_regression(
    n_samples=N_SAMPLES,
    n_features=N_FEATURES,
    n_informative=N_INFORMATIVE,
    random_state=42  # for reproducibility
)

# Convert to DataFrame for better readability
feature_names = [f'feature_{i}' for i in range(1, N_FEATURES + 1)]
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

# Add fold column using KFold
kf = KFold(n_splits=4, shuffle=True, random_state=42)
df['fold_id'] = -1  # Initialize with -1

# Assign fold IDs (0, 1, 2, 3) to each row
for fold_id, (_, test_idx) in enumerate(kf.split(df)):
    df.loc[test_idx, 'fold_id'] = fold_id

# Convert fold_id to integer
df['fold_id'] = df['fold_id'].astype(int)

# Save to CSV
df.to_csv('/home/jovyan/data/synthetic_regression_data.csv', index=False)

print(f"Synthetic dataset created with shape: {df.shape}")
print(f"Sample data:")
print(df.head())
print(f"Data summary:")
print(df.describe())

# Verify fold distribution
print("\nFold distribution:")
print(df['fold_id'].value_counts().sort_index())

Synthetic dataset created with shape: (100000, 102)
Sample data:
   feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0  -1.064999   1.062407  -0.441405  -0.046488   1.336421  -0.770404   
1   0.431288   0.329626   0.264387  -1.059184   0.232240   0.302600   
2   0.583541  -0.666314   0.442413  -0.540896  -0.943721   0.611448   
3  -0.649130  -0.138196  -0.175762   0.418758  -0.586164   0.783710   
4   0.629903  -0.855884  -0.156003  -0.903030   0.311933  -1.261696   

   feature_7  feature_8  feature_9  feature_10  ...  feature_93  feature_94  \
0   0.921947  -2.098437  -0.950326   -0.821592  ...    0.480449   -0.288309   
1  -2.478434   2.435647   0.065110   -1.080004  ...   -0.558633    0.020064   
2  -0.936861   0.854465  -0.781131    0.394717  ...    0.472318   -1.322155   
3  -0.462456   0.305236   0.666846    0.826561  ...   -1.289410    0.074772   
4   0.117821   0.522196   0.588120    0.598700  ...   -0.100881    0.798658   

   feature_95  feature_96  featur