In [1]:
import numpy as np
from sklearn.datasets import make_regression
import pandas as pd
from sklearn.model_selection import KFold

# Generate synthetic regression data
# N_SAMPLES - number of samples (rows)
# N_FEATURES - number of featues (columns)
# N_INFORMATIVE - number of features related to the target 
# - The rest are random noise
N_SAMPLES = 200_000
N_FEATURES = 100
N_INFORMATIVE = 20
X, y = make_regression(
    n_samples=N_SAMPLES,
    n_features=N_FEATURES,
    n_informative=N_INFORMATIVE,
    random_state=42  # for reproducibility
)

# Convert to DataFrame for better readability
feature_names = [f'feature_{i}' for i in range(1, N_FEATURES + 1)]
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

# Add fold column using KFold
kf = KFold(n_splits=4, shuffle=True, random_state=42)
df['fold_id'] = -1  # Initialize with -1

# Assign fold IDs (0, 1, 2, 3) to each row
for fold_id, (_, test_idx) in enumerate(kf.split(df)):
    df.loc[test_idx, 'fold_id'] = fold_id

# Convert fold_id to integer
df['fold_id'] = df['fold_id'].astype(int)

# Save to disk
# df.to_csv('/home/jovyan/data/synthetic_regression_data.csv', index=False)
df.to_parquet('/home/jovyan/data/synthetic_regression_data.parquet', index=False)

print(f"Synthetic dataset created with shape: {df.shape}")
print(f"Sample data:")
print(df.head())
print(f"Data summary:")
print(df.describe())

# Verify fold distribution
print("\nFold distribution:")
print(df['fold_id'].value_counts().sort_index())

Synthetic dataset created with shape: (200000, 102)
Sample data:
   feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0  -0.111563  -0.108128  -1.625328   0.348891  -0.635078  -1.474734   
1   0.265552  -0.510406  -0.322505  -0.829748  -0.002843   0.438090   
2   1.108337  -1.199863   0.951883  -0.506733   0.640460  -1.619802   
3  -1.413631  -1.531913  -0.352112   2.575996   0.795427  -0.799609   
4   0.602984  -0.605729  -1.355847  -0.682064  -0.570526   0.927842   

   feature_7  feature_8  feature_9  feature_10  ...  feature_93  feature_94  \
0  -0.718939   0.223580   0.641410   -0.326920  ...    1.663765    1.562248   
1  -0.573013   0.513119   1.251523    0.278698  ...    0.141794    0.495491   
2   0.620880   1.078592  -1.042822    0.344353  ...    0.680892    0.308564   
3  -0.195855   0.811692  -0.986519   -2.396855  ...    0.703334   -0.502523   
4  -0.239870   0.128459   0.377166   -1.082094  ...   -1.207290   -1.023860   

   feature_95  feature_96  featur