In [1]:
import numpy as np
from sklearn.datasets import make_regression
import pandas as pd
from sklearn.model_selection import KFold

# Generate synthetic regression data
# N_SAMPLES - number of samples (rows)
# N_FEATURES - number of featues (columns)
# N_INFORMATIVE - number of features related to the target 
# - The rest are random noise
N_SAMPLES = 100_000
N_FEATURES = 100
N_INFORMATIVE = 20
X, y = make_regression(
    n_samples=N_SAMPLES,
    n_features=N_FEATURES,
    n_informative=N_INFORMATIVE,
    random_state=42  # for reproducibility
)

# Convert to DataFrame for better readability
feature_names = [f'feature_{i}' for i in range(1, N_FEATURES + 1)]
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

# Add fold column using KFold
kf = KFold(n_splits=4, shuffle=True, random_state=42)
df['fold_id'] = -1  # Initialize with -1

# Assign fold IDs (0, 1, 2, 3) to each row
for fold_id, (_, test_idx) in enumerate(kf.split(df)):
    df.loc[test_idx, 'fold_id'] = fold_id

# Convert fold_id to integer
df['fold_id'] = df['fold_id'].astype(int)

# Save to CSV
df.to_csv('/home/jovyan/data/synthetic_regression_data.csv', index=False)
df.to_parquet('/home/jovyan/data/synthetic_regression_data.parquet', index=False)

print(f"Synthetic dataset created with shape: {df.shape}")
print(f"Sample data:")
print(df.head())
print(f"Data summary:")
print(df.describe())

# Verify fold distribution
print("\nFold distribution:")
print(df['fold_id'].value_counts().sort_index())

Synthetic dataset created with shape: (100000, 102)
Sample data:
   feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0   0.240298   0.075660   0.238052  -1.370873  -0.557766  -0.115439   
1  -1.014933  -0.231514  -1.261011  -1.005024   0.290059   1.195345   
2   1.064136   0.759604  -0.783956   0.751061   2.389619   1.098953   
3  -0.120766   0.844780  -1.636711   2.042730  -1.395969  -0.591630   
4   0.310657  -0.018187   1.028638   1.399354  -0.649027   0.561220   

   feature_7  feature_8  feature_9  feature_10  ...  feature_93  feature_94  \
0  -0.365138  -0.569114  -1.091168    0.653844  ...   -1.269756   -0.444028   
1  -1.560512  -1.070320   0.182198    0.192797  ...    0.261083   -0.244250   
2   0.632135  -0.565392  -0.668911    0.264833  ...   -0.029807   -0.060948   
3   1.471512   0.810839   1.945174   -1.703491  ...    0.823891    0.899313   
4  -0.417665   0.809148  -0.034755    0.973707  ...    1.155719   -2.096772   

   feature_95  feature_96  featur