In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split


In [3]:

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic regression data
X, y = make_regression(
    n_samples=10000,  # Number of samples
    n_features=10,    # Number of features
    noise=0.1,        # Add some noise to make it more realistic
    random_state=42   # For reproducibility
)

# Convert to pandas DataFrame with meaningful column names
feature_names = [f'feature_{i}' for i in range(10)]
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

# Split the data into training and test sets (80-20 split)
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42
)

# Display shapes of training and test sets
print("Training set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

# Save train and test sets to CSV files
train_df.to_csv('data/train.csv', index=False)
test_df.to_csv('data/test.csv', index=False)
print("\nData saved to 'data/train.csv' and 'data/test.csv'")

# Display first few rows of training data
print("\nFirst few rows of training data:")
print(train_df.head())

# Basic statistics of training data
print("\nBasic statistics of training data:")
print(train_df.describe())

Training set shape: (8000, 11)
Test set shape: (2000, 11)

Data saved to 'data/train.csv' and 'data/test.csv'

First few rows of training data:
      feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
9254   0.121086  -0.642399  -0.719407  -0.417852   1.631194  -0.945122   
1561  -1.097273  -2.381292  -0.520215   1.098382   0.182849  -0.943381   
1670  -0.370286  -0.953685  -1.104476  -0.916342   0.786311  -0.775753   
6087   0.372575  -2.286670  -0.454392   1.223723  -0.460174  -1.168506   
6669  -0.040260  -0.515006  -0.555127   0.551852   1.307583  -0.459330   

      feature_6  feature_7  feature_8  feature_9      target  
9254   0.529054   0.411321   0.699966  -1.045702 -102.487202  
1561   1.731197   0.921826  -0.576012  -0.309349  -69.353345  
1670  -0.096478   0.685991  -0.608703  -0.148002 -196.736497  
6087   0.243462  -0.370326  -1.152226   1.452286    8.755086  
6669  -1.480541  -0.824441   0.292184  -1.710496 -287.928595  

Basic statistics of training dat