In [5]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split


In [6]:

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic regression data
X, y = make_classification(
    n_samples=10000,  # Number of samples
    n_features=10,    # Number of features
    n_classes=2,      # Number of classe
    random_state=42   # For reproducibility
)

# Convert to pandas DataFrame with meaningful column names
feature_names = [f'feature_{i}' for i in range(10)]
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

# Split the data into training and test sets (80-20 split)
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42
)

# Display shapes of training and test sets
print("Training set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

# Save train and test sets to CSV files
train_df.to_csv('data/train.csv', index=False)
test_df.to_csv('data/test.csv', index=False)
print("\nData saved to 'data/train.csv' and 'data/test.csv'")

# Display first few rows of training data
print("\nFirst few rows of training data:")
print(train_df.head())

# Basic statistics of training data
print("\nBasic statistics of training data:")
print(train_df.describe())

Training set shape: (8000, 11)
Test set shape: (2000, 11)

Data saved to 'data/train.csv' and 'data/test.csv'

First few rows of training data:
      feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
9254  -1.680163   2.271574   0.971832   0.405132   0.947550   0.224179   
1561  -0.307912   0.446997   0.148353   0.309650   0.391192   0.169165   
1670   0.918926   1.060702  -0.634354   0.592181   0.233767  -1.641776   
6087  -1.703864  -0.184569   0.845681   1.517603   1.983689   0.541266   
6669   1.213408  -1.265586  -0.625509  -0.896729  -1.242617   0.520520   

      feature_6  feature_7  feature_8  feature_9  target  
9254  -1.382286   1.042140  -0.451837  -1.648250       0  
1561  -0.205335  -0.417412  -0.279003   0.173906       1  
1670   1.055336  -0.557833   0.095316   0.156991       1  
6087  -0.249327  -1.690822   0.578749   0.193558       1  
6669   0.444067   1.063889   0.270601   0.245879       0  

Basic statistics of training data:
         feature_0   