# Data Generation Component
This notebook generates synthetic data for training the XGBoost model.

In [None]:
# Elyra Pipeline Parameters
output_dir = '../data'

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import os

In [None]:
# Set random seed
np.random.seed(42)

# Generate synthetic data
X, y = make_classification(
    n_samples=10000,
    n_features=20,
    n_informative=15,
    n_redundant=5,
    n_classes=2,
    random_state=42,
    class_sep=0.8,
    flip_y=0.1
)

# Create feature names
feature_names = [f'feature_{i}' for i in range(20)]
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

In [None]:
# Split data
train_val, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['target'])
train, val = train_test_split(train_val, test_size=0.25, random_state=42, stratify=train_val['target'])

In [None]:
# Create output directory
os.makedirs(output_dir, exist_ok=True)

# Save datasets
train.to_csv(os.path.join(output_dir, 'train.csv'), index=False)
val.to_csv(os.path.join(output_dir, 'val.csv'), index=False)
test.to_csv(os.path.join(output_dir, 'test.csv'), index=False)

# Return paths for the next components
train_path = os.path.join(output_dir, 'train.csv')
val_path = os.path.join(output_dir, 'val.csv')
test_path = os.path.join(output_dir, 'test.csv')

print(f'Saved train data to: {train_path}')
print(f'Saved validation data to: {val_path}')
print(f'Saved test data to: {test_path}')