# Initial data prep

This notebook takes the Iris data (features and labels) and splits it into two subsets:

- Train (used for training)
- Test (used for final model evaluation)

> Two separate Assets will likely be created on Highwind, one for each subset of the data

In [1]:
# Config
RANDOM_SEED = 42
TEST_FRACTION = 0.2
SAVE_DIR = "../data/"

In [2]:
import os
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd

In [3]:
# Load raw data (features and labels)
iris = load_iris()
X, y = iris.data, iris.target

In [4]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=TEST_FRACTION,
    random_state=RANDOM_SEED
)

In [5]:
# Capture in pandas DataFrame

# Train
df_train = pd.DataFrame(X_train, columns=iris.feature_names)
df_train['target'] = y_train

# Test
df_test = pd.DataFrame(X_test, columns=iris.feature_names)
df_test['target'] = y_test

In [6]:
# Check shapes
print(f"df_train: {df_train.shape}")
print(f"df_test: {df_test.shape}")

df_train: (120, 5)
df_test: (30, 5)


In [7]:
# Inspect
df_train.head(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,4.6,3.6,1.0,0.2,0
1,5.7,4.4,1.5,0.4,0
2,6.7,3.1,4.4,1.4,1


In [8]:
# Save subsets separately
df_train.to_csv(os.path.join(SAVE_DIR, "train.csv"), index=False)
df_test.to_csv(os.path.join(SAVE_DIR, "test.csv"), index=False)