# Initial data prep

This notebook takes the raw data (features and labels) and splits it into two subsets:

- Train (used for training)
- Test (used for final model evaluation)

> Two separate Assets will likely be created on Highwind, one for each subset of the data

In [1]:
# Config
RANDOM_SEED = 42
TEST_FRACTION = 0.2
SAVE_DIR = "../data/"
TARGET_COLUMN = "MedHouseVal"

In [2]:
import os
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import pandas as pd

In [3]:
# Load raw data (features and labels)
california_housing = fetch_california_housing(as_frame=True)
df = california_housing.frame
df.head(3)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521


In [4]:
# Check null values
print(f"Total null values: {df.isna().sum().sum()}")

Total null values: 0


In [5]:
# Split features and labels
X = df.copy()
y = X.pop(TARGET_COLUMN)

In [6]:
# Check shapes
print(f"X: {X.shape}")
print(f"y: {y.shape}")

X: (20640, 8)
y: (20640,)


In [7]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=TEST_FRACTION,
    random_state=RANDOM_SEED
)

In [8]:
# Recombine features and labels

# Train
df_train = X_train.copy()
df_train[TARGET_COLUMN] = y_train

# Test
df_test = X_test.copy()
df_test[TARGET_COLUMN] = y_test

In [9]:
# Check shapes
print(f"df_train: {df_train.shape}")
print(f"df_test: {df_test.shape}")

df_train: (16512, 9)
df_test: (4128, 9)


In [10]:
# Inspect
df_train.head(3)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
14196,3.2596,33.0,5.017657,1.006421,2300.0,3.691814,32.71,-117.03,1.03
8267,3.8125,49.0,4.473545,1.041005,1314.0,1.738095,33.77,-118.16,3.821
17445,4.1563,4.0,5.645833,0.985119,915.0,2.723214,34.66,-120.48,1.726


In [11]:
# Save subsets separately
df_train.to_csv(os.path.join(SAVE_DIR, "train.csv"), index=False)
df_test.to_csv(os.path.join(SAVE_DIR, "test.csv"), index=False)