In [13]:
import argparse 
import pathlib
import yaml 

import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split 

import mlflow 

In [14]:
RANDOM_STATE = 42
RAW_SCHEMA_PATH = 'schema.yml'
RAW_PATH = 'data/raw/abalone.csv'
TARGET_COL = 'rings'
TRAIN_FRAC = 0.7
VAL_FRAC = 0.15

In [15]:
def load_data(raw_path, raw_schema, target_col):
    raw = pd.read_csv(
        raw_path,
        dtype=raw_schema
    )
    X = raw.copy()
    y = X.pop(target_col)
    return raw, X, y

In [16]:
def split_data(X, y, train_frac, val_frac, random_state):
    test_frac = 1 - train_frac - val_frac 
    X_train, X_tmp, y_train, y_tmp = train_test_split(
        X, y, test_size=val_frac + test_frac, shuffle=True, random_state=random_state
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_tmp, y_tmp, test_size=test_frac / (val_frac + test_frac), random_state=random_state
    )
    return X_train, X_val, X_test, y_train, y_val, y_test 

In [None]:
pathlib.Path(args.tmp_data_dir + "split/").mkdir(parents=True, exist_ok=True)

In [17]:
def transform_data(raw_schema, X_train, X_val, X_test):
    numeric_features = list(raw_schema.keys())
    numeric_features.remove("sex")
    numeric_transformer = Pipeline(
        steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
    )

    categorical_features = ["sex"]
    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
            ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ]
    )

    preprocess = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )

    X_train_transf = preprocess.fit_transform(X_train)
    X_val_transf = preprocess.transform(X_val)
    X_test_transf = preprocess.transform(X_test)

    return preprocess, X_train_transf, X_val_transf, X_test_transf

In [None]:
def write_data_to_csv(X, y, X_train, X_val, X_test, y_train, y_val, y_test, X_train_transf, X_val_transf, X_test_transf):
    pathlib.Path('data/raw/').mkdir(parents=True, exist_ok=True)
    X.to_csv('data/raw/X.csv', index=False)
    y.to_csv('data/raw/y.csv', index=False)

    pathlib.Path('data/split/').mkdir(parents=True, exist_ok=True)
    X_train.to_csv('data/split/X_train.csv', index=False)
    y_train.to_csv('data/split/y_train.csv', index=False)
    X_val.to_csv('data/split/X_val.csv', index=False)
    y_val.to_csv('data/split/y_val.csv', index=False)
    X_test.to_csv('data/split/X_test.csv', index=False)
    y_test.to_csv('data/split/y_test.csv', index=False)

    pathlib.Path('data/transformed/').mkdir(parents=True, exist_ok=True)
    X_train_transf.to_csv('data/raw/X_train_transf.csv', index=False)
    y_train_transf.to_csv('data/raw/y_train_transf.csv', index=False)
    X_val.to_csv('data/raw/X_val.csv', index=False)
    y_val.to_csv('data/raw/y_val.csv', index=False)
    X_test.to_csv('data/raw/X_test.csv', index=False)
    y_test.to_csv('data/raw/y_test.csv', index=False)
    

In [None]:
def main():
    with mlflow.start_run() as run:
        with open(RAW_SCHEMA_PATH) as file:
            raw_schema = yaml.load(file, Loader=yaml.FullLoader)

        raw, X, y = load_data(raw_path=RAW_PATH, raw_schema=raw_schema, target_col=TARGET_COL)
        X_train, X_val, X_test, y_train, y_val, y_test = split_data(
            X=X, y=y, train_frac=TRAIN_FRAC, val_frac=VAL_FRAC, random_state=RANDOM_STATE
        )
        preprocess, X_train_transf, X_val_transf, X_test_transf = transform_data(
            raw_schema=raw_schema, X_train=X_train, X_val=X_val, X_test=X_test
        )
        write_data_to_csv(
            X, y, X_train, X_val, X_test, y_train, y_val, y_test, X_train_transf, X_val_transf, X_test_transf
        )
        # train()

        # LOG EVERYTHING TO MLFLOW ... DIRECTORY CONTAINING DATA, PREPROCESSOR, 



In [None]:
main()