In [None]:
import os
import tarfile
import tempfile
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder

df = _read_data_from_input_csv_files(base_directory)

df_train, df_validation, df_test = _split_data2(df, 'species')

target_transformer = ColumnTransformer(
    transformers=[("species", OrdinalEncoder(), [0])]
)
y_train = target_transformer.fit_transform(np.array(df_train.species.values).reshape(-1, 1))
y_validation = target_transformer.transform(np.array(df_validation.species.values).reshape(-1, 1))
y_test = target_transformer.transform(np.array(df_test.species.values).reshape(-1, 1))

numeric_transformer = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)
categorical_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder()
)
features_transformer = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transformer, make_column_selector(dtype_exclude="object")),
        ("categorical", categorical_transformer, ["island"]),
    ]
)

X_train = features_transformer.fit_transform(df_train)
X_validation = features_transformer.transform(df_validation)
X_test = features_transformer.transform(df_test)

train = np.concatenate((X_train, y_train), axis=1)
validation = np.concatenate((X_validation, y_validation), axis=1)
test = np.concatenate((X_test, y_test), axis=1)
