In [9]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import os



In [10]:
abalone_df = pd.read_csv('data/abalone-dataset.csv')


In [11]:
abalone_df.head()

Unnamed: 0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
3,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
4,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8


In [12]:
def merge_two_dicts(x, y):
    z = x.copy()
    z.update(y)
    return z

In [13]:
def create_dir(path):
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)

In [14]:
feature_columns_names = [
    "sex",
    "length",
    "diameter",
    "height",
    "whole_weight",
    "shucked_weight",
    "viscera_weight",
    "shell_weight",
]
label_column = "rings"
feature_columns_dtype = {
    "sex": str,
    "length": np.float64,
    "diameter": np.float64,
    "height": np.float64,
    "whole_weight": np.float64,
    "shucked_weight": np.float64,
    "viscera_weight": np.float64,
    "shell_weight": np.float64,
}
label_column_dtype = {"rings": np.float64}

abalone_df = pd.read_csv(
    'data/abalone-dataset.csv',
    header=None,
    names=feature_columns_names + [label_column],
    dtype=merge_two_dicts(feature_columns_dtype, label_column_dtype),
)

numeric_features = list(feature_columns_names)
numeric_features.remove("sex")


numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["sex"]
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

y = abalone_df.pop("rings")
X_pre = preprocess.fit_transform(abalone_df)
y_pre = y.to_numpy().reshape(len(y), 1)

X = np.concatenate((y_pre, X_pre), axis=1)
base_dir = "batch_predictions"
# train = 'batch_predictions/train'
# validation = "batch"

create_dir(base_dir)
# create_dir(train)
# create_dir(train)

# if not os.path.exists(base_dir):
#     os.mkdir(base_dir)

np.random.shuffle(X)
train, validation, test = np.split(X, [int(0.7 * len(X)), int(0.85 * len(X))])
# note train, validation test files
pd.DataFrame(train).to_csv(f"{base_dir}/train.csv", header=False, index=False)
pd.DataFrame(validation).to_csv(
    f"{base_dir}/validation.csv", header=False, index=False
)
pd.DataFrame(test).to_csv(f"{base_dir}/test.csv", header=False, index=False)