In [58]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

In [59]:
train_filename = "binary_classification.csv"
output = "output"
test_filename = None
task = None
idx = None
targets = ["income"]
features = None
categorical_features = None
use_gpu = False
num_folds = 5
seed = 42
num_trials = 100
time_limit = 360
fast = False

In [60]:
# __post_init__

# self.output
# self.targets
# self.idx

# if os.path.exists(output):
#   raise Exception(
#       "Output directory already exists. Please specify some other directory.")

os.makedirs(output, exist_ok=True)

if targets is None:
  print("No target columns specified. Will default to `target`.")
  targets = ["target"]

if idx is None:
  print("No id column specified. Will default to `id`.")
  idx = "id"

No id column specified. Will default to `id`.


In [61]:
import enum

class Task(enum.Enum):
    classification = 0
    regression = 1

    @staticmethod
    def from_str(task_type: str):
        if task_type == "classification":
            return Task.classification
        elif task_type == "regression":
            return Task.regression
        else:
            raise ValueError("Invalid task type: {}".format(task_type))

    @staticmethod
    def list_str():
        return ["classification", "regression"]

class Problem(enum.IntEnum):
    binary_classification = 1
    multi_class_classification = 2
    multi_label_classification = 3
    single_column_regression = 4
    multi_column_regression = 5

    @staticmethod
    def from_str(label):
        if label == "binary_classification":
            return Problem.binary_classification
        elif label == "multi_class_classification":
            return Problem.multi_class_classification
        elif label == "multi_label_classification":
            return Problem.multi_label_classification
        elif label == "single_column_regression":
            return Problem.single_column_regression
        elif label == "multi_column_regression":
            return Problem.multi_column_regression
        else:
            raise NotImplementedError


In [62]:
from typing import List, Optional
from pydantic import BaseModel

class ModelConfig(BaseModel):
    train_filename: str
    test_filename: Optional[str] = None
    idx: str
    targets: List[str]
    problem_type: Problem
    output: str
    features: List[str]
    num_folds: int
    use_gpu: bool
    seed: int
    categorical_features: List[str]
    num_trials: int
    time_limit: Optional[int] = None
    fast: bool

In [63]:
from sklearn.utils.multiclass import type_of_target
import numpy as np

# self.task
# self.targets
def _determine_problem(df: pd.DataFrame) -> Problem:
  values = df[targets].values
  if task is None:
    target_type = type_of_target(values)
    
    if target_type == "continuous":
      problem = Problem.single_column_regression
    elif target_type == "continuous-multioutput":
      problem = Problem.multi_column_regression
    elif target_type == "binary":
      problem = Problem.binary_classification
    elif target_type == "multiclass":
      problem = Problem.multi_class_classification
    elif target_type == "multilabel-indicator":
      problem = Problem.multi_label_classification
    else:
      raise Exception("Unable to infer `problem_type`. Please provide `classification` or `regression`")

    return problem

  if task == "classification":
    if len(targets) == 1:
      if (len(np.unique(values)) == 2):
        problem = Problem.binary_classification
      else:
        problem = Problem.multi_label_classification
    else:
      problem = Problem.multi_label_classification
  
  elif task == "regression":
    if len(targets) == 1:
      problem = Problem.single_column_regression
    else:
      problem = Problem.multi_column_regression
  else:
    raise Exception("Problem type not understood")

  return problem


In [64]:
# self.idx
# what if `idx` is `None`?
def _inject_idxumn(df: pd.DataFrame) -> pd.DataFrame:
  if idx not in df.columns:
    df[idx] = np.arange(len(df))
  
  return df

In [65]:
# self.num_folds
# self.targets
# self.seed
from sklearn.model_selection import KFold, StratifiedKFold

def _create_folds(train_df: pd.DataFrame, problem: Problem) -> pd.DataFrame:
  # in which case could we have `kfold` in our dataset?
  # `UnboundLocalError: local variable 'num_folds' referenced before assignment`
  # it should work correctly. remove comments when moving code to files.
  # if "kfold" in train_df.columns:
  #   num_folds = len(np.unique(train_df["kfold"]))
  #   return train_df

  train_df["kfold"] = -1
  if problem in (Problem.binary_classification, Problem.multi_class_classification):
    y = train_df[targets].values
    kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)
    for fold, (_, valid_indices) in enumerate(kf.split(X=train_df, y=y)):
      train_df.loc[valid_indices, "kfold"] = fold
  elif problem == Problem.single_column_regression:
    num_bins = int(np.floor(1 + np.log2(len(train_df))))
    if num_bins > 10:
      num_bins = 10
    kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)
    train_df["bins"] = pd.cut(train_df[targets].values.ravel(), bins=num_bins, labels=False)
    for fold, (_, valid_indices) in enumerate(kf.split(X=train_df, y=y)):
      train_df.loc[valid_indices, "kfold"] = fold
    train_df = train_df.drop("bins", axis=1)
  elif problem == Problem.multi_column_regression:
    y = train_df[targets].values
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    for fold, (_, valid_indices) in enumerate(kf.split(X=train_df, y=y)):
      train_df.loc[valid_indices, "kfold"] = fold
  elif problem == Problem.multi_label_classification: # TODO: use `iterstrat`
    y = train_df[targets].values
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    for fold, (_, valid_indices) in enumerate(kf.split(X=train_df, y=y)):
      train_df.loc[valid_indices, "kfold"] = fold
  else:
    raise Exception("Problem type not supported")
  
  return train_df

In [66]:
# self.train_filename
# self.test_filename

# START: _process_data

train_df = pd.read_csv(train_filename)
# TODO: use `reduce_memory_usage` here

problem = _determine_problem(train_df)
train_df = _inject_idxumn(train_df)

if test_filename is not None:
  test_df = pd.read_csv(test_filename)
  # TODO: use `reduce_memory_usage` here
  test_df = _inject_idxumn(train_df)

In [67]:
# self.idx
# self.targets

train_df = _create_folds(train_df, problem)
ignore_columns = [idx, "kfold"] + targets

In [68]:
# self.features

if features is None:
  features = list(train_df.columns)
  features = [x for x in features if x not in ignore_columns]

In [69]:
if problem in [Problem.binary_classification, Problem.multi_class_classification]:
  target_encoder = LabelEncoder()
  target_values = train_df[targets].values
  target_encoder.fit(target_values.reshape(-1))
  train_df.loc[:, targets] = target_encoder.transform(
      target_values.reshape(-1))
else:
  target_encoder = None

In [70]:
# self.features
# self.categorical_features

if categorical_features is None:
  categorical_features = []
  for feature in features:
    if train_df[feature].dtype == "object":
      categorical_features.append(feature)

In [71]:
# self.categorical_features
# self.num_folds
# self.test_filename
# self.output

if len(categorical_features) > 0:
  print("Encoding categorical features")

categorical_encoders = {}
for fold in range(num_folds):
  fold_train = train_df[train_df.kfold != fold].reset_index(drop=True)
  fold_valid = train_df[train_df.kfold == fold].reset_index(drop=True)

  if test_filename is not None:
    test_fold = test_df.copy(deep=True)

  if len(categorical_features) > 0:
    ordinal_encoder = OrdinalEncoder(
        handle_unknown="use_encoded_value", unknown_value=np.nan)
    fold_train[categorical_features] = ordinal_encoder.fit_transform(
        fold_train[categorical_features].values)
    fold_valid[categorical_features] = ordinal_encoder.transform(
        fold_valid[categorical_features].values)
    if test_filename is not None:
      test_fold[categorical_features] = ordinal_encoder.transform(
          test_fold[categorical_features].values)
    categorical_encoders[fold] = ordinal_encoder

  # WHAT'S A FCKING FEATHER? look at `to_feather`
  fold_train.to_feather(os.path.join(output, f"train_fold_{fold}.feather"))
  fold_valid.to_feather(os.path.join(output, f"valid_fold_{fold}.feather"))
  if test_filename is not None:
    test_fold.to_feather(os.path.join(output, f"test_fold_{fold}.feather"))

Encoding categorical features


In [72]:
# save configs
model_config = {}
model_config["idx"] = idx  # self
model_config["features"] = features  # self
model_config["categorical_features"] = categorical_features
model_config["train_filename"] = train_filename  # self
model_config["test_filename"] = test_filename  # self
model_config["output"] = output  # self
model_config["problem_type"] = problem
model_config["idx"] = idx  # self
model_config["targets"] = targets  # self
model_config["use_gpu"] = use_gpu  # self
model_config["num_folds"] = num_folds  # self
model_config["seed"] = seed  # self
model_config["num_trials"] = num_trials  # self
model_config["time_limit"] = time_limit  # self
model_config["fast"] = fast  # self


In [73]:
import joblib

model_config = ModelConfig(**model_config)
print(f"Model config: {model_config}")
print("Saving model config")
joblib.dump(model_config, f"{output}/axgb.config")

Model config: train_filename='binary_classification.csv' test_filename=None idx='id' targets=['income'] problem_type=<Problem.binary_classification: 1> output='output' features=['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country'] num_folds=5 use_gpu=False seed=42 categorical_features=['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country'] num_trials=100 time_limit=360 fast=False
Saving model config


['output/axgb.config']

In [74]:
print("Saving encoders")
joblib.dump(categorical_encoders,
            f"{output}/axgb.categorical_encoders")
joblib.dump(target_encoder, f"{output}/axgb.target_encoder")

# END: _process_data


Saving encoders


['output/axgb.target_encoder']