In [1]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

In [2]:
train_filename = "binary_classification.csv"
output = "output"
test_filename = None
task = None
idx = None
targets = ["income"]
features = None
categorical_features = None
use_gpu = False
num_folds = 5
seed = 42
num_trials = 100
time_limit = 360
fast = False

In [3]:
# __post_init__

# self.output
# self.targets
# self.idx

# if os.path.exists(output):
#   raise Exception(
#       "Output directory already exists. Please specify some other directory.")

os.makedirs(output, exist_ok=True)

if targets is None:
  print("No target columns specified. Will default to `target`.")
  targets = ["target"]

if idx is None:
  print("No id column specified. Will default to `id`.")
  idx = "id"

No id column specified. Will default to `id`.


In [4]:
import enum

class Task(enum.Enum):
    classification = 0
    regression = 1

    @staticmethod
    def from_str(task_type: str):
        if task_type == "classification":
            return Task.classification
        elif task_type == "regression":
            return Task.regression
        else:
            raise ValueError("Invalid task type: {}".format(task_type))

    @staticmethod
    def list_str():
        return ["classification", "regression"]

class Problem(enum.IntEnum):
    binary_classification = 1
    multi_class_classification = 2
    multi_label_classification = 3
    single_column_regression = 4
    multi_column_regression = 5

    @staticmethod
    def from_str(label):
        if label == "binary_classification":
            return Problem.binary_classification
        elif label == "multi_class_classification":
            return Problem.multi_class_classification
        elif label == "multi_label_classification":
            return Problem.multi_label_classification
        elif label == "single_column_regression":
            return Problem.single_column_regression
        elif label == "multi_column_regression":
            return Problem.multi_column_regression
        else:
            raise NotImplementedError


In [5]:
from typing import List, Optional
from pydantic import BaseModel

class ModelConfig(BaseModel):
    train_filename: str
    test_filename: Optional[str] = None
    idx: str
    targets: List[str]
    problem: Problem
    output: str
    features: List[str]
    num_folds: int
    use_gpu: bool
    seed: int
    categorical_features: List[str]
    num_trials: int
    time_limit: Optional[int] = None
    fast: bool

In [6]:
from sklearn.utils.multiclass import type_of_target
import numpy as np

# self.task
# self.targets
def _determine_problem(df: pd.DataFrame) -> Problem:
  values = df[targets].values
  if task is None:
    target_type = type_of_target(values)
    
    if target_type == "continuous":
      problem = Problem.single_column_regression
    elif target_type == "continuous-multioutput":
      problem = Problem.multi_column_regression
    elif target_type == "binary":
      problem = Problem.binary_classification
    elif target_type == "multiclass":
      problem = Problem.multi_class_classification
    elif target_type == "multilabel-indicator":
      problem = Problem.multi_label_classification
    else:
      raise Exception("Unable to infer `problem`. Please provide `classification` or `regression`")

    return problem

  if task == "classification":
    if len(targets) == 1:
      if (len(np.unique(values)) == 2):
        problem = Problem.binary_classification
      else:
        problem = Problem.multi_label_classification
    else:
      problem = Problem.multi_label_classification
  
  elif task == "regression":
    if len(targets) == 1:
      problem = Problem.single_column_regression
    else:
      problem = Problem.multi_column_regression
  else:
    raise Exception("Problem type not understood")

  return problem


In [7]:
# self.idx
# what if `idx` is `None`?
def _inject_idxumn(df: pd.DataFrame) -> pd.DataFrame:
  if idx not in df.columns:
    df[idx] = np.arange(len(df))
  
  return df

In [8]:
# self.num_folds
# self.targets
# self.seed
from sklearn.model_selection import KFold, StratifiedKFold

def _create_folds(train_df: pd.DataFrame, problem: Problem) -> pd.DataFrame:
  # in which case could we have `kfold` in our dataset?
  # `UnboundLocalError: local variable 'num_folds' referenced before assignment`
  # it should work correctly. remove comments when moving code to files.
  # if "kfold" in train_df.columns:
  #   num_folds = len(np.unique(train_df["kfold"]))
  #   return train_df

  train_df["kfold"] = -1
  if problem in (Problem.binary_classification, Problem.multi_class_classification):
    y = train_df[targets].values
    kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)
    for fold, (_, valid_indices) in enumerate(kf.split(X=train_df, y=y)):
      train_df.loc[valid_indices, "kfold"] = fold
  elif problem == Problem.single_column_regression:
    num_bins = int(np.floor(1 + np.log2(len(train_df))))
    if num_bins > 10:
      num_bins = 10
    kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)
    train_df["bins"] = pd.cut(train_df[targets].values.ravel(), bins=num_bins, labels=False)
    for fold, (_, valid_indices) in enumerate(kf.split(X=train_df, y=y)):
      train_df.loc[valid_indices, "kfold"] = fold
    train_df = train_df.drop("bins", axis=1)
  elif problem == Problem.multi_column_regression:
    y = train_df[targets].values
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    for fold, (_, valid_indices) in enumerate(kf.split(X=train_df, y=y)):
      train_df.loc[valid_indices, "kfold"] = fold
  elif problem == Problem.multi_label_classification: # TODO: use `iterstrat`
    y = train_df[targets].values
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    for fold, (_, valid_indices) in enumerate(kf.split(X=train_df, y=y)):
      train_df.loc[valid_indices, "kfold"] = fold
  else:
    raise Exception("Problem type not supported")
  
  return train_df

In [9]:
from typing import List, TypedDict
import joblib

def process_data(
    idx: int,
    num_folds: int,
    targets: List[str],
    features: List[str],
    categorical_features: List[str],
    train_filename: str,
    test_filename: str,
    output: str,
) -> ModelConfig:
  train_df = pd.read_csv(train_filename)
  # TODO: use `reduce_memory_usage` here

  problem = _determine_problem(train_df)
  train_df = _inject_idxumn(train_df)

  if test_filename is not None:
    test_df = pd.read_csv(test_filename)
    # TODO: use `reduce_memory_usage` here
    test_df = _inject_idxumn(train_df)


  train_df = _create_folds(train_df, problem)
  ignore_columns = [idx, "kfold"] + targets


  if features is None:
    features = list(train_df.columns)
    features = [x for x in features if x not in ignore_columns]


  if problem in [Problem.binary_classification, Problem.multi_class_classification]:
    target_encoder = LabelEncoder()
    target_values = train_df[targets].values
    target_encoder.fit(target_values.reshape(-1))
    train_df.loc[:, targets] = target_encoder.transform(
        target_values.reshape(-1))
  else:
    target_encoder = None


  if categorical_features is None:
    categorical_features = []
    for feature in features:
      if train_df[feature].dtype == "object":
        categorical_features.append(feature)


  if len(categorical_features) > 0:
    print("Encoding categorical features")

  categorical_encoders = {}
  for fold in range(num_folds):
    fold_train = train_df[train_df.kfold != fold].reset_index(drop=True)
    fold_valid = train_df[train_df.kfold == fold].reset_index(drop=True)

    if test_filename is not None:
      test_fold = test_df.copy(deep=True)

    if len(categorical_features) > 0:
      ordinal_encoder = OrdinalEncoder(
          handle_unknown="use_encoded_value", unknown_value=np.nan)
      fold_train[categorical_features] = ordinal_encoder.fit_transform(
          fold_train[categorical_features].values)
      fold_valid[categorical_features] = ordinal_encoder.transform(
          fold_valid[categorical_features].values)
      if test_filename is not None:
        test_fold[categorical_features] = ordinal_encoder.transform(
            test_fold[categorical_features].values)
      categorical_encoders[fold] = ordinal_encoder

    # WHAT'S A FCKING FEATHER? look at `to_feather`
    fold_train.to_feather(os.path.join(output, f"train_fold_{fold}.feather"))
    fold_valid.to_feather(os.path.join(output, f"valid_fold_{fold}.feather"))
    if test_filename is not None:
      test_fold.to_feather(os.path.join(output, f"test_fold_{fold}.feather"))


  # save configs
  model_config = {}
  model_config["idx"] = idx
  model_config["features"] = features
  model_config["categorical_features"] = categorical_features
  model_config["train_filename"] = train_filename
  model_config["test_filename"] = test_filename
  model_config["output"] = output
  model_config["problem"] = problem
  model_config["idx"] = idx
  model_config["targets"] = targets
  model_config["use_gpu"] = use_gpu
  model_config["num_folds"] = num_folds
  model_config["seed"] = seed
  model_config["num_trials"] = num_trials
  model_config["time_limit"] = time_limit
  model_config["fast"] = fast

  model_config = ModelConfig(**model_config)

  print(f"Model config: {model_config}")
  print("Saving model config")
  joblib.dump(model_config, f"{output}/axgb.config")


  print("Saving encoders")
  joblib.dump(categorical_encoders, f"{output}/axgb.categorical_encoders")
  joblib.dump(target_encoder, f"{output}/axgb.target_encoder")

  return model_config


In [10]:
from xgboost import XGBClassifier, XGBRegressor

def fetch_model_params(model_config: ModelConfig):
  problem = model_config.problem
  direction = "minimize"
  match problem:
    case Problem.binary_classification | Problem.multi_label_classification:
      model = XGBClassifier
      predict_probabilities = True
      eval_metric = "logloss"
    case Problem.multi_class_classification:
      model = XGBClassifier
      predict_probabilities = True
      eval_metric = "mlogloss"
    case Problem.single_column_regression | Problem.multi_column_regression:
      model = XGBRegressor
      predict_probabilities = False
      eval_metric = "rmse"
    case other:
      raise NotImplementedError
  
  return model, predict_probabilities, eval_metric, direction


In [11]:
from typing import Type, Optional, TypedDict
from optuna import Trial

ParamsDict = TypedDict('ParamsDict', {
  'learning_rate': float,
  'reg_lambda': float,
  'reg_alpha': float,
  'subsample': float,
  'colsample_bytree': float,
  'max_depth': int,
  'early_stopping_rounds': int,
  'n_estimators': any,
  'tree_method': any,
  'gpu_id': int,
  'predictor': any,
  'booster': any,
  'gamma': float,
  'grow_policy': any,
})

def get_params(trial: Type[Trial], model_config: Type[ModelConfig]) -> ParamsDict:
  params: ParamsDict = {}
  params["learning_rate"] = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
  params["reg_lambda"] = trial.suggest_float("reg_lambda", 1e-8, 100.0, log=True)
  params["reg_alpha"] = trial.suggest_float("reg_alpha", 1e-8, 100.0, log=True)
  params["subsample"] = trial.suggest_float("subsample", 0.1, 1.0)
  params["colsample_bytree"] = trial.suggest_float("colsample_bytree", 0.1, 1.0)
  params["max_depth"] = trial.suggest_int("max_depth", 1, 9)
  params["early_stopping_rounds"] = trial.suggest_int("early_stopping_rounds", 100, 500)
  params["n_estimators"] = trial.suggest_categorical("n_estimators", [7000, 15000, 20000])

  if model_config.use_gpu:
    params["tree_method"] = "gpu_hist"
    params["gpu_id"] = 0
    params["predictor"] = "gpu_predictor"
  else:
    params["tree_method"] = trial.suggest_categorical("tree_method", ["exact", "approx", "hist"])
    params["booster"] = trial.suggest_categorical("booster", ["gbtree", "gblinear"])
    if params["booster"] == "gbtree":
      params["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
      params["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

  return params


  from .autonotebook import tqdm as notebook_tqdm


In [12]:
from dataclasses import dataclass
from typing import Callable, Type, List, TypedDict, Any
from sklearn import metrics as skmetrics
from functools import partial
import copy

MetricsDict = TypedDict('MetricsDict', {
  'auc': Any,
  'logloss': Any,
  'f1': Any,
  'accuracy': Any,
  'mlogloss': Any,
  'r2': Any,
  'mse': Any,
  'mae': Any,
  'rmse': Any,
  'rmsle': Any,
})

@dataclass
class Metrics:
  problem: Problem

  def __post_init__(self):
    values: List[tuple[str, Callable]] = []

    match self.problem:
      case Problem.binary_classification:
        values.append(('auc', skmetrics.roc_auc_score))
        values.append(('logloss', skmetrics.log_loss))
        values.append(('f1', skmetrics.f1_score))
        values.append(('accuracy', skmetrics.accuracy_score))
        values.append(('precision', skmetrics.precision_score))
        values.append(('recall', skmetrics.recall_score))
      case Problem.multi_class_classification:
        values.append(('logloss', skmetrics.log_loss))
        values.append(('accuracy', skmetrics.accuracy_score))
        values.append(('mlogloss', skmetrics.log_loss))
      case [Problem.single_column_regression, Problem.multi_column_regression]:
        values.append(('r2', skmetrics.r2_score))
        values.append(('mse', skmetrics.mean_squared_error))
        values.append(('mae', skmetrics.mean_absolute_error))
        values.append(('rmse', partial(skmetrics.mean_squared_error, squared=False)))
        values.append(('rmsle', partial(skmetrics.mean_squared_log_error, squared=False)))
      case Problem.multi_label_classification:
        values.append(('r2', skmetrics.log_loss))
      case other:
        raise Exception("Invalid problem type")

    self.values = values

  def calculate(self, y_true: List[Any], y_pred: List[Any]) -> MetricsDict:
    metrics: MetricsDict = {}
    for name, callable in self.values:
      if self.problem == Problem.binary_classification:
        if name == "auc":
          metrics[name] = callable(y_true, y_pred[:, 1])
        if name == "logloss":
          metrics[name] = callable(y_true, y_pred)
        else:
          metrics[name] = callable(y_true, y_pred[:, 1] >= 0.5)
      elif self.problem == Problem.multi_class_classification:
        if name == "accuracy":
          metrics[name] = callable(y_true, np.argmax(y_pred, axis=1))
        else:
          metrics[name] = callable(y_true, y_pred)
      else:
        if name == "rmsle":
          temp_pred = copy.deepcopy(y_pred)
          temp_pred[temp_pred < 0] = 0
          metrics[name] = callable(y_pred, temp_pred)
        else:
          metrics[name] = callable(y_true, y_pred)
    
    return metrics


In [13]:
from typing import Dict

def dict_mean(dict_list) -> Dict[str, Any]:
  mean_dict = {}
  for key in dict_list[0].keys():
    mean_dict[key] = sum(d[key] for d in dict_list) / len(dict_list)

  return mean_dict

In [14]:
from typing import Literal, Type

def optimize(
  trial: any,
  xgb_model: Type[XGBClassifier] | Type[XGBRegressor],
  predict_probabilities: bool,
  eval_metric: Literal['logloss', 'mlogloss', 'rmse'],
  model_config: Type[ModelConfig],
) -> Dict[str, Any]:
  params = get_params(trial, model_config)
  early_stopping_rounds = params["early_stopping_rounds"]
  params["early_stopping_rounds"] = None

  metrics = Metrics(model_config.problem)

  scores = []

  for fold in range(model_config.num_folds):
    train_path = f"./output/train_fold_{fold}.feather"
    valid_path = f"./output/valid_fold_{fold}.feather"
    
    train_feather = pd.read_feather(train_path)
    valid_feather = pd.read_feather(valid_path)

    x_train = train_feather[model_config.features]
    x_valid = valid_feather[model_config.features]

    y_train = train_feather[model_config.targets].values
    y_valid = valid_feather[model_config.targets].values

    # train model
    model = xgb_model(
      random_state=model_config.seed,
      eval_metric=eval_metric,
      use_label_encoder=False,
      **params,
    )

    if model_config.problem in (Problem.multi_column_regression, Problem.multi_label_classification):
      y_pred = []
      models = [model] * len(model_config.targets)

      for idx, _model in enumerate(models):
        _model.fit(
          x_train,
          y_train[:, idx],
          early_stopping_rounds=early_stopping_rounds,
          eval_set=[(x_valid, y_valid[:, idx])],
          verbose=False,
        )

        if model_config.problem == Problem.multi_column_regression:
          temp = _model.predict(x_valid)
        else:
          temp = _model.predict_proba(x_valid)[:, 1]
        y_pred.append(temp)
      
      y_pred = np.column_stack(y_pred)

    else: 
      model.fit(
        x_train,
        y_train,
        early_stopping_rounds=early_stopping_rounds,
        eval_set=[(x_valid, y_valid)],
        verbose=False,
      )

      if predict_probabilities:
        y_pred = model.predict_proba(x_valid)
      else:
        y_pred = model.predict(x_valid)

    dict = metrics.calculate(y_valid, y_pred)
    scores.append(dict)
    if model_config.fast is True:
      break
  
  mean_metrics = dict_mean(scores)
  print(f"Metrics: {mean_metrics}")

  return mean_metrics[eval_metric]


In [15]:
from functools import partial
from typing import Dict
import optuna

def train_model(model_config: ModelConfig) -> Dict[str, Any]:
  model, predict_probabilities, eval_metric, direction = fetch_model_params(
      model_config)

  optimize_function = partial(
    optimize,
    xgb_model=model,
    predict_probabilities=predict_probabilities,
    eval_metric=eval_metric,
    model_config=model_config
  )

  db_path = os.path.join(model_config.output, "params.db")
  study = optuna.create_study(
    direction=direction,
    study_name="testtesttest",
    storage=f"sqlite:///{db_path}",
    load_if_exists=True
  )

  study.optimize(
    optimize_function, 
    n_trials=model_config.num_trials,
    timeout=model_config.time_limit,
  )

  return study.best_params

In [18]:
def save_valid_predictions(final_valid_predictions, model_config, target_encoder, output_file_name):
    final_valid_predictions = pd.DataFrame.from_dict(
        final_valid_predictions, orient="index").reset_index()
    if target_encoder is None:
        final_valid_predictions.columns = [
            model_config.idx] + model_config.targets
    else:
        final_valid_predictions.columns = [
            model_config.idx] + list(target_encoder.classes_)

    final_valid_predictions.to_csv(
        os.path.join(model_config.output, output_file_name),
        index=False,
    )


def save_test_predictions(final_test_predictions, model_config, target_encoder, test_ids, output_file_name):
    final_test_predictions = np.mean(final_test_predictions, axis=0)
    if target_encoder is None:
        final_test_predictions = pd.DataFrame(
            final_test_predictions, columns=model_config.targets)
    else:
        final_test_predictions = pd.DataFrame(
            final_test_predictions, columns=list(target_encoder.classes_))
    final_test_predictions.insert(
        loc=0, column=model_config.idx, value=test_ids)
    final_test_predictions.to_csv(
        os.path.join(model_config.output, output_file_name),
        index=False,
    )


In [33]:
from typing import Dict

def predict_model(model_config: ModelConfig, best_params: Dict[str, Any]):
  early_stopping_rounds = best_params["early_stopping_rounds"]
  del best_params["early_stopping_rounds"]

  if model_config.use_gpu is True:
    best_params["tree_method"] = "gpu_hist"
    best_params["gpu_id"] = 0
    best_params["predictor"] = "gpu_predictor"

  xgb_model, predict_probabilities, eval_metric, _ = fetch_model_params(
      model_config)

  metrics = Metrics(model_config.problem)
  scores = []

  final_test_predictions = []
  final_valid_predictions = {}

  target_encoder = joblib.load(f"{model_config.output}/axgb.target_encoder")

  for fold in range(model_config.num_folds):
    print(f"training and predicting for fold {fold}")

    train_path = f"train_fold_{fold}.feather"
    valid_path = f"valid_fold_{fold}.feather"

    train_feather = pd.read_feather(os.path.join(model_config.output, train_path))
    valid_feather = pd.read_feather(os.path.join(model_config.output, valid_path))

    x_train = train_feather[model_config.features]
    x_valid = valid_feather[model_config.features]

    valid_ids = valid_feather[model_config.idx].values

    if model_config.test_filename is not None:
      test_path = f"test_fold_{fold}.feather"
      test_feather = pd.read_feather(os.path.join(model_config.output, test_path))
      x_test = test_feather[model_config.features]
      test_ids = test_feather[model_config.idx].values

    y_train = train_feather[model_config.targets].values
    y_valid = valid_feather[model_config.targets].values

    model = xgb_model(
      random_state=model_config.seed,
      eval_metric=eval_metric,
      use_label_encoder=False,
      **best_params,
    )

    if model_config.problem in (Problem.multi_column_regression, Problem.multi_label_classification):
      y_preds = []
      test_preds = []
      trained_models = []

      for idx in range(len(model_config.targets)):
        _m = copy.deepcopy(model)
        _m.fit(
          x_train,
          y_train[:, idx],
          early_stopping_rounds=early_stopping_rounds,
          eval_set=[(x_valid, y_valid[:, idx])],
          verbose=False,
        )

        trained_models.append(_m)

        if model_config.problem == Problem.multi_column_regression:
          y_pred_temp = _m.predict(x_valid)
          if model_config.test_filename is not None:
            test_pred_temp = _m.predict(x_test)
        else:
          y_pred_temp = _m.predict_proba(x_valid)[:, 1]
          if model_config.test_filename is not None:
            test_pred_temp = _m.predict_proba(x_test)[:, 1]

        y_preds.append(y_pred_temp)
        if model_config.test_filename is not None:
          test_preds.append(test_pred_temp)
        
      y_pred = np.column_stack(y_pred)
      if model_config.test_filename is not None:
        test_pred = np.column_stack(test_pred)

      axgb_model_path = f"axgb_model.{fold}"
      joblib.dump(
        trained_models, 
        os.path.join(model_config.output, axgb_model_path)
      )
    
    else:
      model.fit(
        x_train, 
        y_train, 
        early_stopping_rounds=early_stopping_rounds, 
        eval_set=[(x_valid, y_valid)],
        verbose=False,
      )

      axgb_model_path = f"axgb_model.{fold}"
      joblib.dump(
        model,
        os.path.join(model_config.output, axgb_model_path)
      )

      if predict_probabilities:
        y_pred = model.predict_proba(x_valid)
        if model_config.test_filename is not None:
          test_pred = model.predict_proba(x_test)
      else:
        y_pred = model.predict(x_valid)
        if model_config.test_filename is not None:
          test_pred = model.predict(x_test)
      
    final_valid_predictions.update(dict(zip(valid_ids, y_pred)))
    if model_config.test_filename is not None:
      final_test_predictions.append(test_pred)
    
    # calculate metrics
    metrics_dict = metrics.calculate(y_valid, y_pred)
    scores.append(dict)
    print(f"fold {fold} done!")

  mean_metrics = dict_mean(scores)
  print(f"metrics: {mean_metrics}")
  save_valid_predictions(
    final_valid_predictions, 
    model_config, 
    target_encoder, 
    "oof_predictions.csv"
  )

  if model_config.test_filename is not None:
    save_test_predictions(
      final_test_predictions, 
      model_config,
      target_encoder, 
      test_ids, 
      "test_predictions.csv"
    )
  else:
    print("No test data supplied. Only OOF predictions were generated")




In [24]:
def predict(model_config: ModelConfig, best_params: Dict[str, Any]):
  print("creating oof and test predictions")
  predict_model(model_config, best_params)

In [35]:
def train():
  model_config = process_data(
    idx=idx,
    num_folds=num_folds,
    targets=targets,
    features=features,
    categorical_features=categorical_features,
    train_filename=train_filename,
    test_filename=test_filename,
    output=output,
  )
  best_params = train_model(model_config=model_config)
  predict(model_config, best_params)

train()

Encoding categorical features


[32m[I 2022-12-26 11:09:40,503][0m Using an existing study with name 'testtesttest' instead of creating a new one.[0m


Model config: train_filename='binary_classification.csv' test_filename=None idx='id' targets=['income'] problem=<Problem.binary_classification: 1> output='output' features=['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country'] num_folds=5 use_gpu=False seed=42 categorical_features=['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country'] num_trials=100 time_limit=360 fast=False
Saving model config
Saving encoders


[32m[I 2022-12-26 11:11:01,339][0m Trial 102 finished with value: 0.27367066639537285 and parameters: {'learning_rate': 0.013217931011323429, 'reg_lambda': 0.0005408884267550638, 'reg_alpha': 0.0015974601554364515, 'subsample': 0.9982700092408792, 'colsample_bytree': 0.17720594184466476, 'max_depth': 5, 'early_stopping_rounds': 205, 'n_estimators': 20000, 'tree_method': 'exact', 'booster': 'gbtree', 'gamma': 1.565890808478196e-06, 'grow_policy': 'lossguide'}. Best is trial 80 with value: 0.2732667686585658.[0m


Metrics: {'auc': 0.798327886446119, 'logloss': 0.27367066639537285, 'f1': 0.7129964051344546, 'accuracy': 0.8733452297823554, 'precision': 0.7845530603194806, 'recall': 0.653613701694827}


[32m[I 2022-12-26 11:12:07,812][0m Trial 103 finished with value: 0.27351345572715 and parameters: {'learning_rate': 0.017119544845125963, 'reg_lambda': 1.2737286501397076e-05, 'reg_alpha': 0.0009351808961080538, 'subsample': 0.9365739873067974, 'colsample_bytree': 0.16620563638612476, 'max_depth': 5, 'early_stopping_rounds': 215, 'n_estimators': 20000, 'tree_method': 'exact', 'booster': 'gbtree', 'gamma': 9.857222918218443e-07, 'grow_policy': 'lossguide'}. Best is trial 80 with value: 0.2732667686585658.[0m


Metrics: {'auc': 0.7988304914681534, 'logloss': 0.27351345572715, 'f1': 0.7134750059574189, 'accuracy': 0.8733145455450846, 'precision': 0.7833865657757808, 'recall': 0.6551448017065334}


[32m[I 2022-12-26 11:13:55,899][0m Trial 104 finished with value: 0.27720611590220545 and parameters: {'learning_rate': 0.014761967394693489, 'reg_lambda': 4.3470135522887436e-05, 'reg_alpha': 0.0002625688311352254, 'subsample': 0.9402399124810431, 'colsample_bytree': 0.1048281981691513, 'max_depth': 5, 'early_stopping_rounds': 248, 'n_estimators': 20000, 'tree_method': 'exact', 'booster': 'gbtree', 'gamma': 6.984588236989982e-07, 'grow_policy': 'lossguide'}. Best is trial 80 with value: 0.2732667686585658.[0m


Metrics: {'auc': 0.7950278961572563, 'logloss': 0.27720611590220545, 'f1': 0.7088840291816441, 'accuracy': 0.8723011452053369, 'precision': 0.7856953657113747, 'recall': 0.6459619411818265}


[32m[I 2022-12-26 11:14:54,709][0m Trial 105 finished with value: 0.2771317775951735 and parameters: {'learning_rate': 0.01712697716097669, 'reg_lambda': 2.699579578743089e-05, 'reg_alpha': 0.0009077140399616976, 'subsample': 0.8942266816699467, 'colsample_bytree': 0.881736301209408, 'max_depth': 6, 'early_stopping_rounds': 293, 'n_estimators': 20000, 'tree_method': 'exact', 'booster': 'gbtree', 'gamma': 5.762852230302218e-06, 'grow_policy': 'lossguide'}. Best is trial 80 with value: 0.2732667686585658.[0m


Metrics: {'auc': 0.7983635111816358, 'logloss': 0.2771317775951735, 'f1': 0.7127504178234613, 'accuracy': 0.8730688217215163, 'precision': 0.7830505632269829, 'recall': 0.6542512942079317}


[32m[I 2022-12-26 11:16:15,967][0m Trial 106 finished with value: 0.2774291731988045 and parameters: {'learning_rate': 0.018930521435251414, 'reg_lambda': 1.2292739738376965e-05, 'reg_alpha': 0.002954327698225882, 'subsample': 0.9559460715306645, 'colsample_bytree': 0.12392428123898219, 'max_depth': 6, 'early_stopping_rounds': 235, 'n_estimators': 20000, 'tree_method': 'exact', 'booster': 'gbtree', 'gamma': 8.793567903732161e-08, 'grow_policy': 'lossguide'}. Best is trial 80 with value: 0.2732667686585658.[0m


Metrics: {'auc': 0.7938449219465766, 'logloss': 0.2774291731988045, 'f1': 0.707532567839831, 'accuracy': 0.872024765437939, 'precision': 0.7866616124142495, 'recall': 0.643029649718396}
creating oof and test predictions
training and predicting for fold 0
fold 0 done!
training and predicting for fold 1
fold 1 done!
training and predicting for fold 2
fold 2 done!
training and predicting for fold 3
fold 3 done!
training and predicting for fold 4
fold 4 done!


TypeError: unbound method dict.keys() needs an argument

In [66]:
from pydantic import create_model
from typing import Union
import json

@dataclass
class Predictor:
  model_path: str

  def __post_init__(self):
    self.model_config = joblib.load(os.path.join(self.model_path, "axgb.config"))
    self.target_encoder = joblib.load(os.path.join(self.model_path, "axgb.target_encoder"))
    self.categorical_encoders = joblib.load(os.path.join(self.model_path, "axgb.categorical_encoders"))

    self.models = []
    for fold in range(self.model_config.num_folds):
      current_model = joblib.load(os.path.join(self.model_path, f"axgb_model.{fold}"))
      self.models.append(current_model)

    _, self.predict_probabilities, _, _ = fetch_model_params(self.model_config)

  def get_prediction_schema(self):
    categorical_features = self.model_config.categorical_features
    schema = { "PredictSchema": {} }
    
    for categorical_feature in categorical_features:
      schema["PredictSchema"][categorical_feature] = "str"
    
    for feature in self.model_config.features:
      if feature not in categorical_features:
        schema["PredictSchema"][feature] = 10.0
    
    return create_model("PredictSchema", **schema["PredictSchema"])

  def _predict_data_frame(self, df: Type[pd.DataFrame]):
    categorical_features = self.model_config.categorical_features
    test_ids = df[self.model_config.idx].values

    final_predictions = []
    for fold in range(self.model_config.num_folds):
      fold_test = df.copy(deep=True)
      if len(categorical_features) > 0:
        categorical_encoder = self.categorical_encoders[fold]
        fold_test[categorical_features] = categorical_encoder.transform(
          fold_test[categorical_features].values)
      
      test_features = fold_test[self.model_config.features]

      for column in test_features.columns:
        if test_features[column].dtype == "object":
          test_features[column] = test_features[column].astype(np.int64)

      if self.model_config.problem in (
        Problem.multi_column_regression, 
        Problem.multi_label_classification
      ):
        test_predictions_mll = []
        for m_idx in range(len(self.models[fold])):
          if self.model_config.problem == Problem.multi_column_regression:
            test_prediction_temp = self.model[fold][m_idx].predict(test_features)
          else:
            test_prediction_temp = self.models[fold][m_idx].predict_proba(test_features)[:, 1]
          test_predictions_mll.append(test_prediction_temp)
      else:
        if self.predict_probabilities:
          test_predictions = self.models[fold].predict_proba(test_features)
        else:
          test_predictions = self.models[fold].predict(test_features)
      
      final_predictions.append(test_predictions)

    final_predictions = np.mean(final_predictions, axis=0)
    if self.target_encoder is None:
      final_predictions = pd.DataFrame(
        final_predictions, 
        columns=self.model_config.target_cols
      )
    else:
      final_predictions = pd.DataFrame(
        final_predictions, columns=list(self.target_encoder.classes_))
    
    final_predictions.insert(loc=0, column=self.model_config.idx, value=test_ids)
    return final_predictions

  def predict_single(self, sample: Dict[str, Union[str, int, float]] = None, fast_predict: bool = True):
    sample = json.loads(sample)
    sample_df = pd.DataFrame.from_dict(sample, orient="index").T
    sample_df[self.model_config.idx] = 0
    preds = self._predict_data_frame(sample_df)
    preds = preds.to_dict(orient="records")[0]
    return preds

  def predict_file(self, test_filename: str, out_filename: str):
    test_df = pd.read_csv(test_filename)
    # test_df = reduce_memory_usage(test_df)
    final_preds = self._predict_df(test_df)
    final_preds.to_csv(out_filename, index=False)


In [67]:
predictor = Predictor(model_path="./output/")
schema = predictor.get_prediction_schema()
schema.Config.fields

{}

In [68]:
# predictor.predict_single('{ "workclass": "Private" }')
# predictor.predict_single('{ }')
# predictor.predict_single(
#     '{ "education": "HS-grad",  "marital.status": "Never-married", "occupation": "Handlers-cleaners", "relationship": "Unmarried", "race": "Black", "sex": "Female", "native.country": "United-States", }')

value = '''{
  "age": "24",
  "workclass": "Private",
  "fnlwgt": "82804",
  "education.num": "9",
  "education": "HS-grad",
  "marital.status": "Never-married",
  "occupation": "Handlers-cleaners",
  "relationship": "Unmarried",
  "race": "Black",
  "sex": "Female",
  "capital.gain": "0",
  "capital.loss": "0",
  "hours.per.week": "40",
  "native.country": "United-States"
}'''
predictor.predict_single(value)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_features[column] = test_features[column].astype(np.int64)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_features[column] = test_features[column].astype(np.int64)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_features[column] = test_features[column].astype(np.int64)
A value is tryin

{'id': 0, '<=50K': 0.9989117383956909, '>50K': 0.0010881501948460937}

In [71]:
value = '''{
  "age": "44",
  "workclass": "Private",
  "fnlwgt": "121874",
  "education": "Some-college",
  "education.num": "10",
  "marital.status": "Divorced",
  "occupation": "Sales",
  "relationship": "Unmarried",
  "race": "White",
  "sex": "Male",
  "capital.gain": "0",
  "capital.loss": "0",
  "hours.per.week": "55",
  "native.country": "United-States"
}'''
predictor.predict_single(value)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_features[column] = test_features[column].astype(np.int64)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_features[column] = test_features[column].astype(np.int64)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_features[column] = test_features[column].astype(np.int64)
A value is tryin

{'id': 0, '<=50K': 0.7725300192832947, '>50K': 0.22746995091438293}

In [72]:
value = '''{
  "age": "36",
  "workclass": "Private",
  "fnlwgt": "246449",
  "education": "Some-college",
  "education.num": "10",
  "marital.status": "Married-civ-spouse",
  "occupation": "Exec-managerial",
  "relationship": "Husband",
  "race": "White",
  "sex": "Male",
  "capital.gain": "0",
  "capital.loss": "0",
  "hours.per.week": "60",
  "native.country": "United-States"
}'''
predictor.predict_single(value)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_features[column] = test_features[column].astype(np.int64)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_features[column] = test_features[column].astype(np.int64)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_features[column] = test_features[column].astype(np.int64)
A value is tryin

{'id': 0, '<=50K': 0.40971335768699646, '>50K': 0.5902866125106812}