In [107]:
import pandas as pd
import numpy as np
import xgboost as xgb
import dalex as dx

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer

import xgboost as xgb
import numpy as np
from typing import Tuple
from sklearn.metrics import balanced_accuracy_score, accuracy_score

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import sympy as sp

import numpy as np
import xgboost as xgb
from typing import Tuple

In [114]:
titanic = dx.datasets.load_titanic()
X = titanic.drop(columns='survived')
y = titanic.survived

numerical_features = ['age', 'fare', 'sibsp', 'parch']
numerical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

categorical_features = ['gender', 'class', 'embarked']
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', xgb.train(params,
           dtrain=dtrain,
           num_boost_round=15,
           obj=obj_sq,
        #    evals=[(dtrain, 'train'), (dtest, 'test')]
           ))])

# preprocess = make_column_transformer(
#     (StandardScaler(), ['age', 'fare', 'parch', 'sibsp']),
#     (OneHotEncoder(), ['gender', 'class', 'embarked']))


X_transformed = pd.get_dummies(X, drop_first=True)
# display(X.head())
# display(X_transformed.head())

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [96]:
def gradient(predt: np.ndarray, dtrain: xgb.DMatrix, lambda_: float, f: np.ndarray) -> np.ndarray:
    y = dtrain.get_label()
    return 2 * (predt - y) + 2 * lambda_ * (predt - f)

def hessian(predt: np.ndarray, dtrain: xgb.DMatrix, lambda_:float, f: np.ndarray) -> np.ndarray:
    # y = dtrain.get_label()
    return 2 + 2 * lambda_

def obj_sq(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]:
    f = np.ones_like(predt)
    lambda_ = 0
    grad = gradient(predt, dtrain, lambda_, f)
    hess = hessian(predt, dtrain, lambda_, f)
    if type(hess) in [int, float]:
        hess = hess*np.ones_like(grad)
    return grad, hess

In [112]:
params = {
    "max_depth": 5,
   #  "objective": "binary:logistic",
    "eval_metric": "auc"
}

model = xgb.train(params,
           dtrain=dtrain,
           num_boost_round=15,
           obj=obj_sq,
        #    evals=[(dtrain, 'train'), (dtest, 'test')]
           )

pred = model.predict(dtest)

accuracy_score(y_test, pred >= 0.5)
# print(y_test)
# print(pred)


0.8031674208144797

In [117]:
exp = dx.Explainer(model, X, y)
# exp.model_profile()

Preparation of a new explainer is initiated

  -> data              : 2207 rows 7 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 2207 values
  -> model_class       : xgboost.core.Booster (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_xgboost at 0x0000017C1C680220> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> model type        : 'model_type' not provided and cannot be extracted.
  -> model type        : Some functionalities won't be available.
  -> residual function : difference between y and yhat (default)
  -> residuals         :  'residual_function' returns an Error when executed:
DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`



  -> predicted values  : 'predict_function' returns an Error when executed: 
DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:gender: object, class: object, embarked: object



  -> predicted values  : 'predict_function' must return numpy.ndarray (1d)



In [None]:
exp.predict(X)

In [56]:
X_train

Unnamed: 0,age,fare,sibsp,parch,gender_male,class_2nd,class_3rd,class_deck crew,class_engineering crew,class_restaurant staff,class_victualling crew,embarked_Cherbourg,embarked_Queenstown,embarked_Southampton
1996,30.0,0.0000,0,0,True,False,False,False,False,False,True,False,False,True
1389,28.0,0.0000,0,0,True,False,False,False,False,True,False,False,False,True
1565,29.0,0.0000,0,0,True,False,False,False,False,False,True,False,False,True
965,41.0,13.0000,0,0,True,True,False,False,False,False,False,False,False,True
900,23.0,9.0406,0,0,True,False,True,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,47.0,0.0000,0,0,True,False,False,False,False,False,True,False,False,True
1095,29.0,26.0000,0,0,True,True,False,False,False,False,False,False,False,True
1130,37.0,8.1303,0,0,True,False,True,False,False,False,False,False,False,True
1294,23.0,13.0000,0,0,True,True,False,False,False,False,False,False,False,True
