In [32]:
import pandas as pd
from catboost import CatBoostClassifier
import numpy as np
from lightgbm import LGBMClassifier
from scipy.spatial.distance import cityblock
from sklearn.ensemble import StackingClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from utils.general_utils import load_data_old, get_cat_feature_names
from utils.model_extensions_utils import FocalLossObjective
from utils.plot_utils import Evaluation
# from utils.fe_utils import get_growth_features
from explainerdashboard import ClassifierExplainer, ExplainerDashboard, InlineExplainer
import matplotlib.pyplot as plt
import shap
from utils.fe_utils import get_growth_features


In [33]:
def process_df(X_temp):
    # - remove zero variance features
    # cols_to_drop = [col for col in X_temp.select_dtypes(include=np.number).columns if np.std(X_temp[col]) == 0]
    # X_temp = X_temp.drop(cols_to_drop, axis=1)
    tech_cols = ['maven', 'generic', 'docker', 'npm', 'pypi', 'gradle', 'nuget']
    usage_cols = tech_cols + ['artifacts_count', 'artifacts_size', 'binaries_count', 'binaries_size', 'items_count',
                              'number_of_permissions', 'internal_groups', 'number_of_users', 'n_env', 'n_tech',
                              'n_repos']
    X_temp['n_tech'] = (X_temp[tech_cols] != 0).astype(int).sum(axis=1)
    X_temp['n_tech.1'] = (X_temp[[col + '.1' for col in tech_cols]] != 0).astype(int).sum(axis=1)
    X_temp['n_tech.2'] = (X_temp[[col + '.2' for col in tech_cols]] != 0).astype(int).sum(axis=1)
    X_temp['n_tech.3'] = (X_temp[[col + '.3' for col in tech_cols]] != 0).astype(int).sum(axis=1)
    X_temp['n_tech.4'] = (X_temp[[col + '.4' for col in tech_cols]] != 0).astype(int).sum(axis=1)
    X_temp['n_repos'] = (X_temp[tech_cols]).sum(axis=1)
    X_temp['n_repos.1'] = (X_temp[[col + '.1' for col in tech_cols]]).sum(axis=1)
    X_temp['n_repos.2'] = (X_temp[[col + '.2' for col in tech_cols]]).sum(axis=1)
    X_temp['n_repos.3'] = (X_temp[[col + '.3' for col in tech_cols]]).sum(axis=1)
    X_temp['n_repos.4'] = (X_temp[[col + '.4' for col in tech_cols]]).sum(axis=1)
    # X_temp['leading_tech'] = list(X_temp[tech_cols].idxmax(axis=1))
    # X_temp.loc[X_temp['leading_tech'].isin(['npm', 'gradle', 'pypi']), 'leading_tech'] = 'else'

    # - get trends features
    for col in usage_cols:
        growth_feature_monthly, growth_feature_quarter, df_fg = get_growth_features(col, X_temp.copy())
        X_temp[col + '_monthly_growth'] = growth_feature_monthly
        X_temp[col + '_quarter_growth'] = growth_feature_quarter

    # - transform to category
    cat_features = get_cat_feature_names(X_temp)
    for col in cat_features:
        X_temp[col] = X_temp[col].astype('category')

    # - drop usage features from the periods before the relevant-date
    cols_to_drop = [col for col in X_temp.columns if '.1' in col or '.2' in col or '.3' in col or '.4' in col]
    X_temp = X_temp.drop(cols_to_drop, axis=1)
    X_temp['artifacts/binaries_size'] = np.where(X_temp['binaries_size'] == 0, 0,
                                                 X_temp['artifacts_size'] / X_temp['binaries_size'])
    X_temp['artifacts/binaries_count'] = np.where(X_temp['binaries_count'] == 0, 0,
                                                  X_temp['artifacts_count'] / X_temp['binaries_count'])
    # X_temp = X_temp.drop(['total_employees_with_details', 'days_from_contact_added', 'territory', 'industry_group',
    #                       'total_employees_range'], axis=1)
    return X_temp

In [34]:
df = load_data_old('fit.sql')
df

Unnamed: 0,account_id,relevant_date,class,territory,account_id.1,relevant_date.1,period_range,artifacts_count,artifacts_size,binaries_count,...,n_ent_mentioned_sessions,n_competitor_mentioned_sessions,n_xray_mentioned_sessions,n_replys,n_sent,n_calls,n_task_xray,replys_to_sent,days_since_reply,days_since_sent
0,0011r00001iACCU,2018-08-25 00:00:00,0,EMEA,0011r00001iACCU,2018-08-25 00:00:00,3 Months,1289272,14066,638194,...,0,0,0,0,0,0,0,-1.000000,1000,1000
1,0011r00001iACCU,2019-08-09 00:00:00,0,EMEA,0011r00001iACCU,2019-08-09 00:00:00,3 Months,1844459,27343,875317,...,0,0,0,0,1,0,0,0.000000,259,85
2,0011r00001iACCU,2020-08-27 00:00:00,0,EMEA,0011r00001iACCU,2020-08-27 00:00:00,3 Months,515803,35022,397341,...,0,0,1,1,6,0,0,0.166667,9,9
3,0011r00001iACCU,2021-07-30 00:00:00,0,EMEA,0011r00001iACCU,2021-07-30 00:00:00,3 Months,642349,26485,459664,...,0,0,0,0,0,0,0,-1.000000,158,161
4,0011r00001iACGM,2019-08-30 00:00:00,0,EMEA,0011r00001iACGM,2019-08-30 00:00:00,3 Months,56486,234,48490,...,0,0,0,0,2,0,0,0.000000,1000,57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5855,001w000001kgU5v,2018-08-30 00:00:00,0,Americas,001w000001kgU5v,2018-08-30 00:00:00,3 Months,132,0,128,...,0,0,0,0,0,0,0,-1.000000,1000,1000
5856,001w000001kgfaR,2018-10-16 00:00:00,0,EMEA,001w000001kgfaR,2018-10-16 00:00:00,3 Months,24491,623,14349,...,0,0,0,0,0,0,0,-1.000000,1000,1000
5857,001w000001kgfaR,2019-10-15 00:00:00,0,EMEA,001w000001kgfaR,2019-10-15 00:00:00,3 Months,36144,872,22216,...,0,0,0,0,0,0,0,-1.000000,1000,152
5858,001w000001kgfaR,2020-09-19 00:00:00,0,EMEA,001w000001kgfaR,2020-09-19 00:00:00,3 Months,61167,1073,37468,...,0,0,0,0,0,0,0,-1.000000,1000,180


In [35]:
has_won = df.groupby('account_id', as_index=False).sum('class').loc[:, ['account_id', 'class']]
has_won['has_won'] = has_won['class'].apply(lambda x: True if x > 0 else False)
has_won.drop('class', axis=1, inplace=True)
new_df = df.merge(has_won, on='account_id')
df_did_win, df_did_not_win = new_df[new_df['has_won']], new_df[~new_df['has_won']]
df_did_win = df_did_win[df_did_win['class'] == 1].groupby('account_id', as_index=False).min('relevant_date')
df_did_not_win = df_did_not_win.groupby('account_id').sample(n=1, random_state=2)
df = pd.concat([df_did_win, df_did_not_win])
df = df.sample(frac=1, random_state=2).reset_index(drop=True)

In [36]:
cols_to_drop = [col for col in df.columns if 'period_range' in col or 'relevant_date' in col or 'account_id' in col
                or 'class' in col or 'has_won' in col]
X, y = df.drop(cols_to_drop, axis=1).fillna(-1), df['class']
X = process_df(X)

In [37]:
get_cat_feature_names(X)

['territory', 'industry_group', 'total_employees_range']

In [38]:
cbc = CatBoostClassifier(cat_features=get_cat_feature_names(X), auto_class_weights='Balanced', verbose=0,
                         random_state=5, loss_function=FocalLossObjective(), eval_metric="Logloss",
                         bootstrap_type='Bayesian')

In [39]:
X = X.reindex(sorted(X.columns), axis=1)

In [40]:
cbc.fit(X, y)

<catboost.core.CatBoostClassifier at 0x148ef0ac0>

In [41]:
df_for_predict = load_data_old('predict.sql')
cols_to_drop = [col for col in df_for_predict.columns if
                'period_range' in col or 'relevant_date' in col or 'account_id' in col
                or 'class' in col or 'has_won' in col]
df_for_predict_clean = df_for_predict.drop(cols_to_drop, axis=1)
df_for_predict_clean = process_df(df_for_predict_clean)

In [42]:
df_for_predict_clean = df_for_predict_clean.reindex(sorted(df_for_predict_clean.columns), axis=1)
df_for_predict_clean

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,pypi_monthly_growth,pypi_quarter_growth,qoe_score,replys_to_sent,support_views_views,territory,total_employees_range,total_employees_with_details,unresolved_jira_cases,xray_views
0,-1,1.301997,1.014674,580153,-0.020795,-0.127819,28766,0.001388,0.069750,-1,...,0.00,-0.25,-1.0,0.333333,-1,EMEA,251-1K,11.0,0,-1
1,-1,1.225902,1.472669,181474,0.056865,0.216213,916,0.063421,0.189886,-1,...,0.00,0.00,-1.0,0.000000,-1,EMEA,11-50,4.0,0,-1
2,-1,1.600799,1.038462,97716,0.011116,0.020427,81,0.012658,-0.016484,-1,...,0.00,0.00,-1.0,0.500000,-1,EMEA,51-250,3.0,0,-1
3,-1,0.948401,1.187500,11065,0.016107,0.071429,19,0.000000,0.027778,-1,...,0.00,0.00,-1.0,0.000000,-1,Americas,51-250,8.0,0,-1
4,-1,1.854750,2.403682,1417617,-0.003362,0.050197,20501,-0.007622,0.045593,17,...,0.00,0.00,-1.0,0.500000,-1,EMEA,51-250,25.0,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2049,-1,1.046034,1.021797,376613,0.025522,0.081320,1547,0.043820,0.096888,-1,...,0.25,0.25,-1.0,0.250000,-1,EMEA,51-250,6.0,0,-1
2050,-1,1.002824,1.000000,11008,0.003378,0.026594,3,0.000000,0.000000,-1,...,0.00,0.00,-1.0,0.500000,-1,EMEA,251-1K,8.0,0,-1
2051,2,1.462223,1.840855,109734,0.053731,0.131497,775,0.100434,0.361883,-1,...,0.00,0.00,-1.0,0.000000,0,EMEA,10K-50K,6.0,0,0
2052,-1,1.041752,1.025000,79917,0.014928,0.042398,82,0.006173,0.025641,-1,...,0.00,0.50,-1.0,0.333333,-1,EMEA,1K-5K,4.0,0,-1


In [43]:
X

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,pypi_monthly_growth,pypi_quarter_growth,qoe_score,replys_to_sent,support_views_views,territory,total_employees_range,total_employees_with_details,unresolved_jira_cases,xray_views
0,2,2.875758,2.085175,1776094,-0.029869,0.126686,7736,-0.032646,0.112310,-1,...,0.0,0.0,-1.0,0.0,0,Americas,11-50,5.0,0,0
1,-1,0.999901,1.023256,30151,0.013675,0.082525,44,0.078947,0.010766,-1,...,0.0,0.0,-1.0,0.0,-1,Americas,51-250,2.0,0,-1
2,-1,1.042621,1.210843,40559,-0.138315,2.592367,201,-0.153844,3.980336,12,...,0.0,0.0,-1.0,0.0,-1,APAC,5K-10K,6.0,0,-1
3,-1,1.020976,1.071429,41129,0.019082,0.121332,15,0.035714,0.410714,-1,...,0.0,0.0,-1.0,0.0,-1,Americas,251-1K,3.0,0,-1
4,-1,2.455746,2.124444,28273,0.546472,2.911952,478,0.214738,0.888758,-1,...,0.0,0.0,-1.0,0.5,-1,EMEA,11-50,13.0,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2648,-1,1.744133,1.170863,805793,0.007835,0.040676,1953,-0.030829,0.129886,-1,...,0.0,0.0,-1.0,0.5,-1,EMEA,50K-100K,7.0,0,-1
2649,-1,1.533460,1.238035,288315,0.120742,0.548818,983,0.266244,0.446162,-1,...,0.0,0.0,-1.0,-1.0,-1,APAC,1K-5K,3.0,0,-1
2650,-1,1.110468,1.257426,445994,0.044666,-0.018604,1778,0.043788,0.015621,18,...,0.5,0.0,-1.0,0.0,-1,EMEA,1K-5K,9.0,4,-1
2651,-1,1.030727,1.006472,8185,0.179138,2.938783,933,0.010141,0.935121,-1,...,0.0,0.5,-1.0,0.0,-1,EMEA,100K+,6.0,0,-1


In [44]:
df_for_predict_clean['class_pred_proba'] = cbc.predict_proba(df_for_predict_clean)[:, 1]

In [45]:
df_for_predict_clean

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,pypi_quarter_growth,qoe_score,replys_to_sent,support_views_views,territory,total_employees_range,total_employees_with_details,unresolved_jira_cases,xray_views,class_pred_proba
0,-1,1.301997,1.014674,580153,-0.020795,-0.127819,28766,0.001388,0.069750,-1,...,-0.25,-1.0,0.333333,-1,EMEA,251-1K,11.0,0,-1,0.066314
1,-1,1.225902,1.472669,181474,0.056865,0.216213,916,0.063421,0.189886,-1,...,0.00,-1.0,0.000000,-1,EMEA,11-50,4.0,0,-1,0.065718
2,-1,1.600799,1.038462,97716,0.011116,0.020427,81,0.012658,-0.016484,-1,...,0.00,-1.0,0.500000,-1,EMEA,51-250,3.0,0,-1,0.066057
3,-1,0.948401,1.187500,11065,0.016107,0.071429,19,0.000000,0.027778,-1,...,0.00,-1.0,0.000000,-1,Americas,51-250,8.0,0,-1,0.065935
4,-1,1.854750,2.403682,1417617,-0.003362,0.050197,20501,-0.007622,0.045593,17,...,0.00,-1.0,0.500000,-1,EMEA,51-250,25.0,0,-1,0.071332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2049,-1,1.046034,1.021797,376613,0.025522,0.081320,1547,0.043820,0.096888,-1,...,0.25,-1.0,0.250000,-1,EMEA,51-250,6.0,0,-1,0.066556
2050,-1,1.002824,1.000000,11008,0.003378,0.026594,3,0.000000,0.000000,-1,...,0.00,-1.0,0.500000,-1,EMEA,251-1K,8.0,0,-1,0.067442
2051,2,1.462223,1.840855,109734,0.053731,0.131497,775,0.100434,0.361883,-1,...,0.00,-1.0,0.000000,0,EMEA,10K-50K,6.0,0,0,0.067547
2052,-1,1.041752,1.025000,79917,0.014928,0.042398,82,0.006173,0.025641,-1,...,0.50,-1.0,0.333333,-1,EMEA,1K-5K,4.0,0,-1,0.066001


In [46]:
####################
# WHAT IF ANALYSIS #
####################

processed_df_for_fit = X
processed_df_for_fit['class'] = y
processed_df_for_fit['class_pred_proba'] = cbc.predict_proba(X)[:, 1]

In [47]:
high_bar_for_proba = processed_df_for_fit['class_pred_proba'].quantile(.85)
low_bar_for_proba = processed_df_for_fit['class_pred_proba'].quantile(.7)
processed_df_for_fit['class_pred'] = processed_df_for_fit['class_pred_proba'].apply(
    lambda x: 'High' if x >= high_bar_for_proba else 'Medium' if x >= low_bar_for_proba else 'Low')

In [48]:
high_bar_for_proba = df_for_predict_clean['class_pred_proba'].quantile(.85)
low_bar_for_proba = df_for_predict_clean['class_pred_proba'].quantile(.7)
df_for_predict_clean['class_pred'] = df_for_predict_clean['class_pred_proba'].apply(
    lambda x: 'High' if x >= high_bar_for_proba else 'Medium' if x >= low_bar_for_proba else 'Low')

In [49]:
processed_df_for_fit

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,replys_to_sent,support_views_views,territory,total_employees_range,total_employees_with_details,unresolved_jira_cases,xray_views,class,class_pred_proba,class_pred
0,2,2.875758,2.085175,1776094,-0.029869,0.126686,7736,-0.032646,0.112310,-1,...,0.0,0,Americas,11-50,5.0,0,0,0,0.066705,Low
1,-1,0.999901,1.023256,30151,0.013675,0.082525,44,0.078947,0.010766,-1,...,0.0,-1,Americas,51-250,2.0,0,-1,0,0.065808,Low
2,-1,1.042621,1.210843,40559,-0.138315,2.592367,201,-0.153844,3.980336,12,...,0.0,-1,APAC,5K-10K,6.0,0,-1,0,0.067767,Medium
3,-1,1.020976,1.071429,41129,0.019082,0.121332,15,0.035714,0.410714,-1,...,0.0,-1,Americas,251-1K,3.0,0,-1,0,0.066022,Low
4,-1,2.455746,2.124444,28273,0.546472,2.911952,478,0.214738,0.888758,-1,...,0.5,-1,EMEA,11-50,13.0,0,-1,0,0.066101,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2648,-1,1.744133,1.170863,805793,0.007835,0.040676,1953,-0.030829,0.129886,-1,...,0.5,-1,EMEA,50K-100K,7.0,0,-1,0,0.066008,Low
2649,-1,1.533460,1.238035,288315,0.120742,0.548818,983,0.266244,0.446162,-1,...,-1.0,-1,APAC,1K-5K,3.0,0,-1,0,0.067337,Medium
2650,-1,1.110468,1.257426,445994,0.044666,-0.018604,1778,0.043788,0.015621,18,...,0.0,-1,EMEA,1K-5K,9.0,4,-1,0,0.069183,Medium
2651,-1,1.030727,1.006472,8185,0.179138,2.938783,933,0.010141,0.935121,-1,...,0.0,-1,EMEA,100K+,6.0,0,-1,0,0.066073,Low


In [50]:
df_for_predict_clean

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,qoe_score,replys_to_sent,support_views_views,territory,total_employees_range,total_employees_with_details,unresolved_jira_cases,xray_views,class_pred_proba,class_pred
0,-1,1.301997,1.014674,580153,-0.020795,-0.127819,28766,0.001388,0.069750,-1,...,-1.0,0.333333,-1,EMEA,251-1K,11.0,0,-1,0.066314,Low
1,-1,1.225902,1.472669,181474,0.056865,0.216213,916,0.063421,0.189886,-1,...,-1.0,0.000000,-1,EMEA,11-50,4.0,0,-1,0.065718,Low
2,-1,1.600799,1.038462,97716,0.011116,0.020427,81,0.012658,-0.016484,-1,...,-1.0,0.500000,-1,EMEA,51-250,3.0,0,-1,0.066057,Low
3,-1,0.948401,1.187500,11065,0.016107,0.071429,19,0.000000,0.027778,-1,...,-1.0,0.000000,-1,Americas,51-250,8.0,0,-1,0.065935,Low
4,-1,1.854750,2.403682,1417617,-0.003362,0.050197,20501,-0.007622,0.045593,17,...,-1.0,0.500000,-1,EMEA,51-250,25.0,0,-1,0.071332,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2049,-1,1.046034,1.021797,376613,0.025522,0.081320,1547,0.043820,0.096888,-1,...,-1.0,0.250000,-1,EMEA,51-250,6.0,0,-1,0.066556,Low
2050,-1,1.002824,1.000000,11008,0.003378,0.026594,3,0.000000,0.000000,-1,...,-1.0,0.500000,-1,EMEA,251-1K,8.0,0,-1,0.067442,Medium
2051,2,1.462223,1.840855,109734,0.053731,0.131497,775,0.100434,0.361883,-1,...,-1.0,0.000000,0,EMEA,10K-50K,6.0,0,0,0.067547,Medium
2052,-1,1.041752,1.025000,79917,0.014928,0.042398,82,0.006173,0.025641,-1,...,-1.0,0.333333,-1,EMEA,1K-5K,4.0,0,-1,0.066001,Low


In [51]:
top_model = cbc

In [52]:
scaler = StandardScaler()
bad_accounts = df_for_predict_clean[df_for_predict_clean['class_pred'] != 'High']

In [53]:
processed_df_for_fit['class_diff'] = processed_df_for_fit.apply(lambda row: 1 if
((row['class'] == 1) & (row['class_pred_proba'] < 0.5))
or ((row['class'] == 0) & (row['class_pred_proba'] >= 0.5))
else 0, axis=1)
print('diff in classes:')
print(np.sum(processed_df_for_fit['class_diff']))
print('out of')
print(processed_df_for_fit.shape[0])
train_data_for_whatif = processed_df_for_fit.loc[processed_df_for_fit['class_pred'] == 'High', :].drop(
    ['class_diff', 'class_pred_proba', 'class', 'class_pred'], axis=1)
cat_cols = get_cat_feature_names(train_data_for_whatif)
train_data_for_whatif['cat_val'] = train_data_for_whatif[cat_cols].apply(
    lambda row: '_'.join(row.values.astype(str)), axis=1)

bad_accounts['cat_val'] = bad_accounts[cat_cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
bad_accounts = bad_accounts.drop(['class_pred'], axis=1)
bad_accounts = bad_accounts.drop(cat_cols, axis=1)
train_data_for_whatif = train_data_for_whatif.drop(cat_cols, axis=1)

diff in classes:
0
out of
2653



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [54]:
exclude_for_analysis = []

In [62]:
for index, row in bad_accounts.iterrows():
    row_trans = pd.DataFrame(row)
    print(row['cat_val'])
    train_data_subset = train_data_for_whatif.loc[train_data_for_whatif['cat_val'] == row['cat_val'], :]
    if train_data_subset.shape[0] == 0:
        train_data_subset = train_data_for_whatif.loc[train_data_for_whatif['cat_val'] == '-1_-1_-1', :]

    print(train_data_subset.shape[0])
    train_data_subset_w_instance = pd.concat([train_data_subset, row_trans.transpose()])
    train_data_subset_w_instance = train_data_subset_w_instance.drop(['cat_val'], axis=1)
    df_whatif_scaled = pd.DataFrame(scaler.fit_transform(train_data_subset_w_instance),
                                    columns=train_data_subset_w_instance.columns)
    df_whatif_scaled = df_whatif_scaled.fillna(0)
    sample = df_whatif_scaled.iloc[-1]
    df_whatif_scaled_wo_sample = df_whatif_scaled.iloc[:-1, :]
    dists = [cityblock(sample, df_whatif_scaled_wo_sample.iloc[i]) for i in
             (range(df_whatif_scaled_wo_sample.shape[0]))]
    closest_obs = train_data_subset.iloc[[np.argmin(dists)], range(train_data_subset.shape[1])].drop('cat_val', axis=1)
    df_concat_for_shap = pd.concat([closest_obs, row_trans.transpose()], axis=0)
    shap_values_total = shap.TreeExplainer(top_model).shap_values(df_concat_for_shap)
    shap_diff = np.subtract(shap_values_total[0], shap_values_total[1])
    max_diff_loc = np.argmax(shap_diff)
    found_feature = False
    while not found_feature:
        feature_name = list(train_data_subset.columns)[max_diff_loc]
        nb_val_bigger = int(row.iloc[max_diff_loc]) > int(closest_obs.iloc[0, max_diff_loc])
        nb_val_equal = int(row.iloc[max_diff_loc]) == int(closest_obs.iloc[0, max_diff_loc])
        str_for_parse = " to be bigger" if nb_val_bigger else " to be smaller"
        if not nb_val_equal and feature_name not in exclude_for_analysis:
            print("You should change the feature " + feature_name + str_for_parse)
            found_feature = True
        else:
            shap_diff[np.argmax(shap_diff)] = -100000000000
            max_diff_loc = np.argmax(shap_diff)



Software & Services_EMEA_251-1K
385


CatBoostError: Invalid type for cat_feature[non-default value idx=0,feature_idx=43]=6.0 : cat_features must be integer or string, real number values and NaN values should be converted to string.