In [2]:
import pandas as pd
from catboost import CatBoostClassifier
import numpy as np
from scipy.spatial.distance import cityblock
from sklearn.preprocessing import StandardScaler
from utils.general_utils import load_data_old, get_cat_feature_names
from utils.model_extensions_utils import FocalLossObjective
import shap
from utils.fe_utils import get_growth_features

In [3]:
def process_df(X_temp):
    tech_cols = ['maven', 'generic', 'docker', 'npm', 'pypi', 'gradle', 'nuget']
    usage_cols = tech_cols + ['artifacts_count', 'artifacts_size', 'binaries_count', 'binaries_size', 'items_count',
                              'number_of_permissions', 'internal_groups', 'number_of_users', 'n_env', 'n_tech',
                              'n_repos']
    X_temp['n_tech'] = (X_temp[tech_cols] != 0).astype(int).sum(axis=1)
    X_temp['n_tech.1'] = (X_temp[[col + '.1' for col in tech_cols]] != 0).astype(int).sum(axis=1)
    X_temp['n_tech.2'] = (X_temp[[col + '.2' for col in tech_cols]] != 0).astype(int).sum(axis=1)
    X_temp['n_tech.3'] = (X_temp[[col + '.3' for col in tech_cols]] != 0).astype(int).sum(axis=1)
    X_temp['n_tech.4'] = (X_temp[[col + '.4' for col in tech_cols]] != 0).astype(int).sum(axis=1)
    X_temp['n_repos'] = (X_temp[tech_cols]).sum(axis=1)
    X_temp['n_repos.1'] = (X_temp[[col + '.1' for col in tech_cols]]).sum(axis=1)
    X_temp['n_repos.2'] = (X_temp[[col + '.2' for col in tech_cols]]).sum(axis=1)
    X_temp['n_repos.3'] = (X_temp[[col + '.3' for col in tech_cols]]).sum(axis=1)
    X_temp['n_repos.4'] = (X_temp[[col + '.4' for col in tech_cols]]).sum(axis=1)

    for col in usage_cols:
        growth_feature_monthly, growth_feature_quarter, df_fg = get_growth_features(col, X_temp.copy())
        X_temp[col + '_monthly_growth'] = growth_feature_monthly
        X_temp[col + '_quarter_growth'] = growth_feature_quarter

    # - transform to category
    cat_features = get_cat_feature_names(X_temp)
    for col in cat_features:
        X_temp[col] = X_temp[col].astype('category')

    # - drop usage features from the periods before the relevant-date
    cols_to_drop = [col for col in X_temp.columns if '.1' in col or '.2' in col or '.3' in col or '.4' in col]
    X_temp = X_temp.drop(cols_to_drop, axis=1)
    X_temp['artifacts/binaries_size'] = np.where(X_temp['binaries_size'] == 0, 0,
                                                 X_temp['artifacts_size'] / X_temp['binaries_size'])
    X_temp['artifacts/binaries_count'] = np.where(X_temp['binaries_count'] == 0, 0,
                                                  X_temp['artifacts_count'] / X_temp['binaries_count'])
    X_temp = X_temp.drop(['total_employees_with_details', 'days_from_contact_added', 'territory', 'industry_group',
                          'total_employees_range'], axis=1)
    return X_temp

In [4]:
df = load_data_old('fit.sql')
df

Unnamed: 0,account_id,relevant_date,class,territory,account_id.1,relevant_date.1,period_range,artifacts_count,artifacts_size,binaries_count,...,n_ent_mentioned_sessions,n_competitor_mentioned_sessions,n_xray_mentioned_sessions,n_replys,n_sent,n_calls,n_task_xray,replys_to_sent,days_since_reply,days_since_sent
0,0011r00001iACCU,2018-08-25 00:00:00,0,EMEA,0011r00001iACCU,2018-08-25 00:00:00,3 Months,1289272,14066,638194,...,0,0,0,0,0,0,0,-1.000000,1000,1000
1,0011r00001iACCU,2019-08-09 00:00:00,0,EMEA,0011r00001iACCU,2019-08-09 00:00:00,3 Months,1844459,27343,875317,...,0,0,0,0,1,0,0,0.000000,259,85
2,0011r00001iACCU,2020-08-27 00:00:00,0,EMEA,0011r00001iACCU,2020-08-27 00:00:00,3 Months,515803,35022,397341,...,0,0,1,1,6,0,0,0.166667,9,9
3,0011r00001iACCU,2021-07-30 00:00:00,0,EMEA,0011r00001iACCU,2021-07-30 00:00:00,3 Months,642349,26485,459664,...,0,0,0,0,0,0,0,-1.000000,158,161
4,0011r00001iACGM,2019-08-30 00:00:00,0,EMEA,0011r00001iACGM,2019-08-30 00:00:00,3 Months,56486,234,48490,...,0,0,0,0,2,0,0,0.000000,1000,57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5877,001w000001kgU5v,2018-08-30 00:00:00,0,Americas,001w000001kgU5v,2018-08-30 00:00:00,3 Months,132,0,128,...,0,0,0,0,0,0,0,-1.000000,1000,1000
5878,001w000001kgfaR,2018-10-16 00:00:00,0,EMEA,001w000001kgfaR,2018-10-16 00:00:00,3 Months,24491,623,14349,...,0,0,0,0,0,0,0,-1.000000,1000,1000
5879,001w000001kgfaR,2019-10-15 00:00:00,0,EMEA,001w000001kgfaR,2019-10-15 00:00:00,3 Months,36144,872,22216,...,0,0,0,0,0,0,0,-1.000000,1000,152
5880,001w000001kgfaR,2020-09-19 00:00:00,0,EMEA,001w000001kgfaR,2020-09-19 00:00:00,3 Months,61167,1073,37468,...,0,0,0,0,0,0,0,-1.000000,1000,180


In [5]:
has_won = df.groupby('account_id', as_index=False).sum('class').loc[:, ['account_id', 'class']]
has_won['has_won'] = has_won['class'].apply(lambda x: True if x > 0 else False)
has_won.drop('class', axis=1, inplace=True)
new_df = df.merge(has_won, on='account_id')
df_did_win, df_did_not_win = new_df[new_df['has_won']], new_df[~new_df['has_won']]
df_did_win = df_did_win[df_did_win['class'] == 1].groupby('account_id', as_index=False).min('relevant_date')
df_did_not_win = df_did_not_win.groupby('account_id').sample(n=1, random_state=2)
df = pd.concat([df_did_win, df_did_not_win])
df = df.sample(frac=1, random_state=2).reset_index(drop=True)

In [6]:
cols_to_drop = [col for col in df.columns if 'period_range' in col or 'relevant_date' in col or 'account_id' in col
                or 'class' in col or 'has_won' in col]
X, y = df.drop(cols_to_drop, axis=1).fillna(-1), df['class']
X = process_df(X)

In [7]:
get_cat_feature_names(X)

[]

In [8]:
cbc = CatBoostClassifier(cat_features=get_cat_feature_names(X), auto_class_weights='Balanced', verbose=0,
                         random_state=5, loss_function=FocalLossObjective(), eval_metric="Logloss",
                         bootstrap_type='Bayesian')

In [9]:
X = X.reindex(sorted(X.columns), axis=1)

In [10]:
cbc.fit(X, y)

<catboost.core.CatBoostClassifier at 0x7fa5c87579a0>

In [11]:
df_for_predict = load_data_old('predict.sql')
cols_to_drop = [col for col in df_for_predict.columns if
                'period_range' in col or 'relevant_date' in col or 'account_id' in col
                or 'class' in col or 'has_won' in col]
df_for_predict_clean = df_for_predict.drop(cols_to_drop, axis=1)
df_for_predict_clean = process_df(df_for_predict_clean)

In [12]:
df_for_predict_clean = df_for_predict_clean.reindex(sorted(df_for_predict_clean.columns), axis=1)
df_for_predict_clean

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,number_of_users_quarter_growth,pricing_views,pypi,pypi_monthly_growth,pypi_quarter_growth,qoe_score,replys_to_sent,support_views_views,unresolved_jira_cases,xray_views
0,-1,1.301014,1.014757,580579,-0.012625,-0.130641,28743,-0.002787,0.065690,-1,...,0.008696,-1,1,0.00,-0.25,-1.0,0.333333,-1,0,-1
1,-1,1.226310,1.476726,182378,0.056636,0.215420,920,0.061820,0.189758,-1,...,0.093567,-1,1,0.00,0.00,-1.0,0.000000,-1,0,-1
2,-1,1.595081,1.051282,98324,0.013922,0.022989,82,0.018831,-0.010073,-1,...,0.027655,-1,0,0.00,0.00,-1.0,0.500000,-1,0,-1
3,-1,0.945803,1.117647,11099,0.016577,0.070325,19,0.000000,0.057190,-1,...,0.000000,-1,1,0.00,0.00,-1.0,0.000000,-1,0,-1
4,-1,1.853821,2.401145,1421458,-0.000187,0.048232,20549,-0.004205,0.043428,17,...,0.053678,-1,12,0.00,0.00,-1.0,0.500000,-1,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2048,-1,1.045947,1.021768,377680,0.024902,0.080763,1549,0.040114,0.095081,-1,...,0.057487,-1,1,-0.25,0.25,-1.0,0.250000,-1,0,-1
2049,-1,1.002824,1.000000,11009,0.003286,0.026199,3,0.000000,0.000000,-1,...,0.000000,-1,0,0.00,0.00,-1.0,0.625000,-1,0,-1
2050,2,1.463059,1.842723,110638,0.054630,0.131814,785,0.099802,0.364108,-1,...,0.195603,3,0,0.00,0.00,-1.0,0.000000,0,0,0
2051,-1,1.042163,1.012346,80233,0.016749,0.040836,82,0.006173,0.025404,-1,...,0.021535,-1,2,0.00,0.50,-1.0,0.333333,-1,0,-1


In [13]:
X

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,number_of_users_quarter_growth,pricing_views,pypi,pypi_monthly_growth,pypi_quarter_growth,qoe_score,replys_to_sent,support_views_views,unresolved_jira_cases,xray_views
0,-1,1.010826,1.005556,33612,0.006322,0.022357,181,0.028839,0.102380,-1,...,0.000000,-1,0.0,0.00,0.00,-1.0,-1.000000,-1,0,-1
1,0,1.072605,1.126799,133136,0.110948,0.108732,1253,0.061992,-0.048989,7,...,0.072050,0,1.0,-0.25,0.00,-1.0,0.500000,0,1,0
2,-1,1.010057,1.000000,425446,-0.192083,-0.015428,723,-0.156684,-0.106017,-1,...,-0.070513,-1,1.0,0.00,0.00,-1.0,-1.000000,-1,0,-1
3,-1,1.021839,1.184615,33923,0.044085,0.130560,77,0.064162,0.324329,18,...,0.110256,-1,1.0,0.00,-0.25,-1.0,0.444444,-1,0,-1
4,-1,1.824033,2.200704,151382,0.040462,0.305511,2500,0.048097,0.339663,-1,...,0.103726,-1,0.0,0.00,0.00,-1.0,0.000000,-1,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2652,-1,1.028372,1.000000,16238,0.066810,0.226497,146,0.094142,0.552819,-1,...,0.031250,-1,0.0,0.00,0.00,-1.0,0.500000,-1,0,-1
2653,-1,1.054695,1.476603,479153,0.049289,0.311261,1704,0.005481,0.460036,-1,...,0.038496,-1,1.0,0.00,0.00,-1.0,0.142857,-1,1,-1
2654,-1,1.018148,1.007463,46508,-0.033047,-0.101608,135,-0.043601,-0.142927,0,...,0.095604,-1,1.0,0.25,0.00,-1.0,-1.000000,-1,0,-1
2655,-1,1.002872,1.000000,3492,0.000024,0.070046,2,0.000000,0.500000,-1,...,0.000000,-1,0.0,0.00,0.00,-1.0,-1.000000,-1,0,-1


In [14]:
df_for_predict_clean['class_pred_proba'] = cbc.predict_proba(df_for_predict_clean)[:, 1]

In [15]:
df_for_predict_clean

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,pricing_views,pypi,pypi_monthly_growth,pypi_quarter_growth,qoe_score,replys_to_sent,support_views_views,unresolved_jira_cases,xray_views,class_pred_proba
0,-1,1.301014,1.014757,580579,-0.012625,-0.130641,28743,-0.002787,0.065690,-1,...,-1,1,0.00,-0.25,-1.0,0.333333,-1,0,-1,0.332641
1,-1,1.226310,1.476726,182378,0.056636,0.215420,920,0.061820,0.189758,-1,...,-1,1,0.00,0.00,-1.0,0.000000,-1,0,-1,0.279469
2,-1,1.595081,1.051282,98324,0.013922,0.022989,82,0.018831,-0.010073,-1,...,-1,0,0.00,0.00,-1.0,0.500000,-1,0,-1,0.329004
3,-1,0.945803,1.117647,11099,0.016577,0.070325,19,0.000000,0.057190,-1,...,-1,1,0.00,0.00,-1.0,0.000000,-1,0,-1,0.178409
4,-1,1.853821,2.401145,1421458,-0.000187,0.048232,20549,-0.004205,0.043428,17,...,-1,12,0.00,0.00,-1.0,0.500000,-1,0,-1,0.594196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2048,-1,1.045947,1.021768,377680,0.024902,0.080763,1549,0.040114,0.095081,-1,...,-1,1,-0.25,0.25,-1.0,0.250000,-1,0,-1,0.424374
2049,-1,1.002824,1.000000,11009,0.003286,0.026199,3,0.000000,0.000000,-1,...,-1,0,0.00,0.00,-1.0,0.625000,-1,0,-1,0.272718
2050,2,1.463059,1.842723,110638,0.054630,0.131814,785,0.099802,0.364108,-1,...,3,0,0.00,0.00,-1.0,0.000000,0,0,0,0.307100
2051,-1,1.042163,1.012346,80233,0.016749,0.040836,82,0.006173,0.025404,-1,...,-1,2,0.00,0.50,-1.0,0.333333,-1,0,-1,0.357633


In [16]:
####################
# WHAT IF ANALYSIS #
####################

processed_df_for_fit = X
processed_df_for_fit['class'] = y
processed_df_for_fit['class_pred_proba'] = cbc.predict_proba(X)[:, 1]

In [17]:
high_bar_for_proba = processed_df_for_fit['class_pred_proba'].quantile(.85)
low_bar_for_proba = processed_df_for_fit['class_pred_proba'].quantile(.7)
processed_df_for_fit['class_pred'] = processed_df_for_fit['class_pred_proba'].apply(
    lambda x: 'High' if x >= high_bar_for_proba else 'Medium' if x >= low_bar_for_proba else 'Low')

In [18]:
high_bar_for_proba = df_for_predict_clean['class_pred_proba'].quantile(.85)
low_bar_for_proba = df_for_predict_clean['class_pred_proba'].quantile(.7)
df_for_predict_clean['class_pred'] = df_for_predict_clean['class_pred_proba'].apply(
    lambda x: 'High' if x >= high_bar_for_proba else 'Medium' if x >= low_bar_for_proba else 'Low')

In [19]:
processed_df_for_fit

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,pypi_monthly_growth,pypi_quarter_growth,qoe_score,replys_to_sent,support_views_views,unresolved_jira_cases,xray_views,class,class_pred_proba,class_pred
0,-1,1.010826,1.005556,33612,0.006322,0.022357,181,0.028839,0.102380,-1,...,0.00,0.00,-1.0,-1.000000,-1,0,-1,0,0.168478,Low
1,0,1.072605,1.126799,133136,0.110948,0.108732,1253,0.061992,-0.048989,7,...,-0.25,0.00,-1.0,0.500000,0,1,0,0,0.402800,Medium
2,-1,1.010057,1.000000,425446,-0.192083,-0.015428,723,-0.156684,-0.106017,-1,...,0.00,0.00,-1.0,-1.000000,-1,0,-1,0,0.269789,Low
3,-1,1.021839,1.184615,33923,0.044085,0.130560,77,0.064162,0.324329,18,...,0.00,-0.25,-1.0,0.444444,-1,0,-1,1,0.710284,High
4,-1,1.824033,2.200704,151382,0.040462,0.305511,2500,0.048097,0.339663,-1,...,0.00,0.00,-1.0,0.000000,-1,0,-1,0,0.245215,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2652,-1,1.028372,1.000000,16238,0.066810,0.226497,146,0.094142,0.552819,-1,...,0.00,0.00,-1.0,0.500000,-1,0,-1,0,0.277072,Low
2653,-1,1.054695,1.476603,479153,0.049289,0.311261,1704,0.005481,0.460036,-1,...,0.00,0.00,-1.0,0.142857,-1,1,-1,0,0.502036,High
2654,-1,1.018148,1.007463,46508,-0.033047,-0.101608,135,-0.043601,-0.142927,0,...,0.25,0.00,-1.0,-1.000000,-1,0,-1,0,0.265595,Low
2655,-1,1.002872,1.000000,3492,0.000024,0.070046,2,0.000000,0.500000,-1,...,0.00,0.00,-1.0,-1.000000,-1,0,-1,0,0.173082,Low


In [20]:
df_for_predict_clean

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,pypi,pypi_monthly_growth,pypi_quarter_growth,qoe_score,replys_to_sent,support_views_views,unresolved_jira_cases,xray_views,class_pred_proba,class_pred
0,-1,1.301014,1.014757,580579,-0.012625,-0.130641,28743,-0.002787,0.065690,-1,...,1,0.00,-0.25,-1.0,0.333333,-1,0,-1,0.332641,Low
1,-1,1.226310,1.476726,182378,0.056636,0.215420,920,0.061820,0.189758,-1,...,1,0.00,0.00,-1.0,0.000000,-1,0,-1,0.279469,Low
2,-1,1.595081,1.051282,98324,0.013922,0.022989,82,0.018831,-0.010073,-1,...,0,0.00,0.00,-1.0,0.500000,-1,0,-1,0.329004,Low
3,-1,0.945803,1.117647,11099,0.016577,0.070325,19,0.000000,0.057190,-1,...,1,0.00,0.00,-1.0,0.000000,-1,0,-1,0.178409,Low
4,-1,1.853821,2.401145,1421458,-0.000187,0.048232,20549,-0.004205,0.043428,17,...,12,0.00,0.00,-1.0,0.500000,-1,0,-1,0.594196,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2048,-1,1.045947,1.021768,377680,0.024902,0.080763,1549,0.040114,0.095081,-1,...,1,-0.25,0.25,-1.0,0.250000,-1,0,-1,0.424374,Medium
2049,-1,1.002824,1.000000,11009,0.003286,0.026199,3,0.000000,0.000000,-1,...,0,0.00,0.00,-1.0,0.625000,-1,0,-1,0.272718,Low
2050,2,1.463059,1.842723,110638,0.054630,0.131814,785,0.099802,0.364108,-1,...,0,0.00,0.00,-1.0,0.000000,0,0,0,0.307100,Low
2051,-1,1.042163,1.012346,80233,0.016749,0.040836,82,0.006173,0.025404,-1,...,2,0.00,0.50,-1.0,0.333333,-1,0,-1,0.357633,Low


In [21]:
top_model = cbc

In [22]:
scaler = StandardScaler()
bad_accounts = df_for_predict_clean[df_for_predict_clean['class_pred'] != 'High']

In [23]:
processed_df_for_fit['class_diff'] = processed_df_for_fit.apply(lambda row: 1 if
((row['class'] == 1) & (row['class_pred_proba'] < 0.5))
or ((row['class'] == 0) & (row['class_pred_proba'] >= 0.5))
else 0, axis=1)
# Predict class for train data (Test: how many are in different class from og label to predicted?)
print('diff in classes:')
print(np.sum(processed_df_for_fit['class_diff']))
print('out of')
print(processed_df_for_fit.shape[0])

diff in classes:
5
out of
2657


In [24]:
# Filter only high class (Test: how many are high class and how many are the rest?)
print("The value counts of the predicted rating for the data of the train")
print(processed_df_for_fit['class_pred'].value_counts())

train_data_for_whatif = processed_df_for_fit.loc[processed_df_for_fit['class_pred'] == 'High', :].drop(
    ['class_diff', 'class_pred_proba', 'class', 'class_pred'], axis=1)

# For both train data and new data, add column for categorical features (Test: Value counts the cat val column, both for train and test)
cat_cols = get_cat_feature_names(train_data_for_whatif)
train_data_for_whatif['cat_val'] = train_data_for_whatif[cat_cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
bad_accounts['cat_val'] = bad_accounts[cat_cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
print("cat val column of train data:")
print(train_data_for_whatif['cat_val'].value_counts())
print("cat val column of real time data:")
print(bad_accounts['cat_val'].value_counts())

bad_accounts = bad_accounts.drop(['class_pred'], axis=1)
bad_accounts = bad_accounts.drop(cat_cols, axis=1)
train_data_for_whatif = train_data_for_whatif.drop(cat_cols, axis=1)

The value counts of the predicted rating for the data of the train
Low       1860
High       399
Medium     398
Name: class_pred, dtype: int64
cat val column of train data:
    399
Name: cat_val, dtype: int64
cat val column of real time data:
    1745
Name: cat_val, dtype: int64



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [25]:
exclude_for_analysis = []

In [26]:
for index, row in bad_accounts.iterrows():
    row_trans = pd.DataFrame(row)
    # For each instance of new data in iteration, bring train data of same categorical values
    train_data_subset = train_data_for_whatif.loc[train_data_for_whatif['cat_val'] == row['cat_val'], :]
    if train_data_subset.shape[0] == 0:
        train_data_subset = train_data_for_whatif.loc[train_data_for_whatif['cat_val'] == '-1_-1_-1', :]

    # Attach the subset of train data with the current instance
    train_data_subset_w_instance = pd.concat([train_data_subset, row_trans.transpose()])

    # Remove the newly added categorical column
    train_data_subset_w_instance = train_data_subset_w_instance.drop(['cat_val'], axis=1)

    # Scale the concated df
    df_whatif_scaled = pd.DataFrame(scaler.fit_transform(train_data_subset_w_instance),
                                    columns=train_data_subset_w_instance.columns)
    df_whatif_scaled = df_whatif_scaled.fillna(0)

    # Find nearest neighbour for the instance we care for from the other data set
    sample = df_whatif_scaled.iloc[-1]
    df_whatif_scaled_wo_sample = df_whatif_scaled.iloc[:-1, :]
    dists = [cityblock(sample, df_whatif_scaled_wo_sample.iloc[i]) for i in
             (range(df_whatif_scaled_wo_sample.shape[0]))]
    closest_obs = train_data_subset.iloc[[np.argmin(dists)], range(train_data_subset.shape[1])].drop('cat_val', axis=1)

    # Calculate shap values for the neighboor's features and the current instance's features, calculate diffs
    df_concat_for_shap = pd.concat([closest_obs, row_trans.transpose()], axis=0)
    shap_values_total = shap.TreeExplainer(top_model).shap_values(df_concat_for_shap)

    # Create recommendations based on the top diff feature
    shap_diff = np.subtract(shap_values_total[0], shap_values_total[1])
    max_diff_loc = np.argmax(shap_diff)
    found_feature = False
    while not found_feature:
        feature_name = list(train_data_subset.columns)[max_diff_loc]
        nb_val_bigger = int(row.iloc[max_diff_loc]) > int(closest_obs.iloc[0, max_diff_loc])
        nb_val_equal = int(row.iloc[max_diff_loc]) == int(closest_obs.iloc[0, max_diff_loc])
        str_for_parse = " to be bigger" if nb_val_bigger else " to be smaller"
        if not nb_val_equal and feature_name not in exclude_for_analysis:
            print("You should change the feature " + feature_name + str_for_parse)
            found_feature = True
        else:
            shap_diff[np.argmax(shap_diff)] = -100000000000
            max_diff_loc = np.argmax(shap_diff)

You should change the feature days_from_permissions_change to be bigger
You should change the feature days_since_reply to be bigger
You should change the feature days_from_permissions_change to be bigger
You should change the feature days_from_permissions_change to be smaller
You should change the feature company_age to be bigger
You should change the feature docker to be smaller
You should change the feature days_since_reply to be bigger
You should change the feature days_since_reply to be bigger
You should change the feature days_since_reply to be bigger
You should change the feature days_from_permissions_change to be bigger
You should change the feature days_from_users_change to be bigger
You should change the feature days_since_reply to be bigger
You should change the feature engineers to be smaller
You should change the feature artifacts/binaries_size to be smaller
You should change the feature n_tech to be smaller
You should change the feature days_from_users_change to be bigger


ValueError: cannot convert float NaN to integer