In [99]:
import pandas as pd
from catboost import CatBoostClassifier
import numpy as np
from scipy.spatial.distance import cityblock
from sklearn.preprocessing import StandardScaler
from utils.general_utils import load_data_old, get_cat_feature_names
from utils.model_extensions_utils import FocalLossObjective
import shap
from utils.fe_utils import get_growth_features
from kmodes.kmodes import KModes
from tqdm import tqdm
import json

In [100]:
def process_df(X_temp):
    tech_cols = ['maven', 'generic', 'docker', 'npm', 'pypi', 'gradle', 'nuget']
    usage_cols = tech_cols + ['artifacts_count', 'artifacts_size', 'binaries_count', 'binaries_size', 'items_count',
                              'number_of_permissions', 'internal_groups', 'number_of_users', 'n_env', 'n_tech',
                              'n_repos']

    X_temp['n_tech'] = (X_temp[tech_cols] != 0).astype(int).sum(axis=1)
    X_temp['n_tech.1'] = (X_temp[[col + '.1' for col in tech_cols]] != 0).astype(int).sum(axis=1)
    X_temp['n_tech.2'] = (X_temp[[col + '.2' for col in tech_cols]] != 0).astype(int).sum(axis=1)
    X_temp['n_tech.3'] = (X_temp[[col + '.3' for col in tech_cols]] != 0).astype(int).sum(axis=1)
    X_temp['n_tech.4'] = (X_temp[[col + '.4' for col in tech_cols]] != 0).astype(int).sum(axis=1)
    X_temp['n_repos'] = (X_temp[tech_cols]).sum(axis=1)
    X_temp['n_repos.1'] = (X_temp[[col + '.1' for col in tech_cols]]).sum(axis=1)
    X_temp['n_repos.2'] = (X_temp[[col + '.2' for col in tech_cols]]).sum(axis=1)
    X_temp['n_repos.3'] = (X_temp[[col + '.3' for col in tech_cols]]).sum(axis=1)
    X_temp['n_repos.4'] = (X_temp[[col + '.4' for col in tech_cols]]).sum(axis=1)
    for col in usage_cols:
        growth_feature_monthly, growth_feature_quarter, df_fg = get_growth_features(col, X_temp.copy())
        X_temp[col + '_monthly_growth'] = growth_feature_monthly
        X_temp[col + '_quarter_growth'] = growth_feature_quarter

    # - transform to category
    cat_features = get_cat_feature_names(X_temp)
    for col in cat_features:
        X_temp[col] = X_temp[col].astype('category')

    # - drop usage features from the periods before the relevant-date
    cols_to_drop = [col for col in X_temp.columns if '.1' in col or '.2' in col or '.3' in col or '.4' in col]
    X_temp = X_temp.drop(cols_to_drop, axis=1)
    X_temp['artifacts/binaries_size'] = np.where(X_temp['binaries_size'] == 0, 0,
                                                 X_temp['artifacts_size'] / X_temp['binaries_size'])
    X_temp['artifacts/binaries_count'] = np.where(X_temp['binaries_count'] == 0, 0,
                                                  X_temp['artifacts_count'] / X_temp['binaries_count'])
    # cat_features.remove('territory')
    # cat_features.remove('industry_group')
    # X_temp = X_temp.drop(cat_features, axis=1)
    # X_temp = X_temp.drop(['total_employees_with_details', 'days_from_contact_added', 'territory', 'industry_group',
    #                       'total_employees_range'], axis=1)
    return X_temp

In [101]:
df = load_data_old('fit.sql')
# df = pd.read_csv('data/fit.csv', delimiter=';')

In [102]:
df.columns

Index(['account_id', 'relevant_date', 'class', 'territory', 'account_id.1',
       'relevant_date.1', 'period_range', 'artifacts_count', 'artifacts_size',
       'binaries_count',
       ...
       'n_ent_mentioned_sessions', 'n_competitor_mentioned_sessions',
       'n_xray_mentioned_sessions', 'n_replys', 'n_sent', 'n_calls',
       'n_task_xray', 'replys_to_sent', 'days_since_reply', 'days_since_sent'],
      dtype='object', length=188)

In [103]:
has_won = df.groupby('account_id', as_index=False).sum('class').loc[:, ['account_id', 'class']]
has_won['has_won'] = has_won['class'].apply(lambda x: True if x > 0 else False)
has_won.drop('class', axis=1, inplace=True)
new_df = df.merge(has_won[['account_id', 'has_won']], on='account_id')
df_did_win, df_did_not_win = new_df[new_df['has_won']], new_df[~new_df['has_won']]
df_did_win = df_did_win.merge(df_did_win.groupby('account_id', as_index=False).agg({'relevant_date': 'min'}),
                              on=['account_id', 'relevant_date'])
df_did_not_win = df_did_not_win.groupby('account_id', as_index=False).sample(n=1, random_state=2)
df = pd.concat([df_did_win, df_did_not_win])
df = df.sample(frac=1, random_state=2).reset_index(drop=True)

In [104]:
cols_to_drop = [col for col in df.columns if 'period_range' in col or 'relevant_date' in col or 'account_id' in col
                or 'class' in col or 'has_won' in col or 'class_pred_proba' in col or 'class_pred' in col or 'class_diff' in col]
X, y = df.drop(cols_to_drop, axis=1), df['class']

In [105]:
X = process_df(X)

In [106]:
cbc = CatBoostClassifier(cat_features=get_cat_feature_names(X), auto_class_weights='Balanced', verbose=0,
                         random_state=5, loss_function=FocalLossObjective(), eval_metric="Logloss",
                         bootstrap_type='Bayesian')
cbc_no_cats = CatBoostClassifier(cat_features=[], auto_class_weights='Balanced', verbose=0,
                                 random_state=5, loss_function=FocalLossObjective(), eval_metric="Logloss",
                                 bootstrap_type='Bayesian')

In [107]:
X = X.reindex(sorted(X.columns), axis=1)

In [108]:
X.head()

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,pypi_monthly_growth,pypi_quarter_growth,qoe_score,replys_to_sent,support_views_views,territory,total_employees_range,total_employees_with_details,unresolved_jira_cases,xray_views
0,-1,1.062204,1.002037,483819,0.062013,0.142322,4919,0.055707,0.140696,18,...,0.0,0.0,-1.0,0.0,-1,EMEA,5K-10K,8.0,0,-1
1,-1,5.83126,2.887931,49694,0.172579,0.359019,335,0.122308,0.193611,-1,...,0.0,0.0,-1.0,0.5,-1,Americas,51-250,3.0,0,-1
2,-1,1.021197,0.0,2457,0.418271,4.335647,0,0.0,0.0,-1,...,0.0,0.0,-1.0,0.0,-1,Americas,5K-10K,17.0,0,-1
3,0,1.366859,2.322176,132491,0.022089,0.059766,555,0.017667,0.093904,-1,...,0.0,0.0,-1.0,0.0,0,Americas,251-1K,14.0,0,0
4,-1,1.259227,1.269625,14398,0.438131,3.735021,372,0.425771,2.967368,-1,...,0.0,0.0,-1.0,0.4,-1,Americas,11-50,2.0,0,-1


In [109]:
cbc.fit(X, y)
cbc_no_cats.fit(X.drop(get_cat_feature_names(X), axis=1), y)

<catboost.core.CatBoostClassifier at 0x7fe93357a130>

In [110]:
df_for_predict = load_data_old('predict.sql')
# df_for_predict = pd.read_csv('data/predict.csv', delimiter=';')
cols_to_drop = [col for col in df_for_predict.columns if
                'period_range' in col or 'relevant_date' in col or 'account_id' in col
                or 'class' in col or 'has_won' in col]
df_for_predict_clean = df_for_predict.drop(cols_to_drop, axis=1)
df_for_predict_clean = process_df(df_for_predict_clean)

In [111]:
df_for_predict_clean = df_for_predict_clean.reindex(sorted(df_for_predict_clean.columns), axis=1)
df_for_predict_clean

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,pypi_monthly_growth,pypi_quarter_growth,qoe_score,replys_to_sent,support_views_views,territory,total_employees_range,total_employees_with_details,unresolved_jira_cases,xray_views
0,-1,1.299624,1.015124,581040,-0.001590,-0.129872,28391,-0.012926,0.049607,-1,...,0.00,0.00,-1.0,0.333333,-1,EMEA,251-1K,11.0,0,-1
1,-1,1.227835,1.480831,184357,0.055054,0.213569,927,0.058176,0.187031,-1,...,0.00,0.00,-1.0,0.250000,-1,EMEA,11-50,4.0,0,-1
2,-1,1.584233,1.051282,99531,0.019620,0.028304,82,0.012423,-0.010073,-1,...,0.00,0.00,-1.0,0.500000,-1,EMEA,51-250,3.0,0,-1
3,-1,0.946677,1.117647,11167,0.017653,0.066509,19,0.000000,0.027778,-1,...,0.00,0.00,-1.0,0.000000,-1,Americas,51-250,8.0,0,-1
4,-1,1.852663,2.398301,1427608,0.005369,0.044077,20611,0.001652,0.038832,17,...,0.00,0.00,-1.0,0.333333,-1,EMEA,51-250,25.0,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2042,-1,1.045206,1.021753,379278,0.023281,0.078413,1550,0.032952,0.091284,-1,...,-0.25,0.25,-1.0,0.200000,-1,EMEA,51-250,6.0,0,-1
2043,-1,1.002913,1.000000,11019,0.003421,0.025776,3,0.000000,0.000000,-1,...,0.00,0.00,-1.0,0.555556,-1,EMEA,251-1K,8.0,0,-1
2044,1,1.463433,1.843318,112178,0.054128,0.130921,800,0.095192,0.361026,-1,...,0.00,0.00,-1.0,0.000000,0,EMEA,10K-50K,6.0,0,0
2045,-1,1.043248,1.012346,80737,0.021620,0.042968,82,0.006173,0.025404,-1,...,0.00,0.50,-1.0,0.333333,-1,EMEA,1K-5K,4.0,0,-1


In [112]:
cbc.get_cat_feature_indices()

[43, 103, 104]

In [113]:
df_for_predict_clean['class_pred_proba'] = cbc.predict_proba(df_for_predict_clean)[:, 1]

In [114]:
df_for_predict_clean

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,pypi_quarter_growth,qoe_score,replys_to_sent,support_views_views,territory,total_employees_range,total_employees_with_details,unresolved_jira_cases,xray_views,class_pred_proba
0,-1,1.299624,1.015124,581040,-0.001590,-0.129872,28391,-0.012926,0.049607,-1,...,0.00,-1.0,0.333333,-1,EMEA,251-1K,11.0,0,-1,0.185556
1,-1,1.227835,1.480831,184357,0.055054,0.213569,927,0.058176,0.187031,-1,...,0.00,-1.0,0.250000,-1,EMEA,11-50,4.0,0,-1,0.207364
2,-1,1.584233,1.051282,99531,0.019620,0.028304,82,0.012423,-0.010073,-1,...,0.00,-1.0,0.500000,-1,EMEA,51-250,3.0,0,-1,0.156874
3,-1,0.946677,1.117647,11167,0.017653,0.066509,19,0.000000,0.027778,-1,...,0.00,-1.0,0.000000,-1,Americas,51-250,8.0,0,-1,0.184829
4,-1,1.852663,2.398301,1427608,0.005369,0.044077,20611,0.001652,0.038832,17,...,0.00,-1.0,0.333333,-1,EMEA,51-250,25.0,0,-1,0.358735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2042,-1,1.045206,1.021753,379278,0.023281,0.078413,1550,0.032952,0.091284,-1,...,0.25,-1.0,0.200000,-1,EMEA,51-250,6.0,0,-1,0.210583
2043,-1,1.002913,1.000000,11019,0.003421,0.025776,3,0.000000,0.000000,-1,...,0.00,-1.0,0.555556,-1,EMEA,251-1K,8.0,0,-1,0.156572
2044,1,1.463433,1.843318,112178,0.054128,0.130921,800,0.095192,0.361026,-1,...,0.00,-1.0,0.000000,0,EMEA,10K-50K,6.0,0,0,0.294523
2045,-1,1.043248,1.012346,80737,0.021620,0.042968,82,0.006173,0.025404,-1,...,0.50,-1.0,0.333333,-1,EMEA,1K-5K,4.0,0,-1,0.165239


In [115]:
####################
# WHAT IF ANALYSIS #
####################

processed_df_for_fit = X
processed_df_for_fit['class'] = y
processed_df_for_fit['class_pred_proba'] = cbc.predict_proba(X)[:, 1]

In [116]:
high_bar_for_proba = processed_df_for_fit['class_pred_proba'].quantile(.85)
low_bar_for_proba = processed_df_for_fit['class_pred_proba'].quantile(.7)
processed_df_for_fit['class_pred'] = processed_df_for_fit['class_pred_proba'].apply(
    lambda x: 'High' if x >= high_bar_for_proba else 'Medium' if x >= low_bar_for_proba else 'Low')

In [117]:
high_bar_for_proba = df_for_predict_clean['class_pred_proba'].quantile(.85)
low_bar_for_proba = df_for_predict_clean['class_pred_proba'].quantile(.7)
df_for_predict_clean['class_pred'] = df_for_predict_clean['class_pred_proba'].apply(
    lambda x: 'High' if x >= high_bar_for_proba else 'Medium' if x >= low_bar_for_proba else 'Low')

In [118]:
processed_df_for_fit

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,replys_to_sent,support_views_views,territory,total_employees_range,total_employees_with_details,unresolved_jira_cases,xray_views,class,class_pred_proba,class_pred
0,-1,1.062204,1.002037,483819,0.062013,0.142322,4919,0.055707,0.140696,18,...,0.000000,-1,EMEA,5K-10K,8.0,0,-1,0,0.356963,High
1,-1,5.831260,2.887931,49694,0.172579,0.359019,335,0.122308,0.193611,-1,...,0.500000,-1,Americas,51-250,3.0,0,-1,0,0.153785,Low
2,-1,1.021197,0.000000,2457,0.418271,4.335647,0,0.000000,0.000000,-1,...,0.000000,-1,Americas,5K-10K,17.0,0,-1,0,0.255831,Low
3,0,1.366859,2.322176,132491,0.022089,0.059766,555,0.017667,0.093904,-1,...,0.000000,0,Americas,251-1K,14.0,0,0,0,0.256460,Low
4,-1,1.259227,1.269625,14398,0.438131,3.735021,372,0.425771,2.967368,-1,...,0.400000,-1,Americas,11-50,2.0,0,-1,0,0.177179,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2790,-1,4.398353,1.047945,77996,0.031556,0.124813,153,0.038071,0.085117,-1,...,-1.000000,-1,EMEA,10K-50K,10.0,0,-1,0,0.230537,Low
2791,-1,1.413096,2.887574,59993,-0.005199,0.012895,488,0.000002,-0.013055,-1,...,0.375000,-1,Americas,251-1K,15.0,0,-1,0,0.268804,Low
2792,-1,1.281619,1.554261,3423187,0.076761,0.235121,10140,0.098865,0.297066,-1,...,-1.000000,-1,EMEA,51-250,9.0,0,-1,0,0.275068,Medium
2793,-1,1.078910,1.282596,202861,0.024082,0.082488,2174,0.061149,0.209359,-1,...,0.333333,-1,EMEA,1K-5K,16.0,0,-1,0,0.207447,Low


In [119]:
df_for_predict_clean

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,qoe_score,replys_to_sent,support_views_views,territory,total_employees_range,total_employees_with_details,unresolved_jira_cases,xray_views,class_pred_proba,class_pred
0,-1,1.299624,1.015124,581040,-0.001590,-0.129872,28391,-0.012926,0.049607,-1,...,-1.0,0.333333,-1,EMEA,251-1K,11.0,0,-1,0.185556,Low
1,-1,1.227835,1.480831,184357,0.055054,0.213569,927,0.058176,0.187031,-1,...,-1.0,0.250000,-1,EMEA,11-50,4.0,0,-1,0.207364,Low
2,-1,1.584233,1.051282,99531,0.019620,0.028304,82,0.012423,-0.010073,-1,...,-1.0,0.500000,-1,EMEA,51-250,3.0,0,-1,0.156874,Low
3,-1,0.946677,1.117647,11167,0.017653,0.066509,19,0.000000,0.027778,-1,...,-1.0,0.000000,-1,Americas,51-250,8.0,0,-1,0.184829,Low
4,-1,1.852663,2.398301,1427608,0.005369,0.044077,20611,0.001652,0.038832,17,...,-1.0,0.333333,-1,EMEA,51-250,25.0,0,-1,0.358735,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2042,-1,1.045206,1.021753,379278,0.023281,0.078413,1550,0.032952,0.091284,-1,...,-1.0,0.200000,-1,EMEA,51-250,6.0,0,-1,0.210583,Low
2043,-1,1.002913,1.000000,11019,0.003421,0.025776,3,0.000000,0.000000,-1,...,-1.0,0.555556,-1,EMEA,251-1K,8.0,0,-1,0.156572,Low
2044,1,1.463433,1.843318,112178,0.054128,0.130921,800,0.095192,0.361026,-1,...,-1.0,0.000000,0,EMEA,10K-50K,6.0,0,0,0.294523,High
2045,-1,1.043248,1.012346,80737,0.021620,0.042968,82,0.006173,0.025404,-1,...,-1.0,0.333333,-1,EMEA,1K-5K,4.0,0,-1,0.165239,Low


In [120]:
top_model = cbc

In [121]:
scaler = StandardScaler()
bad_accounts = df_for_predict_clean[df_for_predict_clean['class_pred'] != 'High']
bad_accounts_og_df = df_for_predict[df_for_predict_clean['class_pred'] != 'High']

In [122]:
processed_df_for_fit['class_diff'] = processed_df_for_fit.apply(lambda row: 1 if
((row['class'] == 1) & (row['class_pred_proba'] < 0.5))
or ((row['class'] == 0) & (row['class_pred_proba'] >= 0.5))
else 0, axis=1)

# Predict class for train data (Test: how many are in different class from og label to predicted?)
print('diff in classes:')
print(np.sum(processed_df_for_fit['class_diff']))
print('out of')
print(processed_df_for_fit.shape[0])

diff in classes:
64
out of
2795


In [123]:
# Filter only high class (Test: how many are high class and how many are the rest?)
print("The value counts of the predicted rating for the data of the train")
print(processed_df_for_fit['class_pred'].value_counts(dropna=False))

train_data_for_whatif = processed_df_for_fit.loc[processed_df_for_fit['class_pred'] == 'High', :].drop(
    ['class_diff', 'class_pred_proba', 'class', 'class_pred'], axis=1)

# For both train data and new data, use K-Modes to create a cluster column (only for cat features)
cat_cols = get_cat_feature_names(train_data_for_whatif)
bad_accounts = bad_accounts.drop(['class_pred', 'class_pred_proba'], axis=1)
km = KModes(n_clusters=5, init='Huang', n_init=5, verbose=2, n_jobs=-1, max_iter=5000, random_state=2)
df_for_kmodes = pd.concat([train_data_for_whatif[cat_cols], bad_accounts[cat_cols]])
km.fit(df_for_kmodes)
bad_accounts['cluster'] = km.predict(bad_accounts[cat_cols])
train_data_for_whatif['cluster'] = km.predict(train_data_for_whatif[cat_cols])

bad_accounts = bad_accounts.drop(cat_cols, axis=1)
train_data_for_whatif = train_data_for_whatif.drop(cat_cols, axis=1)

The value counts of the predicted rating for the data of the train
Low       1956
High       420
Medium     419
Name: class_pred, dtype: int64
Best run was number 3


In [124]:
bad_accounts

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,pypi,pypi_monthly_growth,pypi_quarter_growth,qoe_score,replys_to_sent,support_views_views,total_employees_with_details,unresolved_jira_cases,xray_views,cluster
0,-1,1.299624,1.015124,581040,-0.001590,-0.129872,28391,-0.012926,0.049607,-1,...,1,0.00,0.00,-1.0,0.333333,-1,11.0,0,-1,0
1,-1,1.227835,1.480831,184357,0.055054,0.213569,927,0.058176,0.187031,-1,...,1,0.00,0.00,-1.0,0.250000,-1,4.0,0,-1,0
2,-1,1.584233,1.051282,99531,0.019620,0.028304,82,0.012423,-0.010073,-1,...,0,0.00,0.00,-1.0,0.500000,-1,3.0,0,-1,3
3,-1,0.946677,1.117647,11167,0.017653,0.066509,19,0.000000,0.027778,-1,...,1,0.00,0.00,-1.0,0.000000,-1,8.0,0,-1,2
5,-1,1.737976,4.618598,93119,0.000968,0.036109,3427,0.000438,0.006644,-1,...,6,0.50,0.50,-1.0,0.250000,-1,3.0,0,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2041,-1,2.133634,1.325243,438146,-0.060622,-0.282971,819,0.015334,0.103240,-1,...,2,0.50,0.50,-1.0,0.000000,-1,3.0,0,-1,3
2042,-1,1.045206,1.021753,379278,0.023281,0.078413,1550,0.032952,0.091284,-1,...,1,-0.25,0.25,-1.0,0.200000,-1,6.0,0,-1,3
2043,-1,1.002913,1.000000,11019,0.003421,0.025776,3,0.000000,0.000000,-1,...,0,0.00,0.00,-1.0,0.555556,-1,8.0,0,-1,0
2045,-1,1.043248,1.012346,80737,0.021620,0.042968,82,0.006173,0.025404,-1,...,2,0.00,0.50,-1.0,0.333333,-1,4.0,0,-1,0


In [125]:
train_data_for_whatif

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,pypi,pypi_monthly_growth,pypi_quarter_growth,qoe_score,replys_to_sent,support_views_views,total_employees_with_details,unresolved_jira_cases,xray_views,cluster
0,-1,1.062204,1.002037,483819,0.062013,0.142322,4919,0.055707,0.140696,18,...,0.0,0.000,0.000,-1.0,0.000000,-1,8.0,0,-1,0
14,-1,1.225715,1.826087,24681,0.318772,6.534135,84,0.138578,2.513636,18,...,1.0,0.000,0.000,-1.0,0.666667,-1,28.0,0,-1,2
26,0,1.158051,1.503546,166808,0.035976,0.081427,848,0.080182,0.127626,-1,...,3.0,0.000,-0.250,-1.0,0.250000,0,11.0,0,5,0
31,0,1.055284,1.083333,9506,0.098664,0.248446,13,0.683333,1.291667,28,...,1.0,0.000,0.000,-1.0,0.000000,0,12.0,0,2,0
38,2,1.413304,2.300330,91681,14.447174,22.978342,697,13.717908,14.502500,-1,...,0.0,0.000,0.000,-1.0,0.200000,0,9.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2756,-1,1.161397,1.000000,10506,0.897664,0.910573,4,0.500000,1.500000,5,...,0.0,0.000,0.000,-1.0,0.484848,-1,14.0,0,-1,1
2764,-1,3.761033,10.876923,885603,0.035376,0.087365,4242,0.038719,0.067310,-1,...,3.0,0.000,0.000,-1.0,-1.000000,-1,4.0,0,-1,1
2785,0,1.006095,1.294118,157814,0.017031,0.125071,88,0.005747,0.063439,-1,...,3.0,0.250,1.000,-1.0,0.333333,0,26.0,0,0,0
2786,-1,1.161397,1.000000,10506,0.897664,0.910573,4,0.500000,1.500000,5,...,0.0,0.000,0.000,-1.0,0.484848,-1,14.0,0,-1,1


In [126]:
what_if_features = ['days_since_reply', 'n_sent', 'n_calls',
                    'n_task_xray', 'n_replys', 'n_trials', 'n_ent_trials',
                    'unresolved_jira_cases', 'n_training', 'n_poor_cases',
                    'number_of_permissions', 'binaries_count', 'artifacts_count',
                    'binaries_size', 'artifacts_size', 'n_tech']

In [None]:
final_payload = []
i = 0
for index, row in tqdm(bad_accounts.iterrows(), total=(bad_accounts.shape[0])):
    # if i > 100:
    #     break

    row = row.reindex(sorted(row.index), axis=1)
    row_trans = pd.DataFrame(row)
    # For each instance of new data in iteration, bring train data of same categorical values
    train_data_subset = train_data_for_whatif.loc[train_data_for_whatif['cluster'] == row['cluster'], :]
    train_data_subset_account_ids = df.loc[processed_df_for_fit['class_pred'] == 'High', :]
    train_data_subset_account_ids = train_data_subset_account_ids.loc[
        train_data_for_whatif['cluster'] == row['cluster'], 'account_id']
    if train_data_subset.shape[0] == 0:
        train_data_subset = train_data_for_whatif

    # Attach the subset of train data with the current instance (Test: whats the shape before and after attach)
    train_data_subset_w_instance = pd.concat([train_data_subset, row_trans.transpose()])

    # Remove the newly added categorical column
    train_data_subset_w_instance = train_data_subset_w_instance.drop(['cluster'], axis=1)

    # Scale the concated df
    df_whatif_scaled = pd.DataFrame(scaler.fit_transform(train_data_subset_w_instance),
                                    columns=train_data_subset_w_instance.columns)
    df_whatif_scaled = df_whatif_scaled.fillna(0)

    # Find nearest neighbour for the instance we care for from the other data set
    sample = df_whatif_scaled.iloc[-1]
    df_whatif_scaled_wo_sample = df_whatif_scaled.iloc[:-1, :]
    dists = [cityblock(sample, df_whatif_scaled_wo_sample.iloc[i]) for i in
             (range(df_whatif_scaled_wo_sample.shape[0]))]
    closest_obs = train_data_subset.iloc[[np.argmin(dists)], range(train_data_subset.shape[1])].drop('cluster', axis=1)
    closest_obs_account_id = train_data_subset_account_ids.iloc[[np.argmin(dists)]]

    # Calculate shap values for the neighbor's features and the current instance's features, calculate diffs
    df_concat_for_shap = pd.concat([closest_obs, row_trans.transpose()], axis=0).drop('cluster', axis=1)
    shap_values_total = shap.TreeExplainer(cbc_no_cats).shap_values(df_concat_for_shap)

    # Create recommendations based on the top diff feature
    shap_diff = np.subtract(shap_values_total[0], shap_values_total[1])
    max_diff_loc = np.argmax(shap_diff)
    found_feature = False
    while not found_feature:
        feature_name = list(train_data_subset.columns)[max_diff_loc]
        curr_val = row.iloc[max_diff_loc]
        nb_val = closest_obs.iloc[0, max_diff_loc]
        if np.isnan(curr_val):
            curr_val = 0
        if np.isnan(nb_val):
            nb_val = 0
        val_diff = nb_val - curr_val
        if val_diff != 0 and feature_name in what_if_features:
            final_payload.append({'what_if': {'account_id': bad_accounts_og_df.loc[index, 'account_id'],
                                              'feature_name': feature_name,
                                              'change_val': val_diff,
                                              'neighbor_id': closest_obs_account_id.iloc[0]}})
            found_feature = True
        else:
            shap_diff[np.argmax(shap_diff)] = np.NINF
            max_diff_loc = np.argmax(shap_diff)

    i += 1

 54%|█████▍    | 939/1740 [04:59<04:27,  2.99it/s]

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/5000, moves: 0, cost: 2516.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/5000, moves: 0, cost: 2429.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/5000, moves: 623, cost: 2336.0
Run 4, iteration: 2/5000, moves: 48, cost: 2336.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/5000, moves: 817, cost: 2113.0
Run 3, iteration: 2/5000, moves: 133, cost: 2113.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/5000, moves: 911, cost: 2390.0
Run 5, iteration: 2/5000, moves: 87, cost: 2390.0


 85%|████████▌ | 1481/1740 [10:17<04:26,  1.03s/it]  

In [None]:
print(len(final_payload))

In [None]:
feature_dict = {}
for dict in final_payload:
    feature_name = dict['what_if']['feature_name']
    change_val = dict['what_if']['change_val']
    if feature_name in feature_dict:
        feature_dict[feature_name] = (feature_dict[feature_name][0] + 1, feature_dict[feature_name][1] + change_val)
    else:
        feature_dict[feature_name] = (1, change_val)
    print(json.dumps(dict, indent=4))

In [131]:
sorted(feature_dict.items(), key=lambda x: x[1][0], reverse=True)

[('days_since_reply', (817, 648978.0)),
 ('unresolved_jira_cases', (411, -2154.0)),
 ('binaries_size', (352, -631929.0)),
 ('number_of_permissions', (64, 535.721274509804)),
 ('artifacts_count', (31, 8512405.0)),
 ('binaries_count', (24, 82899.0)),
 ('artifacts_size', (16, 1691.0)),
 ('n_tech', (9, 44.0)),
 ('n_task_xray', (8, 166.0)),
 ('n_sent', (5, 26.0)),
 ('n_replys', (3, 33.0))]