In [640]:
import pandas as pd
from catboost import CatBoostClassifier
import numpy as np
from scipy.spatial.distance import cityblock
from sklearn.preprocessing import StandardScaler
from utils.general_utils import load_data_old, get_cat_feature_names
from utils.model_extensions_utils import FocalLossObjective
import shap
from utils.fe_utils import get_growth_features
from kmodes.kmodes import KModes

In [641]:
def process_df(X_temp):
    tech_cols = ['maven', 'generic', 'docker', 'npm', 'pypi', 'gradle', 'nuget']
    usage_cols = tech_cols + ['artifacts_count', 'artifacts_size', 'binaries_count', 'binaries_size', 'items_count',
                              'number_of_permissions', 'internal_groups', 'number_of_users', 'n_env', 'n_tech',
                              'n_repos']

    X_temp['n_tech'] = (X_temp[tech_cols] != 0).astype(int).sum(axis=1)
    X_temp['n_tech.1'] = (X_temp[[col + '.1' for col in tech_cols]] != 0).astype(int).sum(axis=1)
    X_temp['n_tech.2'] = (X_temp[[col + '.2' for col in tech_cols]] != 0).astype(int).sum(axis=1)
    X_temp['n_tech.3'] = (X_temp[[col + '.3' for col in tech_cols]] != 0).astype(int).sum(axis=1)
    X_temp['n_tech.4'] = (X_temp[[col + '.4' for col in tech_cols]] != 0).astype(int).sum(axis=1)
    X_temp['n_repos'] = (X_temp[tech_cols]).sum(axis=1)
    X_temp['n_repos.1'] = (X_temp[[col + '.1' for col in tech_cols]]).sum(axis=1)
    X_temp['n_repos.2'] = (X_temp[[col + '.2' for col in tech_cols]]).sum(axis=1)
    X_temp['n_repos.3'] = (X_temp[[col + '.3' for col in tech_cols]]).sum(axis=1)
    X_temp['n_repos.4'] = (X_temp[[col + '.4' for col in tech_cols]]).sum(axis=1)
    for col in usage_cols:
        growth_feature_monthly, growth_feature_quarter, df_fg = get_growth_features(col, X_temp.copy())
        X_temp[col + '_monthly_growth'] = growth_feature_monthly
        X_temp[col + '_quarter_growth'] = growth_feature_quarter

    # - transform to category
    cat_features = get_cat_feature_names(X_temp)
    for col in cat_features:
        X_temp[col] = X_temp[col].astype('category')

    # - drop usage features from the periods before the relevant-date
    cols_to_drop = [col for col in X_temp.columns if '.1' in col or '.2' in col or '.3' in col or '.4' in col]
    X_temp = X_temp.drop(cols_to_drop, axis=1)
    X_temp['artifacts/binaries_size'] = np.where(X_temp['binaries_size'] == 0, 0,
                                                 X_temp['artifacts_size'] / X_temp['binaries_size'])
    X_temp['artifacts/binaries_count'] = np.where(X_temp['binaries_count'] == 0, 0,
                                                  X_temp['artifacts_count'] / X_temp['binaries_count'])
    # cat_features.remove('territory')
    # cat_features.remove('industry_group')
    # X_temp = X_temp.drop(cat_features, axis=1)
    # X_temp = X_temp.drop(['total_employees_with_details', 'days_from_contact_added', 'territory', 'industry_group',
    #                       'total_employees_range'], axis=1)
    return X_temp

In [642]:
# df = load_data_old('fit.sql')
df = pd.read_csv('data/fit.csv', delimiter=';')

In [643]:
has_won = df.groupby('account_id', as_index=False).sum('class').loc[:, ['account_id', 'class']]
has_won['has_won'] = has_won['class'].apply(lambda x: True if x > 0 else False)
has_won.drop('class', axis=1, inplace=True)
new_df = df.merge(has_won[['account_id', 'has_won']], on='account_id')
df_did_win, df_did_not_win = new_df[new_df['has_won']], new_df[~new_df['has_won']]
df_did_win = df_did_win.merge(df_did_win.groupby('account_id', as_index=False).agg({'relevant_date': 'min'}),  on=['account_id', 'relevant_date'])
df_did_not_win = df_did_not_win.groupby('account_id', as_index=False).sample(n=1, random_state=2)
df = pd.concat([df_did_win, df_did_not_win])
df = df.sample(frac=1, random_state=2).reset_index(drop=True)

In [644]:
cols_to_drop = [col for col in df.columns if 'period_range' in col or 'relevant_date' in col or 'account_id' in col
                or 'class' in col or 'has_won' in col or 'class_pred_proba' in col or 'class_pred' in col or 'class_diff' in col]
X, y = df.drop(cols_to_drop, axis=1), df['class']

In [645]:
X = process_df(X)

In [646]:
cbc = CatBoostClassifier(cat_features=get_cat_feature_names(X), auto_class_weights='Balanced', verbose=0,
                         random_state=5, loss_function=FocalLossObjective(), eval_metric="Logloss",
                         bootstrap_type='Bayesian')
cbc_no_cats = CatBoostClassifier(cat_features=[], auto_class_weights='Balanced', verbose=0,
                         random_state=5, loss_function=FocalLossObjective(), eval_metric="Logloss",
                         bootstrap_type='Bayesian')

In [647]:
X = X.reindex(sorted(X.columns), axis=1)

In [648]:
X.head()

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,pypi_monthly_growth,pypi_quarter_growth,qoe_score,replys_to_sent,support_views_views,territory,total_employees_range,total_employees_with_details,unresolved_jira_cases,xray_views
0,-1,0.354839,0.0,11,0.0,0.0,0,0.0,0.0,-1,...,0.0,0.0,-1.0,0.0,-1,Americas,51-250,3.0,0,-1
1,-1,0.999321,1.0,35321,0.064754,0.038032,10,0.055556,-0.322823,41,...,0.0,0.0,-1.0,0.0,-1,Americas,251-1K,17.0,0,-1
2,-1,4.392331,9.666667,27149,0.543505,4.498503,116,0.740681,2.290043,-1,...,0.0,0.0,-1.0,0.0,-1,EMEA,251-1K,7.0,0,-1
3,-1,1.225715,1.826087,24681,0.318772,6.534135,84,0.138578,2.513636,18,...,0.0,0.0,-1.0,0.666667,-1,Americas,1K-5K,28.0,0,-1
4,-1,1.020973,0.933333,12073,0.036175,0.340701,14,0.038462,0.094697,-1,...,0.0,0.0,-1.0,0.25,-1,EMEA,1-10,2.0,0,-1


In [649]:
cbc.fit(X, y)
cbc_no_cats.fit(X.drop(get_cat_feature_names(X), axis=1), y)

<catboost.core.CatBoostClassifier at 0x7f95329b9220>

In [650]:
# df_for_predict = load_data_old('predict.sql')
df_for_predict = pd.read_csv('data/predict.csv', delimiter=';')
cols_to_drop = [col for col in df_for_predict.columns if
                'period_range' in col or 'relevant_date' in col or 'account_id' in col
                or 'class' in col or 'has_won' in col]
df_for_predict_clean = df_for_predict.drop(cols_to_drop, axis=1)
df_for_predict_clean = process_df(df_for_predict_clean)

In [651]:
df_for_predict_clean = df_for_predict_clean.reindex(sorted(df_for_predict_clean.columns), axis=1)
df_for_predict_clean

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,pypi_monthly_growth,pypi_quarter_growth,qoe_score,replys_to_sent,support_views_views,territory,total_employees_range,total_employees_with_details,unresolved_jira_cases,xray_views
0,-1,1.300404,1.014774,580352,-0.010603,-0.131726,28710,-0.003963,0.063740,-1,...,0.00,-0.25,-1.0,0.333333,-1,EMEA,251-1K,11.0,0,-1
1,-1,1.226489,1.475962,182585,0.056289,0.214977,921,0.061756,0.189489,-1,...,0.00,0.00,-1.0,0.000000,-1,EMEA,11-50,4.0,0,-1
2,-1,1.593653,1.051282,98475,0.014636,0.023652,82,0.012423,-0.010073,-1,...,0.00,0.00,-1.0,0.500000,-1,EMEA,51-250,3.0,0,-1
3,-1,0.947454,1.117647,11107,0.016707,0.069873,19,0.000000,0.057190,-1,...,0.00,0.00,-1.0,0.000000,-1,Americas,51-250,8.0,0,-1
4,-1,1.853653,2.400888,1422037,0.000362,0.047562,20554,-0.003600,0.042657,17,...,0.00,0.00,-1.0,0.500000,-1,EMEA,51-250,25.0,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2049,-1,1.045952,1.021768,377916,0.024678,0.080504,1549,0.039030,0.094248,-1,...,-0.25,0.25,-1.0,0.250000,-1,EMEA,51-250,6.0,0,-1
2050,-1,1.002824,1.000000,11009,0.003240,0.026102,3,0.000000,0.000000,-1,...,0.00,0.00,-1.0,0.625000,-1,EMEA,251-1K,8.0,0,-1
2051,2,1.463218,1.843091,110849,0.054692,0.131860,787,0.099509,0.364253,-1,...,0.00,0.00,-1.0,0.000000,0,EMEA,10K-50K,6.0,0,0
2052,-1,1.042327,1.012346,80304,0.017158,0.040923,82,0.006173,0.025404,-1,...,0.00,0.50,-1.0,0.333333,-1,EMEA,1K-5K,4.0,0,-1


In [652]:
cbc.get_cat_feature_indices()

[43, 103, 104]

In [653]:
df_for_predict_clean['class_pred_proba'] = cbc.predict_proba(df_for_predict_clean)[:, 1]

In [654]:
df_for_predict_clean

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,pypi_quarter_growth,qoe_score,replys_to_sent,support_views_views,territory,total_employees_range,total_employees_with_details,unresolved_jira_cases,xray_views,class_pred_proba
0,-1,1.300404,1.014774,580352,-0.010603,-0.131726,28710,-0.003963,0.063740,-1,...,-0.25,-1.0,0.333333,-1,EMEA,251-1K,11.0,0,-1,0.167474
1,-1,1.226489,1.475962,182585,0.056289,0.214977,921,0.061756,0.189489,-1,...,0.00,-1.0,0.000000,-1,EMEA,11-50,4.0,0,-1,0.204060
2,-1,1.593653,1.051282,98475,0.014636,0.023652,82,0.012423,-0.010073,-1,...,0.00,-1.0,0.500000,-1,EMEA,51-250,3.0,0,-1,0.166906
3,-1,0.947454,1.117647,11107,0.016707,0.069873,19,0.000000,0.057190,-1,...,0.00,-1.0,0.000000,-1,Americas,51-250,8.0,0,-1,0.174760
4,-1,1.853653,2.400888,1422037,0.000362,0.047562,20554,-0.003600,0.042657,17,...,0.00,-1.0,0.500000,-1,EMEA,51-250,25.0,0,-1,0.308810
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2049,-1,1.045952,1.021768,377916,0.024678,0.080504,1549,0.039030,0.094248,-1,...,0.25,-1.0,0.250000,-1,EMEA,51-250,6.0,0,-1,0.194128
2050,-1,1.002824,1.000000,11009,0.003240,0.026102,3,0.000000,0.000000,-1,...,0.00,-1.0,0.625000,-1,EMEA,251-1K,8.0,0,-1,0.156841
2051,2,1.463218,1.843091,110849,0.054692,0.131860,787,0.099509,0.364253,-1,...,0.00,-1.0,0.000000,0,EMEA,10K-50K,6.0,0,0,0.355329
2052,-1,1.042327,1.012346,80304,0.017158,0.040923,82,0.006173,0.025404,-1,...,0.50,-1.0,0.333333,-1,EMEA,1K-5K,4.0,0,-1,0.181356


In [655]:
####################
# WHAT IF ANALYSIS #
####################

processed_df_for_fit = X
processed_df_for_fit['class'] = y
processed_df_for_fit['class_pred_proba'] = cbc.predict_proba(X)[:, 1]

In [656]:
high_bar_for_proba = processed_df_for_fit['class_pred_proba'].quantile(.85)
low_bar_for_proba = processed_df_for_fit['class_pred_proba'].quantile(.7)
processed_df_for_fit['class_pred'] = processed_df_for_fit['class_pred_proba'].apply(
    lambda x: 'High' if x >= high_bar_for_proba else 'Medium' if x >= low_bar_for_proba else 'Low')

In [657]:
high_bar_for_proba = df_for_predict_clean['class_pred_proba'].quantile(.85)
low_bar_for_proba = df_for_predict_clean['class_pred_proba'].quantile(.7)
df_for_predict_clean['class_pred'] = df_for_predict_clean['class_pred_proba'].apply(
    lambda x: 'High' if x >= high_bar_for_proba else 'Medium' if x >= low_bar_for_proba else 'Low')

In [658]:
processed_df_for_fit

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,replys_to_sent,support_views_views,territory,total_employees_range,total_employees_with_details,unresolved_jira_cases,xray_views,class,class_pred_proba,class_pred
0,-1,0.354839,0.000000,11,0.000000,0.000000,0,0.000000,0.000000,-1,...,0.000000,-1,Americas,51-250,3.0,0,-1,0,0.154141,Low
1,-1,0.999321,1.000000,35321,0.064754,0.038032,10,0.055556,-0.322823,41,...,0.000000,-1,Americas,251-1K,17.0,0,-1,0,0.266735,Low
2,-1,4.392331,9.666667,27149,0.543505,4.498503,116,0.740681,2.290043,-1,...,0.000000,-1,EMEA,251-1K,7.0,0,-1,1,0.705475,High
3,-1,1.225715,1.826087,24681,0.318772,6.534135,84,0.138578,2.513636,18,...,0.666667,-1,Americas,1K-5K,28.0,0,-1,0,0.374733,High
4,-1,1.020973,0.933333,12073,0.036175,0.340701,14,0.038462,0.094697,-1,...,0.250000,-1,EMEA,1-10,2.0,0,-1,0,0.124967,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2779,-1,1.559849,1.430380,10738,-0.305527,-0.503121,339,-0.281288,-0.216247,16,...,0.000000,-1,EMEA,50K-100K,2.0,0,-1,0,0.267687,Low
2780,-1,2.071819,4.472771,841695,0.025767,0.136682,16755,0.069766,0.182638,-1,...,0.000000,-1,Americas,51-250,18.0,0,-1,0,0.231598,Low
2781,-1,1.161719,1.021583,242604,0.038393,0.130800,568,0.032378,0.084456,-1,...,-1.000000,-1,EMEA,51-250,11.0,0,-1,0,0.195564,Low
2782,-1,1.008543,1.182796,53476,0.024462,0.129615,110,-0.004505,-0.008806,-1,...,0.200000,-1,APAC,5K-10K,9.0,0,-1,0,0.128133,Low


In [659]:
df_for_predict_clean

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,qoe_score,replys_to_sent,support_views_views,territory,total_employees_range,total_employees_with_details,unresolved_jira_cases,xray_views,class_pred_proba,class_pred
0,-1,1.300404,1.014774,580352,-0.010603,-0.131726,28710,-0.003963,0.063740,-1,...,-1.0,0.333333,-1,EMEA,251-1K,11.0,0,-1,0.167474,Low
1,-1,1.226489,1.475962,182585,0.056289,0.214977,921,0.061756,0.189489,-1,...,-1.0,0.000000,-1,EMEA,11-50,4.0,0,-1,0.204060,Low
2,-1,1.593653,1.051282,98475,0.014636,0.023652,82,0.012423,-0.010073,-1,...,-1.0,0.500000,-1,EMEA,51-250,3.0,0,-1,0.166906,Low
3,-1,0.947454,1.117647,11107,0.016707,0.069873,19,0.000000,0.057190,-1,...,-1.0,0.000000,-1,Americas,51-250,8.0,0,-1,0.174760,Low
4,-1,1.853653,2.400888,1422037,0.000362,0.047562,20554,-0.003600,0.042657,17,...,-1.0,0.500000,-1,EMEA,51-250,25.0,0,-1,0.308810,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2049,-1,1.045952,1.021768,377916,0.024678,0.080504,1549,0.039030,0.094248,-1,...,-1.0,0.250000,-1,EMEA,51-250,6.0,0,-1,0.194128,Low
2050,-1,1.002824,1.000000,11009,0.003240,0.026102,3,0.000000,0.000000,-1,...,-1.0,0.625000,-1,EMEA,251-1K,8.0,0,-1,0.156841,Low
2051,2,1.463218,1.843091,110849,0.054692,0.131860,787,0.099509,0.364253,-1,...,-1.0,0.000000,0,EMEA,10K-50K,6.0,0,0,0.355329,High
2052,-1,1.042327,1.012346,80304,0.017158,0.040923,82,0.006173,0.025404,-1,...,-1.0,0.333333,-1,EMEA,1K-5K,4.0,0,-1,0.181356,Low


In [660]:
top_model = cbc

In [661]:
scaler = StandardScaler()
bad_accounts = df_for_predict_clean[df_for_predict_clean['class_pred'] != 'High']

In [662]:
processed_df_for_fit['class_diff'] = processed_df_for_fit.apply(lambda row: 1 if
                                    ((row['class'] == 1) & (row['class_pred_proba'] < 0.5))
                                    or ((row['class'] == 0) & (row['class_pred_proba'] >= 0.5))
                                    else 0, axis=1)

# Predict class for train data (Test: how many are in different class from og label to predicted?)
print('diff in classes:')
print(np.sum(processed_df_for_fit['class_diff']))
print('out of')
print(processed_df_for_fit.shape[0])

diff in classes:
64
out of
2784


In [663]:
# Filter only high class (Test: how many are high class and how many are the rest?)
print("The value counts of the predicted rating for the data of the train")
print(processed_df_for_fit['class_pred'].value_counts(dropna=False))

train_data_for_whatif = processed_df_for_fit.loc[processed_df_for_fit['class_pred'] == 'High', :].drop(
    ['class_diff', 'class_pred_proba', 'class', 'class_pred'], axis=1)

# For both train data and new data, add column for categorical features (Test: Value counts the cat val column, both for train and test)
cat_cols = get_cat_feature_names(train_data_for_whatif)
bad_accounts = bad_accounts.drop(['class_pred', 'class_pred_proba'], axis=1)
km = KModes(n_clusters=5, init='Huang', n_init=5, verbose=2, n_jobs=-1, max_iter=5000, random_state=2)
df_for_kmodes = pd.concat([train_data_for_whatif[cat_cols], bad_accounts[cat_cols]])
km.fit(df_for_kmodes)
bad_accounts['cluster'] = km.predict(bad_accounts[cat_cols])
train_data_for_whatif['cluster'] = km.predict(train_data_for_whatif[cat_cols])

print("cluster column of train data:")
print(train_data_for_whatif['cluster'].value_counts(dropna=False))
print("")
print("cluster column of real time data:")
print(bad_accounts['cluster'].value_counts(dropna=False))

bad_accounts = bad_accounts.drop(cat_cols, axis=1)
train_data_for_whatif = train_data_for_whatif.drop(cat_cols, axis=1)

The value counts of the predicted rating for the data of the train
Low       1949
High       418
Medium     417
Name: class_pred, dtype: int64
Best run was number 3
cluster column of train data:
1    194
2    123
3     55
4     34
0     12
Name: cluster, dtype: int64

cluster column of real time data:
3    628
2    490
1    277
4    215
0    136
Name: cluster, dtype: int64


In [664]:
bad_accounts

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,pypi,pypi_monthly_growth,pypi_quarter_growth,qoe_score,replys_to_sent,support_views_views,total_employees_with_details,unresolved_jira_cases,xray_views,cluster
0,-1,1.300404,1.014774,580352,-0.010603,-0.131726,28710,-0.003963,0.063740,-1,...,1,0.00,-0.25,-1.0,0.333333,-1,11.0,0,-1,2
1,-1,1.226489,1.475962,182585,0.056289,0.214977,921,0.061756,0.189489,-1,...,1,0.00,0.00,-1.0,0.000000,-1,4.0,0,-1,3
2,-1,1.593653,1.051282,98475,0.014636,0.023652,82,0.012423,-0.010073,-1,...,0,0.00,0.00,-1.0,0.500000,-1,3.0,0,-1,3
3,-1,0.947454,1.117647,11107,0.016707,0.069873,19,0.000000,0.057190,-1,...,1,0.00,0.00,-1.0,0.000000,-1,8.0,0,-1,4
5,-1,1.738619,4.623482,93070,0.000882,0.036750,3426,0.000438,0.006947,-1,...,6,0.50,0.50,-1.0,0.333333,-1,3.0,0,-1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2048,-1,2.149842,1.331738,449575,-0.048608,-0.270471,835,0.027813,0.116413,-1,...,2,0.25,0.50,-1.0,0.000000,-1,3.0,0,-1,3
2049,-1,1.045952,1.021768,377916,0.024678,0.080504,1549,0.039030,0.094248,-1,...,1,-0.25,0.25,-1.0,0.250000,-1,6.0,0,-1,3
2050,-1,1.002824,1.000000,11009,0.003240,0.026102,3,0.000000,0.000000,-1,...,0,0.00,0.00,-1.0,0.625000,-1,8.0,0,-1,2
2052,-1,1.042327,1.012346,80304,0.017158,0.040923,82,0.006173,0.025404,-1,...,2,0.00,0.50,-1.0,0.333333,-1,4.0,0,-1,3


In [665]:
train_data_for_whatif

Unnamed: 0,artifactory_views,artifacts/binaries_count,artifacts/binaries_size,artifacts_count,artifacts_count_monthly_growth,artifacts_count_quarter_growth,artifacts_size,artifacts_size_monthly_growth,artifacts_size_quarter_growth,avg_resolution_days,...,pypi,pypi_monthly_growth,pypi_quarter_growth,qoe_score,replys_to_sent,support_views_views,total_employees_with_details,unresolved_jira_cases,xray_views,cluster
2,-1,4.392331,9.666667,27149,0.543505,4.498503,116,0.740681,2.290043,-1,...,0.0,0.0,0.00,-1.0,0.000000,-1,7.0,0,-1,2
3,-1,1.225715,1.826087,24681,0.318772,6.534135,84,0.138578,2.513636,18,...,1.0,0.0,0.00,-1.0,0.666667,-1,28.0,0,-1,2
23,-1,1.039786,1.009375,5828,0.056772,-0.078578,323,0.158065,-0.143521,11,...,0.0,0.0,0.00,-1.0,0.250000,-1,6.0,1,-1,2
25,-1,1.161397,1.000000,10506,0.897664,0.910573,4,0.500000,1.500000,5,...,0.0,0.0,0.00,-1.0,0.484848,-1,14.0,0,-1,1
33,-1,2.179312,2.139556,933053,0.034048,0.291701,4722,0.048379,0.368138,-1,...,1.0,0.0,0.00,-1.0,1.000000,-1,15.0,1,-1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2752,0,1.036412,1.121569,116386,0.076920,0.173732,572,0.034031,0.129011,16,...,1.0,0.0,-0.25,-1.0,0.500000,2,19.0,0,0,2
2755,-1,3.761033,10.876923,885603,0.035376,0.087365,4242,0.038719,0.067310,-1,...,3.0,0.0,0.00,-1.0,-1.000000,-1,4.0,0,-1,1
2770,0,1.007743,1.004785,157739,0.377536,0.381820,420,0.238001,0.264706,-1,...,0.0,0.0,0.00,-1.0,0.000000,0,23.0,0,0,1
2775,-1,1.161397,1.000000,10506,0.897664,0.910573,4,0.500000,1.500000,5,...,0.0,0.0,0.00,-1.0,0.484848,-1,14.0,0,-1,1


In [666]:
exclude_for_analysis = ['total_employees_with_details',
                        'nuget',
                        'n_jira_cases',
                        'number_of_users_monthly_growth',
                        'n_contacts',
                        'days_from_users_change',
                        'maven_quarter_growth',
                        'company_age']

In [667]:
for index, row in bad_accounts.iterrows():
    row = row.reindex(sorted(row.index), axis=1)
    row_trans = pd.DataFrame(row)
    # For each instance of new data in iteration, bring train data of same categorical values
    train_data_subset = train_data_for_whatif.loc[train_data_for_whatif['cluster'] == row['cluster'], :]
    # print(train_data_subset.shape)
    if train_data_subset.shape[0] == 0:
        train_data_subset = train_data_for_whatif

    # Attach the subset of train data with the current instance (Test: whats the shape before and after attach)
    train_data_subset_w_instance = pd.concat([train_data_subset, row_trans.transpose()])
    # print("shape of train data:")
    # print(train_data_subset.shape)
    # print("shape of data with the row:")
    # print(train_data_subset_w_instance.shape)
    # Remove the newly added categorical column
    train_data_subset_w_instance = train_data_subset_w_instance.drop(['cluster'], axis=1)

    # Scale the concated df
    df_whatif_scaled = pd.DataFrame(scaler.fit_transform(train_data_subset_w_instance),
                                    columns=train_data_subset_w_instance.columns)
    df_whatif_scaled = df_whatif_scaled.fillna(0)

    # Find nearest neighbour for the instance we care for from the other data set
    sample = df_whatif_scaled.iloc[-1]
    df_whatif_scaled_wo_sample = df_whatif_scaled.iloc[:-1, :]
    dists = [cityblock(sample, df_whatif_scaled_wo_sample.iloc[i]) for i in
             (range(df_whatif_scaled_wo_sample.shape[0]))]
    closest_obs = train_data_subset.iloc[[np.argmin(dists)], range(train_data_subset.shape[1])].drop('cluster', axis=1)

    # Calculate shap values for the neighbor's features and the current instance's features, calculate diffs
    df_concat_for_shap = pd.concat([closest_obs, row_trans.transpose()], axis=0).drop('cluster', axis=1)
    shap_values_total = shap.TreeExplainer(cbc_no_cats).shap_values(df_concat_for_shap)

    # Create recommendations based on the top diff feature
    shap_diff = np.subtract(shap_values_total[0], shap_values_total[1])
    max_diff_loc = np.argmax(shap_diff)
    found_feature = False
    while not found_feature:
        feature_name = list(train_data_subset.columns)[max_diff_loc]
        curr_val = row.iloc[max_diff_loc]
        nb_val = closest_obs.iloc[0, max_diff_loc]
        if np.isnan(curr_val):
            curr_val = 0
        if np.isnan(nb_val):
            nb_val = 0
        nb_val_bigger = int(curr_val) < int(nb_val)
        nb_val_equal = int(curr_val) == int(nb_val)
        str_for_parse = " to be bigger" if nb_val_bigger else " to be smaller"
        if not nb_val_equal and feature_name not in exclude_for_analysis:
            print("You should change the feature " + feature_name + str_for_parse)
            found_feature = True
        else:
            shap_diff[np.argmax(shap_diff)] = -100000000000
            max_diff_loc = np.argmax(shap_diff)




You should change the feature days_from_permissions_change to be bigger
You should change the feature days_since_sent to be bigger
You should change the feature unresolved_jira_cases to be smaller
You should change the feature days_from_permissions_change to be bigger
You should change the feature maven to be bigger
You should change the feature unresolved_jira_cases to be smaller
You should change the feature days_since_sent to be bigger
You should change the feature days_from_permissions_change to be bigger
You should change the feature maven to be bigger
You should change the feature days_from_permissions_change to be bigger
You should change the feature unresolved_jira_cases to be smaller
You should change the feature days_from_permissions_change to be bigger
You should change the feature days_since_sent to be bigger
You should change the feature days_from_contact_added to be bigger
You should change the feature maven to be bigger
You should change the feature days_since_sent to be