In [1]:
from hana_ml import DataFrame, ConnectionContext
from hana_ml.dataframe import create_dataframe_from_pandas
from hana_ml.algorithms.apl.gradient_boosting_classification import GradientBoostingBinaryClassifier
from hana_ml.algorithms.pal.partition import train_test_val_split
from hana_ml.visualizers.unified_report import UnifiedReport

from shap._explanation import Explanation
from shap.plots import force

In [2]:
hana_address = '8c7eed2e-f460-4a82-abfa-e3ede36923d7.hna1.prod-eu10.hanacloud.ondemand.com' 
hana_port = 443
hana_user = 'MLCU_008' 
hana_password = 'Welcome22' 
hana_encrypt = 'true'

# Instantiate connection object
conn = ConnectionContext(address = hana_address,
                                   port = 443, 
                                   user = hana_user, 
                                   password = hana_password, 
                                   encrypt = hana_encrypt,
                                   sslValidateCertificate = 'false' 
                                  )


At this point we are assuming that table EMP_CHURN contains the data from the CSV file. This step has been done in the "Data Upload" notebook.

We will split up the data into a train and test set. We do not need a validation set as the APL will already split the train set into a train and validation set internally. The test set here is simply used as a hold-out set which will not get used for training the algorithm.

In [3]:
emp_churn_all = DataFrame(conn, 'select * from EMP_CHURN')

In [4]:
train, test, valid = train_test_val_split(emp_churn_all, testing_percentage=0.2, validation_size=0)

Now store the train and test datasets into their own tables:

In [5]:
train.save('EMP_CHURN_TRAIN', force=True)
test.save('EMP_CHURN_TEST', force=True)

<hana_ml.dataframe.DataFrame at 0x1f759b9b2b0>

The data from the train table will be used to fit a classifier:

In [6]:
model = GradientBoostingBinaryClassifier(variable_auto_selection = True)

model.set_params(
    extra_applyout_settings={
        'APL/ApplyExtraMode': 'Advanced Apply Settings',
        'APL/ApplyPredictedValue': 'true',
        'APL/ApplyProbability': 'true',       
        'APL/ApplyDecision': 'true',   
        'APL/ApplyContribution': 'all' 
    })

model.fit(train, label='FLIGHT_RISK', key='EMPLOYEE_ID')

Now apply the trained model on the hold-out dataset and view a few examples:

In [7]:
apply_out = model.predict(test)
pdf_apply_out = apply_out.collect()

print(pdf_apply_out.head(10))

   EMPLOYEE_ID TRUE_LABEL PREDICTED  gb_score_FLIGHT_RISK  \
0        10170         No        No             -3.891939   
1        10173         No        No             -5.304201   
2        10200        Yes       Yes             -0.859471   
3        10202        Yes        No             -2.189888   
4        10263        Yes       Yes             -0.570763   
5        10314         No        No             -2.926042   
6        10351         No        No             -2.209723   
7        10360         No        No             -2.693893   
8        10361         No        No             -2.322585   
9        10367         No        No             -2.241893   

   gb_proba_FLIGHT_RISK  gb_contrib_AGE  gb_contrib_GENERATION  \
0              0.019998        0.247021               0.028988   
1              0.004946       -0.017063              -0.165065   
2              0.297450       -0.054138               0.007168   
3              0.100662        0.098701               0.007315  

We will use the AUC metric for retrieving the classifier's performance. This will be in the interval [0..1] where we would like it to be close to 1.

In [8]:
model.get_performance_metrics()['AUC']

0.9073

Let's see an overview of the feature importances. This lists the variables from the dataset contributing the most towards the flight risk from top to bottom.

The numbers display their share of importance, summing up to 1.

In [9]:
# Disabled the unified report because this does not show correctly in Github.
# Enable the below line if using Jupyter Lab or vs.code for a graphical model report

#UnifiedReport(model).build().display()

In [10]:
model.get_feature_importances()

{'ExactSHAP': OrderedDict([('FUNCTIONALAREACHANGETYPE', 0.22382256388664246),
              ('PROMOTION_WITHIN_LAST_3_YEARS', 0.14958792924880981),
              ('TIMEINPREVPOSITIONMONTH', 0.0930989533662796),
              ('EMPLOYMENT_TYPE_2', 0.08993068337440491),
              ('JOBLEVELCHANGETYPE', 0.04940509423613548),
              ('AGE', 0.035911694169044495),
              ('TENURE_MONTHS', 0.0341409295797348),
              ('SALARY', 0.033187136054039),
              ('PREVCOUNTRYLON', 0.030870283022522926),
              ('RISK_OF_LOSS', 0.030296780169010162),
              ('GENDER', 0.027394775301218033),
              ('PREVIOUS_PERFORMANCE_RATING', 0.02657267265021801),
              ('CHANGE_IN_PERFORMANCE_RATING', 0.020326007157564163),
              ('PREVCOUNTRYLAT', 0.01904531568288803),
              ('CURRENT_COUNTRY', 0.017893513664603233),
              ('CURCOUNTRYLON', 0.017140885815024376),
              ('IMPACT_OF_LOSS', 0.015592319890856743),
          

It appears that the field FUNCTIONALAREACHANGETYPE is the most important towards predicting the target. Lets see what this field contains:

In [11]:
emp_churn_all.agg([('count', 'FUNCTIONALAREACHANGETYPE', 'COUNT')], group_by='FUNCTIONALAREACHANGETYPE').collect()

Unnamed: 0,FUNCTIONALAREACHANGETYPE,COUNT
0,No change,5220
1,Cross-Functional Move,5995
2,External Hire,1301
3,Intra-Functional Move,6599


In [12]:
dict = { 'TRUE_LABEL': 'Target Actual',
        'PREDICTED': 'Target Predicted',
        'gb_score_FLIGHT_RISK': 'Score', 
        'gb_proba_FLIGHT_RISK': 'Probability',        
        'gb_contrib_constant_bias': 'Shap Baseline' }

pdf_apply_out.rename(columns=dict, inplace=True)    
pdf_apply_out

Unnamed: 0,EMPLOYEE_ID,Target Actual,Target Predicted,Score,Probability,gb_contrib_AGE,gb_contrib_GENERATION,gb_contrib_CRITICAL_JOB_ROLE,gb_contrib_RISK_OF_LOSS,gb_contrib_IMPACT_OF_LOSS,...,gb_contrib_CURRENT_FUNCTIONAL_AREA,gb_contrib_CURRENT_PERFORMANCE_RATING,gb_contrib_CURRENT_COUNTRY,gb_contrib_CURCOUNTRYLAT,gb_contrib_CURCOUNTRYLON,gb_contrib_PROMOTION_WITHIN_LAST_3_YEARS,gb_contrib_CHANGE_IN_PERFORMANCE_RATING,gb_contrib_FUNCTIONALAREACHANGETYPE,gb_contrib_JOBLEVELCHANGETYPE,Shap Baseline
0,10170,No,No,-3.891939,0.019998,0.247021,0.028988,0.045069,0.163816,-0.013497,...,0.064489,-0.007611,-0.030123,-0.053542,-0.146821,-1.490310,-0.037205,0.293928,0.014528,-2.150049
1,10173,No,No,-5.304201,0.004946,-0.017063,-0.165065,0.000337,-0.188765,0.020104,...,0.046731,-0.050097,-0.059287,-0.064130,-0.081472,-2.003907,-0.048646,0.303917,0.055184,-2.150049
2,10200,Yes,Yes,-0.859471,0.297450,-0.054138,0.007168,0.048145,0.234632,-0.005541,...,0.071783,-0.020964,-0.079900,-0.082027,-0.064566,0.322989,-0.080420,0.248444,0.132959,-2.150049
3,10202,Yes,No,-2.189888,0.100662,0.098701,0.007315,0.042063,0.164019,-0.002429,...,0.065240,-0.019133,-0.082907,-0.055073,-0.043885,0.251193,-0.094343,0.279285,0.131885,-2.150049
4,10263,Yes,Yes,-0.570763,0.361061,0.235758,0.009927,0.088582,0.358052,0.168477,...,0.083957,-0.033510,-0.091182,-0.070672,-0.069805,0.301546,-0.074188,0.295470,0.152678,-2.150049
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3818,26308,Yes,No,-1.652868,0.160722,0.019125,0.016585,-0.005845,0.043071,-0.031282,...,-0.021770,0.084911,0.116575,0.002158,-0.077312,0.070799,-0.046156,0.401105,-0.079677,-2.150049
3819,26365,No,No,-1.957439,0.123744,0.119508,0.009971,0.020864,-0.079019,-0.052387,...,-0.058383,0.050199,0.011649,0.010622,0.069525,0.158636,0.014948,0.409577,-0.066535,-2.150049
3820,26421,No,No,-6.261555,0.001905,0.096133,0.009787,0.033689,-0.032666,-0.076967,...,-0.110927,-0.129136,0.182304,-0.039753,0.155854,0.133417,-0.092817,-3.529732,-0.121927,-2.150049
3821,26476,No,No,-7.174850,0.000765,0.001067,0.011733,-0.062623,-0.051021,-0.098142,...,0.042523,-0.070609,0.019733,0.053831,-0.112566,-1.674788,-0.041436,-3.312919,0.684118,-2.150049


In [13]:
pdf_apply_out.columns

#cols = [hdr.replace('gb_contrib_', 'Shap ') for hdr in pdf_apply_out.columns]     
#cols

pdf_apply_out.columns = [hdr.replace('gb_contrib_', 'Shap ') for hdr in pdf_apply_out.columns]     

col_list = [col for col in pdf_apply_out.columns if col.startswith('Shap')]
pdf_apply_out['Total Shap'] = pdf_apply_out[col_list].sum(axis=1)

pdf_apply_out

Unnamed: 0,EMPLOYEE_ID,Target Actual,Target Predicted,Score,Probability,Shap AGE,Shap GENERATION,Shap CRITICAL_JOB_ROLE,Shap RISK_OF_LOSS,Shap IMPACT_OF_LOSS,...,Shap CURRENT_PERFORMANCE_RATING,Shap CURRENT_COUNTRY,Shap CURCOUNTRYLAT,Shap CURCOUNTRYLON,Shap PROMOTION_WITHIN_LAST_3_YEARS,Shap CHANGE_IN_PERFORMANCE_RATING,Shap FUNCTIONALAREACHANGETYPE,Shap JOBLEVELCHANGETYPE,Shap Baseline,Total Shap
0,10170,No,No,-3.891939,0.019998,0.247021,0.028988,0.045069,0.163816,-0.013497,...,-0.007611,-0.030123,-0.053542,-0.146821,-1.490310,-0.037205,0.293928,0.014528,-2.150049,-3.891939
1,10173,No,No,-5.304201,0.004946,-0.017063,-0.165065,0.000337,-0.188765,0.020104,...,-0.050097,-0.059287,-0.064130,-0.081472,-2.003907,-0.048646,0.303917,0.055184,-2.150049,-5.304202
2,10200,Yes,Yes,-0.859471,0.297450,-0.054138,0.007168,0.048145,0.234632,-0.005541,...,-0.020964,-0.079900,-0.082027,-0.064566,0.322989,-0.080420,0.248444,0.132959,-2.150049,-0.859472
3,10202,Yes,No,-2.189888,0.100662,0.098701,0.007315,0.042063,0.164019,-0.002429,...,-0.019133,-0.082907,-0.055073,-0.043885,0.251193,-0.094343,0.279285,0.131885,-2.150049,-2.189888
4,10263,Yes,Yes,-0.570763,0.361061,0.235758,0.009927,0.088582,0.358052,0.168477,...,-0.033510,-0.091182,-0.070672,-0.069805,0.301546,-0.074188,0.295470,0.152678,-2.150049,-0.570762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3818,26308,Yes,No,-1.652868,0.160722,0.019125,0.016585,-0.005845,0.043071,-0.031282,...,0.084911,0.116575,0.002158,-0.077312,0.070799,-0.046156,0.401105,-0.079677,-2.150049,-1.652868
3819,26365,No,No,-1.957439,0.123744,0.119508,0.009971,0.020864,-0.079019,-0.052387,...,0.050199,0.011649,0.010622,0.069525,0.158636,0.014948,0.409577,-0.066535,-2.150049,-1.957439
3820,26421,No,No,-6.261555,0.001905,0.096133,0.009787,0.033689,-0.032666,-0.076967,...,-0.129136,0.182304,-0.039753,0.155854,0.133417,-0.092817,-3.529732,-0.121927,-2.150049,-6.261555
3821,26476,No,No,-7.174850,0.000765,0.001067,0.011733,-0.062623,-0.051021,-0.098142,...,-0.070609,0.019733,0.053831,-0.112566,-1.674788,-0.041436,-3.312919,0.684118,-2.150049,-7.174849


In [14]:
shap_columns = col_list[:-5]
predictors_names = [c[5:] for c in shap_columns if not c.startswith('DATE')]
shap_values = pdf_apply_out[shap_columns].values

In [15]:
pdf_apply_out

Unnamed: 0,EMPLOYEE_ID,Target Actual,Target Predicted,Score,Probability,Shap AGE,Shap GENERATION,Shap CRITICAL_JOB_ROLE,Shap RISK_OF_LOSS,Shap IMPACT_OF_LOSS,...,Shap CURRENT_PERFORMANCE_RATING,Shap CURRENT_COUNTRY,Shap CURCOUNTRYLAT,Shap CURCOUNTRYLON,Shap PROMOTION_WITHIN_LAST_3_YEARS,Shap CHANGE_IN_PERFORMANCE_RATING,Shap FUNCTIONALAREACHANGETYPE,Shap JOBLEVELCHANGETYPE,Shap Baseline,Total Shap
0,10170,No,No,-3.891939,0.019998,0.247021,0.028988,0.045069,0.163816,-0.013497,...,-0.007611,-0.030123,-0.053542,-0.146821,-1.490310,-0.037205,0.293928,0.014528,-2.150049,-3.891939
1,10173,No,No,-5.304201,0.004946,-0.017063,-0.165065,0.000337,-0.188765,0.020104,...,-0.050097,-0.059287,-0.064130,-0.081472,-2.003907,-0.048646,0.303917,0.055184,-2.150049,-5.304202
2,10200,Yes,Yes,-0.859471,0.297450,-0.054138,0.007168,0.048145,0.234632,-0.005541,...,-0.020964,-0.079900,-0.082027,-0.064566,0.322989,-0.080420,0.248444,0.132959,-2.150049,-0.859472
3,10202,Yes,No,-2.189888,0.100662,0.098701,0.007315,0.042063,0.164019,-0.002429,...,-0.019133,-0.082907,-0.055073,-0.043885,0.251193,-0.094343,0.279285,0.131885,-2.150049,-2.189888
4,10263,Yes,Yes,-0.570763,0.361061,0.235758,0.009927,0.088582,0.358052,0.168477,...,-0.033510,-0.091182,-0.070672,-0.069805,0.301546,-0.074188,0.295470,0.152678,-2.150049,-0.570762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3818,26308,Yes,No,-1.652868,0.160722,0.019125,0.016585,-0.005845,0.043071,-0.031282,...,0.084911,0.116575,0.002158,-0.077312,0.070799,-0.046156,0.401105,-0.079677,-2.150049,-1.652868
3819,26365,No,No,-1.957439,0.123744,0.119508,0.009971,0.020864,-0.079019,-0.052387,...,0.050199,0.011649,0.010622,0.069525,0.158636,0.014948,0.409577,-0.066535,-2.150049,-1.957439
3820,26421,No,No,-6.261555,0.001905,0.096133,0.009787,0.033689,-0.032666,-0.076967,...,-0.129136,0.182304,-0.039753,0.155854,0.133417,-0.092817,-3.529732,-0.121927,-2.150049,-6.261555
3821,26476,No,No,-7.174850,0.000765,0.001067,0.011733,-0.062623,-0.051021,-0.098142,...,-0.070609,0.019733,0.053831,-0.112566,-1.674788,-0.041436,-3.312919,0.684118,-2.150049,-7.174849


In [16]:
actual_values = test.collect()[predictors_names].values
baseline = pdf_apply_out['Shap Baseline']
xpl = Explanation(shap_values, base_values=baseline, data=actual_values, feature_names=predictors_names)

In [17]:
#idx = pdf_apply_out[pdf_apply_out['EMPLOYEE_ID'] == 10101].index.values.astype(int)[0]

# Just take a random record (10)
idx = 10


#xpl[idx].data = [str(item) for item in xpl[idx].data]
#xpl[idx].values = [str(item) for item in xpl[idx].values]
#xpl[0].base_values 

#force(xpl[0].base_values, xpl[idx].values, features=xpl[idx].data, feature_names=predictors_names, matplotlib=False)


ls = [type(item) for item in predictors_names]
print(ls)


[<class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>]


In [18]:
idx = pdf_apply_out[pdf_apply_out['EMPLOYEE_ID'] == 25815].index.values.astype(int)[0]
idx

IndexError: index 0 is out of bounds for axis 0 with size 0

This tells that field FUNCTIONALAREACHANGETYPE incidates whether the employee has recently been allowed an Intra-functional or Cross-functional move or whether there has been no change (we will ignore the External hires as the company has no influence on their churn).

Now let's examing the employees for which the model predicts they will be churning:

In [19]:
emp_flightrisk = apply_out.filter('PREDICTED = \'Yes\'')
num_flightrisk = emp_flightrisk.describe('EMPLOYEE_ID').collect()['count'].values[0]
print('Number of employees in test set with positive flight risk: {}'.format(num_flightrisk))

Number of employees in test set with positive flight risk: 427


The above shows the number of employees in the test set with a risk of churning according to the statistical model.

Now we will take this table of employees with a flight risk and join it with the original table as loaded from the input CSV. This will list all details of the employees with a churn risk.

In [20]:
emp_flightrisk_new = emp_flightrisk.alias('L').join(emp_churn_all.alias('R'), 'L.EMPLOYEE_ID = R.EMPLOYEE_ID', select=[
    ('L.EMPLOYEE_ID', 'EMPLOYEE_ID'),
    'AGE', 'AGE_GROUP10', 'AGE_GROUP5', 'GENERATION', 'CRITICAL_JOB_ROLE', 'RISK_OF_LOSS', 'IMPACT_OF_LOSS', 
    'FUTURE_LEADER', 'GENDER', 'MGR_EMP', 'MINORITY', 'TENURE_MONTHS', 'TENURE_INTERVAL_YEARS', 'TENURE_INTERVALL_DESC',
    'SALARY', 'EMPLOYMENT_TYPE', 'EMPLOYMENT_TYPE_2', 'HIGH_POTENTIAL', 'PREVIOUS_FUNCTIONAL_AREA', 'PREVIOUS_JOB_LEVEL', 
    'PREVIOUS_CAREER_PATH', 'PREVIOUS_PERFORMANCE_RATING', 'PREVIOUS_COUNTRY', 'PREVCOUNTRYLAT', 'PREVCOUNTRYLON', 
    'PREVIOUS_REGION', 'TIMEINPREVPOSITIONMONTH', 'CURRENT_FUNCTIONAL_AREA', 'CURRENT_JOB_LEVEL', 'CURRENT_CAREER_PATH',
    'CURRENT_PERFORMANCE_RATING', 'CURRENT_REGION', 'CURRENT_COUNTRY', 'CURCOUNTRYLAT', 'CURCOUNTRYLON', 
    'PROMOTION_WITHIN_LAST_3_YEARS', 'CHANGED_POSITION_WITHIN_LAST_2_YEARS', 'CHANGE_IN_PERFORMANCE_RATING',
    'FUNCTIONALAREACHANGETYPE', 'JOBLEVELCHANGETYPE', 'HEADS'
])

pdf_emp_flightrisk = emp_flightrisk_new.collect()

Now let's see what the functional area change types are for those employees with a flight risk:

In [21]:
pdf_emp_flightrisk['FUNCTIONALAREACHANGETYPE'].value_counts()

Intra-Functional Move    237
No change                110
Cross-Functional Move     76
External Hire              4
Name: FUNCTIONALAREACHANGETYPE, dtype: int64

We will remove the external hires from the dataset, as the company cannot influence their function in the same way as internal employees:

In [22]:
pdf_emp_flightrisk.drop( pdf_emp_flightrisk[(pdf_emp_flightrisk['FUNCTIONALAREACHANGETYPE'] == 'External Hire')].index, inplace=True)

In [23]:
pdf_emp_flightrisk['FUNCTIONALAREACHANGETYPE'].value_counts()

Intra-Functional Move    237
No change                110
Cross-Functional Move     76
Name: FUNCTIONALAREACHANGETYPE, dtype: int64

To check the effect of the functional area change type for these employees with a churn risk, their functional area change type will be set to 'Cross-functional move':

In [24]:
pdf_emp_flightrisk['FUNCTIONALAREACHANGETYPE'] = 'Cross-Functional Move'

pdf_emp_flightrisk['FUNCTIONALAREACHANGETYPE'].value_counts()

Cross-Functional Move    423
Name: FUNCTIONALAREACHANGETYPE, dtype: int64

In [25]:
create_dataframe_from_pandas(conn, pdf_emp_flightrisk, 'CHURNING_EMPLOYEES', force=True)

100%|██████████| 1/1 [00:00<00:00,  7.99it/s]


<hana_ml.dataframe.DataFrame at 0x1f75a1c6760>

In [26]:
emp_churning = DataFrame(conn, 'select * from CHURNING_EMPLOYEES')
apply_out_new = model.predict(emp_churning)

In [27]:
emp_flightrisk_new_pos = apply_out_new.filter('PREDICTED = \'Yes\'')
num_flightrisk_new = emp_flightrisk_new_pos.describe('EMPLOYEE_ID').collect()['count'].values[0]
num_flightrisk_delta = num_flightrisk - num_flightrisk_new
print('Number of employees in test set with positive flight risk after change in Functional Area Change Type from No change to Cross-Functional Move: {}'.format(num_flightrisk_new))

print('This is down {}, which means that {:.1f}% of employees can possibly be prevented from churning by allowing them a Cross-Functional Move'.format(num_flightrisk_delta, num_flightrisk_delta / num_flightrisk * 100))

Number of employees in test set with positive flight risk after change in Functional Area Change Type from No change to Cross-Functional Move: 163
This is down 264, which means that 61.8% of employees can possibly be prevented from churning by allowing them a Cross-Functional Move
