In [1]:
from hana_ml import DataFrame, ConnectionContext
from hana_ml.dataframe import create_dataframe_from_pandas
from hana_ml.algorithms.apl.gradient_boosting_classification import GradientBoostingBinaryClassifier
from hana_ml.algorithms.pal.partition import train_test_val_split
from hana_ml.visualizers.unified_report import UnifiedReport

import os
from dotenv import load_dotenv

In [2]:
# Loads database connection parameters from .env file.
# If this gives an error, copy file .env.example to .env and change it according to your parameters
load_dotenv()

# Instantiate connection object
conn = ConnectionContext(
    address = os.getenv('DB_HOST'),
    port = os.getenv('DB_PORT'), 
    user = os.getenv('DB_USER'), 
    password = os.getenv('DB_PASS'), 
    encrypt = 'true',
    sslValidateCertificate = 'false' 
)

At this point we are assuming that table EMP_CHURN contains the data from the CSV file. This step has been done in the "Data Upload" notebook.

We will split up the data into a train and test set. We do not need a validation set as the APL will already split the train set into a train and validation set internally. The test set here is simply used as a hold-out set which will not get used for training the algorithm.

In [3]:
emp_churn_all = DataFrame(conn, 'select * from EMP_CHURN')

In [4]:
train, test, valid = train_test_val_split(emp_churn_all, testing_percentage=0.2, validation_size=0)

Now store the train and test datasets into their own tables:

In [5]:
train.save('EMP_CHURN_TRAIN', force=True)
test.save('EMP_CHURN_TEST', force=True)

<hana_ml.dataframe.DataFrame at 0x206dbbf6f10>

The data from the train table will be used to fit a classifier:

In [6]:
model = GradientBoostingBinaryClassifier(variable_auto_selection = True)

model.set_params(
    extra_applyout_settings={
        'APL/ApplyExtraMode': 'Advanced Apply Settings',
        'APL/ApplyPredictedValue': 'true',
        'APL/ApplyProbability': 'false',       
        'APL/ApplyDecision': 'true',   
    })

model.fit(train, label='FLIGHT_RISK', key='EMPLOYEE_ID')

Now apply the trained model on the hold-out dataset and view a few examples:

In [7]:
apply_out = model.predict(test)
pdf_apply_out = apply_out.collect()

print(pdf_apply_out.head(10))

   EMPLOYEE_ID TRUE_LABEL PREDICTED  gb_score_FLIGHT_RISK
0        10158         No        No             -5.249740
1        10182         No        No             -4.838394
2        10200        Yes       Yes             -0.586952
3        10259         No        No             -4.663995
4        10282         No        No             -3.881936
5        10328         No        No             -1.877677
6        10332        Yes        No             -1.365033
7        10338         No        No             -2.078447
8        10343         No       Yes             -0.962751
9        10361         No        No             -2.160242


We will use the AUC metric for retrieving the classifier's performance. This will be in the interval [0..1] where we would like it to be close to 1.

In [8]:
model.get_performance_metrics()['AUC']

0.905

Let's see an overview of the feature importances. This lists the variables from the dataset contributing the most towards the flight risk from top to bottom.

The numbers display their share of importance, summing up to 1.

In [9]:
# Disabled the unified report because this does not show correctly in Github.
# Enable the below line if using Jupyter Lab or vs.code for a graphical model report

#UnifiedReport(model).build().display()

In [10]:
model.get_feature_importances()

{'ExactSHAP': OrderedDict([('FUNCTIONALAREACHANGETYPE', 0.2440330535173416),
              ('PROMOTION_WITHIN_LAST_3_YEARS', 0.14001020789146423),
              ('TIMEINPREVPOSITIONMONTH', 0.10031360387802124),
              ('EMPLOYMENT_TYPE_2', 0.0914246216416359),
              ('AGE', 0.03436921909451485),
              ('SALARY', 0.0322422981262207),
              ('JOBLEVELCHANGETYPE', 0.0318293422460556),
              ('RISK_OF_LOSS', 0.029726015403866768),
              ('TENURE_MONTHS', 0.028254985809326172),
              ('GENDER', 0.027538727968931198),
              ('PREVCOUNTRYLON', 0.02592954970896244),
              ('PREVIOUS_PERFORMANCE_RATING', 0.025640953332185745),
              ('PREVCOUNTRYLAT', 0.023161407560110092),
              ('CURCOUNTRYLON', 0.020921478047966957),
              ('IMPACT_OF_LOSS', 0.01535748690366745),
              ('CHANGE_IN_PERFORMANCE_RATING', 0.014801138080656528),
              ('CRITICAL_JOB_ROLE', 0.01358804851770401),
         

It appears that the field FUNCTIONALAREACHANGETYPE is the most important towards predicting the target. Lets see what this field contains:

In [11]:
emp_churn_all.agg([('count', 'FUNCTIONALAREACHANGETYPE', 'COUNT')], group_by='FUNCTIONALAREACHANGETYPE').collect()

Unnamed: 0,FUNCTIONALAREACHANGETYPE,COUNT
0,No change,5220
1,Cross-Functional Move,5995
2,External Hire,1301
3,Intra-Functional Move,6599


This tells that field FUNCTIONALAREACHANGETYPE incidates whether the employee has recently been allowed an Intra-functional or Cross-functional move or whether there has been no change (we will ignore the External hires as the company has no influence on their churn).

Now let's examing the employees for which the model predicts they will be churning:

In [12]:
emp_flightrisk = apply_out.filter('PREDICTED = \'Yes\'')
num_flightrisk = emp_flightrisk.describe('EMPLOYEE_ID').collect()['count'].values[0]
print('Number of employees in test set with positive flight risk: {}'.format(num_flightrisk))

Number of employees in test set with positive flight risk: 468


The above shows the number of employees in the test set with a risk of churning according to the statistical model.

Now we will take this table of employees with a flight risk and join it with the original table as loaded from the input CSV. This will list all details of the employees with a churn risk.

In [13]:
emp_flightrisk_new = emp_flightrisk.alias('L').join(emp_churn_all.alias('R'), 'L.EMPLOYEE_ID = R.EMPLOYEE_ID', select=[
    ('L.EMPLOYEE_ID', 'EMPLOYEE_ID'),
    'AGE', 'AGE_GROUP10', 'AGE_GROUP5', 'GENERATION', 'CRITICAL_JOB_ROLE', 'RISK_OF_LOSS', 'IMPACT_OF_LOSS', 
    'FUTURE_LEADER', 'GENDER', 'MGR_EMP', 'MINORITY', 'TENURE_MONTHS', 'TENURE_INTERVAL_YEARS', 'TENURE_INTERVALL_DESC',
    'SALARY', 'EMPLOYMENT_TYPE', 'EMPLOYMENT_TYPE_2', 'HIGH_POTENTIAL', 'PREVIOUS_FUNCTIONAL_AREA', 'PREVIOUS_JOB_LEVEL', 
    'PREVIOUS_CAREER_PATH', 'PREVIOUS_PERFORMANCE_RATING', 'PREVIOUS_COUNTRY', 'PREVCOUNTRYLAT', 'PREVCOUNTRYLON', 
    'PREVIOUS_REGION', 'TIMEINPREVPOSITIONMONTH', 'CURRENT_FUNCTIONAL_AREA', 'CURRENT_JOB_LEVEL', 'CURRENT_CAREER_PATH',
    'CURRENT_PERFORMANCE_RATING', 'CURRENT_REGION', 'CURRENT_COUNTRY', 'CURCOUNTRYLAT', 'CURCOUNTRYLON', 
    'PROMOTION_WITHIN_LAST_3_YEARS', 'CHANGED_POSITION_WITHIN_LAST_2_YEARS', 'CHANGE_IN_PERFORMANCE_RATING',
    'FUNCTIONALAREACHANGETYPE', 'JOBLEVELCHANGETYPE', 'HEADS'
])

pdf_emp_flightrisk = emp_flightrisk_new.collect()

Now let's see what the functional area change types are for those employees with a flight risk:

In [14]:
pdf_emp_flightrisk['FUNCTIONALAREACHANGETYPE'].value_counts()

Intra-Functional Move    282
No change                 96
Cross-Functional Move     82
External Hire              8
Name: FUNCTIONALAREACHANGETYPE, dtype: int64

We will remove the external hires from the dataset, as the company cannot influence their function in the same way as internal employees:

In [15]:
pdf_emp_flightrisk.drop( pdf_emp_flightrisk[(pdf_emp_flightrisk['FUNCTIONALAREACHANGETYPE'] == 'External Hire')].index, inplace=True)

In [16]:
pdf_emp_flightrisk['FUNCTIONALAREACHANGETYPE'].value_counts()

Intra-Functional Move    282
No change                 96
Cross-Functional Move     82
Name: FUNCTIONALAREACHANGETYPE, dtype: int64

To check the effect of the functional area change type for these employees with a churn risk, their functional area change type will be set to 'Cross-functional move':

In [17]:
pdf_emp_flightrisk['FUNCTIONALAREACHANGETYPE'] = 'Cross-Functional Move'

pdf_emp_flightrisk['FUNCTIONALAREACHANGETYPE'].value_counts()

Cross-Functional Move    460
Name: FUNCTIONALAREACHANGETYPE, dtype: int64

In [18]:
create_dataframe_from_pandas(conn, pdf_emp_flightrisk, 'CHURNING_EMPLOYEES', force=True)

100%|██████████| 1/1 [00:00<00:00,  6.52it/s]


<hana_ml.dataframe.DataFrame at 0x206d85abdf0>

In [19]:
emp_churning = DataFrame(conn, 'select * from CHURNING_EMPLOYEES')
apply_out_new = model.predict(emp_churning)

In [20]:
emp_flightrisk_new_pos = apply_out_new.filter('PREDICTED = \'Yes\'')
num_flightrisk_new = emp_flightrisk_new_pos.describe('EMPLOYEE_ID').collect()['count'].values[0]
num_flightrisk_delta = num_flightrisk - num_flightrisk_new
print('Number of employees in test set with positive flight risk after change in Functional Area Change Type from No change to Cross-Functional Move: {}'.format(num_flightrisk_new))

print('This is down {}, which means that {:.1f}% of employees can possibly be prevented from churning by allowing them a Cross-Functional Move'.format(num_flightrisk_delta, num_flightrisk_delta / num_flightrisk * 100))

Number of employees in test set with positive flight risk after change in Functional Area Change Type from No change to Cross-Functional Move: 146
This is down 322, which means that 68.8% of employees can possibly be prevented from churning by allowing them a Cross-Functional Move
