In [1]:
from hana_ml import DataFrame, ConnectionContext
from hana_ml.dataframe import create_dataframe_from_pandas
from hana_ml.algorithms.apl.classification import AutoClassifier
from hana_ml.algorithms.pal.partition import train_test_val_split

In [2]:
hana_address = '8c7eed2e-f460-4a82-abfa-e3ede36923d7.hna1.prod-eu10.hanacloud.ondemand.com' 
hana_port = 443
hana_user = 'MLCU_008' 
hana_password = 'Welcome22' 
hana_encrypt = 'true'

# Instantiate connection object
conn = ConnectionContext(address = hana_address,
                                   port = 443, 
                                   user = hana_user, 
                                   password = hana_password, 
                                   encrypt = hana_encrypt,
                                   sslValidateCertificate = 'false' 
                                  )


At this point we are assuming that table EMP_CHURN contains the data from the CSV file. This step has been done in the "Data Upload" notebook.

We will split up the data into a train and test set. We do not need a validation set as the APL will already split the train set into a train and validation set internally. The test set here is simply used as a hold-out set which will not get used for training the algorithm.

In [3]:
emp_churn_all = DataFrame(conn, 'select * from EMP_CHURN')

In [4]:
train, test, valid = train_test_val_split(emp_churn_all, testing_percentage=0.2, validation_size=0)

Now store the train and test datasets into their own tables:

In [5]:
train.save('EMP_CHURN_TRAIN')
test.save('EMP_CHURN_TEST')

<hana_ml.dataframe.DataFrame at 0x193073b9cd0>

The data from the train table will be used to fit a classifier:

In [6]:
model = AutoClassifier(conn_context=conn, variable_auto_selection=True)
#model.set_params(
#    extra_applyout_settings={
#        'APL/ApplyContribution': 'all'
#    })

model.fit(train, label='FLIGHT_RISK', key='EMPLOYEE_ID')

Now apply the trained model on the hold-out dataset and view a few examples:

In [7]:
apply_out = model.predict(test)
print(apply_out.head(100).collect())

    EMPLOYEE_ID TRUE_LABEL PREDICTED  PROBABILITY
0         10102         No        No     0.945495
1         10198         No        No     0.999998
2         10256         No        No     0.998772
3         10259         No        No     0.999998
4         10261        Yes        No     0.928935
..          ...        ...       ...          ...
95        14821         No       Yes     0.303692
96        14833         No        No     0.928768
97        15242         No        No     0.878350
98        15609         No        No     0.875925
99        15611         No        No     0.977075

[100 rows x 4 columns]


We will use the AUC metric for retrieving the classifier's performance. This will be in the interval [0..1] where we would like it to be close to 1.

In [8]:
model.get_performance_metrics()['AUC']

0.8225

Let's see an overview of the feature importances. This lists the variables from the dataset contributing the most towards the flight risk from top to bottom.

The numbers display their share of importance, summing up to 1.

In [9]:
model.get_feature_importances()

OrderedDict([('EMPLOYMENT_TYPE_2', 0.18583554773494607),
             ('TIMEINPREVPOSITIONMONTH', 0.16786299514880204),
             ('FUNCTIONALAREACHANGETYPE', 0.1127601179230471),
             ('PROMOTION_WITHIN_LAST_3_YEARS', 0.09419017758315447),
             ('CHANGE_IN_PERFORMANCE_RATING', 0.08047987880415516),
             ('AGE', 0.051665987778959865),
             ('CURCOUNTRYLON', 0.050919529343584014),
             ('GENDER', 0.04890419383768541),
             ('CHANGED_POSITION_WITHIN_LAST_2_YEARS', 0.036813988734119486),
             ('PREVIOUS_PERFORMANCE_RATING', 0.03089791873322329),
             ('GENERATION', 0.024702433374270548),
             ('CURRENT_FUNCTIONAL_AREA', 0.02415320601669405),
             ('CRITICAL_JOB_ROLE', 0.021989952642134276),
             ('PREVIOUS_JOB_LEVEL', 0.020181007115439107),
             ('PREVIOUS_FUNCTIONAL_AREA', 0.01822230259028073),
             ('RISK_OF_LOSS', 0.01652352857750571),
             ('CURRENT_CAREER_PATH', 0.013897

It appears that the field EMPLOYMENT_TYPE_2 is the most important towards predicting the target. Lets see what this field contains:

In [30]:
emp_churn_all.agg([('count', 'EMPLOYMENT_TYPE_2', 'COUNT')], group_by='EMPLOYMENT_TYPE_2').collect()

Unnamed: 0,EMPLOYMENT_TYPE_2,COUNT
0,Regular,16835
1,Temporary,2280


This tells the EMPLOYMENT_TYPE_2 incidates whether the employee's contract is a regular or temporary one.

Now let's examing the employees for which the model predicts they will be churning:

In [11]:
emp_flightrisk = apply_out.filter('PREDICTED = \'Yes\'')
num_flightrisk = emp_flightrisk.describe('EMPLOYEE_ID').collect()['count'].values[0]
print('Number of employees in test set with positive flight risk: {}'.format(num_flightrisk))

Number of employees in test set with positive flight risk: 465


The above shows the number of employees in the test set with a risk of churning according to the statistical model

Now we will take this table of employees with a flight risk and join it with the original table as loaded from the input CSV. This will list all details of the employees with a churn risk.

In [13]:
emp_flightrisk_new = emp_flightrisk.alias('L').join(emp_churn_all.alias('R'), 'L.EMPLOYEE_ID = R.EMPLOYEE_ID', select=[
    ('L.EMPLOYEE_ID', 'EMPLOYEE_ID'),
    'AGE',
    'AGE_GROUP10',
    'AGE_GROUP5',
    'GENERATION',
    'CRITICAL_JOB_ROLE',
    'RISK_OF_LOSS',
    'IMPACT_OF_LOSS',
    'FUTURE_LEADER',
    'GENDER',
    'MGR_EMP',
    'MINORITY',
    'TENURE_MONTHS',
    'TENURE_INTERVAL_YEARS',
    'TENURE_INTERVALL_DESC',
#    ('SALARY * 2', 'SALARY'),
    'SALARY',
    'EMPLOYMENT_TYPE',
    'EMPLOYMENT_TYPE_2',
    'HIGH_POTENTIAL',
    'PREVIOUS_FUNCTIONAL_AREA',
    'PREVIOUS_JOB_LEVEL',
    'PREVIOUS_CAREER_PATH',
    'PREVIOUS_PERFORMANCE_RATING',
    'PREVIOUS_COUNTRY',
    'PREVCOUNTRYLAT',
    'PREVCOUNTRYLON',
    'PREVIOUS_REGION',
    'TIMEINPREVPOSITIONMONTH',
    'CURRENT_FUNCTIONAL_AREA',
    'CURRENT_JOB_LEVEL',
    'CURRENT_CAREER_PATH',
    'CURRENT_PERFORMANCE_RATING',
    'CURRENT_REGION',
    'CURRENT_COUNTRY',
    'CURCOUNTRYLAT',
    'CURCOUNTRYLON',
    'PROMOTION_WITHIN_LAST_3_YEARS',
    'CHANGED_POSITION_WITHIN_LAST_2_YEARS',
    'CHANGE_IN_PERFORMANCE_RATING',
    'FUNCTIONALAREACHANGETYPE',
    'JOBLEVELCHANGETYPE',
    'HEADS',
    'FLIGHT_RISK'
])

pdf_emp_flightrisk = emp_flightrisk_new.collect()

To check the effect of the employment type for these employees with a churn risk, their status will be modified to be a regular (non-temporary) employee:

In [14]:
pdf_emp_flightrisk['EMPLOYMENT_TYPE_2'] = 'Regular'

In [15]:
create_dataframe_from_pandas(conn, pdf_emp_flightrisk, 'CHURNING_EMPLOYEES', force=True)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.00it/s]


<hana_ml.dataframe.DataFrame at 0x19304800ca0>

In [16]:
emp_churning = DataFrame(conn, 'select * from CHURNING_EMPLOYEES')
apply_out_new = model.predict(emp_churning)

In [35]:
emp_flightrisk_new_pos = apply_out_new.filter('PREDICTED = \'Yes\'')
num_flightrisk_new = emp_flightrisk_new_pos.describe('EMPLOYEE_ID').collect()['count'].values[0]
print('Number of employees in test set with positive flight risk after change in employment type from Temporary to Regular: {}'.format(num_flightrisk_new))

print('This is a difference of {}, which means that {} employees can possibly be prevented from churning by providing them a Regular contract'.format(num_flightrisk - num_flightrisk_new, num_flightrisk - num_flightrisk_new))

Number of employees in test set with positive flight risk after change in employment type from Temporary to Regular: 172
This is a difference of 293, which means that 293 employees can possibly be prevented from churning by providing them a Regular contract
