In [4]:
import pandas as pd
import numpy as np

train = pd.read_csv("train.csv", sep=",", header=0)
test = pd.read_csv("test.csv", sep=",", header=0)


## Exploratory Data Analysis

In [5]:
train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [28]:
train.describe()

Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
count,54808.0,54808.0,54808.0,50684.0,54808.0,54808.0,54808.0,54808.0,54808.0
mean,39195.830627,1.253011,34.803915,3.329256,5.865512,0.351974,0.023172,63.38675,0.08517
std,22586.581449,0.609264,7.660169,1.259993,4.265094,0.47759,0.15045,13.371559,0.279137
min,1.0,1.0,20.0,1.0,1.0,0.0,0.0,39.0,0.0
25%,19669.75,1.0,29.0,3.0,3.0,0.0,0.0,51.0,0.0
50%,39225.5,1.0,33.0,3.0,5.0,0.0,0.0,60.0,0.0
75%,58730.5,1.0,39.0,4.0,7.0,1.0,0.0,76.0,0.0
max,78298.0,10.0,60.0,5.0,37.0,1.0,1.0,99.0,1.0


In [7]:
from pandas_profiling import ProfileReport
exploratory_report_train = ProfileReport(train, title="Train Profiling Report", explorative = True)
exploratory_report_train.to_widgets()

Summarize dataset:   0%|          | 0/27 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valuâ€¦

In [25]:
train.isnull().sum()

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [37]:
# number of promotion per department
train.groupby('department')['is_promoted'].sum()

department
Analytics             512
Finance               206
HR                    136
Legal                  53
Operations           1023
Procurement           688
R&D                    69
Sales & Marketing    1213
Technology            768
Name: is_promoted, dtype: int64

In [44]:
train.groupby('education')['is_promoted'].sum()

education
Bachelor's          3008
Below Secondary       67
Master's & above    1471
Name: is_promoted, dtype: int64

In [17]:
train["is_promoted"].value_counts()

0    50140
1     4668
Name: is_promoted, dtype: int64

In [30]:
train["gender"].value_counts()
16312/(16312+38496)

0.29762078528681946

In [19]:
4668/(4668+50140)

0.08517004816815063

In [27]:
train.corr()

Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
employee_id,1.0,-0.005121,0.000437,0.004533,0.001274,-0.002501,0.00842,-0.000586,0.001206
no_of_trainings,-0.005121,1.0,-0.081278,-0.063126,-0.057275,-0.045576,-0.007628,0.042517,-0.024896
age,0.000437,-0.081278,1.0,0.006008,0.657111,-0.025592,-0.008169,-0.04838,-0.017166
previous_year_rating,0.004533,-0.063126,0.006008,1.0,0.000253,0.351578,0.027738,0.075139,0.15932
length_of_service,0.001274,-0.057275,0.657111,0.000253,1.0,-0.077693,-0.039927,-0.038122,-0.01067
KPIs_met >80%,-0.002501,-0.045576,-0.025592,0.351578,-0.077693,1.0,0.097,0.078391,0.221582
awards_won?,0.00842,-0.007628,-0.008169,0.027738,-0.039927,0.097,1.0,0.072138,0.195871
avg_training_score,-0.000586,0.042517,-0.04838,0.075139,-0.038122,0.078391,0.072138,1.0,0.181147
is_promoted,0.001206,-0.024896,-0.017166,0.15932,-0.01067,0.221582,0.195871,0.181147,1.0


### Data exploration findings:
- 8.5% of employees are promoted
- 9 departments, 54808 employees
- Nulls: education 2409, previous_year_rating 4124
- Top 5 promotions (departments): Sales & Marketing, Operations, Technology, Procurement, Analytics
- High correlation (> 0.18) between is_promoted and other variables: KPIs_met>80%, awards_won?, avg_training_score
- Gender: m 38496 (roughly 70%), f 16312 (roughly 30%)


## Data clean

### Train

In [46]:
# numericalize department, education, gender
train.loc[train['department'] == 'Analytics', ['department']] = 0
train.loc[train['department'] == 'Finance', ['department']] = 1
train.loc[train['department'] == 'HR', ['department']] = 2
train.loc[train['department'] == 'Legal', ['department']] = 3
train.loc[train['department'] == 'Operations', ['department']] = 4
train.loc[train['department'] == 'Procurement', ['department']] = 5
train.loc[train['department'] == 'R&D', ['department']] = 6
train.loc[train['department'] == 'Sales & Marketing', ['department']] = 7
train.loc[train['department'] == 'Technology', ['department']] = 8

train.loc[train['education'] == "Master's & above", ['education']] = 2
train.loc[train['education'] == "Bachelor's", ['education']] = 1
train.loc[train['education'] == "Below Secondary", ['education']] = 0

train['gender'] = train['gender'].astype('category').cat.codes



### Test

In [45]:
# numericalize department, education, gender
test.loc[test['department'] == 'Analytics', ['department']] = 0
test.loc[test['department'] == 'Finance', ['department']] = 1
test.loc[test['department'] == 'HR', ['department']] = 2
test.loc[test['department'] == 'Legal', ['department']] = 3
test.loc[test['department'] == 'Operations', ['department']] = 4
test.loc[test['department'] == 'Procurement', ['department']] = 5
test.loc[test['department'] == 'R&D', ['department']] = 6
test.loc[test['department'] == 'Sales & Marketing', ['department']] = 7
test.loc[test['department'] == 'Technology', ['department']] = 8

test.loc[test['education'] == "Master's & above", ['education']] = 2
test.loc[test['education'] == "Bachelor's", ['education']] = 1
test.loc[test['education'] == "Below Secondary", ['education']] = 0

test['gender'] = test['gender'].astype('category').cat.codes

In [47]:
test.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8724,8,region_26,1,1,sourcing,1,24,,1,1,0,77
1,74430,2,region_4,1,0,other,1,31,3.0,5,0,0,51
2,72255,7,region_13,1,1,other,1,31,1.0,4,0,0,47
3,38562,5,region_2,1,0,other,3,31,2.0,9,0,0,65
4,64486,1,region_29,1,1,sourcing,1,30,4.0,7,0,0,61


## Validation model
Goal: we want to predict if an employee will get promoted

In [31]:
y_train=(train['is_promoted']).astype(int)
X_train=train[['CompPrice','Income','Advertising','Population','Price','ShelveLoc','Age','Education','Urban','US']]



# fit 
clf = tree.DecisionTreeClassifier(class_weight=None, 
                                  criterion='gini', 
                                  max_depth=10, 
                                  max_features=None, 
                                  max_leaf_nodes=None, 
                                  min_samples_leaf=2, 
                                  min_samples_split=10, 
                                  min_weight_fraction_leaf=0.0, 
                                  presort=False, 
                                  random_state=100, 
                                  splitter='best')
clf = clf.fit(X_train, y_train) 

# export estimated tree into dot graphic file
dot_data = tree.export_graphviz(clf, out_file='Dtree.dot', feature_names=X.columns)

# plot
plt.figure(figsize=(15,12))
tree.plot_tree(clf);

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77
1,74430,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,72255,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,38562,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,64486,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61
