In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, plot_roc_curve


In [282]:
df = pd.read_csv('../20220911_data.csv')
df.drop(columns=['hire_date'], inplace=True)
for var in ('gender', 'ethnicity'):
    temp = pd.get_dummies(df[var], prefix=var, drop_first=True)
    df.drop(columns=[var], inplace=True)
    df = df.join(temp)
    

In [283]:
y = df['terminated_in_first_year']
X = df.drop(columns=['terminated_in_first_year'])


In [284]:
cols = X.columns.str.contains('strengths|weakness|compared_to_others')

crosschq_vars = X.iloc[:, cols].values
crosschq_vars = MinMaxScaler().fit_transform(crosschq_vars)

pca = PCA(n_components=crosschq_vars.shape[1])
pca.fit(crosschq_vars)

var_explained = pca.explained_variance_ratio_.cumsum()

components = np.argmax(var_explained >= 0.75)

pca75 = PCA(components)
crosschq_vars75 = pca75.fit_transform(crosschq_vars)

X = X.iloc[:, ~cols]
crosschq_vars75 = pd.DataFrame(crosschq_vars75)
crosschq_vars75.columns = [f'pcomponent_{i}' for i in crosschq_vars75.columns]
X = X.join(crosschq_vars75)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)


In [None]:
#####################
### DECISION TREE ###
#####################

In [None]:
# https://scikit-learn.org/stable/modules/tree.html#minimal-cost-complexity-pruning
clf = DecisionTreeClassifier(criterion='entropy', random_state=0)
clf.fit(X_train, y_train)

# clf.score(X_train, y_train)

In [None]:
# https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html
path = clf.cost_complexity_pruning_path(X_train, y_train)
path = pd.DataFrame(path)

sns.lineplot(path, x = 'ccp_alphas', y='impurities')

In [None]:
parameters = {
    'criterion': ('entropy', 'gini'),
    'splitter': ('best', 'random'),
    'ccp_alpha': np.arange(0, 0.04, 0.001),
    'class_weight': ('balanced', None),
#     'max_depth': np.arange(5, 30, 1),
#     'min_samples_split': np.arange(2, 10, 1),
#     'min_samples_leaf': np.arange(1, 5, 1),
}

clf = DecisionTreeClassifier(random_state=0)
cv = RandomizedSearchCV(clf, parameters, n_iter=150)
cv.fit(X_train, y_train)


In [None]:
cv_results = pd.DataFrame(cv.cv_results_)
cv_results.loc[cv_results.param_class_weight != 'balanced', 'param_class_weight'] = 'None'

print(cv.best_params_, cv.best_score_)

best_clf = cv.best_estimator_

sns.lineplot(
    cv_results[cv_results.param_class_weight == 'None'],
    x='param_ccp_alpha', 
    y='mean_test_score',
    hue='param_criterion',
)


In [None]:
print(
    'train score:', best_clf.score(X_train, y_train), '||',
    'test score:', best_clf.score(X_test, y_test),
)

confusion_matrix(best_clf.predict(X_train), y_train)

In [None]:
plot_roc_curve(best_clf, X_train, y_train)