In [44]:
import pandas as pd

from category_encoders import OrdinalEncoder
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from xgboost import XGBClassifier



In [2]:
df = pd.read_csv('../data/clean_HR_Analytics.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,3,4,1,6,3,3,2,2,2,2


In [21]:
# Vertical split 
target = "Attrition"
X = df.drop(columns=target)
y = df[target]

In [22]:
label = LabelEncoder()
y_encoded = label.fit_transform(y)

ordinal = OrdinalEncoder()
X_encoded = ordinal.fit_transform(X)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [60]:
algo = {
    'logistic_regression': {
        'model': LogisticRegression(),
        'params': {}

    },

    'random_forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'standardscaler__with_mean': [True, False],
            'randomforestclassifier__n_estimators': range(30, 100, 10),
            'randomforestclassifier__max_depth': range(10, 50, 5)
        }
    },

    'adaboost': {
        'model': AdaBoostClassifier(random_state=42),
        'params': {
            'adaboostclassifier__n_estimators': range(20, 100, 10)
        }
    },

    'gradient_boosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'gradientboostingclassifier__n_estimators': range(2, 5, 1),
            'gradientboostingclassifier__max_depth': range(20, 40, 5)
        }
    },

    'xgbclassifier': {
        'model': XGBClassifier(random_state=42),
        'params': {
            'xgbclassifier__n_estimators': range(100, 301, 100),
            'xgbclassifier__max_depth': range(3, 12, 2),
            'xgbclassifier__learning_rate': [0.1, 0.01, 0.001]
        }
    }
}

In [61]:
score = []
for algo_name, mp in algo.items():
    pipeline = make_pipeline(
        StandardScaler(),
        mp['model']
    )

    model = GridSearchCV(
        pipeline,
        param_grid=mp['params'],
        cv=5,
        n_jobs=-1,
        verbose=1
    )

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    train_acc = model.score(X_train, y_train)
    test_acc = model.score(X_test, y_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    score.append({
        'model': algo_name,
        'Train acc': train_acc,
        'Test acc': test_acc,
        'accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'best_score': model.best_score_,
        'best_params': model.best_params_

    })

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 112 candidates, totalling 560 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 12 candidates, totalling 60 fits


  _warn_prf(average, modifier, msg_start, len(result))


Fitting 5 folds for each of 45 candidates, totalling 225 fits


In [62]:
results = pd.DataFrame(score, columns=['model', 'Train acc', 'Test acc', 'accuracy', 'Precision', 'Recall', 'best_score', 'best_params'])
results.head()

Unnamed: 0,model,Train acc,Test acc,accuracy,Precision,Recall,best_score,best_params
0,logistic_regression,0.875,0.887755,0.887755,0.666667,0.307692,0.860559,{}
1,random_forest,0.982993,0.87415,0.87415,0.625,0.128205,0.856304,"{'randomforestclassifier__max_depth': 10, 'ran..."
2,adaboost,0.908163,0.857143,0.857143,0.434783,0.25641,0.877566,{'adaboostclassifier__n_estimators': 50}
3,gradient_boosting,0.831633,0.867347,0.867347,0.0,0.0,0.831634,"{'gradientboostingclassifier__max_depth': 20, ..."
4,xgbclassifier,0.948129,0.863946,0.863946,0.454545,0.128205,0.861392,"{'xgbclassifier__learning_rate': 0.01, 'xgbcla..."
