In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

X_train = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/employee_departure_dataset_X.csv')#[:100000]
y_train = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/employee_departure_dataset_y.csv')#[:100000]

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

class FeatureEngineering(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X[['Salary', 'PreviousSalary']] = X[['Salary', 'PreviousSalary']].map(lambda x: x.replace('K', '000') if isinstance(x, str) else x)
        X[['Salary', 'PreviousSalary']] = X[['Salary', 'PreviousSalary']].astype(int)
        X['Raise'] = (X['Salary'] / X['PreviousSalary']) - 1
        X['ReviewDiff'] = X['SelfReview'] - X['SupervisorReview']
        X['ReviewOverRaise'] = X['SelfReview'] / (X['Raise'] +.0001)
        return X

In [3]:
num = ['SupervisorReview','SelfReview', 'Raise', 'Salary','ReviewOverRaise', 'PreviousSalary', 'StressLevel', 'ReviewDiff']
cat = ['Gender', 'DepartmentCode', 'Distance']

In [4]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, num),
    ("cat", categorical_transformer, cat)
])

full_pipeline = Pipeline(steps=[
    ("feature_engineering", FeatureEngineering()),
    ("preprocessing", preprocessor)
])

In [5]:
X_preprocessed = full_pipeline.fit_transform(X_train)

In [6]:
X_preprocessed.shape

(100000, 21)

In [7]:
def optimize_model(model_pipeline: Pipeline, 
                   X_train: pd.DataFrame, 
                   y_train: pd.DataFrame, 
                   param_space: dict, 
                   n_iter: int, 
                   cv_splits=5):
    """
    This function takes in model pipeline, training set, hyperparameter space and number of iterations for RandomizedSearchCV.
    It will then optimize the given model based on the provided hyperparameter set and selected scoring metric, and return the best model.
    It does this by randomly selecting a set of hyperparameters from the provided hyperparameter space and evaluating the model.

    Parameters:
    model_pipeline (Pipeline): A scikit-learn Pipeline object
    X_train (pd.DataFrame): The training feature matrix
    y_train (pd.DataFrame): The response vector for training
    param_space (dict): The hyperparameter space to search
    n_iter (int): The number of iterations to search
    cv_splits (int): The number of cross-validation splits

    Returns:
    RandomizedSearchCV: A RandomizedSearchCV object
    Pipeline: The best model found by the RandomizedSearchCV
    """
    # This is our Timeseries Cross Validator. If we had used the regular CV, it would have broken the temporal order -
    # - which could have lead to data leakage.
    #timeseries_split = TimeSeriesSplit(n_splits=cv_splits)
        
    random_search = RandomizedSearchCV(
        estimator=model_pipeline,
        param_distributions=param_space,
        n_iter=n_iter,
        scoring='f1', # we choose f1 score
        #cv=timeseries_split, 
        random_state=42,
        n_jobs=-1,
        refit=True,
    )
    
    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_
    return random_search, best_model

In [8]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# this is our hyperparameter space for random forest
param_space = {
    'model__n_estimators': [600, 750, 900],
    'model__max_depth': [15, 20, 25],
    'model__min_samples_split': [6, 7, 8],
    'model__min_samples_leaf': [4, 6, 8],
    'model__bootstrap': [True],
    'model__n_jobs': [-1],
    'model__max_samples': [0.5, 0.6, 0.7, 0.8, 0.9],  
    'model__max_features': [0.5, 0.6, 0.7, 0.8, 0.9, 1], 
    'model__random_state': [42],
}

random_forest_pipeline = Pipeline([
    ('model',  RandomForestClassifier(n_jobs=-1, random_state=42))
])

random_search, best_model = optimize_model(random_forest_pipeline, X_preprocessed, y_train, param_space, 5)


  return fit_method(estimator, *args, **kwargs)


CPU times: total: 5min 47s
Wall time: 9min 30s


In [9]:
print(f'Validation Best Score: {random_search.best_score_:.2%}')

Validation Best Score: 86.19%


In [10]:
random_search.best_estimator_["model"]