In [1]:
import pandas as pd
import numpy as np

class SME():
    
    def __init__(self):
        self.asked = 0
        self.df = self.get()
        
    def get(self):
        X_train = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/employee_departure_dataset_X.csv')
        y_train = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/employee_departure_dataset_y.csv')

        X_train['Left'] = y_train
        return X_train
    
    def ask(self, valuedict):
        self.asked += 1
        #  print(self.asked)
        
        if self.asked > 500:
            raise Exception("Sorry, you have asked enough")

        arr = []
        for prop in valuedict:
            val = valuedict[prop]

            if val is None: 
                continue

            val = f"'{val}'" if isinstance(val,str) else val
            # print(prop, val)
            arr.append(f'{prop} == {val}')

            query = ' and '.join(arr)
        result = self.df.query(query)

        if len(result) == 0:
            raise Exception("I don't know")
        
        return (int)(result['Left'].mean())

In [2]:
sme = SME()

# INPUT ALL EDA ALLOWABLE HERE

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans

In [5]:
n_clusters = 500

sme.df['SelfReview'] = sme.df['SelfReview'].fillna(sme.df['SelfReview'].median())
sme.df['SupervisorReview'] = sme.df['SupervisorReview'].fillna(sme.df['SupervisorReview'].median())
sme.df[['Salary', 'PreviousSalary']] = sme.df[['Salary', 'PreviousSalary']].map(lambda x: x.replace('K', '000') if isinstance(x, str) else x)
sme.df[['Salary', 'PreviousSalary']] = sme.df[['Salary', 'PreviousSalary']].astype(int)

sme.df['Raise'] = (sme.df['Salary'] / sme.df['PreviousSalary']) - 1
sme.df['ReviewDiff'] = sme.df['SelfReview'] - sme.df['SupervisorReview']
sme.df['ReviewOverRaise'] = sme.df['SelfReview'] / (sme.df['Raise'] +.0001)

sme.df['LongDist'] = np.where(sme.df['Distance'].isin(['~20miles', '>30miles']), 1, 0)

sme.df['HighStress'] = np.where(sme.df['StressLevel'] > 3.5, 1, 0)


num = ['HighStress', 'LongDist','Gender', 'SupervisorReview','SelfReview', 'Raise', 'Salary','ReviewOverRaise', 'PreviousSalary'] #, 'LongDist', ReviewOverRaise','ReviewDiff', 'StressLevel'
cat = ['DepartmentCode']

In [6]:
featurepreprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num)
    ])
preprocessor = Pipeline(steps=[
    ('features', featurepreprocessor),
    ('pca', PCA(n_components=.9)),#.9
    ('kmeans', KMeans(n_clusters=n_clusters, random_state=42, n_init='auto'))
    ])

In [7]:
#X_dist = preprocessor.fit_transform(sme.df[num+cat])
pre = featurepreprocessor.fit_transform(sme.df[num+cat])

kmeans = KMeans(n_clusters = n_clusters, n_init='auto')

X_dist = kmeans.fit_transform(pre)
X_dist

#What are the reporesentative points? (e.i the ones closest to each centroid)
representative_idx = np.argmin(X_dist, axis=0)
#representative_idx.shape

#What do the reporesentative points look like?
X_representative = sme.df.iloc[representative_idx]

#Look up the leave/stay value of hte 500 representative points
y_representative= []
for i in representative_idx:
    y_representative.append(sme.ask({'RecordId': i+100})) #All record Id's are 100 more than the index value

#Propagate the label value to each point in the cluster.
y_train_propagated = np.empty(len(sme.df), dtype=np.int32)
for i in range(n_clusters):
    y_train_propagated[kmeans.labels_ == i] = y_representative[i]

In [8]:
X = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/employee_departure_dataset_X.csv')
y= y_train_propagated


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [71]:

y_ans = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/employee_departure_dataset_y.csv')

# Preprocessing for ML Learning Pipeline

## Feature Engineering Pipeline

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

class FeatureEngineering(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X[['Salary', 'PreviousSalary']] = X[['Salary', 'PreviousSalary']].map(lambda x: x.replace('K', '000') if isinstance(x, str) else x)
        X[['Salary', 'PreviousSalary']] = X[['Salary', 'PreviousSalary']].astype(int)
        X['Raise'] = (X['Salary'] / X['PreviousSalary']) - 1
        X['ReviewDiff'] = X['SelfReview'] - X['SupervisorReview']
        X['ReviewOverRaise'] = X['SelfReview'] / (X['Raise'] +.0001)
        X['LongDist'] = np.where(X['Distance'].isin(['~20miles', '>30miles']), 1, 0)
        X['HighStress'] = np.where(X['StressLevel'] > 3.5, 1, 0)
        
        return X

## Rest of Preprocessing Pipeline (categorical and numerical handling)

In [13]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, num),
    ("cat", categorical_transformer, cat)
])

full_pipeline = Pipeline(steps=[
    ("feature_engineering", FeatureEngineering()),
    ("preprocessing", preprocessor)
])

## List Out Numerical, Catagorical Features

In [14]:
num = ['SupervisorReview',
       'SelfReview', 
       'Raise', 
       'Salary',
       'ReviewOverRaise', 
       'PreviousSalary', 
       'StressLevel', 
       'ReviewDiff']
cat = ['Gender', 
       'DepartmentCode', 
       'Distance']

## Fit Preprocessing Pipeline to training data

In [15]:
X_preprocessed = full_pipeline.fit_transform(X_train)
X_preprocessed_use = X_preprocessed

In [16]:
def optimize_model(model_pipeline: Pipeline, 
                   X_train: pd.DataFrame, 
                   y_train: pd.DataFrame, 
                   param_space: dict, 
                   n_iter: int, 
                   cv_splits=5):
    """
    This function takes in model pipeline, training set, hyperparameter space and number of iterations for RandomizedSearchCV.
    It will then optimize the given model based on the provided hyperparameter set and selected scoring metric, and return the best model.
    It does this by randomly selecting a set of hyperparameters from the provided hyperparameter space and evaluating the model.

    Parameters:
    model_pipeline (Pipeline): A scikit-learn Pipeline object
    X_train (pd.DataFrame): The training feature matrix
    y_train (pd.DataFrame): The response vector for training
    param_space (dict): The hyperparameter space to search
    n_iter (int): The number of iterations to search
    cv_splits (int): The number of cross-validation splits

    Returns:
    RandomizedSearchCV: A RandomizedSearchCV object
    Pipeline: The best model found by the RandomizedSearchCV
    """
        
    random_search = RandomizedSearchCV(
        estimator=model_pipeline,
        param_distributions=param_space,
        n_iter=n_iter,
        scoring='recall', # we choose f1 score
        #cv=timeseries_split, 
        random_state=42,
        n_jobs=-1,
        refit=True,
    )
    
    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_
    return random_search, best_model

In [17]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# this is our hyperparameter space for random forest
param_space = {
    'model__n_estimators': [100, 600, 750, 900],
    'model__max_depth': [10, 15, 20, 25],
    'model__min_samples_split': [6, 7, 8, 15],
    'model__min_samples_leaf': [4, 6, 8, 15],
    'model__bootstrap': [True],
    'model__n_jobs': [-1],
    'model__max_samples': [0.5, 0.6, 0.7, 0.8, 0.9],  
    'model__max_features': [0.5, 0.6, 0.7, 0.8, 0.9, 1], 
    'model__random_state': [42],
}

random_forest_pipeline = Pipeline([
    ('model',  RandomForestClassifier(random_state=42))#n_jobs=-1, 
])

random_search, best_model = optimize_model(random_forest_pipeline, X_preprocessed_use[:10000], y_train[:10000], param_space, 5)

CPU times: total: 31.2 s
Wall time: 47.7 s


In [18]:
print(f'Validation Best Score: {random_search.best_score_:.2%}')

Validation Best Score: 91.59%


In [61]:
random_search.best_estimator_

In [63]:
p = Pipeline([('full', full_pipeline),
              ('clf', RandomForestClassifier(max_depth=20, max_features=0.9, max_samples=0.9,
                       min_samples_leaf=4, min_samples_split=15,
                       n_estimators=750, n_jobs=-1, random_state=42))
             ])

In [65]:
p.fit(X_train, y_train)
p

In [66]:
y_pred = p.predict(X_test)

In [85]:
from sklearn.metrics import classification_report
print("Classification Report:\n")
print(classification_report(y_ans, y_pred))

Classification Report:

              precision    recall  f1-score   support

           0       0.81      0.82      0.81     63963
           1       0.67      0.65      0.66     36037

    accuracy                           0.76    100000
   macro avg       0.74      0.74      0.74    100000
weighted avg       0.76      0.76      0.76    100000



In [83]:
y_ans = y_ans.iloc[X_test.index]

In [89]:
print(np.unique(y_test, return_counts=True))
print(np.unique(y_pred, return_counts=True))

(array([0, 1]), array([65310, 34690], dtype=int64))
(array([0, 1]), array([65352, 34648], dtype=int64))


In [95]:
np.sum(y_test != y_pred)

108

In [97]:
np.unique(y_test, return_counts=True)

(array([0, 1]), array([65310, 34690], dtype=int64))