### Module 4 - Project Delivery


**Components in this Project:**
- Exploratory Analysis: Chasing Aha! Moments
- ABT Construction: Building a Base for Successful Modeling
- Classification Algorithms: The Value of Versatility
- Model Evaluation: Sophisticated Performance Metrics, AUROC
- <span style="color:royalblue">Project Delivery: From Raw Data to Final Prediction</span>


Kairong Chen

Lasted edited: 01/30/2022


#### This Module includes:
1. Confirm Your Model
    - Load the original analytical base table that was used to train the model
    - Split it into the same training and test sets (with the same random seed)
    - See if we get the same AUROC on the test set as we got in the previous module
2. Write Pre-Modeling Functions
    - clean_data(): from Module 2: ABT Construction
    - engineer_features(): from Module 2: ABT Construction
3. Construct Custom Model Class
    - `self.__init__()`
    - Functions in classes must have `self` as the first argument
4. Keep it in Jupyter Notebook



#### Important Concepts

In [1]:
%load_ext nb_black
# NumPy for numerical computing
import numpy as np

# Pandas for DataFrames
import pandas as pd

pd.set_option("display.max_columns", 100)

# Pickle for reading model files
import pickle

# Scikit-Learn's train_test_split function
from sklearn.model_selection import train_test_split

# Area Under ROC Curve
from sklearn.metrics import roc_auc_score

<IPython.core.display.Javascript object>

In [2]:
# Load final_model.pkl as model
with open("final_model.pkl", "rb") as f:
    clf = pickle.load(f)

<IPython.core.display.Javascript object>

#### Confirm the Model

In [3]:
# Display model object
print( clf )


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(max_features=0.33, random_state=123))])


<IPython.core.display.Javascript object>

In [4]:
# Load analytical base table used in Module 4
abt = pd.read_csv("analytical_base_table.csv")

<IPython.core.display.Javascript object>

In [5]:
# Create separate object for target variable
y = abt.status

# Create separate object for input features
X = abt.drop("status", axis=1)

# Split X and y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1234, stratify=abt.status
)

<IPython.core.display.Javascript object>

In [6]:
# Predict X_test
pred = clf.predict_proba(X_test)

# Get just the prediction for the positive class (1)
pred = [p[1] for p in pred]

# Print AUROC
print("AUROC:", roc_auc_score(y_test, pred))

AUROC: 0.9915194952019338


<IPython.core.display.Javascript object>

#### Write Pre-Modeling Functions

In [7]:
raw_data = pd.read_csv("unseen_employee_data.csv")

raw_data.head()

Unnamed: 0,avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,tenure
0,228,management,,0.735618,2,,high,0.805661,3.0
1,229,product,,1.0,4,,low,0.719961,4.0
2,196,sales,1.0,0.557426,4,,low,0.749835,2.0
3,207,IT,,0.715171,3,,high,0.987447,3.0
4,129,management,,0.484818,2,,low,0.441219,3.0


<IPython.core.display.Javascript object>

In [8]:
def clean_data(df):
    # Drop duplicates
    df = df.drop_duplicates()

    # Drop temporary workers
    df = df[df.department != "temp"]

    # Missing filed_complaint values should be 0
    df["filed_complaint"] = df.filed_complaint.fillna(0)

    # Missing recently_promoted values should be 0
    df["recently_promoted"] = df.recently_promoted.fillna(0)

    # 'information_technology' should be 'IT'
    df.department.replace("information_technology", "IT", inplace=True)

    # Fill missing values in department with 'Missing'
    df["department"].fillna("Missing", inplace=True)

    # Indicator variable for missing last_evaluation
    df["last_evaluation_missing"] = df.last_evaluation.isnull().astype(int)

    # Fill missing values in last_evaluation with 0
    df.last_evaluation.fillna(0, inplace=True)

    # Return cleaned dataframe
    return df

<IPython.core.display.Javascript object>

In [9]:
# Create cleaned_new_data
cleaned_data = clean_data(raw_data)

# Display first 5 rows
cleaned_data.head()

Unnamed: 0,avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,tenure,last_evaluation_missing
0,228,management,0.0,0.735618,2,0.0,high,0.805661,3.0,0
1,229,product,0.0,1.0,4,0.0,low,0.719961,4.0,0
2,196,sales,1.0,0.557426,4,0.0,low,0.749835,2.0,0
3,207,IT,0.0,0.715171,3,0.0,high,0.987447,3.0,0
4,129,management,0.0,0.484818,2,0.0,low,0.441219,3.0,0


<IPython.core.display.Javascript object>

In [10]:
def engineer_features(df):
    # Create indicator features
    df["underperformer"] = (
        (df.last_evaluation < 0.6) & (df.last_evaluation_missing == 0)
    ).astype(int)

    df["unhappy"] = (df.satisfaction < 0.2).astype(int)

    df["overachiever"] = ((df.last_evaluation > 0.8) & (df.satisfaction > 0.7)).astype(
        int
    )

    # Create new dataframe with dummy features
    df = pd.get_dummies(df, columns=["department", "salary"])

    # Return augmented DataFrame
    return df

<IPython.core.display.Javascript object>

In [11]:
# Create augmented_new_data
augmented_data = engineer_features(cleaned_data)

# Display first 5 rows
augmented_data.head()

Unnamed: 0,avg_monthly_hrs,filed_complaint,last_evaluation,n_projects,recently_promoted,satisfaction,tenure,last_evaluation_missing,underperformer,unhappy,overachiever,department_IT,department_Missing,department_admin,department_engineering,department_finance,department_management,department_marketing,department_procurement,department_product,department_sales,department_support,salary_high,salary_low,salary_medium
0,228,0.0,0.735618,2,0.0,0.805661,3.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
1,229,0.0,1.0,4,0.0,0.719961,4.0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
2,196,1.0,0.557426,4,0.0,0.749835,2.0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
3,207,0.0,0.715171,3,0.0,0.987447,3.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
4,129,0.0,0.484818,2,0.0,0.441219,3.0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0


<IPython.core.display.Javascript object>

In [12]:
# Predict probabilities
pred = clf.predict_proba(augmented_data)

# Print first 5 predictions
print(pred[:5])

[[1.   0.  ]
 [0.98 0.02]
 [1.   0.  ]
 [1.   0.  ]
 [0.   1.  ]]


<IPython.core.display.Javascript object>

#### Construct Custom Model Class

In [13]:
class EmployeeRetentionModel:
    def __init__(self, model_location):
        with open(model_location, "rb") as f:
            self.model = pickle.load(f)

    def predict_proba(self, X_new, clean=True, augment=True):
        if clean:
            X_new = self.clean_data(X_new)

        if augment:
            X_new = self.engineer_features(X_new)

        return X_new, self.model.predict_proba(X_new)

    # Add functions here
    def clean_data(self, df):
        # Drop duplicates
        df = df.drop_duplicates()

        # Drop temporary workers
        df = df[df.department != "temp"]

        # Missing filed_complaint values should be 0
        df["filed_complaint"] = df.filed_complaint.fillna(0)

        # Missing recently_promoted values should be 0
        df["recently_promoted"] = df.recently_promoted.fillna(0)

        # 'information_technology' should be 'IT'
        df.department.replace("information_technology", "IT", inplace=True)

        # Fill missing values in department with 'Missing'
        df["department"].fillna("Missing", inplace=True)

        # Indicator variable for missing last_evaluation
        df["last_evaluation_missing"] = df.last_evaluation.isnull().astype(int)

        # Fill missing values in last_evaluation with 0
        df.last_evaluation.fillna(0, inplace=True)

        # Return cleaned dataframe
        return df

    def engineer_features(self, df):
        # Create indicator features
        df["underperformer"] = (
            (df.last_evaluation < 0.6) & (df.last_evaluation_missing == 0)
        ).astype(int)

        df["unhappy"] = (df.satisfaction < 0.2).astype(int)

        df["overachiever"] = (
            (df.last_evaluation > 0.8) & (df.satisfaction > 0.7)
        ).astype(int)

        # Create new dataframe with dummy features
        df = pd.get_dummies(df, columns=["department", "salary"])

        # Return augmented DataFrame
        return df

<IPython.core.display.Javascript object>

In [14]:
# Initialize an instance
retention_model = EmployeeRetentionModel("final_model.pkl")

<IPython.core.display.Javascript object>

In [15]:
# Predict raw data
_, pred1 = retention_model.predict_proba(raw_data, clean=True, augment=True)

# Predict cleaned data
_, pred2 = retention_model.predict_proba(cleaned_data, clean=False, augment=True)

# Predict cleaned and augmented data
_, pred3 = retention_model.predict_proba(augmented_data, clean=False, augment=False)

<IPython.core.display.Javascript object>

In [16]:
# Check: Should be true
np.array_equal(pred1, pred2) and np.array_equal(pred2, pred3)

True

<IPython.core.display.Javascript object>