In [None]:
# The code was removed by Watson Studio for sharing.

# Train a simple employee promotion prediction model

## Use Case
This notebook demonstrates the implementation of a Random Forest Classification model for predicting employee promotions. The focus is on leveraging model training with employee historical data for accurate predictions. Employee promotion prediction involves identifying factors that contribute to promotion decisions based on various features.

## What you'll learn in this notebook
Random Forest Classification: Random Forest is an ensemble learning method that constructs a multitude of decision trees during training and outputs the mode of the classes (classification) or mean prediction (regression) of the individual trees for robust and accurate predictions. To help understand and implement the Random Forest algorithm for predicting employee promotions, this notebook covers the process of extracting relevant features, training the Random Forest model, and evaluating its performance in predictions.

## Table of Contents

1.  [Step 1: Load and prepare the data](#load_data)

1.  [Step 2: Balance the dataset](#balance_dataset)

1.  [Step 3: Data Preprocessing and Train-Test split](#process_data)

1.  [Step 4: Capture model metadata for AI Governance](#capture_metadata)

1.  [Step 5: Train the Random forest classfier model](#train_model)

1.  [Step 6: Model Evaluation](#evaluate_data)

1.  [Step 7: Save the classification model](#save_model)

In [None]:
import os, types
import pandas as pd
from botocore.client import Config
import ibm_boto3

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, f1_score, recall_score, precision_score, roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt

In [None]:
# The code was removed by Watson Studio for sharing.

In [None]:
IBM_API_KEY=os.environ['IBM_API_KEY']

<a id="load_data"></a>
## Step 1: Load and prepare the data

In [None]:
# Download data asset from project storage and store it in the local file system
wslib.download_file("epp_train.csv", "epp_train.csv")

In [None]:
# Read data from the CSV file into a DataFrame
employee_data = pd.read_csv("epp_train.csv")

# Change the order of the columns
employee_data = employee_data[['employee_id', 'department', 'region', 'education', 'gender',
       'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'kpis_met_above_80_percent', 'any_awards_won',
       'avg_training_score', 'is_promoted']]

employee_data.head()

In [None]:
employee_data.shape

In [None]:
employee_data.columns

In [None]:
employee_data = employee_data.drop(columns=["employee_id", "recruitment_channel", "region"])

<a id="balance_dataset"></a>
## Step 2: Balance the dataset

Balancing the dataset with respect to the "is_promoted" variable is essential to avoid bias in the predictive model. When the dataset is imbalanced, with one class significantly outnumbering the other (e.g., promoted or not promoted employees), the model may exhibit a tendency to favor the majority class. In the context of employee promotion prediction, imbalance could lead to inaccurate predictions, especially if the majority of instances involve non-promotion.

In [None]:
# Separate the two classes
n = employee_data[employee_data['is_promoted'] == 1].count()[0]
class_0_data = employee_data[employee_data['is_promoted'] == 0]
class_1_data = employee_data[employee_data['is_promoted'] == 1]
print(employee_data['is_promoted'].value_counts())
# Take 3760 samples from class 0
class_0_sampled = class_0_data.sample(n, random_state=42)

# Combine the two classes to create a balanced dataset
employee_data = pd.concat([class_0_sampled, class_1_data])
print(employee_data['is_promoted'].value_counts())

In [None]:
employee_data.reset_index(drop=True,inplace=True)

<a id="process_data"></a>
## Step 3: Data Preprocessing and Train-Test split
Data preprocessing, including handling missing values, encoding categorical columns, and splitting the dataset into training and test sets, with a check on the class distribution in the training data.

In [None]:
# Handle missing values
employee_data["education"].fillna(employee_data["education"].mode()[0], inplace=True)
employee_data["previous_year_rating"].fillna(1, inplace=True)

# Encode categorical columns
categorical_columns = employee_data.select_dtypes(include=['object']).columns.tolist()
X_encoded = pd.get_dummies(employee_data, columns=categorical_columns, drop_first=True)

# Split the data into features (X) and target (y)
y = X_encoded["is_promoted"]
X_encoded = X_encoded.drop("is_promoted", axis=1)

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Initializing a Random Forest Classifier
# rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Check the class distribution in the training set
class_counts = y_train.value_counts()
print("Class Distribution in Training Data:")
print(class_counts)


<a id="capture_metadata"></a>
## Step 4: Capture model metadata for AI Governance
Collect the model metadata for effortless report generation using ibm ai governanance python library.

In [None]:
#!pip3 install --upgrade ibm-aigov-facts-client  --no-cache | tail -n 1
from ibm_aigov_facts_client import AIGovFactsClient

If you face problem and see error, please restart karnel, comment the pip installation of `ibm_aigov_facts_client` and proceced
 ```   24 from .utils import canonicalize_version
---> 25 from .version import LegacyVersion, Version, parse
     27 ParsedVersion = Union[Version, LegacyVersion]
     28 UnparsedVersion = Union[Version, LegacyVersion, str]

ImportError: cannot import name 'LegacyVersion' from 'packaging.version' (/opt/conda/envs/Python-RT23.1/lib/python3.10/site-packages/packaging/version.py)
```

In [None]:
## factsheet metadata variables, the name of the deployed model
EXPERIMENT_NAME='sap-hr-usecase-employee-promotion'

In [None]:
### Prepare ai factsheet meta data
# Create a list to store ColumnInfo objects
column_info_list = []

# Iterate over columns and create ColumnInfo objects
for column_name, dtype in zip(X_train.columns, X_train.dtypes):
    cell = {
        "name": column_name,
        "nullable": True, 
        "metadata": {}, 
        "type": str(dtype)
    }
    column_info_list.append(cell)

training_data_references = [
        {
            "id": EXPERIMENT_NAME,
            "type": "s3",
            "connection": {
                "access_key_id": os.environ['COS_API_KEY'],
                "endpoint_url": os.environ['COS_ENDPOINT_URL'],
                "resource_instance_id": os.environ['COS_INSTANCE_ID']
            },
            "location": {
                "bucket": "bucket-sap-epp",
                "path": "epp_train.csv"
            },
            "schema": {
                "id": "training_schema",
                "fields": column_info_list
            }
        }
    ]
#training_data_references

inpunt_schema = []

# Iterate over columns and create ColumnInfo objects
for column_name, dtype in zip(X_train.columns, X_train.dtypes):
    cell = {
        "feature": column_name,
        "name": column_name,
        "type": str(dtype)
    }
    inpunt_schema.append(cell)

In [None]:
## AI Govt factsheet client, using external models with manual log option, initiate client as
facts_client= AIGovFactsClient(api_key=IBM_API_KEY,experiment_name=EXPERIMENT_NAME,enable_autolog=False,external_model=True)
facts_client.assets.get_ai_usecases()

In [None]:
from ibm_aigov_facts_client.supporting_classes.factsheet_utils import DeploymentDetails,TrainingDataReference,ExternalModelSchemas, ModelDetails

## Specify model details
model_details = ModelDetails(
    model_type = "scikit-learn_1.1"
    ,input_type = "object"
    ,algorithm = "RandomForestClassifier"
    ,label_type = "class_counts"
    ,label_column = "is_promoted"
    ,prediction_type = "Binary Classification"
    ,software_spec = "runtime-23.1-py3.10"
    ,provider = "Custom Environment"
    )

## Specify training reference data
trainingdataref=TrainingDataReference(id=EXPERIMENT_NAME,
            type = "url",
            connection = {
                "url": "https://cloud.ibm.com/objectstorage/crn%3Av1%3Abluemix%3Apublic%3Acloud-object-storage%3Aglobal%3Aa%2Fe65910fa61ce9072d64902d03f3d4774%3A2d882273-f864-4224-bde6-d4a74b3143ae%3A%3A?bucket=bucket-sap-epp&bucketRegion=us-south&endpoint=s3.us-south.cloud-object-storage.appdomain.cloud&paneId=bucket_overview",
            },
            location = {
                "bucket": "bucket-sap-epp",
                "path": "epp_train.csv",
                "source": "epp_train.csv"
            },
            schema = {
                "id": "training_schema",
                "fields": column_info_list
            })

## Model deployment details
deployment=DeploymentDetails(identifier='http://169.46.68.130:8080/',name=EXPERIMENT_NAME,deployment_type="online",scoring_endpoint="/v2/predict_and_log")

## input and output schema definition
external_schemas=ExternalModelSchemas(input=[{"fields": inpunt_schema,
   }], output=[{"fields": [
        {'feature': 'is_promoted', 'name': 'is_promoted', 'type': 'uint8'}
    ]}])

In [None]:
## Save external model facts to watsonx.gov

watsonx_factsheet = facts_client.external_model_facts.save_external_model_asset(model_identifier="sap-hr-use-case"
                                                            ,name=EXPERIMENT_NAME
                                                            ,description="SAP AI Core integration, HR promotion usecase"
                                                            ,deployment_details=deployment
                                                            ,training_data_reference=trainingdataref
                                                            ,model_details=model_details
                                                            ,schemas=external_schemas
                                                            )

<a id="train_model"></a>
## Step 5: Train the Random forest classfier model
Initialization, training, and testing of a Random Forest Classifier with balanced class weights for predicting promotions in an employee dataset

In [None]:
# Initialize a Random Forest Classifier with balanced class weights
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=100)

# Training the model
rf_classifier.fit(X_train, y_train)

# Testing data prediction
y_pred = rf_classifier.predict(X_test)

# Traing data prediction
yt_pred = rf_classifier.predict(X_train)

In [None]:
# List of feature columns 
X_encoded.columns

<a id="evaluate_data"></a>
## Step 6: Model Evaluation
Evaluating the training and testing accuracy scores of a Random Forest Classifier for employee promotion prediction.

In [None]:
# Calculating accuracy
train_accuracy = accuracy_score(y_train, yt_pred)

print("train_accuracy",train_accuracy)

test_accuracy = accuracy_score(y_test, y_pred)

print("test_accuracy",test_accuracy)

In [None]:
#testing data 
pred_df={'no_of_trainings': {0: 1,
  1: 1,
  2: 1,
  3: 2,
  4: 1,
  5: 2,
  6: 1,
  7: 1,
  8: 1,
  9: 1},
 'age': {0: 35, 1: 30, 2: 34, 3: 39, 4: 45, 5: 31, 6: 31, 7: 33, 8: 28, 9: 32},
 'previous_year_rating': {0: 5.0,
  1: 5.0,
  2: 3.0,
  3: 1.0,
  4: 3.0,
  5: 3.0,
  6: 3.0,
  7: 3.0,
  8: 4.0,
  9: 5.0},
 'length_of_service': {0: 8,
  1: 4,
  2: 7,
  3: 10,
  4: 2,
  5: 7,
  6: 5,
  7: 6,
  8: 5,
  9: 5},
 'kpis_met_above_80_percent': {0: 1, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 1},
 'any_awards_won': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0},
 'avg_training_score': {0: 49,
  1: 60,
  2: 50,
  3: 50,
  4: 73,
  5: 85,
  6: 59,
  7: 63,
  8: 83,
  9: 54},
 'department_Finance': {0: 0,
  1: 0,
  2: 0,
  3: 0,
  4: 0,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0},
 'department_HR': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0},
 'department_Legal': {0: 0,
  1: 0,
  2: 0,
  3: 0,
  4: 0,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0},
 'department_Operations': {0: 0,
  1: 1,
  2: 0,
  3: 0,
  4: 0,
  5: 0,
  6: 1,
  7: 1,
  8: 0,
  9: 0},
 'department_Procurement': {0: 0,
  1: 0,
  2: 0,
  3: 0,
  4: 0,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0},
 'department_R&D': {0: 0,
  1: 0,
  2: 0,
  3: 0,
  4: 0,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0},
 'department_Sales & Marketing': {0: 1,
  1: 0,
  2: 1,
  3: 1,
  4: 0,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 1},
 'department_Technology': {0: 0,
  1: 0,
  2: 0,
  3: 0,
  4: 1,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0},
 'education_Below Secondary': {0: 0,
  1: 0,
  2: 0,
  3: 0,
  4: 0,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0},
 "education_Master's & above": {0: 1,
  1: 0,
  2: 0,
  3: 0,
  4: 0,
  5: 0,
  6: 0,
  7: 1,
  8: 0,
  9: 1},
 'gender_m': {0: 0, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 0, 7: 1, 8: 1, 9: 1}}
pred_df=pd.DataFrame(pred_df)
# pred_df

In [None]:
y_prob = rf_classifier.predict_proba(pred_df)
y_prob

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
# Compute precision
precision = precision_score(y_test, y_pred)
print(precision)

# Compute recall
recall = recall_score(y_test, y_pred)
print(recall)

# Compute F1-score
f1 = f1_score(y_test, y_pred)
print(f1)

In [None]:
# Get predicted probabilities for class 1
y_prob = rf_classifier.predict_proba(X_test)[:, 1]

# custom_threshold = 0.4  # You can adjust this value
# y_pred = (y_prob >= custom_threshold).astype(int)

# Classification Report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.2f}")

# Recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall:.2f}")

# Precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision:.2f}")

# AUC Score
auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

<a id="save_model"></a>
## Step 7: Save the model as pickle
Saving a trained Random Forest Classifier model to a file and uploading it as a project asset for future use.

In [None]:
import joblib

# Save the model to a file
joblib.dump(rf_classifier, 'epp_model_rf_nw.pkl')

# Load the model from the file
loaded_model = joblib.load('epp_model_rf_nw.pkl')

# Upload the model 
wslib.upload_file(file_path="epp_model_rf_nw.pkl", asset_name="epp_model_rf_nw.pkl", overwrite=True)