In [7]:
# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
from project_lib import Project
project = Project(spark.sparkContext, 'a2a29465-a371-4b43-b30a-1ecd79492c0f', 'p-360fa5c487a1377ce74f8ef5f9af1d6d0e3744c9')
pc = project.project_context


# 1. Insert project token, API key, and region

<img src="https://cp4d-outcomes.techzone.ibm.com/img/data-fabric-lab/trusted-ai/project_token_for_notebook.png" width=400 align=left>

Click the **three vertical dots** icon above and select **Insert project token** to provide this notebook API access to your project.

The code inserted above will have a line that looks like this:

`project = Project(spark.sparkContext, 'xxxxxxxx-xxx-xxxx-xxxx-xxxxxxxxxx', 'p-xxxxxxxxxxxxxxxxxx')`

The first variable value from the cell above (the one that does **not** begin with `p-`) is your project ID, and should be pasted into the cell below as the value for `PROJECT_ID`. The API key you created earlier in the lab should be pasted into the cell below as the value for `API_KEY`.

The project ID value is also available on the **Manage** tab of your project, in the **General** section.

The **LOCATION** value below will depend on where you provisioned your services. According to the [WML Client documentation](https://ibm-wml-api-pyclient.mybluemix.net/#authentication), valid values for **LOCATION** are:
* Dallas: https://us-south.ml.cloud.ibm.com
* London: https://eu-gb.ml.cloud.ibm.com
* Frankfurt: https://eu-de.ml.cloud.ibm.com
* Tokyo: https://jp-tok.ml.cloud.ibm.com

Run the cell above, and continue running cells individually until you reach step 2.

In [8]:
API_KEY = 'V20GxKBJhE770COJBUSgto2R197lKy25OKR6n5o97_VD'
PROJECT_ID = 'a2a29465-a371-4b43-b30a-1ecd79492c0f'
LOCATION = 'https://us-south.ml.cloud.ibm.com'

In [9]:
if "p-" in PROJECT_ID:
    raise Exception("You have not correctly set the value for your PROJECT_ID. The value beginning with 'p-' is your project access \
    token. Please copy the value of the project_id into the previous cell and re-run it.")

The first model you will create in this notebook uses the scikit-learn framework. The `sklearn` package is available by default in Watson Studio Python environments, and does not need to be installed.

In [10]:
import sklearn
sklearn.__version__

'1.0.2'

The next cell uses the API key and location variables defined above to authenticate with your Watson Machine Learning service. An error in this cell likely means that you do not have access to a WML service, or that the API key or location provided above is incorrect.

In [11]:
from ibm_watson_machine_learning import APIClient

wml_credentials = {
    "apikey": API_KEY,
    "url": LOCATION
}

wml_client = APIClient(wml_credentials)

# <span style="color:red">2. !!--STOP--!! Insert data to code below</span>

Place your cursor in the empty code cell below. Then click the **Find and add data** icon in the upper right corner of the screen -- it looks like a grid of ones and zeroes.

<img src="https://cp4d-outcomes.techzone.ibm.com/img/data-fabric-lab/trusted-ai/find_and_add_data.png" width=400 align=left>

From the **Files** tab in the window that opens, locate the *modeling_records_2022.csv* file from the list of files beneath the drag and drop area, and use the **Insert to code** dropdown beneath it to select **pandas DataFrame**. A code block is automatically inserted into the empty cell that will import your data into a dataframe. Like the `sklearn` package, `pandas` is automatically provided in Watson Studio Python environments.

## <span style="color:red">IMPORTANT: replace all instances of `df_data_x` with `df_data_1` in the code</span>

The automated dataframe will likely use the `df_data_3` variable to hold the data. Update the last two lines of code to import data into the `df_data_1` variable for the rest of the notebook to work correctly. The last lines of your cell should look like this:

<img src="https://cp4d-outcomes.techzone.ibm.com/img/data-fabric-lab/trusted-ai/dataframe_insert.png" width=300 align=left>

Run the inserted code cell below. If you have correctly imported the data, you will see a table populated with employee data. Continue running cells individually until you reach step 3.

In [12]:

import os, types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
cos_client = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='hS34jXqg4cO0rmMfwxn-w7X_yX1vMuZVoijkItsXXo6L',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3.private.us.cloud-object-storage.appdomain.cloud')

bucket = 'trustedail3techlab-donotdelete-pr-ivzkukectdytod'
object_key = 'modeling_records_2022.csv'

body = cos_client.get_object(Bucket=bucket,Key=object_key)['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_data_1 = pd.read_csv(body)
df_data_1.head()


Unnamed: 0,POSITION_CODE,DEPARTMENT_CODE,DAYS_WITH_COMPANY,COMMUTE_TIME,AGE_BEGIN_PERIOD,GENDER_CODE,ATTRITION,PERIOD_TOTAL_DAYS,STARTING_SALARY,ENDING_SALARY,...,VACATION_DAYS_TAKEN,SICK_DAYS_TAKEN,PROMOTIONS,NB_MANAGERS,DAYS_IN_POSITION,DAYS_SINCE_LAST_RAISE,RANKING_CODE,OVERTIME,DBLOVERTIME,TRAVEL
0,1200,200,1825,29,55,0,0,330,159230.77,161538.46,...,28,10.5,0,1,1825,0,3,0.0,0.0,0
1,1200,200,2615,0,49,0,0,180,181153.85,183846.15,...,15,4.0,0,1,2615,60,3,0.0,0.0,0
2,1300,320,1609,30,44,1,0,330,129692.31,132923.08,...,20,15.0,0,1,1609,150,3,0.0,0.0,0
3,1300,320,2035,13,45,0,0,330,146769.23,150461.54,...,28,8.0,0,1,2035,210,3,0.0,0.0,0
4,1400,380,1885,31,44,0,0,330,146769.23,150461.54,...,26,11.5,0,1,1885,60,3,0.0,0.0,0


The next cell splits the training data into the feature columns and the label columns, and then further splits the data further into a training data set and a testing data set. If this cell generates an error, it is likely because you have not imported the data into the `df_data_1` variable as described above. You will need to alter the previous cell to use `df_data_1` and then rerun it.

In [13]:
from sklearn.model_selection import train_test_split

X = df_data_1.drop(['ATTRITION'], axis=1)  # Features
y = df_data_1['ATTRITION']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15) # 85% training and 15% test

Now you will tell Watson Machine Learning to use the current project to store the model.

In [14]:
X.columns.tolist()

['POSITION_CODE',
 'DEPARTMENT_CODE',
 'DAYS_WITH_COMPANY',
 'COMMUTE_TIME',
 'AGE_BEGIN_PERIOD',
 'GENDER_CODE',
 'PERIOD_TOTAL_DAYS',
 'STARTING_SALARY',
 'ENDING_SALARY',
 'NB_INCREASES',
 'BONUS',
 'NB_BONUS',
 'VACATION_DAYS_TAKEN',
 'SICK_DAYS_TAKEN',
 'PROMOTIONS',
 'NB_MANAGERS',
 'DAYS_IN_POSITION',
 'DAYS_SINCE_LAST_RAISE',
 'RANKING_CODE',
 'OVERTIME',
 'DBLOVERTIME',
 'TRAVEL']

The cell below tells the Watson Machine Learning client to save the models in the current project. If you receive an error here, it is likely because you did not correctly set your project ID at the beginning of the notebook.

In [15]:
wml_client.set.default_project(PROJECT_ID)

'SUCCESS'

The following cell provides connection information to the model training data, which will be stored with the model and in FactSheets. You could use the Cloud Object Storage information for this particular project by changing the credentials to match those from above where you inserted the file to code, but for simplicity's sake, you will use a pre-existing file.

In [16]:
training_data_references = [
                {
                    "id": "attrition",
                    "type": "container",
                    "connection": {},
                    "location": {
                        "path": "modeling_records_2022.csv"
                    },

                    #"type": "s3",
                    #"connection": {
                    #    "access_key_id": "yqcPbWZ0AQPHleHVerrR4Wx5e9pymBdMgydbEra5zCif",
                    #    "endpoint_url": "https://s3.us.cloud-object-storage.appdomain.cloud",
                    #    "resource_instance_id": "crn:v1:bluemix:public:cloud-object-storage:global:a/7d8b3c34272c0980d973d3e40be9e9d2:2883ef10-23f1-4592-8582-2f2ef4973639::"
                    #},
                    #"location": {
                    #    "bucket": "faststartlab-donotdelete-pr-nhfd4jnhlxgpc7",
                    #    "path": "modeling_records_2022.csv"
                    #},
                    "schema": {
                        "id": "training_schema",
                        "fields": [
                            {"name": "POSITION_CODE", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "DEPARTMENT_CODE", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "DAYS_WITH_COMPANY", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "COMMUTE_TIME", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "AGE_BEGIN_PERIOD", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "GENDER_CODE", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "PERIOD_TOTAL_DAYS", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "STARTING_SALARY", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "ENDING_SALARY", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "NB_INCREASES", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "BONUS", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "NB_BONUS", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "VACATION_DAYS_TAKEN", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "SICK_DAYS_TAKEN", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "PROMOTIONS", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "NB_MANAGERS", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "DAYS_IN_POSITION", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "DAYS_SINCE_LAST_RAISE", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "RANKING_CODE", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "OVERTIME", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "DBLOVERTIME", "nullable": True, "metadata": {}, "type": "double"},
                            {"name": "TRAVEL", "nullable": True, "metadata": {}, "type": "double"}
                        ]
                    }
                }
            ]

The cell below will authenticate with the IBM FactSheet service using credentials you have already supplied and initialize FactSheet monitoring for this model. Note that Python notebooks in Watson Studio have full support for `pip install`, which allows you to add whatever libraries you need to the notebook environment. For example, if you wanted to use Python to parse command line arguments, you could run `!pip install argparse`.

If you receive an error related to the project access token, it is likely because you either did not insert the project access token as instructed at the beginning of the notebook, or did not run the cell after it was inserted. You will need to return to the beginning of the notebook, ensure the cell is inserted, and execute it.

In [17]:
try:
    from ibm_aigov_facts_client import AIGovFactsClient
except:
    !pip install -U ibm-aigov-facts-client
    from ibm_aigov_facts_client import AIGovFactsClient
        
PROJECT_UID= os.environ['PROJECT_ID']
CPD_URL=os.environ['RUNTIME_ENV_APSX_URL'][len('https://api.'):]
CONTAINER_ID=PROJECT_ID
CONTAINER_TYPE='project'
EXPERIMENT_NAME='predictive_attrition'

PROJECT_ACCESS_TOKEN=project.project_context.accessToken.replace('Bearer ','')

facts_client = AIGovFactsClient(api_key=API_KEY,experiment_name=EXPERIMENT_NAME,container_type=CONTAINER_TYPE,container_id=CONTAINER_ID,set_as_current_experiment=True)

Collecting ibm-aigov-facts-client
  Downloading ibm_aigov_facts_client-1.0.28-py3-none-any.whl (106 kB)
[K     |████████████████████████████████| 106 kB 26.0 MB/s eta 0:00:01
Collecting mlflow-skinny==1.28.0
  Downloading mlflow_skinny-1.28.0-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 76.6 MB/s eta 0:00:01
[?25hCollecting sqlparse>=0.3.1
  Downloading sqlparse-0.4.3-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.2 MB/s  eta 0:00:01
Collecting docutils<0.16,>=0.10
  Downloading docutils-0.15.2-py3-none-any.whl (547 kB)
[K     |████████████████████████████████| 547 kB 73.0 MB/s eta 0:00:01
Collecting timeout-decorator
  Downloading timeout-decorator-0.5.0.tar.gz (4.8 kB)
Collecting databricks-cli<1,>=0.8.7
  Downloading databricks-cli-0.17.3.tar.gz (77 kB)
[K     |████████████████████████████████| 77 kB 5.0 MB/s  eta 0:00:01
Collecting gitpython<4,>=2.1.0
  Downloading GitPython-3.1.29-py3-none-any.whl (182 kB)
[K     |██

The next two cells construct metadata for your model. This will be saved with the model itself and will appear on its FactSheet. If you get errors trying to save the model, they will most likely be from the metadata contained in the model properties, specifically the `TYPE` and `SOFTWARE_SPEC_UID`, which frequently change as Watson Studio adds support for new versions of Python, and removes support for outdated versions. You can get a list of current supported specifications by running `wml_client.software_specifications.list()`.

In [18]:
fields=X_train.columns.tolist()
metadata_dict = {'target_col' : 'ATTRITION', 'fields':fields}

In [19]:
software_spec_uid = wml_client.software_specifications.get_id_by_name("runtime-22.1-py3.9")
print("Software Specification ID: {}".format(software_spec_uid))
model_props = {
    wml_client._models.ConfigurationMetaNames.NAME:"{}".format("attrition challenger - sklearn"),
    wml_client._models.ConfigurationMetaNames.TYPE: "scikit-learn_1.0",
    wml_client._models.ConfigurationMetaNames.SOFTWARE_SPEC_UID: software_spec_uid,
    wml_client._models.ConfigurationMetaNames.TRAINING_DATA_REFERENCES: training_data_references,
    wml_client._models.ConfigurationMetaNames.LABEL_FIELD: "ATTRITION",
    wml_client._models.ConfigurationMetaNames.CUSTOM: metadata_dict
}

facts_client.export_facts.prepare_model_meta(wml_client=wml_client,meta_props=model_props)

Software Specification ID: 12b83a17-24d8-5082-900f-0ab31fbfd3cb


{'name': 'attrition challenger - sklearn',
 'type': 'scikit-learn_1.0',
 'software_spec': '12b83a17-24d8-5082-900f-0ab31fbfd3cb',
 'training_data_references': [{'id': 'attrition',
   'type': 'container',
   'connection': {},
   'location': {'path': 'modeling_records_2022.csv'},
   'schema': {'id': 'training_schema',
    'fields': [{'name': 'POSITION_CODE',
      'nullable': True,
      'metadata': {},
      'type': 'double'},
     {'name': 'DEPARTMENT_CODE',
      'nullable': True,
      'metadata': {},
      'type': 'double'},
     {'name': 'DAYS_WITH_COMPANY',
      'nullable': True,
      'metadata': {},
      'type': 'double'},
     {'name': 'COMMUTE_TIME',
      'nullable': True,
      'metadata': {},
      'type': 'double'},
     {'name': 'AGE_BEGIN_PERIOD',
      'nullable': True,
      'metadata': {},
      'type': 'double'},
     {'name': 'GENDER_CODE',
      'nullable': True,
      'metadata': {},
      'type': 'double'},
     {'name': 'PERIOD_TOTAL_DAYS',
      'nullable': T

The next three cells fit the data the the model using a Random Forest classifier, run predictions on the test data, and then print out the accuracy for how the model did on the test data. Finally, the notebook calculates and displays feature importance. For more information on Random Forest classifiers, see [here](https://www.ibm.com/cloud/learn/random-forest).

In [20]:
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf = RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

2022/11/30 05:00:20 INFO : logging results to factsheet for run_id e3287da02c95472aa3aa93f00b6417ae


INFO:ibm_aigov_facts_client.store.autolog.general_payload_store:logging results to factsheet for run_id e3287da02c95472aa3aa93f00b6417ae


2022/11/30 05:00:21 INFO : Successfully logged results to Factsheet service for run_id e3287da02c95472aa3aa93f00b6417ae under asset_id: d2be6bc2-85fb-45a6-a23c-cd11828ac9ea and project_id : a2a29465-a371-4b43-b30a-1ecd79492c0f


INFO:ibm_aigov_facts_client.base_classes.auth:Successfully logged results to Factsheet service for run_id e3287da02c95472aa3aa93f00b6417ae under asset_id: d2be6bc2-85fb-45a6-a23c-cd11828ac9ea and project_id : a2a29465-a371-4b43-b30a-1ecd79492c0f


In [21]:
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9722222222222222


In [22]:
feature_imp = pd.Series(clf.feature_importances_,index=X.columns).sort_values(ascending=False)
feature_imp

COMMUTE_TIME             0.228825
OVERTIME                 0.159387
DBLOVERTIME              0.087004
DAYS_IN_POSITION         0.055499
STARTING_SALARY          0.052594
ENDING_SALARY            0.050040
DAYS_WITH_COMPANY        0.046151
PERIOD_TOTAL_DAYS        0.044675
AGE_BEGIN_PERIOD         0.043954
DAYS_SINCE_LAST_RAISE    0.042997
VACATION_DAYS_TAKEN      0.035166
RANKING_CODE             0.033583
SICK_DAYS_TAKEN          0.030783
BONUS                    0.030751
POSITION_CODE            0.028626
DEPARTMENT_CODE          0.015183
GENDER_CODE              0.005895
NB_BONUS                 0.004466
NB_INCREASES             0.004420
NB_MANAGERS              0.000000
PROMOTIONS               0.000000
TRAVEL                   0.000000
dtype: float64

The next three cells export data from the model you just created to the FactSheet. The first lists experiments tracked by FactSheets. The second writes the URL and other info on this notebook as custom data to the FactSheet. Note that any data can be written to the FactSheet that might be helpful for model validators.

In [23]:
facts_client.runs.list_runs_by_experiment('1')

Unnamed: 0,run_id,experiment_id,published,artifact_uri,start_time,end_time
0,e3287da02c95472aa3aa93f00b6417ae,1,True,file:///home/spark/shared/mlruns/1/e3287da02c9...,2022-11-30 05:00:17.841000+00:00,2022-11-30 05:00:21.577000+00:00


In [24]:
nb_name = "attrition model creation and deployment"
nb_asset_id = "tbd"
nb_asset_url = "https://" + CPD_URL + "/analytics/notebooks/v2/" + nb_asset_id + "?projectid=" + PROJECT_UID + "&context=cpdaas"

latestRunId = facts_client.runs.list_runs_by_experiment('1').sort_values('start_time').iloc[-1]['run_id']
facts_client.runs.set_tags(latestRunId, {"Notebook name": nb_name, "Notebook id": nb_asset_id, "Notebook URL" : nb_asset_url})
facts_client.export_facts.export_payload(latestRunId)

2022/11/30 05:00:22 INFO : Initiating logging to factsheet for run_id......e3287da02c95472aa3aa93f00b6417ae


INFO:ibm_aigov_facts_client.export.export_facts:Initiating logging to factsheet for run_id......e3287da02c95472aa3aa93f00b6417ae


2022/11/30 05:00:23 INFO : Successfully logged results to Factsheet service for run_id e3287da02c95472aa3aa93f00b6417ae under asset_id: d2be6bc2-85fb-45a6-a23c-cd11828ac9ea and project_id : a2a29465-a371-4b43-b30a-1ecd79492c0f


INFO:ibm_aigov_facts_client.base_classes.auth:Successfully logged results to Factsheet service for run_id e3287da02c95472aa3aa93f00b6417ae under asset_id: d2be6bc2-85fb-45a6-a23c-cd11828ac9ea and project_id : a2a29465-a371-4b43-b30a-1ecd79492c0f


In [25]:
RUN_ID=facts_client.runs.get_current_run_id()
facts_client.export_facts.export_payload(RUN_ID)

2022/11/30 05:00:24 INFO : Initiating logging to factsheet for run_id......e3287da02c95472aa3aa93f00b6417ae


INFO:ibm_aigov_facts_client.export.export_facts:Initiating logging to factsheet for run_id......e3287da02c95472aa3aa93f00b6417ae


2022/11/30 05:00:25 INFO : Successfully logged results to Factsheet service for run_id e3287da02c95472aa3aa93f00b6417ae under asset_id: d2be6bc2-85fb-45a6-a23c-cd11828ac9ea and project_id : a2a29465-a371-4b43-b30a-1ecd79492c0f


INFO:ibm_aigov_facts_client.base_classes.auth:Successfully logged results to Factsheet service for run_id e3287da02c95472aa3aa93f00b6417ae under asset_id: d2be6bc2-85fb-45a6-a23c-cd11828ac9ea and project_id : a2a29465-a371-4b43-b30a-1ecd79492c0f


Finally, the model is stored to the project with all of the metadata defined above.

In [26]:
print("Storing model...")
published_model_details = wml_client.repository.store_model(
    model=clf, 
    meta_props=model_props,
    training_target=['ATTRITION'],
    training_data=X)
model_uid = wml_client.repository.get_model_id(published_model_details)

print("Done")
print("Model ID: {}".format(model_uid))

Storing model...
Done
Model ID: c1ffac5e-8a2e-43db-9305-32549caec122


INFO:ibm_aigov_facts_client.store.autolog.spark_payload_store:logging results to factsheet for run_id 37d294878159474aa0296df76913ab16
INFO:ibm_aigov_facts_client.base_classes.auth:Successfully logged results to Factsheet service for run_id 37d294878159474aa0296df76913ab16 under asset_id: d2be6bc2-85fb-45a6-a23c-cd11828ac9ea and project_id : a2a29465-a371-4b43-b30a-1ecd79492c0f


Next, the notebook uses Apache Spark to create a second model. Because you specified a Spark environment when you created this notebook, the `pyspark` runtime will be available without needing to be installed via `pip`.

In [27]:
try:
    from pyspark.sql import SparkSession
except:
    print('Error: Spark runtime is missing. If you are using Watson Studio change the notebook runtime to Spark by clicking \
    the Vew notebook info button above (the lowercase i in a circle). Click on the Environment tab and use the Environment \
    definition dropdown to select an environment with Spark and Python.')
    raise
spark.version

'3.3.1'

# <span style="color:red">3. !!--STOP--!! Insert data to code below</span>

Place your cursor in the empty code cell below. Then click the **Find and add data** icon in the upper right corner of the screen like you did in step 2. Locate the *modeling_records_2022.csv* file, click its associated **Insert to code** dropdown, and select **SparkSession DataFrame**.

## <span style="color:red">IMPORTANT: replace all instances of `df_data_x` with `df_data_2` in the code</span>

The automated dataframe will likely use the `df_data_3` variable to hold the data. Update the last two lines of code to import data into the `df_data_2` variable for the rest of the notebook to work correctly. The last lines of your cell should look like this:

<img src="https://cp4d-outcomes.techzone.ibm.com/img/data-fabric-lab/trusted-ai/dataframe_insert_2.png" width=700 align=left>

Run the inserted code cell below. If you have correctly imported the data, you will see a table populated with employee data. The remainder of the notebook is very similar to the training of the sklearn model. It will enable FactSheets for the second model, train a Spark Gradient Boost Classifier, and then save that model to the project. You may run the rest of the notebook to its conclusion.

In [28]:

import ibmos2spark, os
# @hidden_cell
metadata = {
    'endpoint': 'https://s3.private.us.cloud-object-storage.appdomain.cloud',
    'service_id': 'iam-ServiceId-5e082bf0-eb96-485e-a8d2-7b2fdee746f5',
    'iam_service_endpoint': 'https://iam.cloud.ibm.com/oidc/token',
    'api_key': 'hS34jXqg4cO0rmMfwxn-w7X_yX1vMuZVoijkItsXXo6L'
}

configuration_name = 'os_f9043eb056184e3eb6fc28c38e5f9558_configs'
cos = ibmos2spark.CloudObjectStorage(sc, metadata, configuration_name, 'bluemix_cos')

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df_data_2 = spark.read\
  .format('org.apache.spark.sql.execution.datasources.csv.CSVFileFormat')\
  .option('header', 'true')\
  .load(cos.url('modeling_records_2022.csv', 'trustedail3techlab-donotdelete-pr-ivzkukectdytod'))
df_data_2.take(5)


[Row(POSITION_CODE='1200', DEPARTMENT_CODE='200', DAYS_WITH_COMPANY='1825', COMMUTE_TIME='29', AGE_BEGIN_PERIOD='55', GENDER_CODE='0', ATTRITION='0', PERIOD_TOTAL_DAYS='330', STARTING_SALARY='159230.77', ENDING_SALARY='161538.46', NB_INCREASES='1', BONUS='0', NB_BONUS='0', VACATION_DAYS_TAKEN='28', SICK_DAYS_TAKEN='10.5', PROMOTIONS='0', NB_MANAGERS='1', DAYS_IN_POSITION='1825', DAYS_SINCE_LAST_RAISE='0', RANKING_CODE='3', OVERTIME='0', DBLOVERTIME='0', TRAVEL='0'),
 Row(POSITION_CODE='1200', DEPARTMENT_CODE='200', DAYS_WITH_COMPANY='2615', COMMUTE_TIME='0', AGE_BEGIN_PERIOD='49', GENDER_CODE='0', ATTRITION='0', PERIOD_TOTAL_DAYS='180', STARTING_SALARY='181153.85', ENDING_SALARY='183846.15', NB_INCREASES='1', BONUS='0', NB_BONUS='0', VACATION_DAYS_TAKEN='15', SICK_DAYS_TAKEN='4', PROMOTIONS='0', NB_MANAGERS='1', DAYS_IN_POSITION='2615', DAYS_SINCE_LAST_RAISE='60', RANKING_CODE='3', OVERTIME='0', DBLOVERTIME='0', TRAVEL='0'),
 Row(POSITION_CODE='1300', DEPARTMENT_CODE='320', DAYS_WITH_C

Similar to the `sklearn` model, you need to specify metadata for the spark model.

In [29]:
software_spec_uid = wml_client.software_specifications.get_id_by_name("spark-mllib_3.2")
print("Software Specification ID: {}".format(software_spec_uid))
model_props = {
    wml_client._models.ConfigurationMetaNames.NAME:"{}".format("attrition challenger - spark"),
    wml_client._models.ConfigurationMetaNames.TYPE: "mllib_3.2",
    wml_client._models.ConfigurationMetaNames.SOFTWARE_SPEC_UID: software_spec_uid,
    wml_client._models.ConfigurationMetaNames.TRAINING_DATA_REFERENCES: training_data_references,
    wml_client._models.ConfigurationMetaNames.LABEL_FIELD: "ATTRITION"
}

facts_client.export_facts.prepare_model_meta(wml_client=wml_client,meta_props=model_props)

Software Specification ID: 20047f72-0a98-58c7-9ff5-a77b012eb8f5


{'name': 'attrition challenger - spark',
 'type': 'mllib_3.2',
 'software_spec': '20047f72-0a98-58c7-9ff5-a77b012eb8f5',
 'training_data_references': [{'id': 'attrition',
   'type': 'container',
   'connection': {},
   'location': {'path': 'modeling_records_2022.csv'},
   'schema': {'id': 'training_schema',
    'fields': [{'name': 'POSITION_CODE',
      'nullable': True,
      'metadata': {},
      'type': 'double'},
     {'name': 'DEPARTMENT_CODE',
      'nullable': True,
      'metadata': {},
      'type': 'double'},
     {'name': 'DAYS_WITH_COMPANY',
      'nullable': True,
      'metadata': {},
      'type': 'double'},
     {'name': 'COMMUTE_TIME',
      'nullable': True,
      'metadata': {},
      'type': 'double'},
     {'name': 'AGE_BEGIN_PERIOD',
      'nullable': True,
      'metadata': {},
      'type': 'double'},
     {'name': 'GENDER_CODE',
      'nullable': True,
      'metadata': {},
      'type': 'double'},
     {'name': 'PERIOD_TOTAL_DAYS',
      'nullable': True,
    

In [30]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline, Model

For the second model, you will create a Gradient Boosted Tree classifier. For more information on Gradient Boosting, see [here](https://www.ibm.com/cloud/learn/boosting).

In [31]:
from pyspark.sql.types import FloatType
for field in fields:
    df_data_2=df_data_2.withColumn(field,df_data_2[field].cast("float").alias(field))
df_data_2=df_data_2.withColumn('ATTRITION',df_data_2['ATTRITION'].cast("int").alias('ATTRITTION'))
df_data_2.take(5)

[Row(POSITION_CODE=1200.0, DEPARTMENT_CODE=200.0, DAYS_WITH_COMPANY=1825.0, COMMUTE_TIME=29.0, AGE_BEGIN_PERIOD=55.0, GENDER_CODE=0.0, ATTRITION=0, PERIOD_TOTAL_DAYS=330.0, STARTING_SALARY=159230.765625, ENDING_SALARY=161538.453125, NB_INCREASES=1.0, BONUS=0.0, NB_BONUS=0.0, VACATION_DAYS_TAKEN=28.0, SICK_DAYS_TAKEN=10.5, PROMOTIONS=0.0, NB_MANAGERS=1.0, DAYS_IN_POSITION=1825.0, DAYS_SINCE_LAST_RAISE=0.0, RANKING_CODE=3.0, OVERTIME=0.0, DBLOVERTIME=0.0, TRAVEL=0.0),
 Row(POSITION_CODE=1200.0, DEPARTMENT_CODE=200.0, DAYS_WITH_COMPANY=2615.0, COMMUTE_TIME=0.0, AGE_BEGIN_PERIOD=49.0, GENDER_CODE=0.0, ATTRITION=0, PERIOD_TOTAL_DAYS=180.0, STARTING_SALARY=181153.84375, ENDING_SALARY=183846.15625, NB_INCREASES=1.0, BONUS=0.0, NB_BONUS=0.0, VACATION_DAYS_TAKEN=15.0, SICK_DAYS_TAKEN=4.0, PROMOTIONS=0.0, NB_MANAGERS=1.0, DAYS_IN_POSITION=2615.0, DAYS_SINCE_LAST_RAISE=60.0, RANKING_CODE=3.0, OVERTIME=0.0, DBLOVERTIME=0.0, TRAVEL=0.0),
 Row(POSITION_CODE=1300.0, DEPARTMENT_CODE=320.0, DAYS_WITH_C

In [32]:
va = VectorAssembler(inputCols = fields, outputCol='features')
va_df = va.transform(df_data_2)
va_df = va_df.select(['features', 'ATTRITION'])
va_df.show(3)

+--------------------+---------+
|            features|ATTRITION|
+--------------------+---------+
|[1200.0,200.0,182...|        0|
|[1200.0,200.0,261...|        0|
|[1300.0,320.0,160...|        0|
+--------------------+---------+
only showing top 3 rows



In [33]:
gbtc = GBTClassifier(labelCol="ATTRITION", maxIter=20)

pipeline = Pipeline(stages=[va, gbtc])

In [34]:
split_data = df_data_2.randomSplit([0.8, 0.2], 24)
train_data = split_data[0]
test_data = split_data[1]

print("Number of training records: " + str(train_data.count()))
print("Number of testing records : " + str(test_data.count()))

Number of training records: 203
Number of testing records : 34


In [35]:
spark_model = pipeline.fit(train_data)

pred = spark_model.transform(test_data)
pred.show(3) 

2022/11/30 05:01:26 INFO : logging results to factsheet for run_id 37d294878159474aa0296df76913ab16
2022/11/30 05:01:27 INFO : Successfully logged results to Factsheet service for run_id 37d294878159474aa0296df76913ab16 under asset_id: d2be6bc2-85fb-45a6-a23c-cd11828ac9ea and project_id : a2a29465-a371-4b43-b30a-1ecd79492c0f
+-------------+---------------+-----------------+------------+----------------+-----------+---------+-----------------+---------------+-------------+------------+--------+--------+-------------------+---------------+----------+-----------+----------------+---------------------+------------+--------+-----------+------+--------------------+--------------------+--------------------+----------+
|POSITION_CODE|DEPARTMENT_CODE|DAYS_WITH_COMPANY|COMMUTE_TIME|AGE_BEGIN_PERIOD|GENDER_CODE|ATTRITION|PERIOD_TOTAL_DAYS|STARTING_SALARY|ENDING_SALARY|NB_INCREASES|   BONUS|NB_BONUS|VACATION_DAYS_TAKEN|SICK_DAYS_TAKEN|PROMOTIONS|NB_MANAGERS|DAYS_IN_POSITION|DAYS_SINCE_LAST_RAISE|R

In [36]:
evaluator = BinaryClassificationEvaluator()
evaluator.setLabelCol("ATTRITION")
print("Test Area Under ROC: " + str(evaluator.evaluate(pred, {evaluator.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.8524305555555556


In [37]:
print("Storing spark model...")
published_model_details = wml_client.repository.store_model(
    model=spark_model, 
    meta_props=model_props,
    training_target=['ATTRITION'],
    training_data=train_data,
    pipeline=pipeline
)
model_uid = wml_client.repository.get_model_id(published_model_details)

print("Done")
print("Model ID: {}".format(model_uid))

Storing spark model...
Done
Model ID: 19e8ace4-5cff-4c9c-bacd-80ddff9f2d76


# Congratulations!

You have completed this notebook. You can now return to the [Data and AI Live Demos lab page](https://cp4d-outcomes.techzone.ibm.com/data-fabric-lab/trusted-ai) and continue with the lab.