#Import & initialize IBM AI Governance Python library & client 

In [None]:
!pip install --extra-index-url https://test.pypi.org/simple/ ibm-aigov-facts-client

In [22]:
import os
PROJECT_UID= os.environ['PROJECT_ID']
os.environ['FACTS_CLIENT_ENV'] = "dev";
CONTAINER_ID=PROJECT_UID
CONTAINER_TYPE="project"
EXPERIMENT_NAME="MyExperiment"
API_KEY = 'Your credentials go here'
DB_PW = """Your credentials go here"""
DB_USER = 'Your credentials go here
print(PROJECT_UID)

bbafca27-1fc6-44aa-9c72-73c2044db2af


In [14]:
from ibm_aigov_facts_client import AIGovFactsClient
facts_client = AIGovFactsClient(api_key=API_KEY,experiment_name=EXPERIMENT_NAME,container_type=CONTAINER_TYPE,container_id=CONTAINER_ID),set_as_current_experiment=True

2021/11/16 13:58:27 INFO : Experiment successfully created with ID 3 and name experiment5
2021/11/16 13:58:27 INFO : Autolog enabled Successfully


#Import  & initialize IBM WML library
Non-IBM runtimes are supported by AI Governance as well

In [None]:
!pip install ibm-watson-machine-learning

In [6]:
wml_credentials = {
    "apikey": API_KEY,
    "url": 'https://us-south.ml.cloud.ibm.com'
}

In [7]:
from ibm_watson_machine_learning import APIClient
wml_client = APIClient(wml_credentials)
wml_client.set.default_project(PROJECT_UID)

'SUCCESS'

In [15]:
data_asset_id = "31ab850e-d6da-4852-9ab7-e4129b094643" #Credit Risk Training data table
data_asset_path = "https://dataplatform.dev.cloud.ibm.com/projects/" + PROJECT_UID + "/data-assets/"+ data_asset_id + "/preview?context=cpdaas"
print(data_asset_path)

https://dataplatform.dev.cloud.ibm.com/projects/bbafca27-1fc6-44aa-9c72-73c2044db2af/data-assets/31ab850e-d6da-4852-9ab7-e4129b094643/preview?context=cpdaas


#Main section - regular ML development
The example below uses Spark-based learning of credit risk - but really anything can be done here.
No IBM code needs to be added to this main section - but AI Gov will still capture facts from there.

In [16]:
try:
    from pyspark.sql import SparkSession
except:
    print('Error: Spark runtime is missing. If you are using Watson Studio change the notebook runtime to Spark.')
    raise

In [17]:
# @hidden_cell
import os
from pyspark.sql import SparkSession
sparkSession = (SparkSession.builder
    .config("spark.jars.packages", facts_client.ORG_FACTS_SPARK)
    .getOrCreate())

DB2_NWK38558_url = 'jdbc:db2://{}:{}/{}:sslConnection=true;'.format(
    'b70af05b-76e4-4bca-a1f5-23dbb4c6a74e.c1ogj3sd0tgtu0lqde00.databases.appdomain.cloud',
    32716,
    'bludb'
)
data_df = sparkSession.read.format('jdbc') \
    .option('url', DB2_NWK38558_url) \
    .option('dbtable', '"NWK38558"."CREDIT_RISK_TRAINING"') \
    .option('user', DB_USER) \
    .option('password', DB_PW).load()
data_df.show(5)

+--------------+------------+--------------------+-----------+----------+---------------+------------------+------------------+------+------------+------------------------+-----------------+---+----------------+-------+--------------------+-------+----------+---------+-------------+-------+
|CheckingStatus|LoanDuration|       CreditHistory|LoanPurpose|LoanAmount|ExistingSavings|EmploymentDuration|InstallmentPercent|   Sex|OthersOnLoan|CurrentResidenceDuration|     OwnsProperty|Age|InstallmentPlans|Housing|ExistingCreditsCount|    Job|Dependents|Telephone|ForeignWorker|   Risk|
+--------------+------------+--------------------+-----------+----------+---------------+------------------+------------------+------+------------+------------------------+-----------------+---+----------------+-------+--------------------+-------+----------+---------+-------------+-------+
|      0_to_200|          31|credits_paid_to_date|      other|      1889|     100_to_500|            less_1|                

In [18]:
spark_df = data_df
(train_data, test_data) = spark_df.randomSplit([0.8, 0.2], 24)

MODEL_NAME = "Loan automation (Spark)"
DEPLOYMENT_NAME = MODEL_NAME + " - Deployment"

print("Number of records for training: " + str(train_data.count()))
print("Number of records for evaluation: " + str(test_data.count()))

Number of records for training: 4005
Number of records for evaluation: 995


In [19]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline, Model
from pyspark.ml.feature import SQLTransformer

features = [x for x in spark_df.columns if x != 'Risk']
categorical_features = ['CheckingStatus', 'CreditHistory', 'LoanPurpose', 'ExistingSavings', 'EmploymentDuration', 'Sex', 'OthersOnLoan', 'OwnsProperty', 'InstallmentPlans', 'Housing', 'Job', 'Telephone', 'ForeignWorker']
categorical_num_features = [x + '_IX' for x in categorical_features]
si_list = [StringIndexer(inputCol=x, outputCol=y) for x, y in zip(categorical_features, categorical_num_features)]
va_features = VectorAssembler(inputCols=categorical_num_features + [x for x in features if x not in categorical_features], outputCol="features")

In [20]:
si_label = StringIndexer(inputCol="Risk", outputCol="label").fit(spark_df)
label_converter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=si_label.labels)

In [21]:
from pyspark.ml.classification import RandomForestClassifier

classifier = RandomForestClassifier(featuresCol="features")
feature_filter = SQLTransformer(statement="SELECT * FROM __THIS__")
pipeline = Pipeline(stages= si_list + [si_label, va_features, classifier, label_converter, feature_filter])
model = pipeline.fit(train_data)

In [12]:
predictions = model.transform(test_data)
evaluatorDT = BinaryClassificationEvaluator(rawPredictionCol="prediction")
area_under_curve = evaluatorDT.evaluate(predictions)

print("areaUnderROC = %g" % area_under_curve)

areaUnderROC = 0.730641


# IBM WML code (defining model metadata)

In [18]:
software_spec_uid = wml_client.software_specifications.get_id_by_name("spark-mllib_2.4")
print("Software Specification ID: {}".format(software_spec_uid))
model_props = {
        wml_client._models.ConfigurationMetaNames.NAME:"{}".format(MODEL_NAME),
        wml_client._models.ConfigurationMetaNames.TYPE: "mllib_2.4",
        wml_client._models.ConfigurationMetaNames.SOFTWARE_SPEC_UID: software_spec_uid,
        wml_client._models.ConfigurationMetaNames.TRAINING_DATA_REFERENCES: training_data_references,
        wml_client._models.ConfigurationMetaNames.LABEL_FIELD: "Risk",
        wml_client._models.ConfigurationMetaNames.CUSTOM: { "experiment_name": EXPERIMENT_NAME}
}

Software Specification ID: 390d21f8-e58b-4fac-9c55-d7ceda621326


#IBM AI Gov code to persist the auto-collected facts from main section

In [19]:
facts_client.export_facts.prepare_model_meta(wml_client=wml_client,meta_props=model_props)

# IBM WML code:  persisting model as an asset

In [20]:
print("Storing model ...")
published_model_details = wml_client.repository.store_model(
    model=model, 
    meta_props=model_props, 
    training_data=train_data, 
    pipeline=pipeline)

model_uid = wml_client.repository.get_model_uid(published_model_details)
print("Done")
print("Model ID: {}".format(model_uid))

Storing model ...
Done
Model ID: f50e3538-25c5-4672-9d04-0ce8fda133ed


Copyright © 2020, 2021 IBM. This notebook and its source code are released under the terms of the MIT License.