# Introduction - Customer Churn Prediction notebook
In this notebook, we illustrate how you can train a model for Churn Prediction using scikit learn. After training the model, you step through the instructions to deploy the model using Watson Machine Learning.

This notebook is a variation of the original notebook reference in this github repo: https://github.com/elenalowery/cpd4_demo/blob/master/assets/jupyterlab/Predict_Customer_Churn_CPD4.ipynb


In [38]:
# Install required Python modules
!pip install sklearn-pandas




## Step 1: Review Use Case
The analytics use case implemented in this notebook is telco churn prediction. It is a simple use case which illustrates typical process for model development and deployment using Cloud Pak for Data.

In [2]:
import itc_utils.flight_service as itcfs
readClient = itcfs.get_flight_client()

nb_data_request = {
    'data_name': """CUSTOMER_TRANSACTION_DATA_shaped""",
    'interaction_properties': {
    }
}

flightInfo = itcfs.get_flight_info(readClient, nb_data_request=nb_data_request)

data_df_4 = itcfs.read_pandas_and_concat(readClient, flightInfo)
data_df_4.head(10)



Unnamed: 0,ID,LONGDISTANCE,INTERNATIONAL,LOCAL,DROPPED,PAYMETHOD,LOCALBILLTYPE,LONGDISTANCEBILLTYPE,USAGE,RATEPLAN,...,CREDITCARD,DOB,ADDRESS_1,CITY,STATE,ZIP,ZIP4,LONGITUDE,LATITUDE,CHURN
0,1,23,0,206,0,CC,Budget,Intnl_discount,229,3,...,1814138742463990,11/11/47,159 HUTTON ST BSMT A,ABSECON,NJ,8201,0,,,T
1,1004,28,0,60,0,Auto,FreeLocal,Standard,89,4,...,6494421570624020,8/7/63,1724 WHITEHAVEN,GLYNDON,MN,56547,0,,,F
2,1005,24,0,5,0,CH,Budget,Standard,29,4,...,3218719749799130,7/18/43,95 W 25TH ST APT 1,WAPPINGERS FALLS,NY,12590,1723,,,F
3,1006,28,0,97,0,CC,FreeLocal,Standard,125,1,...,3016220340639660,10/24/42,66 KULLA DR,RICHLAND,NE,68601,0,-97.377539,41.441233,T
4,1008,0,0,4,2,CC,Budget,Standard,4,2,...,7070216281705930,7/17/32,5621 MCCARTY RD,EVERETT,WA,98205,0,,,F
5,1009,29,0,9,0,CC,Budget,Intnl_discount,38,2,...,4919386224173850,6/21/99,2000 CALLE 4,CAROLINA,PR,979,0,,,F
6,1010,13,0,40,0,CC,Budget,Standard,53,4,...,9402647500670250,1/9/24,3801 YOSEMITE BLVD STE F,HOUSTON,TX,77024,7776,,,F
7,1016,16,0,114,0,CH,Budget,Standard,130,1,...,8522562667441740,3/12/72,843 EUCLID ST APT 101S,KIRKLAND,WA,98034,0,-122.209175,47.709619,T
8,1017,7,0,6,0,CC,Budget,Standard,13,3,...,2981966329479580,11/9/77,3801 MAC CV,NEW YORK,NY,10019,0,-73.990852,40.768196,F
9,1018,21,0,87,0,CC,Budget,Standard,108,1,...,3074091066861440,1/10/06,390 W BROADWAY ST,BUTLER,NJ,7405,0,,,F


In [3]:
# COPY the dataFrame into a new dataFrame called *data*
# Note that when you insert the data, it may get written to a different dataFrame name; in this example, it is assumed the dataFrame is data_df_1
data=data_df_4.copy()

In [7]:
# List all the columns
print(data.columns)

Index(['ID', 'LONGDISTANCE', 'INTERNATIONAL', 'LOCAL', 'DROPPED', 'PAYMETHOD',
       'LOCALBILLTYPE', 'LONGDISTANCEBILLTYPE', 'USAGE', 'RATEPLAN', 'GENDER',
       'STATUS', 'CHILDREN', 'ESTINCOME', 'CAROWNER', 'AGE', 'CREDITCARD',
       'DOB', 'ADDRESS_1', 'CITY', 'STATE', 'ZIP', 'ZIP4', 'LONGITUDE',
       'LATITUDE', 'CHURN'],
      dtype='object')


In [8]:
# Keep only the columns that are relevant for churn prediction
data = data[['ID', 'LONGDISTANCE', 'INTERNATIONAL', 'LOCAL', 'DROPPED', 'PAYMETHOD', 'LOCALBILLTYPE', 'LONGDISTANCEBILLTYPE', 'USAGE', 'RATEPLAN', 'GENDER','STATUS', 'CHILDREN', 'ESTINCOME', 'CAROWNER', 'AGE', 'CHURN']]
data.head()


Unnamed: 0,ID,LONGDISTANCE,INTERNATIONAL,LOCAL,DROPPED,PAYMETHOD,LOCALBILLTYPE,LONGDISTANCEBILLTYPE,USAGE,RATEPLAN,GENDER,STATUS,CHILDREN,ESTINCOME,CAROWNER,AGE,CHURN
0,1,23,0,206,0,CC,Budget,Intnl_discount,229,3,F,S,1,38000.0,N,24.393333,T
1,1004,28,0,60,0,Auto,FreeLocal,Standard,89,4,F,M,1,8073.11,N,46.0,F
2,1005,24,0,5,0,CH,Budget,Standard,29,4,M,M,0,95448.6,Y,53.68,F
3,1006,28,0,97,0,CC,FreeLocal,Standard,125,1,M,S,1,24141.5,Y,17.006667,T
4,1008,0,0,4,2,CC,Budget,Standard,4,2,M,S,1,31952.0,N,34.266667,F


## Step 2: Build the Random Forest model

In [9]:
import pandas as pd
import sklearn
pd.options.display.max_columns = 999

import warnings
warnings.filterwarnings('ignore')

from scipy.stats import chi2_contingency,ttest_ind
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_curve, roc_auc_score

import numpy as np

import urllib3, requests, json

In [10]:
#convert CHURN to 1/0
le = LabelEncoder()
data.loc[:,'CHURN']= le.fit_transform(data.loc[:,'CHURN'])
data.head()

Unnamed: 0,ID,LONGDISTANCE,INTERNATIONAL,LOCAL,DROPPED,PAYMETHOD,LOCALBILLTYPE,LONGDISTANCEBILLTYPE,USAGE,RATEPLAN,GENDER,STATUS,CHILDREN,ESTINCOME,CAROWNER,AGE,CHURN
0,1,23,0,206,0,CC,Budget,Intnl_discount,229,3,F,S,1,38000.0,N,24.393333,1
1,1004,28,0,60,0,Auto,FreeLocal,Standard,89,4,F,M,1,8073.11,N,46.0,0
2,1005,24,0,5,0,CH,Budget,Standard,29,4,M,M,0,95448.6,Y,53.68,0
3,1006,28,0,97,0,CC,FreeLocal,Standard,125,1,M,S,1,24141.5,Y,17.006667,1
4,1008,0,0,4,2,CC,Budget,Standard,4,2,M,S,1,31952.0,N,34.266667,0


In [11]:
# define the label and features
y = np.float32(data.CHURN)
x = data.drop(['CHURN'], axis = 1)

In [12]:
x.columns

Index(['ID', 'LONGDISTANCE', 'INTERNATIONAL', 'LOCAL', 'DROPPED', 'PAYMETHOD',
       'LOCALBILLTYPE', 'LONGDISTANCEBILLTYPE', 'USAGE', 'RATEPLAN', 'GENDER',
       'STATUS', 'CHILDREN', 'ESTINCOME', 'CAROWNER', 'AGE'],
      dtype='object')

In [13]:
# Apply the LabelEncoder to encode the input features in numeric form where applicable
from sklearn_pandas import DataFrameMapper

mapper = DataFrameMapper(
    [('GENDER', LabelEncoder()),
     ('STATUS', LabelEncoder()),
     ('CHILDREN', None),
     ('ESTINCOME',None),
     ('CAROWNER', LabelEncoder()),
     ('AGE',None),
     ('LONGDISTANCE',None),
     ('INTERNATIONAL',None),
     ('LOCAL',None),
     ('DROPPED',None),
     ('PAYMETHOD',LabelEncoder()),
     ('LOCALBILLTYPE',LabelEncoder()),
     ('LONGDISTANCEBILLTYPE',LabelEncoder()),
     ('USAGE',None),
     ('RATEPLAN',None)
    ]
)

In [14]:
# split the data to training and testing set
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [15]:
# fit the model

import sklearn.pipeline
from sklearn.preprocessing import OneHotEncoder

random_forest = RandomForestClassifier()
steps = [('mapper', mapper),('RandonForestClassifier', random_forest)]
pipeline = sklearn.pipeline.Pipeline(steps)
model=pipeline.fit( X_train, y_train )
model

Pipeline(steps=[('mapper',
                 DataFrameMapper(drop_cols=[],
                                 features=[('GENDER', LabelEncoder()),
                                           ('STATUS', LabelEncoder()),
                                           ('CHILDREN', None),
                                           ('ESTINCOME', None),
                                           ('CAROWNER', LabelEncoder()),
                                           ('AGE', None),
                                           ('LONGDISTANCE', None),
                                           ('INTERNATIONAL', None),
                                           ('LOCAL', None), ('DROPPED', None),
                                           ('PAYMETHOD', LabelEncoder()),
                                           ('LOCALBILLTYPE', LabelEncoder()),
                                           ('LONGDISTANCEBILLTYPE',
                                            LabelEncoder()),
                               

In [16]:
### call pipeline.predict() on your X_test data to make a set of test predictions
y_prediction = pipeline.predict( X_test )

### test your predictions using sklearn.classification_report()

report = sklearn.metrics.classification_report( y_test, y_prediction )
### and print the report
print(report)

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99       168
         1.0       0.98      0.99      0.99       115

    accuracy                           0.99       283
   macro avg       0.99      0.99      0.99       283
weighted avg       0.99      0.99      0.99       283



## Step 3 - WML Deployment
In the next set of cells, we deploy the trained model using Watson Machine Learning. To do so:
- Leverage the WML Python client to create a deployment space. 
- Store the trained model to both, the current project and the deployment space. 
- Deploy the model to WML as an online deployment so it is accessible via REST APIs

In [4]:
import os
cpdtoken=os.environ['USER_ACCESS_TOKEN']
wml_credentials = {
"token": cpdtoken,
"instance_id" : "openshift",
"url": os.environ['RUNTIME_ENV_APSX_URL'],
"version": "4.0"
}

from ibm_watson_machine_learning import APIClient
client = APIClient(wml_credentials)



In [27]:
!pip list | grep watson

ibm-watson-machine-learning 1.0.173
ibm-watson-openscale        3.0.12
ibm-watson-studio-lib       3.0.6


In [19]:
# For details on working with spaces, check the following notebook
# https://github.com/IBM/watson-machine-learning-samples/blob/master/cpd4.0/notebooks/python_sdk/instance-management/Space%20management.ipynb
# Create deployment space
space_metadata = {
    'name': 'churn_qa_space',
    'description': 'Quality Assurance deployment space for churn models',
}
space_details = client.spaces.store(space_metadata)
print(space_details)

Space has been created. However some background setup activities might still be on-going. Check for 'status' field in the response. It has to show 'active' before space can be used. If its not 'active', you can monitor the state with a call to spaces.get_details(space_id)
{'entity': {'compute': [{'crn': 'crn:v1:cpd:private:pm-20:private:a/cpduser:99999999-9999-9999-9999-999999999999::', 'guid': '99999999-9999-9999-9999-999999999999', 'name': 'Watson Machine Learning', 'type': 'machine_learning'}], 'description': 'Quality Assurance deployment space for churn models', 'members': [{'id': '1000331002', 'role': 'admin', 'state': 'active', 'type': 'user'}], 'name': 'churn_qa_space', 'scope': {'bss_account_id': 'cpdaccount'}, 'status': {'state': 'preparing'}}, 'metadata': {'created_at': '2022-01-02T16:01:29.008Z', 'creator_id': '1000331002', 'id': '275cec19-291d-4f21-b1c7-db78012ac24d', 'url': '/v2/spaces/275cec19-291d-4f21-b1c7-db78012ac24d'}}


In [20]:
# Confirm deployment space is created
client.spaces.list(limit=10)

------------------------------------  --------------  ------------------------
ID                                    NAME            CREATED
275cec19-291d-4f21-b1c7-db78012ac24d  churn_qa_space  2022-01-02T16:01:29.008Z
------------------------------------  --------------  ------------------------


In [21]:
# Set default space ID so we can deploy models.
space_id = client.spaces.get_id(space_details)
client.set.default_space(space_id)

Unsetting the project_id ...


'SUCCESS'

In [39]:
client.spaces.list()

Note: 'limit' is not provided. Only first 50 records will be displayed if the number of records exceed 50
------------------------------------  --------------  ------------------------
ID                                    NAME            CREATED
275cec19-291d-4f21-b1c7-db78012ac24d  churn_qa_space  2022-01-02T16:01:29.008Z
------------------------------------  --------------  ------------------------


In [44]:
client.spaces.list()

Note: 'limit' is not provided. Only first 50 records will be displayed if the number of records exceed 50
------------------------------------  --------------  ------------------------
ID                                    NAME            CREATED
275cec19-291d-4f21-b1c7-db78012ac24d  churn_qa_space  2022-01-02T16:01:29.008Z
------------------------------------  --------------  ------------------------


In [5]:
space_id='275cec19-291d-4f21-b1c7-db78012ac24d'
client.set.default_space(space_id)

'SUCCESS'

In [6]:
client.software_specifications.list()

--------------------------  ------------------------------------  ----
NAME                        ASSET_ID                              TYPE
default_py3.6               0062b8c9-8b7d-44a0-a9b9-46c416adcbd9  base
pytorch-onnx_1.3-py3.7-edt  069ea134-3346-5748-b513-49120e15d288  base
scikit-learn_0.20-py3.6     09c5a1d0-9c1e-4473-a344-eb7b665ff687  base
spark-mllib_3.0-scala_2.12  09f4cff0-90a7-5899-b9ed-1ef348aebdee  base
ai-function_0.1-py3.6       0cdb0f1e-5376-4f4d-92dd-da3b69aa9bda  base
shiny-r3.6                  0e6e79df-875e-4f24-8ae9-62dcc2148306  base
pytorch_1.1-py3.6           10ac12d6-6b30-4ccd-8392-3e922c096a92  base
scikit-learn_0.22-py3.6     154010fa-5b3b-4ac1-82af-4d5ee5abbc85  base
pytorch-onnx_1.7-py3.8-edt  1b199910-c7d5-5af4-b8f1-e86b760f9779  base
default_r3.6                1b70aec3-ab34-4b87-8aa0-a4a3c8296a36  base
do_py3.8                    295addb5-9ef9-547e-9bf4-92ae3563e720  base
autoai-ts_3.8-py3.8         2aa0c932-798f-5ae9-abd6-15e0c2402fb5  base
tensor

In [17]:
client.software_specifications.list()

--------------------------  ------------------------------------  ----
NAME                        ASSET_ID                              TYPE
default_py3.6               0062b8c9-8b7d-44a0-a9b9-46c416adcbd9  base
pytorch-onnx_1.3-py3.7-edt  069ea134-3346-5748-b513-49120e15d288  base
scikit-learn_0.20-py3.6     09c5a1d0-9c1e-4473-a344-eb7b665ff687  base
spark-mllib_3.0-scala_2.12  09f4cff0-90a7-5899-b9ed-1ef348aebdee  base
ai-function_0.1-py3.6       0cdb0f1e-5376-4f4d-92dd-da3b69aa9bda  base
shiny-r3.6                  0e6e79df-875e-4f24-8ae9-62dcc2148306  base
pytorch_1.1-py3.6           10ac12d6-6b30-4ccd-8392-3e922c096a92  base
scikit-learn_0.22-py3.6     154010fa-5b3b-4ac1-82af-4d5ee5abbc85  base
pytorch-onnx_1.7-py3.8-edt  1b199910-c7d5-5af4-b8f1-e86b760f9779  base
default_r3.6                1b70aec3-ab34-4b87-8aa0-a4a3c8296a36  base
do_py3.8                    295addb5-9ef9-547e-9bf4-92ae3563e720  base
autoai-ts_3.8-py3.8         2aa0c932-798f-5ae9-abd6-15e0c2402fb5  base
tensor

In [30]:
client.repository.ModelMetaNames.get()

In [None]:
client.software_specifications.ConfigurationMetaNames.show()

In [35]:
client.software_specifications.list()

Failure during list sw_specs. (GET https://internal-nginx-svc:12443/v2/software_specifications?version=2021-06-24&space_id=275cec19-291d-4f21-b1c7-db78012ac24d&userfs=true)
Status code: 500, body: {"trace":"8b3f1176-bead-4f4c-ad64-dec8a4e3129b","errors":[{"code":"invalid_response","message":"Invalid error response for search software specification asset. Details: {\"code\":400,\"error\":\"Bad Request\",\"reason\":\"Required request parameters missing\",\"message\":\"The server cannot or will not process the request due to an apparent client error (e.g. malformed request syntax).\"}"}]}


ApiRequestFailure: Failure during list sw_specs. (GET https://internal-nginx-svc:12443/v2/software_specifications?version=2021-06-24&space_id=275cec19-291d-4f21-b1c7-db78012ac24d&userfs=true)
Status code: 500, body: {"trace":"8b3f1176-bead-4f4c-ad64-dec8a4e3129b","errors":[{"code":"invalid_response","message":"Invalid error response for search software specification asset. Details: {\"code\":400,\"error\":\"Bad Request\",\"reason\":\"Required request parameters missing\",\"message\":\"The server cannot or will not process the request due to an apparent client error (e.g. malformed request syntax).\"}"}]}

In [19]:
model_name = 'customer_churn_model'
#software_spec_uid = client.software_specifications.get_uid_by_name('default_py3.8_opence')
software_spec_uid = client.software_specifications.get_id_by_name("default_py3.8")   

metadata = {
    client.repository.ModelMetaNames.NAME: model_name,
    client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: software_spec_uid,
    client.repository.ModelMetaNames.TYPE: "scikit-learn_0.23"
}

stored_model_details = client.repository.store_model(pipeline,
                                               meta_props=metadata,
                                               training_data=X_train,
                                               training_target=y_train)

In [20]:
# Confirm the model is stored in WML repository
client.repository.list_models()

------------------------------------  --------------------  ------------------------  -----------------
ID                                    NAME                  CREATED                   TYPE
23b8f6f5-c936-4eaa-a392-961e483e7ca2  customer_churn_model  2022-01-02T18:36:16.002Z  scikit-learn_0.23
------------------------------------  --------------------  ------------------------  -----------------


In [21]:
# Deploy the model
deploy_metadata = {
    client.deployments.ConfigurationMetaNames.NAME: "Churn Prediction Model Deployment",
    client.deployments.ConfigurationMetaNames.ONLINE: {}
}

published_model_uid = client.repository.get_model_uid(stored_model_details)
created_deployment = client.deployments.create(published_model_uid, meta_props=deploy_metadata)




#######################################################################################

Synchronous deployment creation for uid: '23b8f6f5-c936-4eaa-a392-961e483e7ca2' started

#######################################################################################


initializing
Note: online_url is deprecated and will be removed in a future release. Use serving_urls instead.
..............................
ready


------------------------------------------------------------------------------------------------
Successfully finished deployment creation, deployment_uid='21ac8162-8327-484a-8dae-8a25aa1147dd'
------------------------------------------------------------------------------------------------




In [22]:
deployment_uid = client.deployments.get_uid(created_deployment)
scoring_endpoint = client.deployments.get_scoring_href(created_deployment)
print(scoring_endpoint)

https://internal-nginx-svc:12443/ml/v4/deployments/21ac8162-8327-484a-8dae-8a25aa1147dd/predictions


In [23]:
# Score the model on a test dataset
scoring_payload = {
    "input_data": [{
        'fields': ['ID', 'LONGDISTANCE', 'INTERNATIONAL', 'LOCAL', 'DROPPED', 'PAYMETHOD', 'LOCALBILLTYPE', 'LONGDISTANCEBILLTYPE', 'USAGE', 'RATEPLAN', 'GENDER','STATUS', 'CHILDREN', 'ESTINCOME', 'CAROWNER', 'AGE'],
        'values': [[1,28,0,60,0,"Auto","FreeLocal","Standard",89,4,"F","M",1,23000,"N",45]]}]
}





In [24]:
predictions = client.deployments.score(deployment_uid, scoring_payload)
print(json.dumps(predictions, indent=2))

{
  "predictions": [
    {
      "fields": [
        "prediction",
        "probability"
      ],
      "values": [
        [
          0.0,
          [
            0.84,
            0.16
          ]
        ]
      ]
    }
  ]
}
