# Introduction - Customer Churn Prediction notebook
In this notebook, we illustrate how you can train a model for Churn Prediction using scikit learn. After training the model, you step through the instructions to deploy the model using Watson Machine Learning.

This notebook is a variation of the original notebook reference in this github repo: https://github.com/elenalowery/cpd4_demo/blob/master/assets/jupyterlab/Predict_Customer_Churn_CPD4.ipynb


In [1]:
print("Adding print statement")

Adding print statement


In [1]:
# Install required Python modules
!pip install sklearn-pandas


Collecting sklearn-pandas
  Downloading sklearn_pandas-2.2.0-py2.py3-none-any.whl (10 kB)
Collecting scipy>=1.5.1
  Downloading scipy-1.7.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (39.3 MB)
[K     |████████████████████████████████| 39.3 MB 21.2 MB/s eta 0:00:01
Installing collected packages: scipy, sklearn-pandas
  Attempting uninstall: scipy
    Found existing installation: scipy 1.4.1
    Uninstalling scipy-1.4.1:
      Successfully uninstalled scipy-1.4.1
Successfully installed scipy-1.7.3 sklearn-pandas-2.2.0


## Step 1: Review Use Case
The analytics use case implemented in this notebook is telco churn prediction. It is a simple use case which illustrates typical process for model development and deployment using Cloud Pak for Data.

In [3]:
# @hidden_cell

from ibm_watson_studio_lib import access_project_or_space
wslib = access_project_or_space()

db2cloud_metadata = wslib.get_connection("db2cloud")

import os, ibm_db, ibm_db_dbi as dbi, pandas as pd

db2cloud_dsn = 'DATABASE={};HOSTNAME={};PORT={};PROTOCOL=TCPIP;UID={uid};PWD={pwd};SECURITY=SSL'.format(
    db2cloud_metadata['database'],
    db2cloud_metadata['host'],
    db2cloud_metadata.get('port', 50000),
    uid=db2cloud_metadata['username'],
    pwd=db2cloud_metadata['password']
)

db2cloud_connection = dbi.connect(db2cloud_dsn)
   
# NOTE:
#  A row limit has been applied to the query to enable sample previewing.
#  Adjust the display message and query as needed by editing the following lines:
from IPython.core.display import display, HTML
display(HTML("A row limit of 5000 has been applied to the query to enable sample previewing. If the data set is larger, only the first 5000 rows will be loaded."))
query = 'SELECT * FROM "XCV64422"."CUSTOMER_DATA" FETCH FIRST 5000 ROWS ONLY'

data_df_5 = pd.read_sql_query(query, con=db2cloud_connection)
data_df_5.head()

# After use, close the database connection with the following code:
# db2cloud_connection.close()


Unnamed: 0,ID,LONGDISTANCE,INTERNATIONAL,LOCAL,DROPPED,PAYMETHOD,LOCALBILLTYPE,LONGDISTANCEBILLTYPE,USAGE,RATEPLAN,GENDER,STATUS,CHILDREN,ESTINCOME,CAROWNER,AGE,CHURN
0,1,23,0,206,0,CC,Budget,Intnl_discount,229,3,F,S,1,38000.0,N,24.393333,T
1,1004,28,0,60,0,Auto,FreeLocal,Standard,89,4,F,M,1,8073.11,N,46.0,F
2,1005,24,0,5,0,CH,Budget,Standard,29,4,M,M,0,95448.6,Y,53.68,F
3,1006,28,0,97,0,CC,FreeLocal,Standard,125,1,M,S,1,24141.5,Y,17.006667,T
4,1008,0,0,4,2,CC,Budget,Standard,4,2,M,S,1,31952.0,N,34.266667,F


In [4]:
# COPY the dataFrame into a new dataFrame called *data*
# Note that when you insert the data, it may get written to a different dataFrame name; in this example, it is assumed the dataFrame is data_df_1
data=data_df_5.copy()

In [5]:
# List all the columns
print(data.columns)

Index(['ID', 'LONGDISTANCE', 'INTERNATIONAL', 'LOCAL', 'DROPPED', 'PAYMETHOD',
       'LOCALBILLTYPE', 'LONGDISTANCEBILLTYPE', 'USAGE', 'RATEPLAN', 'GENDER',
       'STATUS', 'CHILDREN', 'ESTINCOME', 'CAROWNER', 'AGE', 'CHURN'],
      dtype='object')


In [6]:
# Keep only the columns that are relevant for churn prediction
data = data[['ID', 'LONGDISTANCE', 'INTERNATIONAL', 'LOCAL', 'DROPPED', 'PAYMETHOD', 'LOCALBILLTYPE', 'LONGDISTANCEBILLTYPE', 'USAGE', 'RATEPLAN', 'GENDER','STATUS', 'CHILDREN', 'ESTINCOME', 'CAROWNER', 'AGE', 'CHURN']]
data.head()


Unnamed: 0,ID,LONGDISTANCE,INTERNATIONAL,LOCAL,DROPPED,PAYMETHOD,LOCALBILLTYPE,LONGDISTANCEBILLTYPE,USAGE,RATEPLAN,GENDER,STATUS,CHILDREN,ESTINCOME,CAROWNER,AGE,CHURN
0,1,23,0,206,0,CC,Budget,Intnl_discount,229,3,F,S,1,38000.0,N,24.393333,T
1,1004,28,0,60,0,Auto,FreeLocal,Standard,89,4,F,M,1,8073.11,N,46.0,F
2,1005,24,0,5,0,CH,Budget,Standard,29,4,M,M,0,95448.6,Y,53.68,F
3,1006,28,0,97,0,CC,FreeLocal,Standard,125,1,M,S,1,24141.5,Y,17.006667,T
4,1008,0,0,4,2,CC,Budget,Standard,4,2,M,S,1,31952.0,N,34.266667,F


## Step 2: Build the Random Forest model

In [7]:
import pandas as pd
import sklearn
pd.options.display.max_columns = 999

import warnings
warnings.filterwarnings('ignore')

from scipy.stats import chi2_contingency,ttest_ind
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_curve, roc_auc_score

import numpy as np

import urllib3, requests, json

In [8]:
#convert CHURN to 1/0
le = LabelEncoder()
data.loc[:,'CHURN']= le.fit_transform(data.loc[:,'CHURN'])
data.head()

Unnamed: 0,ID,LONGDISTANCE,INTERNATIONAL,LOCAL,DROPPED,PAYMETHOD,LOCALBILLTYPE,LONGDISTANCEBILLTYPE,USAGE,RATEPLAN,GENDER,STATUS,CHILDREN,ESTINCOME,CAROWNER,AGE,CHURN
0,1,23,0,206,0,CC,Budget,Intnl_discount,229,3,F,S,1,38000.0,N,24.393333,1
1,1004,28,0,60,0,Auto,FreeLocal,Standard,89,4,F,M,1,8073.11,N,46.0,0
2,1005,24,0,5,0,CH,Budget,Standard,29,4,M,M,0,95448.6,Y,53.68,0
3,1006,28,0,97,0,CC,FreeLocal,Standard,125,1,M,S,1,24141.5,Y,17.006667,1
4,1008,0,0,4,2,CC,Budget,Standard,4,2,M,S,1,31952.0,N,34.266667,0


In [9]:
# define the label and features
y = np.float32(data.CHURN)
x = data.drop(['CHURN'], axis = 1)

In [10]:
x.columns

Index(['ID', 'LONGDISTANCE', 'INTERNATIONAL', 'LOCAL', 'DROPPED', 'PAYMETHOD',
       'LOCALBILLTYPE', 'LONGDISTANCEBILLTYPE', 'USAGE', 'RATEPLAN', 'GENDER',
       'STATUS', 'CHILDREN', 'ESTINCOME', 'CAROWNER', 'AGE'],
      dtype='object')

In [11]:
# Apply the LabelEncoder to encode the input features in numeric form where applicable
from sklearn_pandas import DataFrameMapper

mapper = DataFrameMapper(
    [('GENDER', LabelEncoder()),
     ('STATUS', LabelEncoder()),
     ('CHILDREN', None),
     ('ESTINCOME',None),
     ('CAROWNER', LabelEncoder()),
     ('AGE',None),
     ('LONGDISTANCE',None),
     ('INTERNATIONAL',None),
     ('LOCAL',None),
     ('DROPPED',None),
     ('PAYMETHOD',LabelEncoder()),
     ('LOCALBILLTYPE',LabelEncoder()),
     ('LONGDISTANCEBILLTYPE',LabelEncoder()),
     ('USAGE',None),
     ('RATEPLAN',None)
    ]
)

In [12]:
# split the data to training and testing set
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
# fit the model

import sklearn.pipeline
from sklearn.preprocessing import OneHotEncoder

random_forest = RandomForestClassifier()
steps = [('mapper', mapper),('RandonForestClassifier', random_forest)]
pipeline = sklearn.pipeline.Pipeline(steps)
model=pipeline.fit( X_train, y_train )
model

Pipeline(steps=[('mapper',
                 DataFrameMapper(drop_cols=[],
                                 features=[('GENDER', LabelEncoder()),
                                           ('STATUS', LabelEncoder()),
                                           ('CHILDREN', None),
                                           ('ESTINCOME', None),
                                           ('CAROWNER', LabelEncoder()),
                                           ('AGE', None),
                                           ('LONGDISTANCE', None),
                                           ('INTERNATIONAL', None),
                                           ('LOCAL', None), ('DROPPED', None),
                                           ('PAYMETHOD', LabelEncoder()),
                                           ('LOCALBILLTYPE', LabelEncoder()),
                                           ('LONGDISTANCEBILLTYPE',
                                            LabelEncoder()),
                               

In [14]:
### call pipeline.predict() on your X_test data to make a set of test predictions
y_prediction = pipeline.predict( X_test )

### test your predictions using sklearn.classification_report()

report = sklearn.metrics.classification_report( y_test, y_prediction )
### and print the report
print(report)

              precision    recall  f1-score   support

         0.0       0.99      0.98      0.98       168
         1.0       0.97      0.99      0.98       115

    accuracy                           0.98       283
   macro avg       0.98      0.98      0.98       283
weighted avg       0.98      0.98      0.98       283



## Step 3 - WML Deployment
In the next set of cells, we deploy the trained model using Watson Machine Learning. To do so:
- Leverage the WML Python client to create a deployment space. 
- Store the trained model to both, the current project and the deployment space. 
- Deploy the model to WML as an online deployment so it is accessible via REST APIs

In [14]:
import os
cpdtoken=os.environ['USER_ACCESS_TOKEN']
wml_credentials = {
"token": cpdtoken,
"instance_id" : "openshift",
"url": os.environ['RUNTIME_ENV_APSX_URL'],
"version": "4.0"
}

from ibm_watson_machine_learning import APIClient
client = APIClient(wml_credentials)



In [16]:
# Associate WML client with current project
###project_id = os.environ['PROJECT_ID']
####client.set.default_project(project_id)

'SUCCESS'

In [15]:
client.spaces.list()

Note: 'limit' is not provided. Only first 50 records will be displayed if the number of records exceed 50
------------------------------------  --------------  ------------------------
ID                                    NAME            CREATED
87ecd661-f8b0-4452-b7fb-83aa57558148  churn_qa_space  2022-01-11T16:53:50.712Z
------------------------------------  --------------  ------------------------


In [16]:
space_id='87ecd661-f8b0-4452-b7fb-83aa57558148'

In [20]:
# For details on working with spaces, check the following notebook
# https://github.com/IBM/watson-machine-learning-samples/blob/master/cpd4.0/notebooks/python_sdk/instance-management/Space%20management.ipynb
# Create deployment space
space_metadata = {
    'name': 'churn_qa_space',
    'description': 'Quality Assurance deployment space for churn models',
}
space_details = client.spaces.store(space_metadata)
print(space_details)

Space has been created. However some background setup activities might still be on-going. Check for 'status' field in the response. It has to show 'active' before space can be used. If its not 'active', you can monitor the state with a call to spaces.get_details(space_id)
{'entity': {'compute': [{'crn': 'crn:v1:cpd:private:pm-20:private:a/cpduser:99999999-9999-9999-9999-999999999999::', 'guid': '99999999-9999-9999-9999-999999999999', 'name': 'Watson Machine Learning', 'type': 'machine_learning'}], 'description': 'Quality Assurance deployment space for churn models', 'members': [{'id': '1000330999', 'role': 'admin', 'state': 'active', 'type': 'user'}], 'name': 'churn_qa_space', 'scope': {'bss_account_id': 'cpdaccount'}, 'status': {'state': 'preparing'}}, 'metadata': {'created_at': '2022-01-11T16:53:50.712Z', 'creator_id': '1000330999', 'id': '87ecd661-f8b0-4452-b7fb-83aa57558148', 'url': '/v2/spaces/87ecd661-f8b0-4452-b7fb-83aa57558148'}}


In [21]:
space_id = client.spaces.get_id(space_details)

In [17]:

space_details=client.spaces.get_details(space_id)

In [18]:
space_details

{'entity': {'compute': [{'crn': 'crn:v1:cpd:private:pm-20:private:a/cpduser:99999999-9999-9999-9999-999999999999::',
    'guid': '99999999-9999-9999-9999-999999999999',
    'name': 'Watson Machine Learning',
    'type': 'machine_learning'}],
  'description': 'Quality Assurance deployment space for churn models',
  'name': 'churn_qa_space',
  'scope': {'bss_account_id': 'cpdaccount'},
  'status': {'state': 'active'}},
 'metadata': {'created_at': '2022-01-11T16:53:50.712Z',
  'creator_id': '1000330999',
  'id': '87ecd661-f8b0-4452-b7fb-83aa57558148',
  'updated_at': '2022-01-11T16:53:55.063Z',
  'url': '/v2/spaces/87ecd661-f8b0-4452-b7fb-83aa57558148'}}

In [19]:
client.set.default_space(space_id)

'SUCCESS'

In [20]:
# Confirm deployment space is created
client.spaces.list(limit=10)

------------------------------------  --------------  ------------------------
ID                                    NAME            CREATED
87ecd661-f8b0-4452-b7fb-83aa57558148  churn_qa_space  2022-01-11T16:53:50.712Z
------------------------------------  --------------  ------------------------


In [21]:
model_name = 'customer_churn_model'
software_spec_uid = client.software_specifications.get_uid_by_name('default_py3.8')

metadata = {
    client.repository.ModelMetaNames.NAME: model_name,
    client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: software_spec_uid,
    client.repository.ModelMetaNames.TYPE: "scikit-learn_0.23"
}

stored_model_details = client.repository.store_model(pipeline,
                                               meta_props=metadata,
                                               training_data=X_train,
                                               training_target=y_train)

In [22]:
# Confirm the model is stored in WML repository
client.repository.list_models()

------------------------------------  --------------------  ------------------------  -----------------
ID                                    NAME                  CREATED                   TYPE
f638e9cb-f189-4248-abbf-6c5a3d2b634c  customer_churn_model  2022-01-11T17:35:41.002Z  scikit-learn_0.23
------------------------------------  --------------------  ------------------------  -----------------


In [None]:
# Deploy the model
deploy_metadata = {
    client.deployments.ConfigurationMetaNames.NAME: "Churn Prediction Model Deployment",
    client.deployments.ConfigurationMetaNames.ONLINE: {}
}

published_model_uid = client.repository.get_model_uid(stored_model_details)
created_deployment = client.deployments.create(published_model_uid, meta_props=deploy_metadata)


In [None]:
deployment_uid = client.deployments.get_uid(created_deployment)
scoring_endpoint = client.deployments.get_scoring_href(created_deployment)
print(scoring_endpoint)

In [None]:
# Score the model on a test dataset
scoring_payload = {
    "input_data": [{
        'fields': ['ID', 'LONGDISTANCE', 'INTERNATIONAL', 'LOCAL', 'DROPPED', 'PAYMETHOD', 'LOCALBILLTYPE', 'LONGDISTANCEBILLTYPE', 'USAGE', 'RATEPLAN', 'GENDER','STATUS', 'CHILDREN', 'ESTINCOME', 'CAROWNER', 'AGE'],
        'values': [[1,28,0,60,0,"Auto","FreeLocal","Standard",89,4,"F","M",1,23000,"N",45]]}]
}





In [None]:
predictions = client.deployments.score(deployment_uid, scoring_payload)
print(json.dumps(predictions, indent=2))