## Financial Product Recommendation Engine

This Notebook depicts steps in creating and saving a model in IBM cloud using Watson Studio

Author : Mukesh R, Sam Prasanna R

### Step 1

The Below code blocks helps to create connections required to save the model and connect to WML

In [34]:
api_key = 'TQPDo_OY4aY8q1ObQvD61EbdfmTmT_Ykt2tFukflU3uV'
location = 'us-south'

In [35]:
wml_credentials = {
    "apikey": api_key,
    "url": 'https://' + location + '.ml.cloud.ibm.com'
}

In [36]:
!pip install -U ibm-watson-machine-learning

Requirement already up-to-date: ibm-watson-machine-learning in /opt/conda/envs/Python36/lib/python3.6/site-packages (1.0.10)


In [37]:
from ibm_watson_machine_learning import APIClient

client = APIClient(wml_credentials)

In [38]:
space_id = '5594f595-ad05-400a-b370-d9d789f6d82e'

In [39]:
client.spaces.list(limit=10)

------------------------------------  -----------------------  ------------------------
ID                                    NAME                     CREATED
f6f6501f-c8e1-4c4f-9c65-59ecafdcc386  lticiti-hackathon-space  2020-09-05T19:00:04.389Z
5594f595-ad05-400a-b370-d9d789f6d82e  IBM_Hackathon            2020-09-05T18:23:48.412Z
------------------------------------  -----------------------  ------------------------


In [40]:
client.set.default_space(space_id)

'SUCCESS'

### Step2 

Connect to the Dataset uploaded in Projects in Watson Studio

In [41]:
import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_954c520c2490443bac77531256d3249e = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='nOPlww_HdGiX7UzssnBKunTqIreAxvSySJqq7pUcy1_k',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_954c520c2490443bac77531256d3249e.get_object(Bucket='ibmhackathon-donotdelete-pr-xbhtv6ninapien',Key='train_loan.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_data_1 = pd.read_csv(body)
df_data_1.head()


Unnamed: 0,age,job,marital status,education,credit default?,housing loan?,Vehicle loan,y
0,30,unemployed,married,primary,no,no,no,no
1,33,services,married,secondary,no,yes,yes,no
2,35,management,single,tertiary,no,yes,no,no
3,30,management,married,tertiary,no,yes,yes,no
4,59,blue-collar,married,secondary,no,yes,no,no


In [42]:
obj_df = df_data_1.copy()
obj_df.head()

Unnamed: 0,age,job,marital status,education,credit default?,housing loan?,Vehicle loan,y
0,30,unemployed,married,primary,no,no,no,no
1,33,services,married,secondary,no,yes,yes,no
2,35,management,single,tertiary,no,yes,no,no
3,30,management,married,tertiary,no,yes,yes,no
4,59,blue-collar,married,secondary,no,yes,no,no


### Step 3 

Check for nulls in the records and Replace them with Mean/Mode/Median or Default Category based on the type of data

Since the ML algorithms doesnt perform well with Null in data

In [43]:
obj_df[obj_df.isnull().any(axis=1)]

Unnamed: 0,age,job,marital status,education,credit default?,housing loan?,Vehicle loan,y
79,40,,married,secondary,no,yes,no,no
97,30,admin.,single,,no,no,no,no
108,56,unemployed,,primary,no,no,no,yes
109,27,housemaid,married,primary,no,,,no
168,48,technician,married,tertiary,,no,no,no
1000,38,admin.,married,secondary,no,yes,,no
1004,42,,single,secondary,no,yes,yes,yes
1016,33,services,married,secondary,no,,no,no


In [44]:

obj_df["job"] = obj_df["job"].astype('category')
obj_df.dtypes

obj_df["job_cat"] = obj_df["job"].cat.codes
obj_df.head()

obj_df["marital status "] = obj_df["marital status "].astype('category')
obj_df.dtypes

obj_df["marital status_cat"] = obj_df["marital status "].cat.codes
obj_df.head()


obj_df["education"] = obj_df["education"].astype('category')
obj_df.dtypes

obj_df["education_cat"] = obj_df["education"].cat.codes
obj_df.head()


obj_df["credit default?"] = obj_df["credit default?"].astype('category')
obj_df.dtypes

obj_df["credit default_cat"] = obj_df["credit default?"].cat.codes
obj_df.head()


obj_df["housing loan?"] = obj_df["housing loan?"].astype('category')
obj_df.dtypes

obj_df["housing loan_cat"] = obj_df["housing loan?"].cat.codes
obj_df.head()

obj_df["Vehicle loan"] = obj_df["Vehicle loan"].astype('category')
obj_df.dtypes

obj_df["Vehicle loan_cat"] = obj_df["Vehicle loan"].cat.codes
obj_df.head()


obj_df["y"] = obj_df["y"].astype('category')
obj_df.dtypes

obj_df["y_cat"] = obj_df["y"].cat.codes
obj_df.head()#Replace the NAN's with Mode, Since it was given "For Categorical Value replace it with Mode"
obj_df['job'].fillna(obj_df['job'].mode()[0], inplace=True)

obj_df['marital status '].fillna(obj_df['marital status '].mode()[0], inplace=True)
obj_df['education'].fillna(obj_df['education'].mode()[0], inplace=True)
obj_df['credit default?'].fillna(obj_df['credit default?'].mode()[0], inplace=True)
obj_df['housing loan?'].fillna(obj_df['housing loan?'].mode()[0], inplace=True)
obj_df['Vehicle loan'].fillna(obj_df['Vehicle loan'].mode()[0], inplace=True)



### Step 4

Check for imbalance target variable and either do Upsampling / Downsampling depending on the dataset

In [45]:
obj_df['y'].value_counts()

no     897
yes    124
Name: y, dtype: int64

In [46]:
#Steps to balance the class variables
# Separate majority and minority classes
df_majority = obj_df[obj_df.y=='no']
df_minority = obj_df[obj_df.y=='yes']

In [47]:
from sklearn.utils import resample
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=897,    # to match majority class
                                 random_state=123) # reproducible results

In [48]:
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled.y_cat.value_counts()

1    897
0    897
Name: y_cat, dtype: int64

In [49]:
df_upsampled.head()

Unnamed: 0,age,job,marital status,education,credit default?,housing loan?,Vehicle loan,y,job_cat,marital status_cat,education_cat,credit default_cat,housing loan_cat,Vehicle loan_cat,y_cat
0,30,unemployed,married,primary,no,no,no,no,10,1,0,0,0,0,0
1,33,services,married,secondary,no,yes,yes,no,7,1,1,0,2,1,0
2,35,management,single,tertiary,no,yes,no,no,4,2,2,0,2,0,0
3,30,management,married,tertiary,no,yes,yes,no,4,1,2,0,2,1,0
4,59,blue-collar,married,secondary,no,yes,no,no,1,1,1,0,2,0,0


### Step 5

Since this is POC to show how classification can be helpful in acheiving usecases and sample dataset has very less columns we did not do feature engineering to either remove column which doesnot have impact on target varaibles (eg : Customer ID) using PCA and other methods

The below step invloves getting the required input and output dataframes

Also, Splitting the data into 70 - 30 (Train - Test Split) to Rate the model performance using Confusion Matrix

In this we have used accuracy has the measure to check performance of different algorithms

In [50]:
Y = df_upsampled.iloc[:,14]  
X = df_upsampled.iloc[:,[0,8,9,10,11,12,13]]

In [51]:
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1)

### Step 6

Define Model, train and evaluate them

In [53]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier

# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy")

# Train Decision Tree Classifer
clf = clf.fit(X_train,Y_train)

#Predict the response for test dataset
Y_pred = clf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))

Accuracy: 0.9313543599257885


In [54]:
from sklearn.ensemble import RandomForestClassifier

# Create the model with 100 trees
model = RandomForestClassifier(n_estimators=100,
                               bootstrap = True,
                               max_features = 'sqrt')
# Fit on training data
model.fit(X_train,Y_train)

#Predict the response for test dataset
Y_pred = model.predict(X_test)

print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))

Accuracy: 0.9257884972170687


In [55]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

# Train the model using the training sets
model.fit(X_train,Y_train)

#Predict Output
predicted= model.predict(X_test) 

print("Accuracy:",metrics.accuracy_score(Y_test, predicted))

Accuracy: 0.5862708719851577


In [56]:
from sklearn.naive_bayes import BernoulliNB
model = BernoulliNB()

# Train the model using the training sets
model.fit(X_train,Y_train)

#Predict Output
predicted= model.predict(X_test) 

print("Accuracy:",metrics.accuracy_score(Y_test, predicted))

Accuracy: 0.5677179962894249


In [25]:
from sklearn import svm

svm = svm.SVC()

# Train the model using the training sets
svm.fit(X_train,Y_train)

#Predict Output
predicted= svm.predict(X_test) 

print("Accuracy:",metrics.accuracy_score(Y_test, predicted))

Accuracy: 0.7922077922077922


In [26]:
from sklearn.neural_network import MLPClassifier

MLP = MLPClassifier(alpha=0.1, hidden_layer_sizes=(10,))

# Train the model using the training sets
MLP.fit(X_train,Y_train)

#Predict Output
predicted= MLP.predict(X_test) 

print("Accuracy:",metrics.accuracy_score(Y_test, predicted))

Accuracy: 0.5788497217068646


In [27]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

#ada = AdaBoostClassifier(n_estimators=100)

ada = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.01, random_state=3).fit(X_train, Y_train)

#Predict Output
predicted= ada.predict(X_test) 

print("Accuracy:",metrics.accuracy_score(Y_test, predicted))

Accuracy: 0.7847866419294991


In [33]:
from xgboost import XGBClassifier

# fit model no training data
model = XGBClassifier(max_depth = 10, subsample=0.8,eta = 0.1)
model.fit(X_train, Y_train)

predicted= model.predict(X_test) 

print("Accuracy:",metrics.accuracy_score(Y_test, predicted))

Accuracy: 0.9016697588126159


### Step 7 

Save the model to IBM deployment space to deploy it as API and Make Prediction in Single/Batch

In [30]:
sofware_spec_uid = client.software_specifications.get_id_by_name("scikit-learn_0.20-py3.6")

In [31]:
metadata = {
            client.repository.ModelMetaNames.NAME: 'Scikit model',
            client.repository.ModelMetaNames.TYPE: 'scikit-learn_0.20',
            client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: sofware_spec_uid
}

published_model = client.repository.store_model(
    model=clf,
    meta_props=metadata,
    training_data=X_train,
    training_target=Y_train)

In [32]:
import json

published_model_uid = client.repository.get_model_uid(published_model)
model_details = client.repository.get_details(published_model_uid)
print(json.dumps(model_details, indent=2))

{
  "entity": {
    "label_column": "y_cat",
    "software_spec": {
      "id": "09c5a1d0-9c1e-4473-a344-eb7b665ff687",
      "name": "scikit-learn_0.20-py3.6"
    },
    "training_data_references": [
      {
        "connection": {
          "access_key_id": "not_applicable",
          "endpoint_url": "not_applicable",
          "secret_access_key": "not_applicable"
        },
        "id": "1",
        "location": {},
        "schema": {
          "fields": [
            {
              "name": "age",
              "type": "int64"
            },
            {
              "name": "job_cat",
              "type": "int8"
            },
            {
              "name": "marital status_cat",
              "type": "int8"
            },
            {
              "name": "education_cat",
              "type": "int8"
            },
            {
              "name": "credit default_cat",
              "type": "int8"
            },
            {
              "name": "housing loan_cat"

In [33]:
models_details = client.repository.list_models()

------------------------------------  ------------  ------------------------  -----------------
ID                                    NAME          CREATED                   TYPE
1eeda5a5-2a92-40a2-9e7c-837a26b6584a  Scikit model  2020-09-05T18:37:46.002Z  scikit-learn_0.20
------------------------------------  ------------  ------------------------  -----------------


In [34]:
metadata = {
    client.deployments.ConfigurationMetaNames.NAME: "Personal Loan",
    client.deployments.ConfigurationMetaNames.ONLINE: {}
}

created_deployment = client.deployments.create(published_model_uid, meta_props=metadata)



#######################################################################################

Synchronous deployment creation for uid: '1eeda5a5-2a92-40a2-9e7c-837a26b6584a' started

#######################################################################################


initializing
ready


------------------------------------------------------------------------------------------------
Successfully finished deployment creation, deployment_uid='d8f0e83a-0f90-4afb-9022-14b6d7922191'
------------------------------------------------------------------------------------------------




In [35]:
deployment_uid = client.deployments.get_uid(created_deployment)

In [36]:
scoring_endpoint = client.deployments.get_scoring_href(created_deployment)
print(scoring_endpoint)

https://us-south.ml.cloud.ibm.com/ml/v4/deployments/d8f0e83a-0f90-4afb-9022-14b6d7922191/predictions


In [37]:
client.deployments.list()

------------------------------------  -------------  -----  ------------------------
GUID                                  NAME           STATE  CREATED
d8f0e83a-0f90-4afb-9022-14b6d7922191  Personal Loan  ready  2020-09-05T18:38:34.421Z
------------------------------------  -------------  -----  ------------------------
