<h1><center>Credit Card Example<br/>scikit-learn to ONNX</center></h1>

### imports

In [1]:
import numpy as np
import pandas as pd
import onnxruntime as rt
import sklearn
import skl2onnx
import onnx

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, normalize
from sklearn.metrics import accuracy_score

In [2]:
%load_ext watermark
%watermark --iversions

numpy       1.18.1
sklearn     0.23.2
onnxruntime 1.4.0
onnx        1.7.0
skl2onnx    1.7.0
pandas      1.1.1



# Data Load

In [3]:
cust_df = pd.read_csv('cust_history_new_1K.csv')

print("There are " + str(len(cust_df)) + " observations in the customer history dataset.")
print("There are " + str(len(cust_df.columns)) + " variables in the dataset.")

#cust_df.head()

There are 1000 observations in the customer history dataset.
There are 19 variables in the dataset.


# Data Preparation

## Split Dataframe into Features and Label

In [4]:
cust_df_Y = cust_df[['IS_DEFAULT']]
cust_df_X = cust_df.drop(['IS_DEFAULT'],axis=1)

print(f'cust_df_X.shape={cust_df_X.shape}, cust_df_Y.shape={cust_df_Y.shape}')


cust_df_X.shape=(1000, 18), cust_df_Y.shape=(1000, 1)


## Transform Label

In [5]:
le = LabelEncoder()
cust_df_Y = le.fit_transform(cust_df_Y['IS_DEFAULT'])

In [6]:
type(cust_df_Y)

numpy.ndarray

## Transform Features

### One hot encoding for categorical Columns

In [7]:
categoricalColumns = ['CREDIT_HISTORY', 'TRANSACTION_CATEGORY', 'ACCOUNT_TYPE', 'ACCOUNT_AGE',
                      'STATE', 'IS_URBAN', 'IS_STATE_BORDER', 'HAS_CO_APPLICANT', 'HAS_GUARANTOR',
                      'OWN_REAL_ESTATE', 'OTHER_INSTALMENT_PLAN',
                      'OWN_RESIDENCE', 'RFM_SCORE', 'OWN_CAR', 'SHIP_INTERNATIONAL']
cust_df_X = pd.get_dummies(cust_df_X, columns=categoricalColumns)

#cust_df_X.head()

## Normalize Features

In [8]:
min_max_scaler = MinMaxScaler()
features = min_max_scaler.fit_transform(cust_df_X)
features = normalize(features, axis=1, norm='l1')

cust_df_X = pd.DataFrame(features,columns=cust_df_X.columns)
#cust_df_X.head()

## Split Train and Test Dataset

In [9]:
label    = cust_df_Y
features  = cust_df_X.values

label = np.reshape(label,(-1,1))
X_train,X_test,y_train,y_test = \
       train_test_split(features, label, test_size=0.3, random_state=42, stratify=label)
print(f'X_train.shape={X_train.shape} Y_train.shape={y_train.shape}')
print(f'X_test.shape={X_test.shape} Y_test.shape={y_test.shape}')

X_train.shape=(700, 51) Y_train.shape=(700, 1)
X_test.shape=(300, 51) Y_test.shape=(300, 1)


# sklearn Train

In [10]:
# Import
from sklearn.linear_model import LogisticRegression
sklearn_lr = LogisticRegression()

In [11]:
# TRAIN
sklearn_lr.fit(X_train, y_train.ravel())

LogisticRegression()

In [12]:
sklearn_lr.coef_.shape

(1, 51)

In [13]:
sklearn_lr.coef_.dtype

dtype('float64')

In [14]:
sklearn_lr.coef_

array([[ 0.77483405,  0.41529331,  0.00611661,  0.6085692 , -1.29612335,
        -0.08430527,  0.11118008,  0.58089865,  0.29378692, -0.88450482,
         0.09702741,  0.73011745,  0.16123889,  0.08285559, -0.56030214,
         0.14239107, -0.27800247, -1.11633683, -0.30734996,  1.4795175 ,
         0.1050459 , -0.71337319,  0.13243678, -0.45112553,  0.84723536,
        -0.01254426, -0.45229007,  0.87193758, -0.48688393, -0.38505152,
         0.30527083,  0.47275722, -0.55253791, -0.28904623,  0.20926555,
         0.13984383, -0.21962452,  0.73056698, -0.81034766, -0.78696036,
         0.70717967,  0.75650276, -0.83628345,  0.0874876 , -0.2666159 ,
        -0.12455267,  0.22390028,  0.09057764, -0.17035833,  0.19092994,
        -0.27071063]])

In [15]:
sklearn_lr.intercept_.shape

(1,)

In [16]:
sklearn_prediction = sklearn_lr.predict(X_test)
print(f'sklearn ml accuracy score = {accuracy_score(y_test,sklearn_prediction)}')

sklearn ml accuracy score = 0.7


In [17]:
sklearn_prediction.shape

(300,)

# Convert to ONNX format

In [18]:
from skl2onnx import convert_sklearn

In [19]:
from skl2onnx.common.data_types import DoubleTensorType
initial_type = [('double_input', DoubleTensorType([None, 51]))]
onx = convert_sklearn(sklearn_lr, initial_types=initial_type)
with open("cc_default.onnx", "wb") as f:
    f.write(onx.SerializeToString())

# View the ONNC model

In [20]:
# Load the ONNX model
model = onnx.load("cc_default.onnx")

# Check that the IR is well formed
onnx.checker.check_model(model)

# Print a human readable representation of the graph
print(model.graph)

node {
  input: "double_input"
  output: "label"
  output: "probability_tensor"
  name: "LinearClassifier"
  op_type: "LinearClassifier"
  attribute {
    name: "classlabels_ints"
    ints: 0
    ints: 1
    type: INTS
  }
  attribute {
    name: "coefficients"
    floats: -0.7748340368270874
    floats: -0.41529330611228943
    floats: -0.006116610951721668
    floats: -0.6085692048072815
    floats: 1.2961233854293823
    floats: 0.08430526405572891
    floats: -0.11118007451295853
    floats: -0.580898642539978
    floats: -0.2937869131565094
    floats: 0.8845047950744629
    floats: -0.09702741354703903
    floats: -0.7301174402236938
    floats: -0.161238893866539
    floats: -0.08285558968782425
    floats: 0.5603021383285522
    floats: -0.14239107072353363
    floats: 0.27800247073173523
    floats: 1.1163368225097656
    floats: 0.30734995007514954
    floats: -1.4795174598693848
    floats: -0.10504589974880219
    floats: 0.7133731842041016
    floats: -0.13243678212165833


#### View the model in https://lutzroeder.github.io/netron/

# Use ONNX runtime for scoring

In [21]:
# Compute the prediction with ONNX Runtime
sess = rt.InferenceSession("cc_default.onnx")
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
pred_onx = sess.run([label_name], {input_name: X_test})[0]

In [22]:
pred_onx.shape

(300,)