# Scikit-Learn Model Deployments for SVC, RF, Boosting Classification Algorithms

### Importing libraries

In [1]:
import sys
import numpy as np
import json
from Crypto.Cipher import AES
from sklearn.tree import _tree

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

### Generate Decision-Tree as List

In [2]:
def tree_to_list(tree, feature_names, num):
    code_list = ''
    
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    #code_list = '[input feature, node, [input feature, node, left leaf, right leaf], right leaf]'
    def recurse(node, depth, code_list):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            code_list = code_list + '["{}",{},'.format(name, threshold)
            code_list = recurse(tree_.children_left[node], depth + 1, code_list)
            code_list = code_list + ","
            code_list = recurse(tree_.children_right[node], depth + 1, code_list)
            code_list = code_list + "]"
        else:
            #print(tree_.value[node])
            #proba = list(tree_.value[node][0]/sum(tree_.value[node][0]))
            proba = list(tree_.value[node][0])
            code_list = code_list + "{}".format(proba)
        
        return code_list
    code_list = recurse(0, 2, code_list)
    
    return code_list

### Generate Decision-Tree in List format

In [3]:
def tree_predict_list(x_input_l, lst):
    """
    Function to get the prediction value where trees are stored in list format recursively
    Format: [input feature (column index), node value, left leaf (yes), right leaf (no)]
    Input: x_input_l (input for obaining the tree prediction), lst (tree in list format)
    Output: pred (final pred value for the provided input)
    """
    if isinstance(lst, (float, int)):
        return lst
    
    if isinstance(lst, list) and len(lst) == 1:
        return lst

    if isinstance(lst[2], list) and len(lst[2]) == 4 and isinstance(lst[2][0], str):
        left_leaf = tree_predict_list(x_input_l, lst[2])
    else:
        left_leaf = lst[2]
    if isinstance(lst[3], list) and len(lst[3]) == 4 and isinstance(lst[3][0], str):
        right_leaf = tree_predict_list(x_input_l, lst[3])
    else:
        right_leaf = lst[3]

    # If input feature <= node value then left leaf else right leaf
    pred = left_leaf if x_input_l[int(lst[0])] <= lst[1] else right_leaf
    return pred

In [4]:
def svr_decision_function(k, nv, a, b, cs):
    # Start and end index of each support vector
    start = [sum(nv[:i]) for i in range(len(nv))]
    end = [start[i] + nv[i] for i in range(len(nv))]

    # Calculate: sum(a_p * k(x_p, x)) between every 2 classes
    c = [ sum(a[ i ][p] * k[p] for p in range(start[j], end[j])) +
          sum(a[j-1][q] * k[q] for q in range(start[i], end[i]))
                for i in range(len(nv)) for j in range(i+1,len(nv))]

    # Add the intercept term
    decision = [c[i] + b[i] for i in range(len(b))]
    decision = np.array([x for x in decision]).T
    
    # Obtain the vote based on the decision value
    votes = [[(0 if decision[:, p] > 0 else 1) for p,(i,j) in enumerate((i,j) 
                                           for i in range(len(cs))
                                           for j in range(i+1,len(cs)))]]
    
    # Obtain the One-vs-Rest Decision value
    decision_ovr = ovr_decision_function(np.array(votes), -decision, len(cs))
    return decision_ovr

In [5]:
# One-vs-One decision function into One-vs-Rest decision function 
# From Scikit-Learn source code
def ovr_decision_function(predictions, confidences, n_classes):
    """Compute a continuous, tie-breaking OvR decision function from OvO.
    It is important to include a continuous value, not only votes,
    to make computing AUC or calibration meaningful.
    Parameters
    ----------
    predictions : array-like, shape (n_samples, n_classifiers)
        Predicted classes for each binary classifier.
    confidences : array-like, shape (n_samples, n_classifiers)
        Decision functions or predicted probabilities for positive class
        for each binary classifier.
    n_classes : int
        Number of classes. n_classifiers must be
        ``n_classes * (n_classes - 1 ) / 2``
    """
    n_samples = predictions.shape[0]
    votes = np.zeros((n_samples, n_classes))
    sum_of_confidences = np.zeros((n_samples, n_classes))

    k = 0
    for i in range(n_classes):
        for j in range(i + 1, n_classes):
            sum_of_confidences[:, i] -= confidences[:, k]
            sum_of_confidences[:, j] += confidences[:, k]
            votes[predictions[:, k] == 0, i] += 1
            votes[predictions[:, k] == 1, j] += 1
            k += 1

    # Monotonically transform the sum_of_confidences to (-1/3, 1/3)
    # and add it with votes. The monotonic transformation  is
    # f: x -> x / (3 * (|x| + 1)), it uses 1/3 instead of 1/2
    # to ensure that we won't reach the limits and change vote order.
    # The motivation is to use confidence levels as a way to break ties in
    # the votes without switching any decision made based on a difference
    # of 1 vote.

    transformed_confidences = (sum_of_confidences /
                               (3 * (np.abs(sum_of_confidences) + 1)))

    return votes + transformed_confidences

### Encrypt & Decrypt model

In [6]:
def encrypt_json(json_obj):
    """
    Scikit-Learn Model in JSON format is converted into encrypted JSON.
    Input: JSON object
    Output: Encrypted JSON.
    """
    secret_key = b'abcdefghijklmnop'    
    iv = b'abcdefghijklmKEY'
    
    BS = 16
    pad = lambda s: s + (BS - len(s) % BS) * chr(BS - len(s) % BS)    
    cipher = AES.new(secret_key,AES.MODE_CBC,iv)
    json_string = json.dumps(json_obj)
    json_string = pad(json_string)
    ENCODED= (cipher.encrypt(json_string.encode("utf8")))
    return ENCODED
    
def decrypt_json(model_name):
    """
    Models saved as encrypted json files are loaded and decrypted
    Input: model_name (name of model to be loaded from saved_models folder)
    Output: decrpyted model in json/dictionary format
    """
    folder_path = "saved_models\\"
    filename = folder_path + model_name

    unpad = lambda s: s[0:-s[-1]]

    secret_key = b'abcdefghijklmnop'
    init_vector = b'abcdefghijklmKEY'
    with open(filename + '_json_enc.txt', 'rb') as op_file:
        json_enc = op_file.read()
    cipher = AES.new(secret_key, AES.MODE_CBC, init_vector)
    decoded = cipher.decrypt(json_enc)
    prog_block = unpad(decoded)
    return json.loads(prog_block)


### Save function: SVC/RF/Boosting model --> JSON --> Encrypted JSON

In [7]:
def save_model_json(folder_path, model_name, model_save, model_input, model_output, model_type):
    model_par_len = model_input.shape[1]
    model_X_mean = list(model_input.mean(axis = 0))
    model_X_std = list(model_input.std(axis = 0))
    model_X_var = ((model_input-model_input.mean(axis = 0))/model_input.std(axis = 0)).var()
    
    py_filename_json = folder_path + model_name + '_class.json'
    py_filename_enc = folder_path + model_name + '_json_enc.txt'
    
    dict_file = {}
    if model_type == 'SVC':
        dict_file['model_type'] = model_type
        
        values, counts = np.unique(y, return_counts=True)
        value_counts = [[e1, counts[i]] for i, e1 in enumerate(values)]
        dict_file['class_names'] = [str(x) for x in values]
        
        dict_file['kernel'] = model_save[1].kernel
        dict_file['degree'] = model_save[1].degree
        dict_file['gamma'] = model_save[1].gamma
        dict_file['coef0'] = model_save[1].coef0
        dict_file['epsilon'] = model_save[1].epsilon
        dict_file['n_features'] = model_par_len
        dict_file['input_variance'] = model_X_var
        dict_file['intercept'] = model_save[1].intercept_.tolist()
        dict_file['mean_value'] = model_X_mean
        dict_file['std_value'] = model_X_std
        dict_file['dual_coef'] = model_save[1].dual_coef_.tolist()
        dict_file['support_vectors'] = model_save[1].support_vectors_.tolist()
        dict_file['n_support'] = model_save[1].n_support_.tolist()
    elif model_type == 'RF':
        dict_file['model_type'] = model_type
        
        values, counts = np.unique(y, return_counts=True)
        value_counts = [[e1, counts[i]] for i, e1 in enumerate(values)]
        dict_file['class_names'] = [str(x) for x in values]

        for i in range(model_save.n_estimators):
            tree_list = tree_to_list(model_save.estimators_[i], [str(j) for j in range(model_par_len)], i) 
            dict_file['tree_'+ str(i)] = json.loads(tree_list)
    elif model_type == 'GradientBoost':
        dict_file['model_type'] = model_type
        dict_file['learning_rate'] = model_save.learning_rate
        
        values, counts = np.unique(y, return_counts=True)
        dict_file['class_names'] = [str(x) for x in values]
        
        if len(values) > 2:
            dict_file['base_estimator_value'] = [np.log(x/sum(counts)) for x  in counts]
        else:
            value_counts = [[e1, counts[i]] for i, e1 in enumerate(values)]
            value_list = [value_counts[i][1] for i, e1 in enumerate(value_counts)]
            dict_file['base_estimator_value'] = [np.log(value_list[1]/value_list[0])]

        for i in range(model_save.n_estimators):
            tr_x = []
            for k in range(len(model_save.estimators_[i])):
                tree_list = tree_to_list(model_save.estimators_[i][k], [str(j) for j in range(model_par_len)], i) 
                tr_x.append(json.loads(tree_list))
            dict_file['tree_'+ str(i)] = tr_x
    elif model_type == 'AdaBoost':
        dict_file['model_type'] = model_type
        dict_file['algorithm'] = model_save.algorithm
        dict_file['estimator_weights'] = model_save.estimator_weights_.tolist()
        values, counts = np.unique(y, return_counts=True)
        value_counts = [[e1, counts[i]] for i, e1 in enumerate(values)]
        dict_file['class_names'] = [str(x) for x in values]
        
        for i in range(len(model_save.estimators_)):
            tree_list = tree_to_list(model_save.estimators_[i], [str(j) for j in range(model_par_len)], i) 
            dict_file['tree_'+ str(i)] = json.loads(tree_list)
            
    with open(py_filename_json, "w") as write_file:
        json.dump(dict_file, write_file)
    
    json_ciphertext = encrypt_json(dict_file)
    
    with open(py_filename_enc, "wb") as write_file:
        write_file.write(json_ciphertext)

### Predict function: SVR/RF/Boosting model

In [8]:
def custom_predict_json(model, x_input):
    '''
    Machine Learning models predict function is recreated using the stored values
    '''
    output_list = []
    class_names = model['class_names']
    n_classes = len(class_names)
    # SVC Model
    if model['model_type'] == 'SVC':
        for sample_i in range(x_input.shape[0]):
            try:
                # Preprocess data: Standardization
                x_input_sample = (x_input[sample_i] - model['mean_value'])/model['std_value']
                # Model support vectors, co-efficients & intercept
                sup_vecs = np.array(model['support_vectors'])
                n_support = np.array(model['n_support'])
                dual_coefs = np.array(model['dual_coef'])
                intercept = model['intercept']
                n_featrs = model['n_features']
                inp_var = model['input_variance']
                gam_in = model['gamma']
                gamma = (1/n_featrs) if gam_in == 'auto' else \
                ((1 / (n_featrs * inp_var)) if gam_in == 'scale' else float(gam_in))
                if model['kernel'] == 'linear':
                    kernal_out = np.dot(sup_vecs, x_input_sample.reshape(1, -1).T)
                elif model['kernel'] == 'rbf':
                    diff = sup_vecs - x_input_sample
                    sup_vecs_len = np.shape(sup_vecs)[0]
                    norm_val = np.array([np.linalg.norm(diff[n, :]) for n in range(sup_vecs_len)])
                    kernal_out = np.exp(-gamma*(norm_val**2))
                elif model['kernel'] == 'sigmoid':
                    input_support = np.dot(sup_vecs, x_input_sample.reshape(1, -1).T)
                    kernal_out = np.tanh(gamma * input_support + model['coef0'])
                elif model['kernel'] == 'poly':
                    input_support = np.dot(sup_vecs, x_input_sample.reshape(1, -1).T)
                    kernal_out = (gamma * input_support + model['coef0'])**model['degree']
                kernal_out = kernal_out.reshape(-1, 1)
                decision_fn = svr_decision_function(kernal_out, n_support, dual_coefs, intercept, class_names)[0]
                decision_fn = decision_fn.tolist()
                # Obtain the class name where decision function value of the sample is max
                sample_pred = class_names[decision_fn.index(max(decision_fn))]
            except Exception as error:
                sample_pred = np.nan
                print(error, file=sys.stderr)
            output_list.append(sample_pred)
    # RF Model
    elif model['model_type'] == 'RF':
        for sample_i in range(x_input.shape[0]):
            try:
                # Preprocess data: Standardization
                x_input_sample = (x_input[sample_i] - 0)/1
                # Final prediction would be the class where sum of all trees probability is max.
                trees = [t for t in model.keys() if 'tree_' in t]
                sample_trees_number = ([tree_predict_list(x_input_sample, model[t]) for t in trees])
                sample_trees_proba = [[x / sum(tr) for x in tr] for tr in sample_trees_number]
                sample_proba = [sum(x) / len(x) for x in zip(*sample_trees_proba)]
                # print('Predict_proba: ',sample_proba)
                
                # Obtain the class name where probability is max
                sample_pred = class_names[sample_proba.index(max(sample_proba))]
            except Exception as error:
                sample_pred = np.nan
                print(error, file=sys.stderr)
            output_list.append(sample_pred)
    # GradientBoost Model
    elif model['model_type'] == 'GradientBoost':
        for sample_i in range(x_input.shape[0]):
            try:
                # Preprocess data: Standardization
                x_input_sample = (x_input[sample_i] - 0)/1
                base_value = np.array(model['base_estimator_value'])
                learning_rate = model['learning_rate']
                # Final log probability would be sum of base estimator value and
                # -(learning rate * tree's prediction)
                trees = [t for t in model.keys() if 'tree_' in t]
                if n_classes > 2:
                    sample_pred_residual = [[tree_predict_list(x_input_sample, model[t][p]) for p in range(n_classes)] for t in trees]
                else:
                    sample_pred_residual = [[tree_predict_list(x_input_sample, model[t][0])] for t in trees]
                
                sample_pred_residual = np.array(sample_pred_residual)
                sample_pred_update = np.sum([-(learning_rate * x) for x in sample_pred_residual], axis = 0)
                sample_pred_fin = np.array(sample_pred_update).flatten()
                sample_pred_fin = base_value - sample_pred_fin

                # Final probability
                if n_classes > 2:
                    sample_pred_proba = np.exp(sample_pred_fin) / np.sum(np.exp(sample_pred_fin))
                else:
                    sample_pred_proba = np.exp(sample_pred_fin) / (1 + np.exp(sample_pred_fin))
                    sample_pred_proba = np.array([1-sample_pred_proba[0], sample_pred_proba[0]])
                #print('Predict_proba: ',sample_pred_proba)
                # Final prediction
                # Obtain the class name where probability is max
                sample_pred = class_names[np.argmax(sample_pred_proba)]
            except Exception as error:
                sample_pred = np.nan
                print(error, file=sys.stderr)
            output_list.append(sample_pred)
    # AdaBoost Model
    elif model['model_type'] == 'AdaBoost':
        for sample_i in range(x_input.shape[0]):
            try:
                # Preprocess data: Standardization
                x_input_sample = (x_input[sample_i] - 0)/1
                weights = np.array(model['estimator_weights'])

                # Final prediction would be the class where sum of all trees probability is max.
                trees = [t for t in model.keys() if 'tree_' in t]
                
                sample_trees_number = ([tree_predict_list(x_input_sample, model[t]) for t in trees])
                sample_trees_proba = [[x / sum(tr) for x in tr] for tr in sample_trees_number]
                
                if model['algorithm'] == 'SAMME':
                    # 'SAMME' uses sum of weights for each prediction from estimators. Thus each class will have
                    # some total weights as the decision function
                    
                    sample_trees_class = [class_names[x.index(max(x))] for x in sample_trees_proba]
                    sample_trees_class_bool = [ x == np.array(class_names) for x in sample_trees_class]
                    # Decision function calculation
                    sample_trees_decision = np.array([ x * weights[i] for i, x in enumerate(sample_trees_class_bool)])
                elif model['algorithm'] == 'SAMME.R':
                    # Weigts will be 1 for all estimators in 'SAMME.R'
                    # 'SAMME.R' uses log proba for each prediction from estimators.
                    
                    # Clip probability from 0 to small value before taking log
                    sample_trees_proba = [[max(x, 2.22044605e-16) for x in sublist] for sublist in sample_trees_proba]
                    sample_trees_log_proba = np.log(sample_trees_proba)
                    # Decision function calculation
                    sample_trees_decision = np.array([[(n_classes - 1) * (x - (1/n_classes) * sum(xx)) for x in xx] for xx in sample_trees_log_proba])
                    
                sample_trees_decision = sum(sample_trees_decision)/sum(weights)
                sample_trees_decision /= (n_classes - 1)
                # Probability from Decision
                ### Softmax
                sample_trees_softmax = [np.exp(x) for x in sample_trees_decision] 
                sample_trees_softmax = [x/sum(sample_trees_softmax) for x in sample_trees_softmax] 
                sample_proba = sample_trees_softmax.copy()
                #print('Predict_proba: ',sample_proba)
                # Obtain the class name where probability is max
                sample_pred = class_names[sample_proba.index(max(sample_proba))]
            except Exception as error:
                sample_pred = np.nan
                print(error, file=sys.stderr)
            output_list.append(sample_pred)
    return output_list

### Testing

In [9]:
# SVR testing
n_samples, n_features = 10, 5
rng = np.random.RandomState(0)
y = [1, 2, 1, 2, 0, 0, 1, 4, 2, 0]
X = rng.randn(n_samples, n_features)

# Model Training
algo = SVC(kernel='rbf', degree=3, decision_function_shape = 'ovr',
           gamma=3, coef0=0.0, 
           tol=0.001, C=5.0)
model_SVR_pl = Pipeline([('standardize',StandardScaler()),('svr',algo)])
model_SVR_sk = model_SVR_pl.fit(X, y)
print('Sklearn prediction: ', model_SVR_sk.predict(X))
#print('Sklearn decision_function: \n', model_SVR_sk[1].decision_function(model_SVR_pl[0].transform(X)))

folder_path = "saved_models\\"
save_model_json(folder_path, 'test_model_svr', model_SVR_sk, X, y,'SVC')
test_model_svr = decrypt_json('test_model_svr')
print('Custom prediction: ', custom_predict_json(test_model_svr, X))

Sklearn prediction:  [1 2 1 2 0 0 1 4 2 0]
Custom prediction:  ['1', '2', '1', '2', '0', '0', '1', '4', '2', '0']


In [10]:
# RF testing
n_samples, n_features = 10, 5
rng = np.random.RandomState(0)
y = [1, 0, 1, 0, 3, 0, 1, 1, 0, 2]
X = rng.randn(n_samples, n_features)

# Model Training
algo = RandomForestClassifier(n_estimators = 50, 
                             random_state=0, n_jobs=-1)
model_RF_sk = algo.fit(X, y)
print('Sklearn prediction: ', model_RF_sk.predict(X))
#print('Sklearn predict_proba: ', model_RF_sk.predict_proba(X))

folder_path = "saved_models\\"
save_model_json(folder_path, 'test_model_rf', model_RF_sk, X, y,'RF')
test_model_rf = decrypt_json('test_model_rf')
print('Custom prediction: ', custom_predict_json(test_model_rf, X))

Sklearn prediction:  [1 0 1 0 3 0 1 1 0 2]
Custom prediction:  ['1', '0', '1', '0', '3', '0', '1', '1', '0', '2']


In [16]:
# GradientBoost testing
n_samples, n_features = 12, 5
rng = np.random.RandomState(0)
y = [2, 3, 2, 0, 0, 2, 2, 0, 1, 2, 2, 1]
X = rng.randn(n_samples, n_features)

# Model Training
algo = GradientBoostingClassifier(n_estimators = 10, 
                                 learning_rate = 0.1, random_state=0)
model_GB_sk = algo.fit(X, y)
print('Sklearn prediction: ', model_GB_sk.predict(X))
#print('Sklearn predict_proba: ', model_GB_sk.predict_proba(X))

folder_path = "saved_models\\"
save_model_json(folder_path, 'test_model_xgb', model_GB_sk, X, y,'GradientBoost')
test_model_xgb = decrypt_json('test_model_xgb')
print('Custom prediction: ', custom_predict_json(test_model_xgb, X))

Sklearn prediction:  [2 3 2 0 0 2 2 0 1 2 2 1]
Custom prediction:  ['2', '3', '2', '0', '0', '2', '2', '0', '1', '2', '2', '1']


In [12]:
# AdaBoost testing
n_samples, n_features = 12, 5
rng = np.random.RandomState(0)
y = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]
X = rng.randn(n_samples, n_features)

# Model Training
algo = AdaBoostClassifier(n_estimators = 10, algorithm = 'SAMME',
                         learning_rate = 0.1, random_state=0)
model_AGB_sk = algo.fit(X, y)
print('Sklearn prediction: ', model_AGB_sk.predict(X))
#print('Sklearn predict_proba: ', model_AGB_sk.predict_proba(X))

folder_path = "saved_models\\"
save_model_json(folder_path, 'test_model_agb', model_AGB_sk, X, y,'AdaBoost')
test_model_xgb = decrypt_json('test_model_agb')
print('Custom prediction: ', custom_predict_json(test_model_xgb, X))

Sklearn prediction:  [1 1 1 1 1 1 1 1 1 1 1 2]
Custom prediction:  ['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '2']
