# XMLizer for Sklearn and XGBoost models

In [1]:
import re
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.tree import _tree

class Tree:
    """Represents a node in a decision tree, identified by a unique integer id
    
    Attributes:
        children (list of int): The id-s of the children associated with this node
        depth (int): The depth of this node in the tree
        id (int): The unique id of this node
        parent (id): The id of the parent node
        payload (tuple): Describes what this node does,
            i.e. is it a non-terminal node (cut) or a terminal (leaf) 
    """
    def __init__(self, id, children, parent, depth, payload):
        self.id = id
        self.children = children
        self.parent = parent
        self.depth = depth
        self.payload = payload

    def __repr__(self):
        return "Tree id:{id} children:{children} parent:{parent}, depth:{depth}, payload:{payload}".format(**{
            "id": self.id,
            "children": str(self.children),
            "parent": self.parent,
            "depth": self.depth,
            "payload": self.payload,
        })

    def print_out(self, node_dict):
        """Recursively prints a node and its children, given a dictionary with all the available nodes
        
        Args:
            node_dict (dict id->node): All the available nodes
        
        Returns:
            nothing
        """
        print((self.depth + 1) * "-" + str(self))
        for ch in self.children:
            node_dict[ch].print_out(node_dict)

    def to_tmva(self, nodetree, scale):
        """Writes out a TMVA-compatible XML string for a given node in the decision tree
        
        Args:
            nodetree (dict int->Tree): The dictionary of the full tree
            scale (float): A scaling coefficient for the TMVA leaves (TMVA = sklearn * scale)
        
        Returns:
            string: XML with the node
        
        """

        kind = "c"
        if self.parent != -1:
            idx = nodetree[self.parent].children.index(self.id)
            if idx == 0:
                kind = "l"
            elif idx == 1:
                kind = "r"
        
        #handle leaf (terminal) node
        if len(self.children) == 0:

            return '<Node pos="{0}" depth="{1}" NCoef="0" \
    IVar="{2}" Cut="{3:17E}" cType="1" \
    res="{4:17E}" rms="0.0e-00" \
    purity="{5:.8E}" nType="-99">'.format(
                kind,
                self.depth + 1,
                -1,
                0.0,
                self.payload[1] * scale,
                0.0
            )
        #handle non-leaf node
        else:
            return '<Node pos="{0}" depth="{1}" NCoef="0" \
    IVar="{2}" Cut="{3:17E}" cType="1" \
    res="{4:17E}" rms="0.0" \
    purity="{5:.8E}" nType="0">'.format(
            kind,
            self.depth + 1,
            self.payload[1],
            self.payload[2],
            0.0, 0.0
        )

def sklearn_to_nodetree(cls, nodetree, sklearn_tree, node_id=0, parent_id=-1, depth=-1):
    """Recursively converts a sklearn GradientBoosting{Classifier,Regressor} to a generic representation
    
    Args:
        nodetree (dict id->Node): The output dictionary with the nodes
        sklearn_tree (DecisionTreeRegressor): The input decision tree
        node_id (int): the id of the root node
        parent_id (int): the id of the parent node
        depth (int): The current depth
    
    Returns:
        dict int->Tree: The output node tree
    """

    #if the left (or right) child node id is -1, then this node is already a leaf node
    if sklearn_tree.children_left[node_id] == _tree.TREE_LEAF:
        n = Tree(
            node_id,
            [],
            parent_id,
            depth,
            ("val", sklearn_tree.value[node_id][0,0]/cls.n_estimators)
        )
        nodetree[node_id] = n
        if parent_id in nodetree:
            nodetree[parent_id].children += [node_id]
    #this is not a leaf node
    else:
        n = Tree(
            node_id,
            [],
            parent_id,
            depth,
            ("cut", sklearn_tree.feature[node_id], sklearn_tree.threshold[node_id])
        )
        nodetree[node_id] = n
        if parent_id in nodetree:
            nodetree[parent_id].children += [node_id]

    left_child = sklearn_tree.children_left[node_id]
    right_child = sklearn_tree.children_right[node_id]
    if left_child != _tree.TREE_LEAF:
        sklearn_to_nodetree(cls, nodetree, sklearn_tree, left_child, node_id, depth+1)
    if right_child != _tree.TREE_LEAF:
        sklearn_to_nodetree(cls, nodetree, sklearn_tree, right_child, node_id, depth+1)

    return nodetree

def xgbtree_to_nodetree(tree, features):
    """Converts an xgboost tree dump to an internal Tree representation
    
    Args:
        tree (string): The model dump from xgboost using model.booster().get_dump()[ntree]
    
    Returns:
        dict int->Tree: The tree structure
    """
    _NODEPAT = re.compile(r'(\d+):\[(.+)\]')
    _LEAFPAT = re.compile(r'(\d+):(leaf=.+)')

    parent_stack = []
    prev_depth = -1
    prev_index = -1
    nodes = {}

    for node in tree.split("\n"):
        node_depth = node.count("\t")

        is_node = False
        is_leaf = False

        match = _NODEPAT.match(node.strip())
        if match is not None:
            node_index = int(match.group(1))
            node_variable, threshold = match.group(2).split("<")
            node_variable = int(features.index(node_variable))
            threshold = float(threshold)
            is_node = True

        match = _LEAFPAT.match(node.strip())
        if match is not None:
            node_index = int(match.group(1))
            val = float(match.group(2).split("=")[1])
            is_leaf = True

        if not (is_node or is_leaf):
            continue

        #keep track of the parent of this node
        istack = prev_depth
        while istack < node_depth:
            parent_stack += [prev_index]
            istack += 1
        istack = node_depth
        while istack < prev_depth:
            parent_stack.pop()
            istack += 1
        my_parent = parent_stack[-1]

        #create the node
        if is_node:
            nodes[node_index] = Tree(node_index, [], my_parent, node_depth, ("cut", node_variable, threshold))
        elif is_leaf:
            nodes[node_index] = Tree(node_index, [], my_parent, node_depth, ("val", val))

        #insert node into final node dict
        if my_parent in nodes:
            nodes[my_parent].children += [node_index]

        prev_depth = node_depth
        prev_index = node_index

    #nodes[0].print_out(nodes)

    return nodes

class BDT(object):
    def __init__(self, trees, kind, feature_names, target_names, max_depth, learning_rate):
        self.trees = trees
        self.kind = kind
        self.ntrees = len(trees)

        self.feature_names = feature_names
        self.target_names = target_names

        self.max_depth = max_depth
        self.learning_rate = learning_rate



    def to_tmva(self, outfile_name, mva_name="bdt"):

        #Create list of variables
        #we assume that all variables are 'simple', that is, not expressions
        varstring = ""
        for i in range(len(self.feature_names)):
            varstring += '<Variable VarIndex="{0}" Expression="{1}" Label="{1}" Title="{1}" Unit="" Internal="{1}" Type="F" Min="{2:.64E}" Max="{3:.64E}"/>\n'.format(
                i, self.feature_names[i], 0, 0
            )

        if self.kind == "regression":
            class_string = ""
            num_classes = 1
            analysis_type = "Regression"

            #for regression, just one class
            for icls, clsname in enumerate(["Regression"]):
                class_string += '<Class Name="{0}" Index="{1}"/>\n'.format(
                    clsname, icls
                )

            #as many targets as given (n>1: vector valued regression)
            target_string = ""
            num_targets = len(self.target_names)
            if num_targets > 1:
                raise Exception("TMVA does not support regression with vector values, need to specify a scalar target")
            for itgt, tgtname in enumerate(self.target_names):
                target_string += '<Target Name="{0}" TargetIndex="{1}" Expression="{0}" Label="{0}" Title="{0}" Unit="" Internal="{0}" Type="F" Min="{2:.64E}" Max="{3:.64E}"/>\n'.format(
                    tgtname, itgt, 0.0, 0.0
                )

        elif self.kind == "binary" or self.kind == "multiclass":
            class_string = ""
            num_classes = len(self.target_names)

            #Decide between multiclass or binary
            if self.kind == "binary":
                analysis_type = "Classification"
            elif self.kind == "multiclass":
                analysis_type = "Multiclass"

            for icls, clsname in enumerate(self.target_names):
                class_string += '<Class Name="{0}" Index="{1}"/>\n'.format(
                    clsname, icls
                )
            num_targets = 0
            target_string = ""

          
        outfile = open(outfile_name, "w")
        outfile.write(
        """
        <?xml version="1.0"?>
        <MethodSetup Method="BDT::{mva_name}">
        <GeneralInfo>
        <Info name="TMVA Release" value=""/>
        <Info name="ROOT Release" value=""/>
        <Info name="Creator" value="mlglue"/>
        <Info name="Date" value=""/>
        <Info name="Host" value=""/>
        <Info name="Dir" value=""/>
        <Info name="Training events" value="-1"/>
        <Info name="TrainingTime" value="-1"/>
        <Info name="AnalysisType" value="{analysis_type}"/>
        </GeneralInfo>
        <Options>
        <Option name="NTrees" modified="Yes">{ntrees}</Option>
        <Option name="MaxDepth" modified="Yes">{maxdepth}</Option>
        <Option name="BoostType" modified="Yes">Grad</Option>
        <Option name="Shrinkage" modified="Yes">{learnrate}</Option>
        <Option name="UseNvars" modified="Yes">{usenvars}</Option>
        </Options>

        <Variables NVar="{nvars}">
        {varstring}
        </Variables>

        <Classes NClass="{nclasses}">
        {class_string}
        </Classes>

        <Targets NTrgt="{ntargets}">
        {target_string}
        </Targets>

        <Transformations NTransformations="0"/>
        <MVAPdfs/>
        <Weights NTrees="{ntrees}" AnalysisType="1">
        """.format(**{
                "analysis_type": analysis_type,
                "mva_name": mva_name,
                "ntrees": self.ntrees,
                "maxdepth": self.max_depth,
                "usenvars": len(self.feature_names),
                "nvars": len(self.feature_names),
                "varstring": varstring,
                "learnrate": self.learning_rate,
                
                "nclasses": num_classes,
                "class_string": class_string,

                "ntargets": num_targets,
                "target_string": target_string

                }
            )
        )

        #Loop over decision trees, in scikit that's a 2D array (N_estimators, N_classes)
        #if binary classification, N_classes = 1
        itree = 0
        for tree in self.trees:
            outfile.write(
                '<BinaryTree type="DecisionTree" boostWeight="0.0" itree="{0}">\n'.format(
                    itree, self.learning_rate
                )
            )

            #convert internal representation to TMVA tree
            #re-weight each node by 1/N (N - num trees per class)
            tree_to_tmva(outfile, tree, 0, 1.0)
            #tree_to_tmva(outfile, tree, 0, 14)

            outfile.write('</BinaryTree>\n')
            itree += 1

        #done with output
        outfile.write("""
          </Weights>
        </MethodSetup>
        """)
        outfile.close()

    def setup_tmva(self, bdtfile):
        from ROOT import TMVA
        self.reader = TMVA.Reader("!Color:Silent:!Error")

        self.vardict = {}
        #all variables must be float32
        for ivar in range(0, len(self.feature_names)):
            self.vardict[ivar] = np.array([0], dtype=np.float32)
            self.reader.AddVariable(self.feature_names[ivar], self.vardict[ivar])
        self.tmva = self.reader.BookMVA("bdt", bdtfile)

    def eval_tmva(self, features):
        for ivar, varname in enumerate(self.feature_names):
            self.vardict[ivar][0] = features[ivar]

        if self.kind == "multiclass":
            ret = self.reader.EvaluateMulticlass("bdt")
            ret = np.array([r for r in ret])
        elif self.kind == "binary":
            ret = self.reader.EvaluateMVA("bdt")
        elif self.kind == "regression":
            ret = self.reader.EvaluateRegression("bdt")
            ret = np.array([r for r in ret])
        return ret

class BDTxgboost(BDT):
    def __init__(self, model, feature_names, target_names):
        
        self.model = model
        kind = None
        if model.objective.startswith("binary:logistic"):
            kind = "binary"
        elif model.objective.startswith("multiclass"):
            kind = "multiclass"
        else:
            kind = "regression"
        #print(model.objective, kind)

        trees = []
        for tree_dump in model.get_booster().get_dump():
            tree = xgbtree_to_nodetree(tree_dump, feature_names)
            trees += [tree]

        super(BDTxgboost, self).__init__(trees, kind, feature_names, target_names, model.max_depth, model.learning_rate)

    def eval(self, features):
        #proba = self.model.predict_proba(features)[:, 1]
        proba = self.model.predict(features)#[:, 1]

        #invert sigmoid
        proba = -np.log(abs(1.0/proba - 1.0))

        #apply TMVA transformation
        proba = 2.0 / (1.0 + np.exp(-2.0*proba)) - 1
        
        return proba

class BDTsklearn(BDT):

    def __init__(self, model, feature_names, target_names):
        
        self.model = model
        self.feature_names = feature_names

        kind = None
        if isinstance(model, GradientBoostingRegressor):
            kind = "regression"
        elif isinstance(model, GradientBoostingClassifier):
            if len(target_names) == 2:
                kind = "binary"
            else:
                kind = "multiclass"

        trees = []
        #Loop over decision trees, in scikit that's a 2D array (N_estimators, N_classes)
        for sklearn_trees in model.estimators_:
             #write trees for different classes next to each other
            for class_tree in sklearn_trees:
                nodetree = {}
                sklearn_to_nodetree(model, nodetree, class_tree.tree_, 0, -1, -1)
                trees += [nodetree]

        super(BDTsklearn, self).__init__(trees, kind, feature_names, target_names, model.max_depth, model.learning_rate)


    def eval(self, vals):
        """A TMVA-compatible evaluation function for a scikit-learn classifier
        
        Args:
            vals (numpy array): An array (n_samples, n_features) of the input variables
        
        Returns:
            numpy array: (n_samples, n_classes) array of the output
        """
        
        #need to scale the same way as done in TMVA    
        scale = 1.0 / self.model.n_estimators

        if isinstance(self.model, GradientBoostingClassifier):
            #multiclass classification
            #according to TMVA::MethodBDT::GetMulticlassValues()
            if self.model.n_classes_ > 2:
                ret = np.zeros((vals.shape[0], self.model.n_classes_))
                for iclass in range(self.model.n_classes_):
                    for itree, t in enumerate(self.model.estimators_[:, iclass]):
                        r = t.predict(vals)
                        ret[:, iclass] += r * scale

                norm = np.zeros(ret.shape)
                for i in range(self.model.n_classes_):
                    for j in range(self.model.n_classes_):
                        if i != j:
                            norm[:, i] += np.exp(ret[:, j] - ret[:, i])

                ret = 1.0 / (1.0 + norm)        
                return ret
            #binary classification
            elif self.model.n_classes_ == 2:
                ret = np.zeros(vals.shape[0])

                for itree, t in enumerate(self.model.estimators_[:, 0]):
                    r = t.predict(vals)
                    ret += r * scale
                return 2.0/(1.0 + np.exp(-2.0 * ret)) - 1
        elif isinstance(self.model, GradientBoostingRegressor):
            ret = np.zeros((vals.shape[0], self.model.n_classes_))
            for iclass in range(self.model.n_classes_):
                for itree, t in enumerate(self.model.estimators_[:, iclass]):
                    r = t.predict(vals)
                    ret[:, iclass] += r * scale
                    #ret[:, iclass] += r
            return ret

def tree_to_tmva(outfile, nodetree, current_node, scale):
    """Recursively writes out a decision tree as an XML
    
    Args:
        outfile (TYPE): Output file, must be writeable
        nodetree (TYPE): The dictionary with the nodes
        current_node (int): current node ID
        scale (float): The scale factor for each leaf
    
    Returns:
        nothing
    """
    outfile.write((nodetree[current_node].depth + 1)*"    " + nodetree[current_node].to_tmva(nodetree, scale) + "\n")
    for child in nodetree[current_node].children:
        tree_to_tmva(outfile, nodetree, child, scale)
    outfile.write((nodetree[current_node].depth + 1)*"    " + "</Node>\n")

define functions to open pickled model

In [2]:
import pickle

def save_obj(obj,dest):
    with open(dest,'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(source):
    with open(source,'rb') as f:
        return pickle.load(f)

load pickled models (these two models do the same regression, the only diference is one is trained with sklearn and the other with xgboost)

In [3]:
C2model_skl = load_obj('model_c2_th_PU200_skl.pkl')

In [4]:
C2model_xgb = load_obj('model_c2_th_PU200_xgb.pkl')

define features and dummy target for regression

In [5]:
C2InputVariables = ["cl3d_abseta", "cl3d_coreshowerlength", "cl3d_meanz", "cl3d_showerlength", "cl3d_spptot", "cl3d_srrmean"]
dummy_target = []

convert sklearn and xgboost models to xml

In [6]:
bdt_skl = BDTsklearn(C2model_skl, C2InputVariables, dummy_target)
bdt_skl.to_tmva("test_C2model_skl.xml")

In [7]:
bdt_xgb = BDTxgboost(C2model_xgb, C2InputVariables, dummy_target)
bdt_xgb.to_tmva("test_C2model_xgb.xml")

define the test dataset we want to use

In [8]:
import pandas as pd

cl3d_abseta = [2.8745, 2.88256, 2.86664, 2.89197, 2.72434, 2.73504, 2.75481, 2.68101, 2.66192, 2.63181, 2.64703, 2.62997, 2.65954, 2.6181, 2.5422, 2.56137, 2.54953, 2.57309, 2.50743, 2.5057, 2.52051, 2.52633, 2.4293, 2.42664, 2.4182, 2.39965, 2.35732, 2.34091, 2.3147, 2.31275, 2.28152, 2.24629, 2.2117, 2.12399, 2.05333, 1.8717, 1.82165, 1.6596, 2.88612, 2.88321, 2.75897, 2.71611, 2.70154, 2.6351, 2.64355, 2.67783, 2.66998, 2.58127, 2.55352, 2.59189, 2.56282, 2.51148, 2.52532, 2.3997, 2.4195, 2.43531, 2.40516, 2.3719, 2.30698, 2.30281, 2.30739, 2.30509, 2.29121, 2.14166, 2.06986, 2.0375, 1.9779, 1.85, 1.76858, 1.61684]
cl3d_coreshowerlength = [15, 10, 10, 13, 9, 14, 6, 13, 7, 9, 9, 7, 10, 6, 5, 9, 9, 7, 8, 8, 6, 9, 8, 8, 9, 9, 6, 4, 8, 8, 7, 8, 6, 2, 3, 9, 2, 3, 5, 10, 9, 9, 9, 10, 14, 8, 6, 13, 11, 9, 9, 13, 11, 4, 3, 12, 10, 12, 2, 5, 9, 7, 8, 7, 8, 3, 2, 1, 3, 2]
cl3d_meanz = [353.433, 352.166, 351.025, 338.287, 336.707, 342.583, 345.876, 338.568, 364.488, 342.452, 340.689, 344.031, 344.735, 345.175, 351.872, 342.418, 335.01, 338.58, 337.631, 350.078, 337.768, 337.283, 339.598, 350.685, 342.497, 353.366, 336.253, 337.511, 332.004, 333.614, 339.715, 339.783, 330.752, 383.223, 341.934, 332.152, 347.329, 371.682, 344.674, 332.14, 402.36, 337.351, 373.887, 347.432, 352.756, 333.718, 342.755, 342.173, 336.226, 369.574, 336.392, 352.538, 333.815, 343.102, 347.92, 354.024, 345.011, 341.24, 362.57, 351.978, 337.245, 364.898, 390.632, 351.11, 333.213, 338.638, 354.095, 340.207, 330.957, 378.024]
cl3d_showerlength = [36, 39, 40, 35, 34, 33, 37, 42, 42, 29, 30, 36, 42, 30, 35, 38, 29, 35, 31, 40, 29, 32, 32, 37, 39, 30, 33, 32, 21, 27, 29, 31, 25, 31, 19, 17, 15, 28, 33, 27, 47, 33, 37, 46, 41, 33, 31, 35, 30, 50, 29, 35, 27, 28, 25, 37, 30, 29, 34, 41, 21, 40, 46, 32, 25, 29, 31, 5, 9, 25]
cl3d_spptot = [0.0507148, 0.0560651, 0.0335778, 0.0615488, 0.0571281, 0.0423954, 0.0526622, 0.0513792, 0.044169, 0.046669, 0.0485373, 0.0439432, 0.0418449, 0.0412898, 0.0447453, 0.0419937, 0.0334273, 0.0420075, 0.0428482, 0.0328301, 0.0358253, 0.0350243, 0.0391379, 0.0292497, 0.0521896, 0.0234807, 0.0196477, 0.0367944, 0.0248518, 0.0279796, 0.0225257, 0.030757, 0.0198919, 0.0109972, 0.0120821, 0.0215738, 0.00401706, 0.00320102, 0.0669115, 0.0629402, 0.0328888, 0.0417385, 0.0324387, 0.0383691, 0.0463479, 0.0583387, 0.0447267, 0.0403691, 0.0426642, 0.0453902, 0.0345599, 0.047415, 0.0471672, 0.0324563, 0.0302662, 0.027225, 0.0284459, 0.0385829, 0.0303255, 0.0232581, 0.024943, 0.0342036, 0.0425497, 0.0175971, 0.0250334, 0.0226588, 0.0166786, 0.00840113, 0.00449695, 0.00657583]
cl3d_srrmean = [0.00345892, 0.00411237, 0.00350957, 0.00409898, 0.00382347, 0.00295203, 0.0044008, 0.00364478, 0.00330204, 0.0034307, 0.00222936, 0.00393587, 0.00484258, 0.00354911, 0.00279548, 0.00377017, 0.00422407, 0.00418136, 0.00445737, 0.00408817, 0.00384533, 0.00374325, 0.00380875, 0.00351983, 0.00295819, 0.00298812, 0.00216117, 0.00311949, 0.00274665, 0.00417613, 0.00337179, 0.0040137, 0.00297939, 0.000631064, 0.000904475, 0.00370579, 0.00116965, 0.000182741, 0.00290323, 0.00494255, 0.00293023, 0.00473473, 0.00185686, 0.00415876, 0.00302697, 0.00469527, 0.00210816, 0.00384406, 0.0038524, 0.00400991, 0.00269227, 0.00311919, 0.00409912, 0.00414462, 0.00264463, 0.00338798, 0.00306862, 0.00384011, 0.000930512, 0.00259441, 0.00275698, 0.00408488, 0.000707584, 0.00240011, 0.00259081, 0.000947295, 0.0022118, 0, 0.00244491, 0.00103596]

df = pd.DataFrame()

df['cl3d_abseta'] = cl3d_abseta
df['cl3d_coreshowerlength'] = cl3d_coreshowerlength
df['cl3d_meanz'] = cl3d_meanz
df['cl3d_showerlength'] = cl3d_showerlength
df['cl3d_spptot'] = cl3d_spptot
df['cl3d_srrmean'] = cl3d_srrmean

define the output datafrane to store results

In [9]:
results = pd.DataFrame(columns=["raw_SKL_pred", "xml_SKL_pred", "tmva_SKL_pred", "raw_XGB_pred", "xml_XGB_pred", "tmva_XGB_pred" ])

evaluate the model from sklearn using the functions defined above and the tmva model

In [10]:
bdt_skl.setup_tmva('test_C2model_skl.xml')
predA = []
predB = []
for irow in range(df.shape[0]):
    predA.append( bdt_skl.eval_tmva(np.array(df.iloc[irow, :])) )
    predB.append( bdt_skl.eval(np.array(df.iloc[irow, :]).reshape(1, -1)) )
    
results["xml_SKL_pred"] = predB
results["tmva_SKL_pred"] = predA

Welcome to JupyROOT 6.14/04






evaluate the model from xgboost using the functions defined above and the tmva model

In [11]:
bdt_xgb.setup_tmva('test_C2model_xgb.xml')
predA = []
predB = []
for irow in range(df.shape[0]):
    predA.append( bdt_xgb.eval_tmva(np.array(df.iloc[irow, :])) )
    #predB.append( bdt_xgb.eval(np.array(df.iloc[0, :]).reshape(1,-1)) )
    
feats = ["cl3d_showerlength", "cl3d_coreshowerlength", "cl3d_abseta",  "cl3d_spptot", "cl3d_srrmean", "cl3d_meanz"]
results["xml_XGB_pred"] = bdt_xgb.eval(df[feats])
results["tmva_XGB_pred"] = predA

now do the prediction with the raw sklearn and xgboost models without passing them through the XMLizer

In [12]:
C2features = ["cl3d_showerlength", "cl3d_coreshowerlength", "cl3d_abseta", "cl3d_spptot", "cl3d_srrmean", "cl3d_meanz"]
results['raw_SKL_pred'] = C2model_skl.predict(df[C2features])
results['raw_XGB_pred'] = C2model_xgb.predict(df[C2features])

check results compatibility

In [13]:
results

Unnamed: 0,raw_SKL_pred,xml_SKL_pred,tmva_SKL_pred,raw_XGB_pred,xml_XGB_pred,tmva_XGB_pred
0,0.978202,[[0.06612757715084647]],[0.06612757593393326],0.934846,0.990332,[0.4348461329936981]
1,0.905109,[[0.06231558136060292]],[0.06231557950377464],0.900859,0.976067,[0.4008587896823883]
2,1.150202,[[0.06231558136060292]],[0.06231557950377464],1.243343,0.926216,[0.7433426976203918]
3,0.745283,[[0.0597697656408076]],[0.05976976454257965],0.748780,0.797650,[0.2487800419330597]
4,0.821703,[[0.06704656942859288]],[0.06704656779766083],0.848534,0.938241,[0.34853383898735046]
5,0.915830,[[0.06797201762150344]],[0.06797201931476593],0.887810,0.968565,[0.387809693813324]
6,0.988422,[[0.09056803476416987]],[0.09056803584098816],0.930266,0.988824,[0.43026551604270935]
7,0.723813,[[0.0597697656408076]],[0.05976976454257965],0.758482,0.815882,[0.2584817409515381]
8,0.928256,[[0.09046644390899503]],[0.0904664397239685],0.960177,0.996566,[0.46017709374427795]
9,0.804880,[[0.06704656942859288]],[0.06704656779766083],0.872650,0.958295,[0.37265029549598694]
