In [19]:
import re
import sys
import xml.etree.ElementTree as ET

def build_tree(dt, node_id, node_pos, parent_depth, parent_elementTree, scale):
    n_nodes = dt.tree_.node_count
    children_left = dt.tree_.children_left
    children_right = dt.tree_.children_right
    feature = dt.tree_.feature
    threshold = dt.tree_.threshold
    value = dt.tree_.value

    if (children_left[node_id] != children_right[node_id]):    
        # intermediate node
        node_depth = parent_depth + 1

        # node parameters
        pos = "s" if node_id == 0 else node_pos
        depth = str(node_depth)
        IVar = str(feature[node_id])
        Cut = str(threshold[node_id])

        node_elementTree = ET.SubElement(parent_elementTree, "Node", pos=pos, depth=depth, NCoef="0", IVar=IVar, 
            Cut=Cut, cType="1", res="0.0e+01", rms="0.0e+00", purity="0.0e+00", nType="0")
        build_tree(dt, children_left[node_id], "l", node_depth, node_elementTree, scale)
        build_tree(dt, children_right[node_id], "r", node_depth, node_elementTree, scale)
    else:
        # leaf node
        node_depth = parent_depth + 1

        # node parameters
        pos = "s" if node_id == 0 else node_pos
        depth = node_depth
        IVar = -1

        global NodePurityLimit
        sig = value[node_id][0][0] * scale
        #total = float(sig + bkg)
        #purity = float(sig)/total
        #nType = 1 if purity >= NodePurityLimit else -1
        purity = "0.0e+00"

        node_elementTree = ET.SubElement(parent_elementTree, "Node", pos=pos, depth=str(depth), NCoef="0", IVar=str(IVar), 
            Cut="0.0e+00", cType="1", res=str(sig), rms="0.0e+00", purity=str(purity), nType="-99")

def convert_model(sklearn_bdt_clf, input_var_list, tmva_outfile_xml):
    # classificator
    clf = sklearn_bdt_clf

    if clf.loss_.K != 1:
        sys.exit("Error: Only binary classification is supported for regression trees.")

    # Order of variables must be _exactly_ as in the training numpy array
    # E.g.
    # var_list = [ 
    #            ('m_el_pt', 'F'),
    #            ('m_el_eta', 'F'), 
    #            ('m_el_sigd0PV', 'F'), 
    #            ('m_el_z0SinTheta', 'F'), 
    #            ('m_el_etcone20Dpt', 'F'), 
    #            ('m_el_ptcone20Dpt', 'F')
    #        ]
    var_list = input_var_list

    #  Run-time parameters
    NTrees = clf.n_estimators


    #<MethodSetup Method="BDT::BDT">
    # <GeneralInfo>
    #   <Info name="Creator" value="musthero"/>
    #   <Info name="AnalysisType" value="Classification"/>
    # <Options>
    #   <Option name="NodePurityLimit" modified="No">5.000000e-01</Option>
    #<Weights NTrees="2" AnalysisType="0">
    #-------------


    # <MethodSetup>
    MethodSetup = ET.Element("MethodSetup", Method="BDT::BDT")
    
    # <GeneralInfo>
    GeneralInfo = ET.SubElement(MethodSetup, "GeneralInfo")
    Info_Creator = ET.SubElement(GeneralInfo, "Info", name="Creator", value="Jona Motta")
    Info_AnalysisType = ET.SubElement(GeneralInfo, "Info", name="AnalysisType", value="Regression")

    # <Options>
    Options = ET.SubElement(MethodSetup, "Options")
    Option_NodePurityLimit = ET.SubElement(Options, "Option", name="NodePurityLimit", modified="No").text = "5.00e-01"
    Option_BoostType = ET.SubElement(Options, "Option", name="BoostType", modified="Yes").text = "Grad"
    
    #<Variables>
    Variables = ET.SubElement(MethodSetup, "Variables", NVar=str(len(var_list)))
    for ind, val in enumerate(var_list):
        name = val[0]
        var_type = val[1]
        Variable = ET.SubElement(Variables, "Variable", VarIndex=str(ind), Type=val[1], 
            Expression=name, Label=name, Title=name, Unit="", Internal=name, 
            Min=val[2]+"e+00", Max=val[3]+"e+00")

    # <Weights>
    Weights = ET.SubElement(MethodSetup, "Weights", NTrees=str(NTrees), AnalysisType="1")

    # We support only binary classification
    # from http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
    # estimators_ : ndarray of DecisionTreeRegressor, shape = [n_estimators, loss_.K]
    #     where loss_.K is 1 for binary classification, otherwise n_classes.
    for idx, dt in enumerate(clf.estimators_[:, 0]):
        # <BinaryTree type="DecisionTree" boostWeight="9.2106320437773737e-01" itree="0">
        BinaryTree = ET.SubElement(Weights, "BinaryTree", type="DecisionTree", boostWeight="1.0e+00", itree=str(idx))
        build_tree(dt, 0, "s", -1, BinaryTree, clf.learning_rate/2.)


    # Create XML-tree structure and save it to file
    tree = ET.ElementTree(MethodSetup)
    tree.write(tmva_outfile_xml)

In [20]:
import pickle

def save_obj(obj,dest):
    with open(dest,'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(source):
    with open(source,'rb') as f:
        return pickle.load(f)

In [21]:
C2model = load_obj('/home/llr/cms/motta/HGCAL/CMSSW_11_1_0/src/GRAPHAnalysis/L1BDT/pklModels/calibration_C1skimC2C3/model_c2_th_PU200.pkl')

In [22]:
C2InputVariables = [['cl3d_abseta', 'F', '1.5', '3.0'], ['cl3d_coreshowerlength', 'F', '0.0', '36.0'], ['cl3d_meanz', 'F', '320.0', '515.0'], ['cl3d_showerlength','F', '0.0', '50.0'], ['cl3d_spptot', 'F', '0.0', '0.015'], ['cl3d_srrmean', 'F', '0.0', '0.01']]
#C2InputVariables = [("cl3d_showerlength", 'F'), ("cl3d_coreshowerlength",'F'), ("cl3d_abseta",'F'), ("cl3d_spptot",'F'), ("cl3d_srrmean",'F'), ("cl3d_meanz",'F')]

In [23]:
convert_model(C2model, C2InputVariables, 'tmp.xml')

In [24]:
!rm C2model_nonRscld.xml
!xmllint --format tmp.xml >> C2model_nonRscld.xml
!rm tmp.xml