In [1]:
import numpy as np
import time
import elpigraph
import matplotlib.pyplot as plt
import rpy2.robjects.packages as rpackages
import rpy2.robjects
import rpy2.robjects.numpy2ri
import rpy2.robjects.pandas2ri
r_elpigraph = rpackages.importr("ElPiGraph.R")
rpy2.robjects.numpy2ri.activate()
rpy2.robjects.pandas2ri.activate()
plt.style.use('seaborn')
np.random.seed(0)

# I - Checking output

### Step 1 :  generate output for R and Python

In [2]:
# Load example data
X =  np.genfromtxt('data/tree_data.csv', delimiter=',')

# Create desired list of inputs for R and Python
input_data = [X,X,X]
epg_n_nodes = [10,20,30]
epg_lambda = [.1,.3,.7]
epg_mu = [.02,.07,.01]
epg_trimmingradius = [float('inf'),.3,.8]
epg_finalenergy = ['Penalized','Base','Penalized','Base','Base']
epg_alpha = [.1,.03,.05,.08,.04]
epg_beta = [.3,.02,.04,.07,.2]
epg_mode = [2,1,1,2,1]
epg_n_processes = [1,1,1,1,1]    # Change this if you have multiple cores
epg_collapse_mode = ['PointNumber','PointNumber_Extrema','PointNumber_Leaves','EdgesNumber','EdgesLength']
epg_collapse_par = [5,7,4,6,3] 
                                                # Python uses WeightedCentroid not Weigthed (corrected typo)
epg_ext_mode, r_epg_ext_mode = ['QuantDists','QuantCentroid','WeigthedCentroid','QuantDists','QuantDists'],['QuantDists','QuantCentroid','WeigthedCentroid','QuantDists','QuantDists'] 
epg_ext_par = [.5,.6,.4,.7,.3]
epg_shift_mode = ['NodeDensity','NodePoints','NodeDensity','NodePoints','NodeDensity']
epg_shift_radius = [0.05,0.07,0.04,0.08,0.03]
epg_shift_max = [5,7,4,8,6]

# Results storage
res_py = []
res_R = []

epg_obj_collapse = []
epg_obj_shift = []
epg_obj_extend = []
epg_obj_fineTune = []

r_epg_obj_collapse = []
r_epg_obj_shift = []
r_epg_obj_extend = []
r_epg_obj_fineTune = []

for i in range(len(input_data)):
    ############################ Run main functions, Python version ###################################
    res_py.append(elpigraph.computeElasticPrincipalTree(X=input_data[i],NumNodes = epg_n_nodes[i], 
                                                Lambda=epg_lambda[i], Mu=epg_mu[i],
                                                TrimmingRadius = epg_trimmingradius[i],
                                                FinalEnergy = epg_finalenergy[i],
                                                alpha = epg_alpha[i],
                                                beta = epg_beta[i],                                                    
                                                Do_PCA=False,CenterData=False,
                                                n_cores = epg_n_processes[i],
                                                nReps=1,
                                                EmbPointProb=1.0,
                                                drawPCAView=False,
                                                Mode = epg_mode[i])[0])
    
    ####### util functions
    epg_obj = res_py[i]
    
    epg_obj_collapse.append(elpigraph.CollapseBranches(X = input_data[i], PG = epg_obj, Mode = epg_collapse_mode[i], ControlPar = epg_collapse_par[i]))
    
    epg_obj_shift.append(elpigraph.ShiftBranching(X = input_data[i], 
                                           PG = epg_obj, 
                                           TrimmingRadius = epg_trimmingradius[i],                       
                                           SelectionMode = epg_shift_mode[i], 
                                           DensityRadius = epg_shift_radius[i],
                                           MaxShift = epg_shift_max[i]))
    
    epg_obj_extend.append(elpigraph.ExtendLeaves(X = input_data[i], 
                                          PG = epg_obj,
                                          TrimmingRadius = epg_trimmingradius[i],
                                          Mode = epg_ext_mode[i], 
                                          ControlPar = epg_ext_par[i],
                                          PlotSelected = False))
    
    epg_obj_fineTune.append(elpigraph.fineTuneBR(X=input_data[i],
                                    MaxSteps = epg_maxsteps[i],
                                    Mode = 2,
                                    NumNodes = epg_n_nodes[i], 
                                    InitNodePositions = init_nodes_pos[i],
                                    InitEdges=init_edges + 1,
                                    Lambda=epg_lambda[i], Mu=epg_mu[i],
                                    TrimmingRadius= epg_trimmingradius[i],
                                    FinalEnergy = epg_finalenergy[i],
                                    alpha = epg_alpha[i],
                                    beta = epg_beta[i],                                                    
                                    Do_PCA=False,CenterData=False,
                                    drawAccuracyComplexity = False, drawEnergy = False,drawPCAView = False,
                                    n_cores = epg_n_processes[i],
                                    nReps=1,
                                    ProbPoint=1.0))
    
    #############################################################################################
    ############################ Run main functions, R version ###################################

    tmp = r_elpigraph.computeElasticPrincipalTree(X=input_data[i],NumNodes = epg_n_nodes[i], 
                                                Lambda=epg_lambda[i], Mu=epg_mu[i],
                                                TrimmingRadius= epg_trimmingradius[i],
                                                FinalEnergy = epg_finalenergy[i],
                                                alpha = epg_alpha[i],
                                                beta = epg_beta[i],                                                    
                                                Do_PCA=False,CenterData=False,
                                                n_cores = epg_n_processes[i],
                                                nReps=1,
                                                ProbPoint=1.0,
                                                drawPCAView=False,
                                                Mode = epg_mode[i])[0]
    res_R.append(dict(zip(tmp.names, map(list,np.array(tmp))))) # Convert R result to dict format
    
    ####### util functions
    r_epg_obj = tmp
    
    r_epg_obj_collapse.append(r_elpigraph.CollapseBranches(X = input_data[i], TargetPG = r_epg_obj, Mode = epg_collapse_mode, ControlPar = epg_collapse_par))

    r_epg_obj_shift.append(r_elpigraph.ShiftBranching(X = input_data[i], 
                                       TargetPG = r_epg_obj, 
                                       TrimmingRadius = epg_trimmingradius[i],                       
                                       SelectionMode = epg_shift_mode[i], 
                                       DensityRadius = epg_shift_radius[i],
                                       MaxShift = epg_shift_max[i]))

    r_epg_obj_extend.append(r_elpigraph.ExtendLeaves(X = input_data[i], 
                                          TargetPG = r_epg_obj,
                                          TrimmingRadius = epg_trimmingradius[i],
                                          Mode = r_epg_ext_mode[i], 
                                          ControlPar = epg_ext_par[i],
                                          PlotSelected = False))

    r_epg_obj_fineTune.append(r_elpigraph.fineTuneBR(X=input_data[i],
                                    MaxSteps = epg_maxsteps[i],
                                    Mode = 2,
                                    NumNodes = epg_n_nodes[i], 
                                    InitNodePositions = init_nodes_pos,
                                    InitEdges=init_edges + 1,
                                    Lambda=epg_lambda[i], Mu=epg_mu[i],
                                    TrimmingRadius= epg_trimmingradius[i],
                                    FinalEnergy = epg_finalenergy[i],
                                    alpha = epg_alpha[i],
                                    beta = epg_beta[i],                                                    
                                    Do_PCA=False,CenterData=False,
                                    drawAccuracyComplexity = False, drawEnergy = False,drawPCAView = False,
                                    n_cores = epg_n_processes[i],
                                    nReps=1,
                                    ProbPoint=1.0))

Generating the initial configuration
Creating a chain in the 1st PC with 2 nodes
90% of the points have been used as initial conditions. Resetting.
Constructing tree 1 of 1 / Subset 1 of 1
The elastic matrix is being used. Edge configuration will be ignored
Computing EPG with  10  nodes on  492  points and  3  dimensions
Nodes =  2 3 4 5 6 7 8 9 

BARCODE	ENERGY	NNODES	NEDGES	NRIBS	NSTARS	NRAYS	NRAYS2	MSE	MSEP	FVE	FVEP	UE	UR	URN	URN2	URSD

0||10	0.1433	10	9	8	0	0	0	0.0768	0.0737	0.8575	0.8633	0.0644	0.002	0.0205	0.2049	0


MSDEnergyPlot not yet implemented
accuracyComplexityPlot not yet implemented
0.2048  seconds elapsed


NameError: name 'ConstructGraph' is not defined

In [None]:
epg_obj_collapse = elpigraph.CollapseBranches(X = input_data, TargetPG = epg_obj[0], Mode = epg_collapse_mode, ControlPar = epg_collapse_par)
epg_obj_shift = elpigraph.ShiftBranching(X = input_data, 
                                       TargetPG = epg_obj[0], 
                                       TrimmingRadius = epg_trimmingradius,                       
                                       SelectionMode = epg_shift_mode, 
                                       DensityRadius = epg_shift_radius,
                                       MaxShift = epg_shift_max,
                                       **kwargs)
epg_obj_extend = elpigraph.ExtendLeaves(X = input_data, 
                                      TargetPG = epg_obj[0],
                                      TrimmingRadius = epg_trimmingradius,
                                      Mode = epg_ext_mode, 
                                      ControlPar = epg_ext_par,
                                      PlotSelected = False)
epg_obj = elpigraph.fineTuneBR(X=input_data,
                                MaxSteps = epg_maxsteps,
                                Mode = 2,
                                NumNodes = epg_n_nodes, 
                                InitNodePositions = init_nodes_pos,
                                InitEdges=init_edges + 1,
                                Lambda=epg_lambda, Mu=epg_mu,
                                TrimmingRadius= epg_trimmingradius,
                                FinalEnergy = epg_finalenergy,
                                alpha = epg_alpha,
                                beta = epg_beta,                                                    
                                Do_PCA=False,CenterData=False,
                                drawAccuracyComplexity = False, drawEnergy = False,drawPCAView = False,
                                n_cores = epg_n_processes,
                                nReps=1,
                                ProbPoint=1.0,
                                **kwargs)


epg_obj_collapse = elpigraph.CollapseBranches(X = input_data, TargetPG = epg_obj[0], Mode = epg_collapse_mode, ControlPar = epg_collapse_par)



### Step 2 : check output NodePositions, Edges, ReportTable, FinalReport, ElasticMatrix
#### Prints key and iteration index if a difference is found in the result dictionary

In [5]:
for i in range(len(input_data)):
    one_res_py = res_py[i]
    one_res_R = res_R[i]
    for key in one_res_py:
        if key == 'NodePositions':
            try: assert np.allclose(one_res_py[key], one_res_R[key])
                
            except: print(key,i)

        if key == 'Edges':
            try: assert all(map(lambda x:np.all(x),[one_res_py[key][0]==(one_res_R[key][0]-1), #correcting R indexing that starts at one
                                                    one_res_py[key][1]==one_res_R[key][1],
                                                    one_res_py[key][2]==one_res_R[key][2]]))
            
            except: print(key,i)

        if key == 'ReportTable':
            # This messy loop procedure is because I round results to 4 digits in my version
            # R uses signif and not round. In FinalReport we will check full precision of the final results
            flattened_py = [j for e in list(one_res_py[key].values()) for j in e]
            tmp = []
            for a,b in list(zip(flattened_py,one_res_R[key])):
                if len(b)<=6:
                    tmp.append(a.lower()==b.lower()) # .lower() prevents inf == Inf -> False
                else:
                    allowed_error = 1e-4 + 1e-10 
                    tmp.append(abs(float(a) - np.round(float(b),4) <= allowed_error)) # rounding of the 4th digit can be up or down
                    
            try: assert np.all(np.array(tmp))
            except: print(key,i)

        if key == 'FinalReport':
            try: assert(np.allclose(np.array(list(one_res_py[key].values()))[1:].astype(float), 
                                    np.array(one_res_R[key]).flatten()[1:].astype(float)))
            except: print(key,i)
                
        if key == 'ElasticMatrix':
            try: assert np.all(one_res_py[key] == one_res_R[key])
            except: print(key,i)

# II - Speed comparison

In [4]:
### Python
np.random.seed(0)
num_points = [1000,10000,100000]
num_nodes = [10,20,30,40,50,60]

run_points = []
for j in num_points:
    run_nodes = []
    for i in num_nodes:
        X=np.random.random(size=(j,10))
        s = time.time()
        res = elpigraph.computeElasticPrincipalTree(X = X,NumNodes = i,drawPCAView=False)
        end = time.time() - s
        run_nodes.append(end)
    run_points.append(run_nodes)

Generating the initial configuration
Creating a chain in the 1st PC with 2 nodes
90% of the points have been used as initial conditions. Resetting.
Constructing tree 1 of 1 / Subset 1 of 1
Performing PCA
Using standard PCA
10 dimensions are being used
100.0 % of the original variance has been retained
The elastic matrix is being used. Edge configuration will be ignored
Computing EPG with  10  nodes on  1000  points and  10  dimensions
Nodes =  2 3 4 5 6 7 8 9 

BARCODE	ENERGY	NNODES	NEDGES	NRIBS	NSTARS	NRAYS	NRAYS2	MSE	MSEP	FVE	FVEP	UE	UR	URN	URN2	URSD

1|0|0|0|0|0|0||10	0.6063	10	9	0	0	0	0	0.5868	0.5821	0.3003	0.306	0.0194	0.0	0.0004	0.0036	0


MSDEnergyPlot not yet implemented
accuracyComplexityPlot not yet implemented
0.4939  seconds elapsed
Generating the initial configuration
Creating a chain in the 1st PC with 2 nodes
90% of the points have been used as initial conditions. Resetting.
Constructing tree 1 of 1 / Subset 1 of 1
Performing PCA
Using standard PCA
10 dimensions are bein

In [5]:
### R
np.random.seed(0)
num_points = [1000,10000,100000]
num_nodes = [10,20,30,40,50,60]

run_points_r = []
for j in num_points:
    run_nodes = []
    for i in num_nodes:
        X=np.random.random(size=(j,10))
        s = time.time()
        res= r_elpigraph.computeElasticPrincipalTree(X = X,NumNodes = i)
        end = time.time() - s
        run_nodes.append(end)
    run_points_r.append(run_nodes)

[1] "Generating the initial configuration"
[1] "Creating a chain in the 1st PC with 2 nodes"
[1] "Constructing tree 1 of 1 / Subset 1 of 1"
[1] "Performing PCA on the data"
[1] "Using standard PCA"
[1] "10 dimensions are being used"
[1] "100% of the original variance has been retained"
[1] "The elastic matrix is being used. Edge configuration will be ignored"
[1] "Computing EPG with 10 nodes on 1000 points and 10 dimensions"
[1] "Using a single core"
Nodes = 2 3 4 5 6 7 8 9 
BARCODE	ENERGY	NNODES	NEDGES	NRIBS	NSTARS	NRAYS	NRAYS2	MSE	MSEP	FVE	FVEP	UE	UR	URN	URN2	URSD
1|0|0|0|0|0|0||10	0.6063	10	9	0	0	0	0	0.5868	0.5821	0.3003	0.306	0.01943	3.598e-05	0.0003598	0.003598	0
1.678 sec elapsed
[[1]]

[1] "Generating the initial configuration"
[1] "Creating a chain in the 1st PC with 2 nodes"
[1] "Constructing tree 1 of 1 / Subset 1 of 1"
[1] "Performing PCA on the data"
[1] "Using standard PCA"
[1] "10 dimensions are being used"
[1] "100% of the original variance has been retained"
[1] "The el

In [6]:
### Plotting
for i in range(len(num_points)):

    #plt.plot(num_nodes,np.array(run_points_colab_hybrid[i])/60,marker='.') # run hybrid version if you have a gpu (or get results from colab)
    plt.plot(num_nodes,np.array(run_points[i])/60,marker='.')
    plt.plot(num_nodes,np.array(run_points_r[i])/60,marker='.')

    plt.xlabel('Number of nodes',fontsize=16)
    plt.ylabel('Time (minutes)',fontsize=16)
    plt.legend(['Python_Hybrid_cpu_gpu','Python_one_cpu','R_one_cpu'],fontsize=13)
    plt.title('Number of points (10 dimensions) : '+str(num_points[i]),fontsize=16)
    plt.show()

NameError: name 'num_points' is not defined