In [33]:
import os
from cmath import inf
os.chdir('/Users/matsumoton/Library/CloudStorage/Box-Box/tpot_benchmarking')
print(os.getcwd())
import sys
sys.path.insert(1,'/Users/matsumoton/Library/CloudStorage/Box-Box/tpot_benchmarking')
#load pickle evaluated pipelines
from os.path import exists
import matplotlib.pyplot as plt
import numpy as np
import dill as pickle
import sys
import pandas as pd
from tpot.tpot import TPOTClassifier
import deap
from deap import creator

import pydot 
from IPython.display import Image, display

import networkx as nx
import random
import math

import networkx.algorithms as na

class TrieNode:
 
    def __init__(self, primitive):
        self.primitive = primitive
        self.path = 'root'
        self.traverse_count = 0
        self.total_cv_score = []
        self.generation = []
        self.children = {}
        self.parents = []
        self.depth = 0
        self.max_score = -inf
        self.min_score = inf
        self.diversity_score = 0
 
class PipelineTrie(object):
 
    def __init__(self):
        self.root = TrieNode("")
        self.graph = {}
        
    def insert(self, pipeline_str,pipeline_data,pset):

        def prim_to_list(prim, args):
            if isinstance(prim, deap.gp.Terminal):
                return None
            return [prim.name] + args
        def remove_none(obj):
            if isinstance(obj, (list, tuple, set)):
                return type(obj)(remove_none(x) for x in obj if x is not None)
            elif isinstance(obj, dict):
                return type(obj)((remove_none(k), remove_none(v))
                for k, v in obj.items() if k is not None and v is not None)
            else:
                return obj

        pipeline = creator.Individual.from_string(pipeline_str, pset)

        #convert pipeline into a list and change all hyperparameters to None
        tree = []
        stack = []
        for node in pipeline:
            stack.append((node, []))
            while len(stack[-1][1]) == stack[-1][0].arity:
                prim, args = stack.pop()
                tree = prim_to_list(prim, args)
                if len(stack) == 0:
                    break  # If stack is empty, all nodes should have been seen
                
                stack[-1][1].append(tree)
        
        #remove all Nones
        tree = remove_none(tree)
        
        #dfs through the tree and integrate into trie
        stack = []
        stack.append(tree)
        trie_stack = [self.root]

        while stack:
            s = stack.pop()
            node = trie_stack.pop()
            cur_depth = node.depth+1
            
            if (s[0]) not in node.children:
                node.children[(s[0])] = TrieNode(s[0])
                node.children[(s[0])].parents = np.append(node.parents,node)
                #add a value to the root diversity metric
                #self.root.diversity_score =  self.root.diversity_score + 1/cur_depth**2
                temp_depth = 1
                for tempnode in node.parents:
                    tempnode.diversity_score =  tempnode.diversity_score + 1/temp_depth**2
                    temp_depth = temp_depth + 1
            node.children[(s[0])].traverse_count = node.children[(s[0])].traverse_count + 1
            node.children[(s[0])].total_cv_score.append(pipeline_data["internal_cv_score"])
            node.children[(s[0])].generation.append(pipeline_data["generation"])
            node.children[(s[0])].depth = cur_depth
            if not math.isnan(pipeline_data["internal_cv_score"]) and not math.isinf(pipeline_data["internal_cv_score"]):
                node.children[(s[0])].min_score = min(node.children[(s[0])].min_score,pipeline_data["internal_cv_score"])
                node.children[(s[0])].max_score = max(node.children[(s[0])].max_score,pipeline_data["internal_cv_score"])
                self.root.min_score = min(self.root.min_score,pipeline_data["internal_cv_score"])
                self.root.max_score = max(self.root.max_score,pipeline_data["internal_cv_score"])
            if node.path != 'root':
                node.children[(s[0])].path = node.path + '-' + s[0]
            else:
                node.children[(s[0])].path = s[0]
            if len(s[1:]) > 0:
                stack.extend(s[1:])
                for i in range(len(s[1:])):
                    trie_stack.append(node.children[(s[0])])
                    
    def get_networkx_graph(self,depth=100):
        import networkx as nx
        from pyvis.network import Network
        import matplotlib as mpl

        def colorFader(c1,c2,mix=0): #fade (linear interpolate) from color c1 (at mix=0) to c2 (mix=1)
            c1=np.array(mpl.colors.to_rgb(c1))
            c2=np.array(mpl.colors.to_rgb(c2))
            return mpl.colors.to_hex((1-mix)*c1 + mix*c2)

        c1='red' #blue
        c2='green' #green

        graph = pydot.Dot(graph_type='graph') 
        stack = [self.root]
        parent_stack = []

        max_height = depth
        while stack:
            s = stack.pop()
            if s.depth >= max_height:
                continue
            for k in s.children.keys():
                stack.append(s.children[k])
                temp =  [v for v in s.total_cv_score if not math.isnan(v) and not math.isinf(v)]
                if len(temp) :
                    parentnodeaccuracy =(sum(temp)/len(temp))
                    if parentnodeaccuracy > self.root.max_score:
                        parentnodeaccuracy = self.root.max_score
                    parentnodecolor = colorFader(c1,c2,(parentnodeaccuracy-self.root.min_score)/(self.root.max_score-self.root.min_score))
                else:
                    parentnodeaccuracy = 'NA'
                    parentnodecolor = "#666666"
                    
                temp =  [v for v in s.children[k].total_cv_score if not math.isnan(v) and not math.isinf(v)]
                if len(temp) :
                    childaccuracy = (sum(temp)/len(temp))
                    #floating point 0.00...01 issue
                    if childaccuracy > self.root.max_score:
                        childaccuracy = self.root.max_score
                    childcolor = colorFader(c1,c2,(childaccuracy-self.root.min_score)/(self.root.max_score-self.root.min_score))
                    
                else:
                    childaccuracy = 'NA'
                    childcolor = "#666666"
                
                graph.add_node(pydot.Node(s.path,label=s.primitive+'\n'+str(parentnodeaccuracy),color=parentnodecolor,size=10*(math.tanh(-s.depth+4)+2)))
                graph.add_node(pydot.Node(s.children[k].path,label=s.children[k].primitive+'\n'+str(childaccuracy),color=childcolor,size=10*(math.tanh(-s.children[k].depth+4)+2)))
                
                edge = pydot.Edge(s.path, s.children[k].path,weight=1,color='#515ba3',value=math.log(s.children[k].traverse_count))
                graph.add_edge(edge)
        self.graph = nx.nx_pydot.from_pydot(graph)
        
                
    def display(self,filename):
        import networkx as nx
        from pyvis.network import Network
        import matplotlib as mpl
        nt = Network(height='100%', width='100%', bgcolor='#333333', font_color='white')
        nt.from_nx(self.graph)
        nt.show(filename+'.html')
        nx.write_edgelist(self.graph, filename+".edgelist")
 
    

from sklearn.utils import shuffle
from os import makedirs


def extract_labels(df, labelname):
    y = df[labelname].copy(deep=True)
    x = df.drop(labelname, axis=1)
    x, y = shuffle(x, y)
    x = x.to_numpy()
    y = y.to_numpy()
    return x, y



#directoryevs = ["/baseline_dynamic/baseline","/lexicase_dynamic/anges"]
#directoryevs = ["_baseline",""]

directoryevs = ["/baseline/baseline","/baseline_dynamic/baseline_dynamic","/lexicase/lexicase","/lexicase_dynamic/lexicase_dynamic"]
directoryevs = ["/baseline_dynamic/baseline_dynamic","/lexicase/lexicase","/lexicase_dynamic/lexicase_dynamic"]
#directoryevs = ["/lexicase_dynamic/lexicase_dynamic"]
directoryevs = ["/baseline/baseline","/lexicase/lexicase"]
upper_quantile_only = False

generation_num = 50
total_runs = 40

name_values = {
    "/baseline/baseline" : "baseline",
    "/baseline_dynamic/baseline_dynamic" : "baseline_dynamic",
    "/lexicase/lexicase" : "lexicase",
    "/lexicase_dynamic/lexicase_dynamic" : "lexicase_dynamic"
}

#directoryevs = ["lexicase_dynamic_final"]
result = {}

import pandas as pd
 # This is done based on the dataset ID.
#dataset = openml.datasets.get_dataset(1164)
#dataset = openml.datasets.get_dataset(1164)
dataset = pd.read_csv("/Users/matsumoton/Documents/anges_cad_1_train.csv",sep=",")
y_train = dataset['target']
X_train = dataset.drop(['target'],axis=1)

test_dataset = pd.read_csv("/Users/matsumoton/Documents/anges_cad_1_test.csv",sep=",")
y_test = test_dataset['target']
X_test = test_dataset.drop(['target'],axis=1)



tpot = TPOTClassifier(verbosity=2, population_size=1, generations=1)
tpot.fit(X_train, y_train)

diversity_scores= {}

for directoryev in directoryevs:
    temp_ev = []
    #pipeline_trie = PipelineTrie()
    diversity_scores[directoryev] = []
    print(directoryev)
    for i in range(0,total_runs):
        pipeline_trie = PipelineTrie()
        pklfile = f"/Users/matsumoton/Library/CloudStorage/Box-Box/tpot_benchmark_data/anges{directoryev}_{i}_evaluated_individuals.pkl"
        if not exists(pklfile):
                continue
        with open(pklfile, 'rb') as file:
        #with open(f"C:/Users/matsumoton/Box/tpot_benchmark_data/results_pop40_gen20_{directoryev}/pipelines/digen{j}_run_{i}_evaluated_individuals.pkl", 'rb') as file:
            unpickler = pickle.Unpickler(file)
            result = unpickler.load()
            for k , v in result.items():
                #print(k)
                pipeline_trie.insert(k,v,tpot._pset)
        
        pipeline_trie.get_networkx_graph(100)

        #pipeline_trie.display(f"/Users/matsumoton/Library/CloudStorage/Box-Box/tpot_benchmark_data/anges{directoryev}_run{i}_ds_{pipeline_trie.root.diversity_score}")
        print("global efficiency run " + str(i) + " : " + str(na.global_efficiency(pipeline_trie.graph)))
        A = nx.adjacency_matrix(pipeline_trie.graph)

        A.todense()

        exit
        #if i == 5:
        #    break

        
        #print(directoryev+' '+str(i) + ' ' + str([pipeline_trie.root.diversity_score,pipeline_trie.root.max_score]))
        #diversity_scores[directoryev].append([pipeline_trie.root.diversity_score,pipeline_trie.root.max_score])



        
    
        
    


/Users/matsumoton/Library/CloudStorage/Box-Box/tpot_benchmarking
                                                                          
Generation 1 - Current best internal CV score: 0.8160431654676259
                                                                          
Best pipeline: ExtraTreesClassifier(MultinomialNB(ZeroCount(input_matrix), alpha=10.0, fit_prior=False), bootstrap=True, criterion=gini, max_features=0.15000000000000002, min_samples_leaf=14, min_samples_split=20, n_estimators=100)
/baseline/baseline


TypeError: Unable to evaluate terminal: AdaBoostClassifier.

In [2]:




tpot = TPOTClassifier(verbosity=2, population_size=1, generations=1)
tpot.fit(X_train, y_train)

for directoryev in directoryevs:
    temp_ev = []
    for i in range(total_runs):

        pipeline_trie = PipelineTrie()
        
        #with open(f"C:/Users/matsumoton/Box/tpot_benchmark_data/results_pop40_gen20_{directoryev}/pipelines/digen{j}_run_{i}_evaluated_individuals.pkl", 'rb') as file:
        with open(f"/Users/matsumoton/Library/CloudStorage/Box-Box/tpot_benchmark_data/anges{directoryev}_{i}_evaluated_individuals.pkl", 'rb') as file:
            unpickler = pickle.Unpickler(file)
            result = unpickler.load()
            for k , v in result.items():
                #print(k)
                pipeline_trie.insert(k,v,tpot._pset)

        pipeline_trie.display()
        break
    break


Optimization Progress:   0%|          | 0/2 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.7816238437821171

Best pipeline: LogisticRegression(input_matrix, C=0.001, dual=False, penalty=l2)


In [48]:
pipeline_trie.root.children.keys()


dict_keys(['GaussianNB1', 'RandomForestClassifier1', 'ExtraTreesClassifier1', 'LinearSVC1', 'DecisionTreeClassifier1', 'BernoulliNB1', 'XGBClassifier1', 'MLPClassifier1', 'LogisticRegression1', 'KNeighborsClassifier1', 'SGDClassifier1', 'GradientBoostingClassifier1'])

In [9]:
import os
for count, f in enumerate(os.listdir("/Users/matsumoton/Library/CloudStorage/Box-Box/tpot_benchmark_data/anges/autoepslexicase")):
    f_name, f_ext = os.path.splitext(f)
    #print(f_name)
    f_name = f_name.split("_")[2:]
    f_name = "autoepslexicase_"+"_".join(f_name)
    print(f_name)
    new_name = f'/Users/matsumoton/Library/CloudStorage/Box-Box/tpot_benchmark_data/anges/autoepslexicase/{f_name}{f_ext}'
    #os.rename(f'/Users/matsumoton/Library/CloudStorage/Box-Box/tpot_benchmark_data/anges/autoepslexicase/{f}', new_name)

autoepslexicase_16_fitness
autoepslexicase_33_resources.csv_memory
autoepslexicase_1_fitness
autoepslexicase_16_mutation_rates
autoepslexicase_25_pareto_fitness
autoepslexicase_6_evaluated_individuals
autoepslexicase_24_fitness
autoepslexicase_36_resources.csv_time
autoepslexicase_6_mutation_rates
autoepslexicase_35_pareto_fitness
autoepslexicase_8_fitness
autoepslexicase_5_pareto_fitness
autoepslexicase_8_pareto_fitness
autoepslexicase_36_mutation_rates
autoepslexicase_31_fitness
autoepslexicase_32_evaluated_individuals
autoepslexicase_15_pareto_fitness
autoepslexicase_31_resources.csv_time
autoepslexicase_14_evaluated_individuals
autoepslexicase_25_mutation_rates
autoepslexicase_16_pareto_fitness
autoepslexicase_34_fitness
autoepslexicase_13_evaluated_individuals
autoepslexicase_16_resources.csv_memory
autoepslexicase_21_resources.csv_memory
autoepslexicase_35_mutation_rates
autoepslexicase_6_pareto_fitness
autoepslexicase_35_evaluated_individuals
autoepslexicase_32_resources.csv_tim