In [11]:
from StackGP import *
import sympy as sym
import pandas as pd
import numpy as np

In [3]:
data=pd.read_csv('/Users/nathanhaut/Downloads/pmlb/datasets/195_auto_price/195_auto_price.csv')

In [10]:
trainSize=np.floor(len(data)*0.7)
testSize=len(data)-trainSize

In [None]:
def ComputeSymbolicHess(model,vars):
    printedModel=sym.simplify(printGPModel(model))
    if type(printedModel)==float:
        return sym.matrices.dense.MutableDenseMatrix(np.zeros((vars,vars)))
    hess=sym.hessian(printedModel, [symbols('x'+str(i)) for i in range(vars)])
    return hess

def EvaluateHess(hess,vars,values):
    numHess=hess.subs({symbols('x'+str(j)):values[j] for j in range(vars)})
    hessN = np.array(numHess).astype(float)
    rankN=np.linalg.matrix_rank(hessN,tol=0.0001*0.0001*10)
    return rankN

def Approx2Deriv(model,values,diff1,diff2,positions): #maybe diff should be relative to the variation of each feature
    term1=[values[i]+diff1 if i == positions[0] else values[i] for i in range(len(values))]
    term1=[term1[i]+diff2 if i == positions[1] else term1[i] for i in range(len(term1))]
    term2=[values[i]-diff1 if i == positions[0] else values[i] for i in range(len(values))]
    term2=[term2[i]+diff2 if i == positions[1] else term2[i] for i in range(len(term2))]
    term3=[values[i]+diff1 if i == positions[0] else values[i] for i in range(len(values))]
    term3=[term3[i]-diff2 if i == positions[1] else term3[i] for i in range(len(term3))]
    term4=[values[i]-diff1 if i == positions[0] else values[i] for i in range(len(values))]
    term4=[term4[i]-diff2 if i == positions[1] else term4[i] for i in range(len(term4))]
    return ((evaluateGPModel(model,term1)-evaluateGPModel(model,term2))/((2*diff1))
            -(evaluateGPModel(model,term3)-evaluateGPModel(model,term4))/((2*diff1)))/(2*diff2)

def ApproxHessRank(model,vars,values,diff1=0.001,diff2=0.001):
    hess=[[Approx2Deriv(model,values,diff1,diff2,[i,j]) for i in range(vars)] for j in range(vars)]
    hessN = np.array(hess).astype(float)
    rankN=np.linalg.matrix_rank(hessN,tol=0.0001*0.0001*10)
    return rankN

In [None]:
def runExperiment(file,targetID,IDrange,name):
    #Import data file
    data=pd.read_csv(file)#"/Users/nathanhaut/Downloads/pmlb/datasets/195_auto_price/195_auto_price.csv")
    
    #Extract data from file
    vars=data.columns


    #Split train and test data
    trainSize=np.floor(len(data)*0.7)
    testSize=len(data)-trainSize

    trainIndices=np.random.choice(len(data),int(trainSize),replace=False)
    testIndices=np.setdiff1d(np.arange(len(data)),trainIndices)

    trainData=data.iloc[trainIndices]
    testData=data.iloc[testIndices]

    #Extract input and response data
    trainInput=np.array(trainData.T)[:-1]
    trainResponse=np.array(trainData.T)[-1]
    testInput=np.array(testData.T)[:-1]
    testResponse=np.array(testData.T)[-1]

    #Create target basis set function
    func=basisFunctionComplexityDiff(targetID,IDrange)

    #Evolve models using three approaches: ID-informed, complexity-informed, and standard tournament
    IDmodels=evolve(trainInput,trainResponse,modelEvaluationMetrics=[fitness,func],tourneySize=20,generations=100,align=False,elitismRate=10,popSize=300)
    ID3Omodels=evolve(trainInput,trainResponse,modelEvaluationMetrics=[fitness,stackGPModelComplexity,func],tourneySize=40,generations=100,align=False,elitismRate=10,popSize=300)
    compModels=evolve(trainInput,trainResponse,tourneySize=20,generations=100,align=False,elitismRate=10,popSize=300)
    tourneyModels=evolve(trainInput,trainResponse,modelEvaluationMetrics=[fitness],tourneySize=5,generations=100,align=False,elitismRate=10,popSize=300)

    #Select target models from approaches
    IDmodel=IDmodels[0]
    ID3Omodel=ID3Omodels[0]
    compModel=compModels[0]
    tourneyModel=tourneyModels[0]

    #Align models
    IDmodel=alignGPModel(IDmodel,trainInput,trainResponse)
    ID3Omodel=alignGPModel(ID3Omodel,trainInput,trainResponse)
    compModel=alignGPModel(compModel,trainInput,trainResponse)
    tourneyModel=alignGPModel(tourneyModel,trainInput,trainResponse)

    #Evaluate models on test data
    IDfitness=fitness(IDmodel,testInput,testResponse)
    ID3Ofitness=fitness(ID3Omodel,testInput,testResponse)
    compFitness=fitness(compModel,testInput,testResponse)
    tourneyFitness=fitness(tourneyModel,testInput,testResponse)

    IDRMSE=np.linalg.norm(evaluateGPModel(IDmodel,testInput)-testResponse)
    ID3ORMSE=np.linalg.norm(evaluateGPModel(ID3Omodel,testInput)-testResponse)
    compRMSE=np.linalg.norm(evaluateGPModel(compModel,testInput)-testResponse)
    tourneyRMSE=np.linalg.norm(evaluateGPModel(tourneyModel,testInput)-testResponse)

    #Save results
    results=pd.DataFrame({'ID':[printGPModel(IDmodel),IDfitness,IDRMSE],'ID3O':[printGPModel(ID3Omodel),ID3Ofitness,ID3ORMSE],'Complexity':[printGPModel(compModel),compFitness,compRMSE],'Tourney':[printGPModel(tourneyModel),tourneyFitness,tourneyRMSE]})
    results.to_csv('Results/'+name+'.csv')


    #Return target models and fitnesses on test data
    return results


In [None]:
def runTrials(file, count):
    #Create variables to store output

    #Loop through trials
    for i in range(count):
        #Run experiments
        runExperiment()

    #Compute statistics

    #Save data to file

    #Return results


def loadData():
    #Load statistics from results file


