In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import random

In [2]:
df = pd.read_csv("vectorizedData.csv")

In [3]:
class Generation():
    
    def __init__(self, population, sln):
        self.population = population
        self.sln = sln
    
    #returns sorted population of all solutions from highest score to lowest
    def sortPop(self):
        return dict(sorted(self.population.items(),key=lambda x:x[1], reverse=True))
        
    #Select parents for mutation and child forming
    #we will pick the top X performering solutions
    #we will match them in order of performance(ex: (1,2), (2,3), ... (n, n+1))
    #returns a list o tuples representing the sln 
    def ParentSelection1(self, x):
        parents =list(self.sortPop())
        print(parents)
        parents = parents[:x]
        i=0
        res = []
        while i<x-1:
            res.append((i,i+1))
            i+=1
        return res
    
    #similar to ParentSelection1, but uses the top performing solution as parent for all new children
    #ex; (1,2),(1,3),...(1,n) 
    def ParentSelection2(self,x):
        parents = self.sortPop()
        parents = parents[:x]
        i=0
        res = []
        while i<x-1:
            res.append((0,i+1))
            i+=1
        return res

    #Return next generation of children,array of genes
    def ProducenewGen(self, parents):
        newsln = []
        for m,d in parents:
            geneM = self.sln[m]
            geneD = self.sln[d]
            #perform single point crossover
            child1,child2 = MultiplePointCrossoverFct(geneM,geneD, [250,500,750,1000])
            newsln.extend([list(child1),list(child2)])
        topPerf1,topPerf2 = list(self.sortPop())[0],list(self.sortPop())[1]
        newsln.extend([self.sln[topPerf1],self.sln[topPerf2]])
        return newsln
    
    #genetic mutation on children genes 
    #modify randomly 10 binary values in gene
    #return new generation
    def mutation(self):
        nextSln = np.copy(self.sln)
        for gene in nextSln:
            for _ in range(10):
                randomIndex = random.randint(0, len(gene)-1)  # Fixed indexing
                gene[randomIndex] = 1 - gene[randomIndex]  # Flipping 0 to 1 and vice versa
        return nextSln

In [4]:
def singlePointCrossoverFct(gene1,gene2,x):
    newA = np.append(gene1[:x], gene2[x:])
    newB = np.append(gene2[:x], gene1[x:])
    return newA, newB

In [5]:
def MultiplePointCrossoverFct(gene1,gene2,points):
    for x in points:
        print(x)
        newA = np.append(gene1[:x], gene2[x:])
        newB = np.append(gene2[:x], gene1[x:])
    return newA, newB

In [6]:
def splitData(em,tf=0.75): #tf = training fraction & em=embbeded dataframe
    X = em.iloc[:,1:]
    Xtrain = X[:int(tf*X.shape[0])]
    Xtest = X[int(tf*X.shape[0]):] 
    Y = em.iloc[:,1]
    Ytrain = Y[:int(tf*Y.shape[0])]
    Ytest = Y[int(tf*Y.shape[0]):] 
    return Xtrain, Xtest, Ytrain, Ytest

In [7]:
#create genetic representation of sln, binary representation
def generateGeneticSLN(df):
    l = df.shape[1]-1
    gene = []
    for i in range(l):
        gene.append(random.randint(0,1))
    return gene

In [8]:
def generateDFSolution(gene,X_train, X_test):
    sln_train= pd.DataFrame()
    sln_test= pd.DataFrame()
    for i in range(1,len(gene)):
        curr = gene[i]
        if curr!=0:
            #select feature column and add in solution df
            tr = X_train.iloc[:,i]
            te = X_test.iloc[:,i]
            sln_train = pd.concat([sln_train, tr], axis=1)
            sln_test = pd.concat([sln_test, te], axis=1)
            
    return sln_test, sln_train

In [9]:
def genHeuristicValue(gene,X_train, X_test,Y_train,Y_Test):
    sln_test, sln_train = generateDFSolution(gene,X_train, X_test)
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(sln_train, Y_train)
    #testing set and get accuracy
    y_pred = clf.predict(sln_test)
    return accuracy_score(Y_Test, y_pred)

In [10]:
def initialPopulation(df, size):
    Xtrain, Xtest, Ytrain, Ytest = splitData(df)
    #we will generate a dictionary where each key value pair represents a sln and its heuristic value 
    #initialize our genetic algorithm
    population = { }
    sln = []
    for i in range(size):
        sln.append(generateGeneticSLN(df))
        population[i] = genHeuristicValue(sln[i],Xtrain, Xtest, Ytrain, Ytest)
        print(population)
    return population, sln

In [11]:
population,sln = initialPopulation(df, 10)
gen0 = Generation(population,sln)
print(gen0.population)

{0: 0.9440057430007178}
{0: 0.9440057430007178, 1: 0.955491744436468}
{0: 0.9440057430007178, 1: 0.955491744436468, 2: 0.9669777458722182}
{0: 0.9440057430007178, 1: 0.955491744436468, 2: 0.9669777458722182, 3: 0.968413496051687}
{0: 0.9440057430007178, 1: 0.955491744436468, 2: 0.9669777458722182, 3: 0.968413496051687, 4: 0.9583632447954056}
{0: 0.9440057430007178, 1: 0.955491744436468, 2: 0.9669777458722182, 3: 0.968413496051687, 4: 0.9583632447954056, 5: 0.9519023689877961}
{0: 0.9440057430007178, 1: 0.955491744436468, 2: 0.9669777458722182, 3: 0.968413496051687, 4: 0.9583632447954056, 5: 0.9519023689877961, 6: 0.9511844938980617}
{0: 0.9440057430007178, 1: 0.955491744436468, 2: 0.9669777458722182, 3: 0.968413496051687, 4: 0.9583632447954056, 5: 0.9519023689877961, 6: 0.9511844938980617, 7: 0.955491744436468}
{0: 0.9440057430007178, 1: 0.955491744436468, 2: 0.9669777458722182, 3: 0.968413496051687, 4: 0.9583632447954056, 5: 0.9519023689877961, 6: 0.9511844938980617, 7: 0.955491744436

In [22]:
#perform genetic feature selection
def geneticFeatureSelection(df, initPop_size, nbGens):
    Xtrain, Xtest, Ytrain, Ytest = splitData(df)
    
    population,sln = initialPopulation(df, initPop_size)
    #define initial generation
    gen0 = Generation(population,sln)
    tree = [gen0]
    for i in range(nbGens):
        gen = tree[i]
        parents = gen.ParentSelection1(int(0.5*len(gen.sln)))
        newSln = gen.ProducenewGen(parents)
        newpop = {}
        Xtrain, Xtest, Ytrain, Ytest = splitData(df)
        for i in range(len(newSln)):
                newpop[i] = genHeuristicValue(newSln[i],Xtrain, Xtest, Ytrain, Ytest)
                print(newpop)
        newgen = Generation(newpop,newSln)
        tree.append(newgen)
    return tree[-1].population
                


In [27]:
Xtrain, Xtest, Ytrain, Ytest = splitData(df)
    
population,sln = initialPopulation(df, 10)
#define initial generation
gen0 = Generation(population,sln)
tree = [gen0]

{0: 0.949748743718593}
{0: 0.949748743718593, 1: 0.9483129935391242}
{0: 0.949748743718593, 1: 0.9483129935391242, 2: 0.95908111988514}
{0: 0.949748743718593, 1: 0.9483129935391242, 2: 0.95908111988514, 3: 0.9511844938980617}
{0: 0.949748743718593, 1: 0.9483129935391242, 2: 0.95908111988514, 3: 0.9511844938980617, 4: 0.9526202440775305}
{0: 0.949748743718593, 1: 0.9483129935391242, 2: 0.95908111988514, 3: 0.9511844938980617, 4: 0.9526202440775305, 5: 0.955491744436468}
{0: 0.949748743718593, 1: 0.9483129935391242, 2: 0.95908111988514, 3: 0.9511844938980617, 4: 0.9526202440775305, 5: 0.955491744436468, 6: 0.9676956209619526}
{0: 0.949748743718593, 1: 0.9483129935391242, 2: 0.95908111988514, 3: 0.9511844938980617, 4: 0.9526202440775305, 5: 0.955491744436468, 6: 0.9676956209619526, 7: 0.9576453697056713}
{0: 0.949748743718593, 1: 0.9483129935391242, 2: 0.95908111988514, 3: 0.9511844938980617, 4: 0.9526202440775305, 5: 0.955491744436468, 6: 0.9676956209619526, 7: 0.9576453697056713, 8: 0.9

In [28]:
for i in range(5):
    gen = tree[i]
    parents = gen.ParentSelection1(int(0.5*len(gen.sln)))
    newSln = gen.ProducenewGen(parents)
    newpop = {}
    Xtrain, Xtest, Ytrain, Ytest = splitData(df)
    for i in range(len(newSln)):
            newpop[i] = genHeuristicValue(newSln[i],Xtrain, Xtest, Ytrain, Ytest)
            print(newpop)
    newgen = Generation(newpop,newSln)
    tree.append(newgen)

250
500
750
1000
250
500
750
1000
250
500
750
1000
250
500
750
1000
{0: 0.9583632447954056}
{0: 0.9583632447954056, 1: 0.955491744436468}
{0: 0.9583632447954056, 1: 0.955491744436468, 2: 0.9676956209619526}
{0: 0.9583632447954056, 1: 0.955491744436468, 2: 0.9676956209619526, 3: 0.9533381191672649}
{0: 0.9583632447954056, 1: 0.955491744436468, 2: 0.9676956209619526, 3: 0.9533381191672649, 4: 0.949748743718593}
{0: 0.9583632447954056, 1: 0.955491744436468, 2: 0.9676956209619526, 3: 0.9533381191672649, 4: 0.949748743718593, 5: 0.968413496051687}
{0: 0.9583632447954056, 1: 0.955491744436468, 2: 0.9676956209619526, 3: 0.9533381191672649, 4: 0.949748743718593, 5: 0.968413496051687, 6: 0.9533381191672649}
{0: 0.9583632447954056, 1: 0.955491744436468, 2: 0.9676956209619526, 3: 0.9533381191672649, 4: 0.949748743718593, 5: 0.968413496051687, 6: 0.9533381191672649, 7: 0.9569274946159368}
{0: 0.9583632447954056, 1: 0.955491744436468, 2: 0.9676956209619526, 3: 0.9533381191672649, 4: 0.9497487437185