In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb
clf = lgb.LGBMClassifier(random_state=2)
runs=50

In [None]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [None]:
df=pd.DataFrame({'v1':newsgroups_train.target,'v2':newsgroups_train.data})

In [None]:
#df=pd.read_csv('/content/spam.csv',encoding='latin1')
df=df[['v1','v2']]
df_target=df['v1']
df_target=df_target.replace('spam',1).replace('ham',0)
df=df.drop(columns=['v1'])
X_train, X_test, y_train, y_test = train_test_split(df,df_target, test_size=0.80, random_state=42,stratify=df_target)
X_train, X_test, y_train, y_test = train_test_split(X_train,y_train, test_size=0.30, random_state=42,stratify=y_train)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(
    lowercase=True,stop_words = 'english',max_features=300,min_df=0.01
)
tfidf.fit( X_train['v2'].astype('str') )

def returnCV(df,tfidf):
  text = tfidf.transform( df['v2'].values )
  txt_features_cols = ['feat_' + col.replace(' ','_') for col in tfidf.get_feature_names()]
  txt_features = pd.DataFrame(text.todense(), columns=txt_features_cols)
  return txt_features.reset_index(drop=True)

X_train=returnCV(X_train,tfidf)
X_test=returnCV(X_test,tfidf)

In [None]:
X_train.shape

(1583, 300)

In [None]:
# df=pd.read_csv('/content/musk_csv.csv')
# df=df.copy()
# df_target=df['class']
# df=df.drop(columns=['ID','molecule_name','conformation_name','class'])
# X_train, X_test, y_train, y_test = train_test_split(df,df_target, test_size=0.30, random_state=42,stratify=df_target)
# name='Musk'
# X_train.shape

In [None]:
data_shape=X_train.shape[1]

In [None]:
import scipy
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

def score_model(model,X_train, y_train, X_valid, y_valid):
    model.fit(X_train,y_train)
    # return log_loss(y_valid,model.predict_proba(X_valid))
    P = (model.predict(X_valid) != y_valid).mean()
    alpha=0.01
    # Compute for the objective function
    j = alpha*(X_valid.shape[1]/data_shape)+(1-alpha)*P
    return j

In [None]:
class SwarmGenerator:

  def __init__(self,objective_function,n_iteration=50,population_size=50,
               c1=2,c2=2,w=0.9):
    self.population_size = population_size
    self.n_iteration=n_iteration
    self.historical_score_record=[]
    self.objective_function=objective_function
    self.c1=c1
    self.c2=c2
    self.w=w

  def initialize_population(self,X):
        self.individuals =  np.random.randint(0,2,size=(self.population_size,X.shape[1]))


  def evaluate_fitness(self,model,X_train,y_train,X_valid,y_valid):
        scores =  []
        for i,individual in enumerate(self.individuals):
            chosen_features = [index for index in range(X_train.shape[1]) if individual[index]==1]
            X_train_copy = X_train.iloc[:,chosen_features]
            X_valid_copy = X_valid.iloc[:,chosen_features]
            score = self.objective_function(model,X_train_copy,y_train,X_valid_copy,y_valid)
            if score< self.current_best_scores[i]:
              self.current_best_scores[i]=score
              self.current_best_individual_score_dimensions[i]=individual

            scores.append(score)
        self.fitness_scores = scores
 

  def sigmoid(self,x):
    return 1/(1+np.exp(-x))
        
  def fit(self,model,X_train,y_train,X_valid,y_valid):
    list_best_fitness=[]
    list_mean_fitness=[]
    list_min_fitness=[]

    self.initialize_population(X_train)

    self.best_score=np.inf
    self.best_dim=np.ones(X_train.shape[1]) 

    self.current_best_individual_score_dimensions=self.individuals
    self.current_best_scores = [np.inf]*self.population_size 
    self.gbest_individual=self.best_dim
    self.v=np.zeros((self.population_size,X_train.shape[1]))

    for i in range(self.n_iteration):
      if (self.individuals.sum(axis=1)==0).sum()>0:
        print((self.individuals.sum(axis=1)==0).sum(),' individuals went zero')
        self.individuals[self.individuals.sum(axis=1)==0]=np.random.randint(0,2,(self.individuals[self.individuals.sum(axis=1)==0].shape[0],\
                                                                                self.individuals[self.individuals.sum(axis=1)==0].shape[1]))
      self.evaluate_fitness(model,X_train,y_train,X_valid,y_valid)

      if np.array(self.fitness_scores).min()<self.best_score:
        self.best_dim=self.individuals[np.array(self.fitness_scores).argmin()]
        self.best_score=np.array(self.fitness_scores).min()
      self.gbest_individual=self.best_dim       

      r1=np.random.random((self.population_size,X_train.shape[1]))
      r2=np.random.random((self.population_size,X_train.shape[1]))
    
      self.v=self.w*self.v+self.c1*r1*(self.gbest_individual-self.individuals)+\
      self.c2*r2*(self.current_best_individual_score_dimensions-self.individuals)
      self.v=np.where(self.v>6,6,self.v)
      self.v=np.where(self.v<-6,-6,self.v)

      list_best_fitness.append(self.best_score)
      list_mean_fitness.append(np.array(self.fitness_scores).mean())
      list_min_fitness.append(np.array(self.fitness_scores).min())
      
      
      self.s_v=self.sigmoid(self.v)

      self.individuals=np.where(np.random.uniform(size=(self.population_size,X_train.shape[1]))<self.s_v,1,0)

    return self.best_dim,   list_best_fitness,\
                list_mean_fitness,list_min_fitness

In [None]:
list_best_fitness_org=[]
list_mean_fitness_org=[]
list_min_fitness_org=[]
list_res_org=[]
for i in range(runs):
  print(i)
  gaf=SwarmGenerator(score_model,100,10)
  res,temp1,temp2,temp3=gaf.fit(clf,X_train, y_train,X_test, y_test)
  list_res_org.append(res)
  list_best_fitness_org.append(temp1)
  list_mean_fitness_org.append(temp2)
  list_min_fitness_org.append(temp3)

0


In [None]:
class SwarmGenerator_nope:

  def __init__(self,objective_function,n_iteration=50,population_size=50,
               c1=2,c2=2,w=0.9):
    self.population_size = population_size
    self.n_iteration=n_iteration
    self.historical_score_record=[]
    self.objective_function=objective_function
    self.c1=c1
    self.c2=c2
    self.w=w

  def initialize_population(self,X):
        self.individuals =  np.random.randint(0,2,size=(self.population_size,X.shape[1]))


  def evaluate_fitness(self,model,X_train,y_train,X_valid,y_valid):
        scores =  []
        for i,individual in enumerate(self.individuals):
            chosen_features = [index for index in range(X_train.shape[1]) if individual[index]==1]
            X_train_copy = X_train.iloc[:,chosen_features]
            X_valid_copy = X_valid.iloc[:,chosen_features]
            score = self.objective_function(model,X_train_copy,y_train,X_valid_copy,y_valid)
            if score< self.current_best_scores[i]:
              self.current_best_scores[i]=score
              self.current_best_individual_score_dimensions[i]=individual

            scores.append(score)
        self.fitness_scores = scores
 

  def sigmoid(self,x):
    return 1/(1+np.exp(-x))
        
  def fit(self,model,X_train,y_train,X_valid,y_valid):
    list_best_fitness=[]
    list_mean_fitness=[]
    list_min_fitness=[]
    self.initialize_population(X_train)

    self.best_score=np.inf
    self.best_dim=np.ones(X_train.shape[1]) 
    values_list=[]

    self.current_best_individual_score_dimensions=self.individuals
    self.current_best_scores = [np.inf]*self.population_size 
    self.gbest_individual=self.best_dim
    self.v=np.zeros((self.population_size,X_train.shape[1]))

    for i in range(self.n_iteration):
      if (self.individuals.sum(axis=1)==0).sum()>0:
        print((self.individuals.sum(axis=1)==0).sum(),' individuals went zero')
        self.individuals[self.individuals.sum(axis=1)==0]=np.random.randint(0,2,(self.individuals[self.individuals.sum(axis=1)==0].shape[0],\
                                                                                self.individuals[self.individuals.sum(axis=1)==0].shape[1]))
      self.evaluate_fitness(model,X_train,y_train,X_valid,y_valid)

      

      if np.array(self.fitness_scores).min()<self.best_score:
        self.best_dim=self.individuals[np.array(self.fitness_scores).argmin()]
        self.best_score=np.array(self.fitness_scores).min()
      self.gbest_individual=self.best_dim

      
      temp=list(X_train.columns[np.where( self.individuals[np.array(self.fitness_scores).argmin()] )])

       
      r1=np.random.random((self.population_size,X_train.shape[1]))
      r2=np.random.random((self.population_size,X_train.shape[1]))

      self.fitness_scores_numpy=np.array(self.fitness_scores)
      self.qi=np.array(self.fitness_scores_numpy-self.fitness_scores_numpy.max())/(self.fitness_scores_numpy.min()-self.fitness_scores_numpy.max())
      self.Mi=self.qi
      self.Mi=2*self.Mi-1
    

      temp=self.individuals
      temp_2=(( (temp.reshape(temp.shape[0],1,temp.shape[1])-temp.reshape(1,temp.shape[0],temp.shape[1])).reshape(temp.shape[0]**2,temp.shape[1])**2) )
      temp_3=np.delete(temp_2,tuple(np.arange(0,temp.shape[0]**2,temp.shape[0]+1)),axis=0 ).reshape(temp.shape[0],temp.shape[0]-1,temp.shape[1]).sum(axis=2)

      temp_4=np.where(temp_3<int(X_train.shape[1]/2) )
      temp_4_R0=np.where(temp_3<int(X_train.shape[1]/4))
      
     
      self.acc=[]
      self.acc_R0=[]

      self.acc_individual=[]

      Mi=self.Mi
      for ind_index,individual in enumerate(temp):   
        indexs=np.argsort(self.Mi[temp_4[1][np.where(temp_4[0]==ind_index)]] )[:int(len(temp_4[1][np.where(temp_4[0]==ind_index)])/1)]
        indexs_R0=np.argsort(self.Mi[temp_4_R0[1][np.where(temp_4_R0[0]==ind_index)]] )[:int(len(temp_4_R0[1][np.where(temp_4_R0[0]==ind_index)])/1)] 
        
        temp_5=temp[indexs]*np.repeat(Mi[indexs],temp.shape[1]).reshape(Mi[indexs].shape[0],temp.shape[1]) 
        temp_6=temp_5.sum(axis=0)/(temp[indexs].sum(axis=0))
        temp_6[np.isnan(temp_6)] = 0

        acc1=temp_6

   
        self.acc.append(list(temp_6))

        temp_5=temp[indexs_R0]*np.repeat(Mi[indexs_R0],temp.shape[1]).reshape(Mi[indexs_R0].shape[0],temp.shape[1]) 
        temp_6=temp_5.sum(axis=0)/(temp[indexs_R0].sum(axis=0))
        temp_6[np.isnan(temp_6)] = 0
        self.acc_R0.append(list(temp_6))

        temp_6=(0.8*acc1+0.2*temp_6)

        self.acc_individual.append( list(self.individuals[np.sqrt(((self.individuals-temp_6)**2).sum(axis=1)).argmin()]))

      self.acc=np.array(self.acc)
      self.acc_R0=np.array(self.acc_R0)

      values_list.append(self.best_score )
      
      r3=np.random.random((self.population_size,X_train.shape[1]))
      r4=np.random.random((self.population_size,X_train.shape[1]))

      self.v=self.w*(self.v ) +self.c1*r1*(  (self.gbest_individual ) -self.individuals)+\
       self.c2*r2*( (self.current_best_individual_score_dimensions  ) -self.individuals)+2*r3*( np.array(self.acc_individual)-self.individuals )#+2*r4*( np.array(self.acc_individual2)-self.individuals )

      self.v=np.where(self.v>6,6,self.v)
      self.v=np.where(self.v<-6,-6,self.v)

      list_best_fitness.append(self.best_score)
      list_mean_fitness.append(np.array(self.fitness_scores).mean())
      list_min_fitness.append(np.array(self.fitness_scores).min())
      
      self.s_v=self.sigmoid(self.v)

      rand_nums=np.random.uniform(size=(self.population_size,X_train.shape[1]))
      self.individuals=np.where(rand_nums<self.s_v,1,0)

    return self.best_dim,   list_best_fitness,\
                list_mean_fitness,list_min_fitness

In [None]:
import warnings
warnings.filterwarnings("ignore")
list_best_fitness_new=[]
list_mean_fitness_new=[]
list_min_fitness_new=[]
list_res_new=[]
for i in range(runs):
  print(i)
  gaf=SwarmGenerator_nope(score_model,100,10)
  res,temp1,temp2,temp3=gaf.fit(clf,X_train, y_train,X_test, y_test)
  list_res_new.append(res)
  list_best_fitness_new.append(temp1)
  list_mean_fitness_new.append(temp2)
  list_min_fitness_new.append(temp3)

In [None]:
import scipy

class GAFeatureSelector:

    def __init__(self,objective_function,selective_pressure=2,elitism=2,mutation_rate=0.05,
                                                     n_generations=50,population_size=50):
        self.n_generations = n_generations
        self.selective_pressure = selective_pressure
        self.population_size = population_size
        self.objective_function = objective_function
        self.elitism = elitism
        self.mutation_rate = mutation_rate


    def evaluate_fitness(self,model,X_train,y_train,X_valid,y_valid):
        scores =  []
        for individual in self.individuals:
            chosen_features = [index for index in range(X_train.shape[1]) if individual[index]==1]
            X_train_copy = X_train.iloc[:,chosen_features]
            X_valid_copy = X_valid.iloc[:,chosen_features]
            score = self.objective_function(model,X_train_copy,y_train,X_valid_copy,y_valid)
            scores.append(score)

        self.fitness_scores = scores
        current_best_score = np.max(self.fitness_scores)
        if current_best_score > self.best_score:
            self.best_score = current_best_score
            self.best_feature_set = self.individuals[np.argmax(self.fitness_scores),:]

        ranks = scipy.stats.rankdata(scores,method = 'average')
        self.fitness_ranks = self.selective_pressure * ranks


    def select_individuals(self,model, X_train, y_train, X_valid, y_valid):
        self.evaluate_fitness(model, X_train, y_train, X_valid, y_valid)

        sorted_individuals_fitness  = sorted(zip(self.individuals,self.fitness_ranks),key=lambda x:x[1],reverse=True)
        elite_individuals = np.array([individual for individual,fitness in sorted_individuals_fitness[:self.elitism]])

        #Selecting Non elite individuals with probability proportional to their fitness
        non_elite_individuals = np.array([individual[0] for individual in sorted_individuals_fitness[self.elitism:]])

        non_elite_individuals_fitness = [individual[1] for individual in sorted_individuals_fitness[self.elitism:]]
        selection_probability = non_elite_individuals_fitness/np.sum(non_elite_individuals_fitness)

        selected_indices = np.random.choice(range(len(non_elite_individuals)),self.population_size//2, p=selection_probability)
        selected_individuals = non_elite_individuals[selected_indices,:]
        self.fit_individuals = np.vstack((elite_individuals,selected_individuals))

    #Make me a mutant!
    def mutate(self,array):
        # random_array = np.random.choice([0,1],size=(len(array)),p=[self.mutation_rate,1-self.mutation_rate])
        # xor_array = np.bitwise_xor(array, random_array).astype(np.bool)
        # mutated_array = np.invert(xor_array).astype(int)
        mutated_array = np.copy(array)
        for idx, gene in enumerate(array):
            if np.random.uniform() < self.mutation_rate:
                array[idx] = 1 if gene == 0 else 0

        return mutated_array

    def produce_next_generation(self):
        new_population = np.empty(shape=(self.population_size,self.individuals.shape[1]),dtype=np.int32)
        #Time to produce mutant babies, YAY!
        for i in range(0,self.population_size,2):
            parents = self.fit_individuals[np.random.choice(self.fit_individuals.shape[0], 2, replace=False), :]
            crossover_index = np.random.randint(0,len(self.individuals[0]))
            new_population[i] = np.hstack((parents[0][:crossover_index],parents[1][crossover_index:]))
            new_population[i+1] = np.hstack((parents[1][:crossover_index],parents[0][crossover_index:]))

            new_population[i] = self.mutate(new_population[i])
            new_population[i+1] =  self.mutate(new_population[i+1])
        self.individuals = new_population

    def initialize_population(self,X):
        self.individuals =  np.random.randint(0,2,size=(self.population_size,X.shape[1]))

    def fit(self,model,X_train,y_train,X_valid,y_valid):
        list_best_fitness=[]
        list_mean_fitness=[]
        list_min_fitness=[]
        self.initialize_population(X_train)
        self.best_score = -1 * np.float(np.inf)
        self.best_scores = []

        for i in range(self.n_generations):
            self.select_individuals(model,X_train,y_train,X_valid,y_valid)
            self.produce_next_generation()
            self.best_scores.append(self.best_score)
 
            list_best_fitness.append(self.best_score)
            list_mean_fitness.append(np.array(self.fitness_scores).mean())
            list_min_fitness.append(np.array(self.fitness_scores).min())

            # print("All fitnes scores at generation",i,np.mean(self.fitness_scores))
            # print("Best Score at generation",i,self.best_score)
            # if (np.mean(self.best_scores[-5:])==self.best_scores[-1]) and (i>=4):
            #    break
        return  self.best_feature_set,list_best_fitness,\
                list_mean_fitness,list_min_fitness



In [None]:
list_best_fitness_n=[]
list_mean_fitness_n=[]
list_min_fitness_n=[]
list_res_n=[]
def score_model_gen(model,X_train, y_train, X_valid, y_valid):
    model.fit(X_train,y_train)
    # return log_loss(y_valid,model.predict_proba(X_valid))
    P = (model.predict(X_valid) != y_valid).mean()
    alpha=0.01
    # Compute for the objective function
    j = alpha*(X_valid.shape[1]/data_shape)+(1-alpha)*P
    return -j
for i in range(runs):
  print(i)
  gaf=GAFeatureSelector(score_model_gen,population_size=10,n_generations=100,mutation_rate=0.08)
  res,temp1,temp2,temp3=gaf.fit(clf,X_train, y_train,X_test, y_test)
  list_res_n.append(res)
  list_best_fitness_n.append(temp1)
  list_mean_fitness_n.append(temp2)
  list_min_fitness_n.append(temp3)

In [None]:
import matplotlib.style as style
style.use('fivethirtyeight')
f ,ax= plt.subplots(1,2,figsize=(13,5))
ax[0].plot(np.arange(100),(-np.array(list_best_fitness_n)).mean(axis=0),color='orange',label='Genetic Algorithm')
ax[0].plot(np.arange(100),np.array(list_best_fitness_org).mean(axis=0),label='Particle swarm optimization')
ax[0].plot(np.arange(100),np.array(list_best_fitness_new).mean(axis=0),color='red',label='Proposed Solution')


ax[0].legend(prop={'weight':'bold'} )
ax[0].set_title('best fitness mean value across {} runs'.format(runs),{'fontsize':15},pad=5)

ax[0].set_ylabel('Fitness scores')
ax[0].set_xlabel('Number of Iterations')
ax[1].plot(np.arange(100),(-np.array(list_best_fitness_n)).min(axis=0),color='orange',label='Genetic Algorithm')
ax[1].plot(np.arange(100),np.array(list_best_fitness_org).min(axis=0),label='Particle swarm optimization')
ax[1].plot(np.arange(100),np.array(list_best_fitness_new).min(axis=0),color='red',label='Proposed Solution')

ax[1].set_title('best fitness minimum value across {} runs'.format(runs),{'fontsize':15},pad=5)
ax[1].set_xlabel('Number of Iterations')
ax[1].legend(prop={'weight':'bold'})

f.suptitle('{} Data Set'.format('spam mail'),y=1)

In [None]:
alld=[np.array(list_best_fitness_new)[:,99],-np.array(list_best_fitness_n)[:,99],np.array(list_best_fitness_org)[:,99]]
f ,ax= plt.subplots(figsize=(10,5))
plt.boxplot(x=alld)
plt.xticks([1,2,3],['Proposed Solution','Genetic Algorithm','Particle swarm optimization'],rotation=15)
f.suptitle('{} Data Set , Best fitness values across {} runs'.format('spam mail',runs),y=1)
plt.show()

In [None]:
org_accuracy=[]
for x in list_res_org:
  clf.fit(X_train[X_train.columns[np.where(x)]],y_train)
  org_accuracy.append((clf.predict(X_test[X_train.columns[np.where(x)]])==y_test).mean())
new_accuracy=[]
for x in list_res_new:
  clf.fit(X_train[X_train.columns[np.where(x)]],y_train)
  new_accuracy.append((clf.predict(X_test[X_train.columns[np.where(x)]])==y_test).mean())
n_accuracy=[]
for x in list_res_n:
  clf.fit(X_train[X_train.columns[np.where(x)]],y_train)
  n_accuracy.append((clf.predict(X_test[X_train.columns[np.where(x)]])==y_test).mean())

In [None]:
print( np.round((np.array(list_best_fitness_new)[:,99]).min(),5),' ',np.round((np.array(list_best_fitness_new)[:,99]).max(),5),' ',np.round((np.array(list_best_fitness_new)[:,99]).mean(),5),' ',np.round((np.array(list_best_fitness_new)[:,99]).std(),5),' ',np.array(new_accuracy).mean()   )
print( np.round((-np.array(list_best_fitness_n)[:,99]).min(),5),' ',np.round((-np.array(list_best_fitness_n)[:,99]).max(),5),' ',np.round((-np.array(list_best_fitness_n)[:,99]).mean(),5),' ',np.round((-np.array(list_best_fitness_n)[:,99]).std(),5),' ',np.array(n_accuracy).mean()   )
print( np.round((np.array(list_best_fitness_org)[:,99]).min(),5),' ',np.round((np.array(list_best_fitness_org)[:,99]).max(),5),' ',np.round((np.array(list_best_fitness_org)[:,99]).mean(),5),' ',np.round((np.array(list_best_fitness_org)[:,99]).std(),5),' ',np.array(org_accuracy).mean()   )

In [None]:
from scipy import stats
t,p=stats.ttest_ind(np.array(list_best_fitness_org)[:,99],
                np.array(list_best_fitness_new)[:,99] ,equal_var = False)
t,p

In [None]:
from scipy import stats
t,p=stats.ttest_ind(-np.array(list_best_fitness_n)[:,99],
                np.array(list_best_fitness_new)[:,99] ,equal_var = False)
t,p