In [5]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm
import sklearn.model_selection as skm
import sklearn.linear_model as skl
from sklearn.preprocessing import StandardScaler
from ISLP import load_data
from ISLP.models import ModelSpec as MS
from functools import partial
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from ISLP.models import (Stepwise, sklearn_selected, sklearn_selection_path)
from l0bnb import fit_path
import matplotlib.pyplot as plt

In [6]:
# Steps in Evolutionary Strategy 
"""
    1. Parent Selection : Select Parents from the population
    2. Cross Over : Create Offsprings from parents : Uniform Cross Over p = 0.5
    3. Mutation : Flip the bits of the offsprings with probability p = 0.3
    4. Evaluation : Evaluate the fitness of the offsprings using the fitness function (Validation, NegCP, AIC, BIC)
    5. Selection : Select the best offsprings to be the parents of the next generation
"""

# Parent Selection
def parentSelection(population_size, total_parents) : 
    parent_feature_size = np.random.randint(1, population_size, total_parents)
    parents = []
    for size in parent_feature_size : 
        parent_vector = np.zeros(population_size, dtype = bool)
        selected_indices = np.random.choice(population_size, size, replace=False)
        parent_vector[selected_indices] = True
        parents.append(parent_vector)
    return np.array(parents)

# Uniform Cross Over
def uniformCrossOver(total_parents, probability, population_size, parents : np.ndarray) : 
    offspring = []
    offspring_total = int(total_parents / 2)
    
    for i in range(offspring_total) : 
        """
            Uniform Cross Over : 
            Select J bit from Parent 1 with probability p
            Select J bit from Parent 2 with probability 1-p
        """
        parent_1 = parents[i]
        parent_2 = parents[i+5]
        offspring_vector = np.zeros(population_size, dtype = bool)
    
        for j in range(population_size) : 
            if np.random.rand() < probability : 
                offspring_vector[j] = parent_1[j]
            else : 
                offspring_vector[j] = parent_2[j]
        offspring.append(offspring_vector)
    return np.array(offspring)

# Mutation
def mutation(probability, total_parents, population_size, offspring) : 
    offspring_total = int(total_parents/2)
    for i in range(offspring_total) : 
        mutation_vector = offspring[i]
        for j in range(population_size) : 
            if np.random.rand() < probability : 
                mutation_vector[j] = not mutation_vector[j]
        offspring[i] = mutation_vector
    return offspring


# Evaluation : Validation Set Approach
def evaluation(new_population, data, response, population_columns, training_size=0.8, random_state=42):
    fitness = []
    for i in range(len(new_population)):
        design = MS(population_columns[i])
        data_train, data_test = skm.train_test_split(data, train_size=0.8, random_state=42)
        y_train = data_train[response]
        y_test = data_test[response]
        x_train = design.fit_transform(data_train)
        x_test = design.transform(data_test)
        model = sm.OLS(y_train, x_train).fit()
        predicted = model.predict(x_test)
        rss = np.sum((y_test - predicted)**2)
        fitness.append(rss)
    best_idx = np.argmin(fitness)
    best_individual = new_population[best_idx]
    return best_individual, fitness[best_idx]

In [7]:
data = load_data("Hitters")
data = data.dropna()

In [8]:
# Initializing the Population
response = "Salary"
population = data.columns.drop(response)
population_size = len(population)
total_parents = 10
evolve = 10
final_candidates = []
final_rss = []

for i in range(evolve) :
    parents = parentSelection(population_size, total_parents)
    offspring = uniformCrossOver(total_parents, 0.5, population_size, parents)
    offspring_mutated = mutation(0.3, total_parents, population_size, offspring)
    new_population = np.vstack([parents, offspring_mutated])
    population_columns = [population[new_population[i]] for i in range(len(new_population))]
    good_individual = evaluation(new_population, data, response, population_columns=population_columns)
    individual, rss = good_individual
    final_candidates.append(individual)
    final_rss.append(rss)
    
best_idx = np.argmin(final_rss)
best_individual = final_candidates[best_idx]
print(f"After {10} generations, the best individual is {population[best_individual]} with RSS {final_rss[best_idx]}")



After 10 generations, the best individual is Index(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CHmRun',
       'CRuns', 'CRBI', 'CWalks', 'League', 'Division', 'PutOuts', 'Assists',
       'Errors', 'NewLeague'],
      dtype='object') with RSS 6602652.204253177
