In [9]:
import numpy as np
import statsmodels.api as sm
import sklearn.model_selection as skm
from ISLP import load_data
from ISLP.models import ModelSpec as MS

In [71]:
# Steps in Evolutionary Strategy 
"""
    1. Parent Selection : Select Parents from the population
    2. Cross Over : Create Offsprings from parents : Uniform Cross Over p = 0.5
    3. Mutation : Flip the bits of the offsprings with probability p = 0.3
    4. Evaluation : Evaluate the fitness of the offsprings using the fitness function (Validation, NegCP, AIC, BIC)
    5. Selection : Select the best offsprings to be the parents of the next generation
"""

# Parent Selection
def parentSelection(population_size, total_parents) : 
    parent_feature_size = np.random.randint(1, population_size, total_parents)
    parents = []
    for size in parent_feature_size : 
        parent_vector = np.zeros(population_size, dtype = bool)
        selected_indices = np.random.choice(population_size, size, replace=False)
        parent_vector[selected_indices] = True
        parents.append(parent_vector)
    return np.array(parents)

# Uniform Cross Over
def uniformCrossOver(total_parents, probability, population_size, parents : np.ndarray) : 
    offspring = []
    offspring_total = int(total_parents / 2)
    
    for i in range(offspring_total) : 
        """
            Uniform Cross Over : 
            Select J bit from Parent 1 with probability p
            Select J bit from Parent 2 with probability 1-p
        """
        parent_1 = parents[i]
        parent_2 = parents[i+5]
        offspring_vector = np.zeros(population_size, dtype = bool)
    
        for j in range(population_size) : 
            if np.random.rand() < probability : 
                offspring_vector[j] = parent_1[j]
            else : 
                offspring_vector[j] = parent_2[j]
        offspring.append(offspring_vector)
    return np.array(offspring)

# Mutation
def mutation(probability, total_parents, population_size, offspring) : 
    offspring_total = int(total_parents/2)
    for i in range(offspring_total) : 
        mutation_vector = offspring[i]
        for j in range(population_size) : 
            if np.random.rand() < probability : 
                mutation_vector[j] = not mutation_vector[j]
        offspring[i] = mutation_vector
    return offspring


# Evaluation : Validation Set Approach
def evaluation(new_population, data, response, population_columns, training_size=0.8, random_state=42):
    fitness = []
    for i in range(len(new_population)):
        design = MS(population_columns[i])
        data_train, data_test = skm.train_test_split(data, train_size=0.8, random_state=42)
        y_train = data_train[response]
        y_test = data_test[response]
        x_train = design.fit_transform(data_train)
        x_test = design.transform(data_test)
        model = sm.OLS(y_train, x_train).fit()
        predicted = model.predict(x_test)
        rss = model.aic
        fitness.append(rss)
    top_10 = np.argsort(fitness)
    survival_selection = new_population[top_10[:10]]
    return survival_selection
   
def evaluation_best(new_population, data, response, population_columns, training_size=0.8, random_state=42):
    fitness = []
    for i in range(len(new_population)):
        design = MS(population_columns[i])
        data_train, data_test = skm.train_test_split(data, train_size=0.8, random_state=42)
        y_train = data_train[response]
        y_test = data_test[response]
        x_train = design.fit_transform(data_train)
        x_test = design.transform(data_test)
        model = sm.OLS(y_train, x_train).fit()
        predicted = model.predict(x_test)
        rss = model.aic
        fitness.append(rss)
    best_idx = np.argmin(fitness)
    survival_selection = new_population[best_idx]
    return survival_selection, fitness[best_idx]


In [11]:
data = load_data("Hitters")
data = data.dropna()

In [72]:
# Initializing the Population
response = "Salary"
population = data.columns.drop(response)
population_size = len(population)
total_parents = 10
evolve = 50

for i in range(evolve) :
    parents = parentSelection(population_size, total_parents)
    offspring = uniformCrossOver(total_parents, 0.5, population_size, parents)
    offspring_mutated = mutation(0.3, total_parents, population_size, offspring)
    new_population = np.vstack([parents, offspring_mutated])
    population_columns = [population[new_population[i]] for i in range(len(new_population))]
    good_individual = evaluation(new_population, data, response, population_columns=population_columns)
    parents = good_individual
    if i % 10 == 0 :
        print("Generation : ", i)
    
best_individual, best_fitness = evaluation_best(parents, data, response, population_columns=population_columns)
print("Best Individual : ", population[best_individual])
print("Best Fitness : ", best_fitness)


Generation :  0
Generation :  10
Generation :  20
Generation :  30
Generation :  40
Best Individual :  Index(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'CAtBat', 'CHits',
       'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'League', 'Division', 'PutOuts',
       'Assists', 'Errors', 'NewLeague'],
      dtype='object')
Best Fitness :  3018.1186773189916


In [79]:
cols = population[best_individual]
design = MS(cols)
x = design.fit_transform(data)
y = data[response]
model = sm.OLS(y, x).fit()
model.summary()

0,1,2,3
Dep. Variable:,Salary,R-squared:,0.546
Model:,OLS,Adj. R-squared:,0.512
Method:,Least Squares,F-statistic:,16.3
Date:,"Fri, 07 Mar 2025",Prob (F-statistic):,1.9800000000000002e-32
Time:,14:48:39,Log-Likelihood:,-1876.2
No. Observations:,263,AIC:,3790.0
Df Residuals:,244,BIC:,3858.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,148.2187,73.595,2.014,0.045,3.256,293.182
AtBat,-1.9509,0.624,-3.125,0.002,-3.181,-0.721
Hits,7.4395,2.363,3.148,0.002,2.785,12.094
HmRun,4.3449,6.190,0.702,0.483,-7.847,16.537
Runs,-2.3312,2.971,-0.785,0.433,-8.183,3.521
RBI,-1.0670,2.595,-0.411,0.681,-6.178,4.044
Walks,6.2196,1.825,3.409,0.001,2.626,9.813
CAtBat,-0.1887,0.120,-1.572,0.117,-0.425,0.048
CHits,0.1636,0.665,0.246,0.806,-1.146,1.474

0,1,2,3
Omnibus:,88.911,Durbin-Watson:,2.021
Prob(Omnibus):,0.0,Jarque-Bera (JB):,466.454
Skew:,1.257,Prob(JB):,5.14e-102
Kurtosis:,9.021,Cond. No.,20700.0
