### Пример реализации генетического алгоритма для отбора факторов

#### Импорт библиотек

In [1]:
import random
from abc import ABC, abstractmethod
from copy import deepcopy
from dataclasses import dataclass, field
from itertools import compress, count
from typing import Callable, List, Literal, Optional, TypeAlias, Tuple

import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

#### Особь и интерфейс модели

In [2]:
class ModelGeneric(ABC):
    @abstractmethod
    def fit(self):
        pass

    @abstractmethod
    def predict_proba(self):
        pass


@dataclass
class Individual:
    genotype: List[bool]
    fitness_value: Optional[float] = None
    rank_value: Optional[int] = None
    selection_proba: Optional[float] = None
    _id: int = field(default_factory=count().__next__)

#### Прототип реализации селектора факторов

In [3]:
SelectionStatistic: TypeAlias = pd.DataFrame

In [4]:
class GeneticSelector:
    def __init__(
        self,
        n_generations: int,
        n_individuals: int,
        selection_share: float,
        elitist_share: float,
        mutatuion_rate: float,
        random_seed: int,
        model: ModelGeneric,
        fitness: Literal["GINI", "R_2", "MSE", "CUSTOM"] = "GINI",
        fitness_custom: Callable = None,
    ) -> None:
        pass

    def select(
        self,
        X_train,
        X_test,
        y_train,
        y_test,
        features_to_select: List[str],
        cat_features: List[str],
        verbose: int,
    ) -> SelectionStatistic:
        pass

    @property
    def selected_features(self):
        return self._selected_feature

#### Инициализация популяции

In [5]:
def get_set_of_random_individuals(
    n_individuals: int, genotype_length: int, random_seed: int
) -> List[Individual]:
    "Get set of individuals with random genotype"

    individuals = []
    for i in range(n_individuals):
        random.seed(random_seed + i)
        genotype = random.choices([False, True], k=genotype_length)
        individuals.append(Individual(genotype=genotype))

    return individuals

#### Используемая модель для отбора факторов

In [6]:
RANDOM_SEED = 42
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.01,
    depth=7,
    random_seed=RANDOM_SEED,
    loss_function="Logloss",
    task_type="GPU",
    nan_mode="Min",
    od_type="Iter",
    od_wait=100,
)

#### Fitness метрика для отбора

In [7]:
def gini(y_true, y_pred):
    return 2 * roc_auc_score(y_true, y_pred) - 1

In [8]:
FitnessStatistic: TypeAlias = pd.DataFrame

In [9]:
def get_probability_by_ranks(n_individuals: int, current_rank: int) -> float:
    "Transfer rank to probability"
    return (n_individuals - (current_rank - 1)) / sum(range(n_individuals + 1))

In [10]:
def calculate_fitness_info(
    individuals: List[Individual],
    features_lst: List[str],
    cat_features_all: List[str],
    model: ModelGeneric,
    X_train: pd.DataFrame,
    y_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_test: pd.DataFrame,
    verbose: int,
) -> FitnessStatistic:
    "Calculate fitness information for all individuals"

    fitness = []
    for n, inidivual in enumerate(individuals):
        if (n + 1) % verbose == 0:
            print(f"{n+1} individual calculation", "--- " * 3, dt.datetime.now())

        assert len(features_lst) == len(
            inidivual.genotype
        ), "Length of genotype and features_lst should be equals"

        features_to_fit = list(compress(features_lst, inidivual.genotype))
        cat_features_to_fit = list(set(features_to_fit) & set(cat_features_all))

        model.fit(
            X=X_train[features_to_fit],
            y=y_train,
            cat_features=cat_features_to_fit,
            eval_set=(X_test[features_to_fit], y_test),
            verbose=False,
        )

        y_pred_train = model.predict_proba(X_train[features_to_fit])[:, 1]
        y_pred_test = model.predict_proba(X_test[features_to_fit])[:, 1]

        gini_train = gini(y_train, y_pred_train)
        gini_test = gini(y_test, y_pred_test)

        fitness.append(
            {
                "n": n,
                "gini_train": gini_train,
                "gini_test": gini_test,
                "_id": inidivual._id,
            }
        )

    # Ranking
    fitness = pd.DataFrame(fitness)
    fitness = fitness.sort_values(by=["gini_test"], ascending=False)
    fitness["rank"] = list(range(1, len(fitness) + 1))
    fitness["selection_proba"] = fitness["rank"].apply(
        lambda x: get_probability_by_ranks(len(fitness), x)
    )

    # Save information
    for n, inidivual in enumerate(individuals):
        inidivual.fitness_value = fitness.loc[fitness["n"] == n]["gini_test"].item()
        inidivual.rank_value = fitness.loc[fitness["n"] == n]["rank"].item()
        inidivual.selection_proba = fitness.loc[fitness["n"] == n][
            "selection_proba"
        ].item()

    return fitness

#### Отбор особей (моделей с заданным набором факторов)

In [11]:
def select_individuals(
    individuals: List[Individual],
    random_seed: int,
    fitness: FitnessStatistic,
    share_result: float = 0.5,
    share_elitist: float = 0.25,
) -> List[Individual]:
    "Select the best individuals in the population"

    share_elitist = round(share_elitist, 2)
    share_result = round(share_result, 2)

    if share_elitist >= share_result:
        raise ValueError(
            "All expected individuals won't be deleted by using probability."
        )

    if ((share_elitist > 1) & (share_elitist < 0)) | (
        (share_result > 1) & (share_result < 0)
    ):
        raise ValueError("Probability should be between 0 and 1.")

    if share_result == 1:
        raise ValueError(
            "Result population should be less than initialized number of individuals."
        )

    # Select share_elitist by using random
    idx_all = range(len(individuals))
    random.seed(random_seed)
    k_to_select_elitist = int(round(len(individuals) * share_elitist))
    idx_elitist = random.sample(idx_all, k=k_to_select_elitist)

    # Select share_result by using random and probabilities calculated before
    idx_to_selection = set(idx_all) - set(idx_elitist)
    weights = fitness.loc[fitness["n"].isin(idx_to_selection)][
        "selection_proba"
    ].to_list()

    # Weight our weights
    weights = [weight / sum(weights) for weight in weights]
    np.random.seed(random_seed)
    k_to_select = int(round(len(individuals) * (share_result - share_elitist)))
    idx_selected = np.random.choice(
        list(idx_to_selection), size=k_to_select, replace=False, p=weights
    )

    selected_individuals = [
        ind
        for n, ind in enumerate(individuals)
        if (n in idx_elitist) or (n in idx_selected)
    ]

    if len(selected_individuals) == 0:
        raise ValueError(
            "Were selected 0 individuals. Try to increase number of initial individuals."
        )

    return selected_individuals

#### Создание потомков

In [12]:
def get_offspring_from_pair(
    ind1: Individual, ind2: Individual, random_seed: int
) -> Tuple[Individual, Individual]:
    "Get offspring from pair of individuals"
    child_1_gen = []
    child_2_gen = []
    for n, gen_1 in enumerate(ind1.genotype):
        gen_2 = ind2.genotype[n]

        np.random.seed(random_seed + n)
        children_gen = np.random.choice([gen_1, gen_2], size=2)

        child_1_gen.append(children_gen[0])
        child_2_gen.append(children_gen[1])

    return (
        Individual(gen=child_1_gen),
        Individual(gen=child_2_gen),
    )

#### Мутации

In [13]:
def mutate_offspring(
    offsprings: List[Individual],
    random_seed: int,
    mutation_rate: Optional[float] = None,
) -> List[Individual]:
    "Mutate offspring genotype"

    # Default MR is 1/len(genotype)
    if mutation_rate is None:
        mutation_rate = 1 / len(offsprings[0].genotype)

    individuals_new = deepcopy(list(offsprings))
    for n, individual in enumerate(offsprings):
        for i, gen in enumerate(individual.genotype):
            random.seed(random_seed + n + i)
            proba = random.random()

            if mutation_rate > proba:
                individuals_new[n].genotype[i] = not individuals_new[n].genotype[i]

    assert len(individuals_new) == len(offsprings)

    return individuals_new

#### Скрещивание

In [14]:
def crossover_individuals(
    individuals: List[Individual],
    n_individuals_initial: int,
    random_seed: int,
) -> List[Individual]:
    "Cross and mutate individuals"

    individuals_new = deepcopy(list(individuals))
    cycle = 0

    # Taking a pair and crossover them until we get initial amount of individuals
    while len(individuals_new) < n_individuals_initial:
        random.seed(random_seed + cycle)
        idx1 = random.choice(range(len(individuals)))
        random.seed(random_seed + cycle * 1_000_000)
        idx2 = random.choice(range(len(individuals)))

        ind1 = individuals[idx1]
        ind2 = individuals[idx2]

        if len(ind1.genotype) != len(ind2.genotype):
            raise ValueError("Length of genotypes should be equals")

        # Cross their genotypes to produce 2 pair of individuals
        individuals_generated = get_offspring_from_pair(
            ind1, ind2, random_seed=random_seed + cycle
        )

        individuals_generated_mutated = mutate_offspring(
            individuals_generated, random_seed=random_seed + cycle
        )

        individuals_new += individuals_generated_mutated
        cycle += 1

    # Make generated generation the same volume as initial (if we produce more children than we need)
    if len(individuals_new) > n_individuals_initial:
        np.random.seed(random_seed)
        individuals_new = list(
            np.random.choice(individuals_new, size=n_individuals_initial, replace=False)
        )

    return individuals_new