In [None]:
import random

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np

x_train_path = "../data/x_train.txt"
y_train_path = "../data/y_train.txt"

x_data = np.loadtxt(x_train_path, delimiter=" ")
y_data = np.loadtxt(y_train_path, delimiter=" ")

print("X shape:", x_data.shape)
print("Y shape:", y_data.shape)


In [None]:
MIN_COLS = 2
MAX_COLS = 6


In [None]:
test_size = 1_000
train_size = x_data.shape[0] - test_size

X_train, X_test, y_train, y_test = train_test_split(
    x_data, y_data, test_size=test_size, shuffle=True
    )


In [None]:
def get_model():
    return GradientBoostingClassifier(n_estimators=100)


In [None]:
def create_individual(num_features):
    num_selected_features = random.randint(MIN_COLS, MAX_COLS)
    individual = [0] * num_features
    selected_features = random.sample(
        range(num_features),
        num_selected_features
    )

    for idx in selected_features:
        individual[idx] = 1
    return individual


# def mutate(individual, mutation_rate):
#     num_features = len(individual)
#     for i in range(num_features):
#         if random.random() < mutation_rate:
#             individual[i] = 1 - individual[i]

#     # Upewnij się, że liczba wybranych kolumn jest w zakresie od 3 do 10
#     num_selected_features = sum(individual)
#     if num_selected_features < MIN_COLS:
#         additional_features = random.sample(
#             [i for i in range(num_features) if individual[i] == 0],
#             MIN_COLS - num_selected_features
#             )

#         for idx in additional_features:
#             individual[idx] = 1

#     elif num_selected_features > MAX_COLS:
#         excessive_features = random.sample(
#             [i for i in range(num_features) if individual[i] == 1],
#             num_selected_features - MAX_COLS
#             )

#         for idx in excessive_features:
#             individual[idx] = 0

#     return individual


def mutate(individual, mutation_rate):
    num_features = len(individual)
    for i in range(num_features):
        if random.random() < mutation_rate:
            individual[i] = 1 - individual[i]

    num_selected_features = sum(individual)
    if num_selected_features < MIN_COLS:
        additional_features = random.sample(
            [i for i in range(num_features) if individual[i] == 0],
            MIN_COLS - num_selected_features
        )

        for idx in additional_features:
            individual[idx] = 1

    elif num_selected_features > MAX_COLS:
        excessive_features = random.sample(
            [i for i in range(num_features) if individual[i] == 1],
            num_selected_features - MAX_COLS
        )

        for idx in excessive_features:
            individual[idx] = 0

    return individual


In [None]:
import hashlib


def calculate_hash(individual):
    individual_str = ''.join(map(str, individual))
    hash_object = hashlib.sha256(individual_str.encode())
    hash_hex = hash_object.hexdigest()
    return hash_hex


In [None]:
def is_fitted(model):
    return hasattr(model, "estimators_")


def fitness(individual, get_model_func, X_train, X_test, y_train, y_test) -> int:
    selected_features = [
        index
        for index, bit in enumerate(individual)
        if bit == 1
        ]

    if len(selected_features) == 0:
        return -np.inf

    X_train_subset = X_train[:, selected_features]
    X_test_subset = X_test[:, selected_features]

    model = get_model_func()
    assert is_fitted(model) == False

    model.fit(X_train_subset, y_train)
    y_pred = model.predict(X_test_subset)

    num_correct = np.sum((y_test == 1) & (y_pred == 1))
    profit = num_correct * 20 - len(selected_features) * 200
    return profit


def select_winner(individual1, individual2, model, X_train, X_test, y_train, y_test):
    profit1 = fitness(individual1, model, X_train, X_test, y_train, y_test);
    profit2 = fitness(individual2, model, X_train, X_test, y_train, y_test);
    
    if profit1 > profit2:
        return individual1, profit1;
    else:
        return individual2, profit2;


In [None]:
def tournament_selection(population, scores, k=3):
    selected = random.sample(range(len(population)), k)
    selected_scores = [scores[i] for i in selected]
    return population[selected[np.argmax(selected_scores)]]


def crossover(parent1, parent2):
    point = random.randint(1, len(parent1) - 2)
    child1 = parent1[:point] + parent2[point:]
    child2 = parent2[:point] + parent1[point:]
    return child1, child2


def genetic_algorithm(n: int, population_size: int, mutation_rate: float, get_model_func, X_train, X_test, y_train, y_test):
    num_features = X_train.shape[1]
    population = [
        create_individual(num_features)
        for _ in range(population_size)
        ]

    best_individual = None
    best_fitness = -np.inf

    for generation in range(n):
        # Ocena przystosowania dla całej populacji
        fitness_scores = [
            fitness(individual, get_model_func, X_train, X_test, y_train, y_test)
            for individual in population
            ]

        # Znalezienie najlepszego osobnika w bieżącej populacji
        for i, score in enumerate(fitness_scores):
            if score > best_fitness:
                best_fitness = score
                best_individual = population[i]

        new_population = []
        while len(new_population) < population_size:
            parent1 = tournament_selection(population, fitness_scores)
            parent2 = tournament_selection(population, fitness_scores)
            child1, child2 = crossover(parent1, parent2)
            new_child_1 = mutate(child1, mutation_rate)
            new_child_2 = mutate(child2, mutation_rate)
            new_population.extend([new_child_1, new_child_2])

        assert population != new_population[:population_size]
        population = new_population[:population_size]
        print(f"Generation {generation}, Best Fitness: {best_fitness}")

    return best_individual, best_fitness


N = 50
population_size = 50
mutation_rate = 0.1


best_individual, best_fitness = genetic_algorithm(N, population_size, mutation_rate, get_model, X_train, X_test, y_train, y_test)

selected_features = [
    index
    for index, bit in enumerate(best_individual)
    if bit == 1
    ]

print(f"Best individual (selected features): {selected_features}")
print(f"Best fitness (dollars): {best_fitness}")
