In [1]:
!pip3 install pmlb

Collecting pmlb
  Downloading pmlb-1.0.1.post3-py3-none-any.whl (19 kB)
Installing collected packages: pmlb
Successfully installed pmlb-1.0.1.post3


In [2]:
import warnings
import numpy as np
import pandas as pd
import pmlb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
import random
from sklearn.utils import parallel_backend
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.base import clone
from multiprocessing import Pool

# Suppress warnings
warnings.filterwarnings("ignore")

# Define grammar
grammar = {
    'preprocessing': ['StandardScaler', 'RobustScaler', 'MinMaxScaler', 'PCA'],
    'classifiers': ['DecisionTreeClassifier', 'LogisticRegression', 'KNeighborsClassifier', 'GaussianNB', 'RidgeClassifier', 'SGDClassifier', 'LinearSVC'],
    'combinations': ['VotingClassifier_hard', 'StackingClassifier']
}

# Genotype to phenotype mapping
def genotype_to_phenotype(genotype):
    if len(genotype) < 4:
        return None
    
    total_preprocessing = len(grammar['preprocessing'])
    total_classifiers = len(grammar['classifiers'])
    total_combinations = len(grammar['combinations'])

    preprocessing_idx = genotype[0] % total_preprocessing
    classifiers_idxs = [idx % total_classifiers for idx in genotype[1:-1]]
    combination_idx = genotype[-1] % total_combinations

    preprocessing = grammar['preprocessing'][preprocessing_idx]
    classifiers = [grammar['classifiers'][idx] for idx in classifiers_idxs]
    combination = grammar['combinations'][combination_idx]
    
    return preprocessing, classifiers, combination

# Fitness function with cross-validation
def evaluate_pipeline_cv(preprocessing, base_classifiers, combination_method, X, y):
    preprocessing = globals()[preprocessing]()
    X_transformed = preprocessing.fit_transform(X)
    
    base_classifier_instances = [globals()[classifier]() for classifier in base_classifiers]
    
    if combination_method == 'VotingClassifier_hard':
        ensemble = VotingClassifier(estimators=[(str(i), clf) for i, clf in enumerate(base_classifier_instances)], voting='hard')
    elif combination_method == 'StackingClassifier':
        ensemble = StackingClassifier(estimators=[(str(i), clf) for i, clf in enumerate(base_classifier_instances)], final_estimator=LogisticRegression(max_iter=10000))
    else:
        raise ValueError("Unsupported combination method.")
    
    scores = cross_val_score(ensemble, X_transformed, y, cv=5)  # 5-fold cross-validation
    accuracy = np.mean(scores)
    return accuracy

# Genetic Algorithm parameters
population_size = 50
num_generations = 15
mutation_rate = 0.3
crossover_rate = 0.5  # Adjust as needed

# Load dataset
results = []

for dataset_name in pmlb.classification_dataset_names[61:81]:
    if dataset_name in ['kddcup', 'mnist','krkopt','fars']:  # Skip the datasets named "kddcup" and "mnist"
        continue
    print(f"Processing dataset: {dataset_name}")
    X, y = pmlb.fetch_data(dataset_name, return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    population = [
        [random.randint(0, len(grammar['preprocessing']) + len(grammar['classifiers']) + len(grammar['combinations']) - 1) for _ in range(random.randint(4, 10))]
        for _ in range(population_size)
    ]

    # Evolutionary loop
    fitness_scores_list = []
    for generation in range(num_generations):
        print(f"Generation {generation + 1}")
        # Convert genotypes to phenotypes
        phenotypes = [genotype_to_phenotype(genotype) for genotype in population]
        # print("Genotypes and Equivalent Phenotypes:")
        # for genotype, phenotype in zip(population, phenotypes):
        #     print(f"Genotype: {genotype}, Phenotype: {phenotype}")
        # Evaluate fitness
        fitness_scores = [
            evaluate_pipeline_cv(preprocessing, base_classifiers, combination_method, X_train, y_train)
            for preprocessing, base_classifiers, combination_method in phenotypes
        ]
        fitness_scores_list.append(fitness_scores)

        # Select parents based on fitness scores (roulette wheel selection)
        total_fitness = sum(fitness_scores)
        probabilities = [score / total_fitness if total_fitness != 0 else 1/population_size for score in fitness_scores]
        parent_indices = np.random.choice(range(population_size), size=population_size, p=probabilities)
        parents = [population[i] for i in parent_indices]

        # Apply crossover
        offspring = []
        for i in range(0, population_size, 2):
            if random.random() < crossover_rate:
                crossover_point = random.randint(1, len(parents[i]) - 1)
                offspring1 = parents[i][:crossover_point] + parents[i + 1][crossover_point:]
                offspring2 = parents[i + 1][:crossover_point] + parents[i][crossover_point:]
                offspring.append(offspring1)
                offspring.append(offspring2)
            else:
                offspring.append(parents[i])
                offspring.append(parents[i + 1])

        # Apply mutation
        for i in range(population_size):
            if random.random() < mutation_rate:
                # Randomly select one gene to mutate
                gene_to_mutate = random.randint(0, len(offspring[i]) - 1)
                offspring[i][gene_to_mutate] = random.randint(0, 100)

        # Replace old population with new population
        population = offspring

        # Introduce new random individuals to maintain diversity
        while len(population) < population_size:
            new_genotype = [random.randint(0, len(grammar['preprocessing']) + len(grammar['classifiers']) + len(grammar['combinations']) - 1) for _ in range(random.randint(4, 10))]
            population.append(new_genotype)

    # Find best pipeline
    best_index = np.argmax(np.mean(fitness_scores_list, axis=0))
    best_pipeline = phenotypes[best_index]
    best_accuracy = np.mean(fitness_scores_list, axis=0)[best_index]
    
    # Calculate accuracies for default Random Forest and Gradient Boost classifiers
    rf = RandomForestClassifier(random_state=42,n_jobs=-1)
    gb = GradientBoostingClassifier(random_state=42)
    rf.fit(X_train, y_train)
    gb.fit(X_train, y_train)
    rf_accuracy = np.mean(cross_val_score(rf, X_train, y_train, cv=5))
    gb_accuracy = np.mean(cross_val_score(gb, X_train, y_train, cv=5))

    # Calculate accuracies for Extra Trees classifier
    et = ExtraTreesClassifier(random_state=42,n_jobs=-1)
    et.fit(X_train, y_train)
    et_accuracy = np.mean(cross_val_score(et, X_train, y_train, cv=5))

    results.append([dataset_name, best_pipeline, best_accuracy, rf_accuracy, gb_accuracy, et_accuracy])
    print(f"Dataset: {dataset_name}, Best Pipeline: {best_pipeline}, Best Accuracy: {best_accuracy}, RF Accuracy: {rf_accuracy}, GB Accuracy: {gb_accuracy}, ET Accuracy: {et_accuracy}")
        
# Calculate mean accuracies
mean_pipeline_accuracy = np.mean([row[2] for row in results])
mean_rf_accuracy = np.mean([row[3] for row in results])
mean_gb_accuracy = np.mean([row[4] for row in results])
mean_et_accuracy = np.mean([row[5] for row in results])



print(f"Mean Pipeline Accuracy: {mean_pipeline_accuracy}")
print(f"Mean RF Accuracy: {mean_rf_accuracy}")
print(f"Mean GB Accuracy: {mean_gb_accuracy}")
print(f"Mean ET Accuracy: {mean_et_accuracy}")

# Store results in a CSV file
df = pd.DataFrame(results, columns=['Dataset', 'Best Pipeline', 'Pipeline Accuracy', 'RF Accuracy', 'GB Accuracy', 'ET Accuracy'])
df.to_csv('pipeline_results_optimized_with_crossover.csv', index=False)


Processing dataset: corral
Generation 1
Generation 2
Generation 3
Generation 4
Generation 5
Generation 6
Generation 7
Generation 8
Generation 9
Generation 10
Generation 11
Generation 12
Generation 13
Generation 14
Generation 15
Dataset: corral, Best Pipeline: ('PCA', ['KNeighborsClassifier', 'GaussianNB', 'GaussianNB', 'LogisticRegression', 'RidgeClassifier', 'GaussianNB', 'KNeighborsClassifier'], 'VotingClassifier_hard'), Best Accuracy: 0.9580102564102564, RF Accuracy: 0.992, GB Accuracy: 1.0, ET Accuracy: 0.992
Processing dataset: credit_a
Generation 1
Generation 2
Generation 3
Generation 4
Generation 5
Generation 6
Generation 7
Generation 8
Generation 9
Generation 10
Generation 11
Generation 12
Generation 13
Generation 14
Generation 15
Dataset: credit_a, Best Pipeline: ('MinMaxScaler', ['DecisionTreeClassifier', 'GaussianNB', 'KNeighborsClassifier', 'GaussianNB'], 'StackingClassifier'), Best Accuracy: 0.8753535353535352, RF Accuracy: 0.8767895167895168, GB Accuracy: 0.87313677313677