1. Diabetes Health Indicators (Balanced)

In [2]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz, export_text

from tree import EvoParams, EvolutionaryForest
from sklearn.ensemble import RandomForestClassifier
from utils import prepare_data



train = pd.read_csv('datasets/Diabetes/diabetes_binary_5050_(balanced).csv')
data = prepare_data(train, target_col="Diabetes_binary")

n_estimators = 10

params = EvoParams(
    outcome_type='float',
    population_size=100,
    max_depth=10,
    crossover_rate=0.8,
    mutation_rate=0.4,
    n_elites=5,
    n_classifiers=n_estimators,
    early_stopping_patiance=100,
    training_sample_size = data['X_train'].shape[0] // 4,
    n_species=5
)

evo = EvolutionaryForest(data, params)
evo.evolve(100)

print(evo.ensemble)

for tree in evo.ensemble:
    print(tree)

sk_forest = RandomForestClassifier(n_estimators=n_estimators)
sk_forest.fit(data['X_train'], data['y_train'])

predictions = evo.predict(data['X_test'])
print(f"Forest accuracy: {accuracy_score(data['y_test'], predictions)}")
print(f"Sklearn random forest accuracy: {sk_forest.score(data['X_test'], data['y_test'])}")

Numerical features: Index(['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke',
       'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')
Categorical features: Index([], dtype='object')
Generating initial population
-- Evolutionary forest --
Evaluating species
Evaluating species complete
Species 0 has 20 individuals with best fitness: 0.49
Species 1 has 20 individuals with best fitness: 0.51
Species 2 has 20 individuals with best fitness: 0.47
Species 3 has 20 individuals with best fitness: 0.51
Species 4 has 20 individuals with best fitness: 0.55
Generation 1/100 has 100 trees
  all-time best fitness: 0.55
  best fitness this generation: 0.55
  current temperature: 1.00
Selecting for all species
[[Node(10?HvyAlcoholConsump?<?4.13), Node(12?NoDocbcCost?!=?0.98), Node(20?Income?>?-0.0), Nod

2. Diabetes Health Indicators (Imbalanced Binary)

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz, export_text

from tree import EvoParams, EvolutionaryForest
from sklearn.ensemble import RandomForestClassifier
from utils import prepare_data



train = pd.read_csv('datasets\Diabetes\diabetes_binary_health_(not_balanced).csv')
data = prepare_data(train, target_col="Diabetes_binary")

n_estimators = 10

params = EvoParams(
    outcome_type='float',
    population_size=500,
    max_depth=10,
    crossover_rate=0.5,
    mutation_rate=0.4,
    n_elites=50,
    n_classifiers=n_estimators,
    early_stopping_patiance=100,
    training_sample_size = data['X_train'].shape[0] // 5,
    n_species=10
)

evo = EvolutionaryForest(data, params)
evo.evolve(10)

print(evo.ensemble)

for tree in evo.ensemble:
    print(tree)

sk_forest = RandomForestClassifier(n_estimators=n_estimators)
sk_forest.fit(data['X_train'], data['y_train'])

predictions = evo.predict(data['X_test'])
print(f"Forest accuracy: {accuracy_score(data['y_test'], predictions)}")
print(f"Sklearn random forest accuracy: {sk_forest.score(data['X_test'], data['y_test'])}")

3. Diabetes Health Indicators (Mutliclass Imbalanced)

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz, export_text

from tree import EvoParams, EvolutionaryForest
from sklearn.ensemble import RandomForestClassifier
from utils import prepare_data



train = pd.read_csv('datasets\Diabetes\diabetes_012_health_(multiclass_not_balanced).csv')
data = prepare_data(train, target_col="Diabetes_012")

n_estimators = 10

params = EvoParams(
    outcome_type='float',
    population_size=500,
    max_depth=10,
    crossover_rate=0.5,
    mutation_rate=0.4,
    n_elites=50,
    n_classifiers=n_estimators,
    early_stopping_patiance=100,
    training_sample_size = data['X_train'].shape[0] // 5,
    n_species=10
)

evo = EvolutionaryForest(data, params)
evo.evolve(50)

print(evo.ensemble)

for tree in evo.ensemble:
    print(tree)

sk_forest = RandomForestClassifier(n_estimators=n_estimators)
sk_forest.fit(data['X_train'], data['y_train'])

predictions = evo.predict(data['X_test'])
print(f"Forest accuracy: {accuracy_score(data['y_test'], predictions)}")
print(f"Sklearn random forest accuracy: {sk_forest.score(data['X_test'], data['y_test'])}")

4. DARWIN (Alzheimer's Disease Prediction)

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz, export_text

from tree import EvoParams, EvolutionaryForest
from sklearn.ensemble import RandomForestClassifier
from utils import prepare_data



train = pd.read_csv('datasets\DARWIN\DARWIN.csv')
data = prepare_data(train, target_col="class")

n_estimators = 5

params = EvoParams(
    outcome_type='string',
    population_size=1000,
    max_depth=10,
    crossover_rate=0.8,
    mutation_rate=0.2,
    n_elites=10,
    n_classifiers=n_estimators,
    early_stopping_patiance=100,
    training_sample_size = data['X_train'].shape[0] // 1,
    n_species=5
)

evo = EvolutionaryForest(data, params)
evo.evolve(100)

print(evo.ensemble)

for tree in evo.ensemble:
    print(tree)

sk_forest = RandomForestClassifier(n_estimators=n_estimators)
sk_forest.fit(data['X_train'], data['y_train'])

predictions = evo.predict(data['X_test'])
print(f"Forest accuracy: {accuracy_score(data['y_test'], predictions)}")
print(f"Sklearn random forest accuracy: {sk_forest.score(data['X_test'], data['y_test'])}")

5. Toxicity

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz, export_text

from tree import EvoParams, EvolutionaryForest
from sklearn.ensemble import RandomForestClassifier
from utils import prepare_data



train = pd.read_csv('datasets\Toxicity\data.csv')
data = prepare_data(train, target_col="Class")

n_estimators = 5

params = EvoParams(
    outcome_type='string',
    population_size=100,
    max_depth=10,
    crossover_rate=0.8,
    mutation_rate=0.2,
    n_elites=10,
    n_classifiers=n_estimators,
    early_stopping_patiance=100,
    training_sample_size = data['X_train'].shape[0] // 1,
    n_species=5
)

evo = EvolutionaryForest(data, params)
evo.evolve(100)

print(evo.ensemble)

for tree in evo.ensemble:
    print(tree)

sk_forest = RandomForestClassifier(n_estimators=n_estimators)
sk_forest.fit(data['X_train'], data['y_train'])

predictions = evo.predict(data['X_test'])
print(f"Forest accuracy: {accuracy_score(data['y_test'], predictions)}")
print(f"Sklearn random forest accuracy: {sk_forest.score(data['X_test'], data['y_test'])}")