In [1]:
from geneticalgorithm import geneticalgorithm as ga

from sklearn.tree import DecisionTreeClassifier as DTC, plot_tree, export_text
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate as cv
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import graphviz

from utils import get_data

%matplotlib inline

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.impute import SimpleImputer, KNNImputer

from sklearn.decomposition import PCA
from utils import get_data

%matplotlib inline

# https://pypi.org/project/geneticalgorithm/#1112-id

In [2]:
XY_train, X_test, inverse_target_map = get_data(min_size=None, min_size_test=None, fill_nan=None)
train_columns = list(XY_train.columns)
train_columns.remove("TARGET_NUM")

min_max_scaler = MinMaxScaler(feature_range=(2, 10))
X_train_minmax = min_max_scaler.fit(XY_train[train_columns])
x_train = X_train_minmax.transform(XY_train[train_columns])

imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=1)
imp_train = imp.fit(x_train)
x_train_full = imp_train.transform(x_train)

x_train_full_df = pd.DataFrame(x_train_full, columns=train_columns, index=XY_train.index)

min_size = 50

for c in x_train_full_df.columns:
    if c != "TARGET_NUM":
        x_train_full_df[c][x_train_full_df.groupby(c)[c].transform('size') <= min_size] = 0

In [3]:
pca = PCA(n_components=36)
pca_result = pca.fit_transform(x_train_full_df)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(pca_result, XY_train["TARGET_NUM"].values, test_size=0.3)

In [5]:
algorithm_param = {'max_num_iteration': 1000,\
                   'population_size':16,\
                   'mutation_probability':0.1,\
                   'elit_ratio': 0.01,\
                   'crossover_probability': 0.5,\
                   'parents_portion': 0.3,\
                   'crossover_type':'uniform',\
                   'max_iteration_without_improv':100}

In [6]:
def fitness(x):
    if x[0] == 0:
        criterion = "gini"
    elif x[0] == 1:
        criterion = "entropy"
    else:
        raise ValueError
    
    if x[1] == 0:
        max_depth = None
    else:
        max_depth = int(x[1])
        
    min_samples_split = int(x[2])
    min_samples_leaf = int(x[3])
    
    if x[4] == 0:
        max_features = "sqrt"
    elif x[4] == 1:
        max_features = "log2"
    elif x[4] == 2:
        max_features = None
    else:
        raise ValueError
    
    #if x[5] == 1:
    #    max_leaf_nodes = None
    #else:
    #    max_leaf_nodes = int(x[5])
    #
    #ccp_alpha = x[6]
    
    dtc = DTC(criterion=criterion,
              max_depth=max_depth,
              min_samples_split=min_samples_split,
              min_samples_leaf=min_samples_leaf,
              max_features=max_features,
             # max_leaf_nodes=max_leaf_nodes,
             # ccp_alpha=ccp_alpha,
              random_state=42)
    
    #cv_results = cv(dtc, pca_result, XY_train["TARGET_NUM"], cv=3)
    #print(cv_results['test_score'])
    #fitness_v = cv_results['test_score'].mean()
    
    dtc = dtc.fit(X_train, y_train)
    #print(dtc.get_params(deep=True))
    fitness_v = dtc.score(X_test, y_test)
    
    print(x.tolist())
    print(fitness_v)
    print()
    
    return 1 - fitness_v

In [7]:
varbound = np.array([[0, 1],  # criterion
                     [0, 20],  # max_depth
                     [2, 40],   # min_samples_split
                     [1, 20],  # min_samples_leaf
                     [0, 2]])  # max_features
                    # [1, 10],  # max_leaf_nodes
                    # [0., 1.]])  # ccp_alpha

vartype = np.array([['int'],  # criterion
                    ['int'],  # max_depth
                    ['int'],   # min_samples_split
                    ['int'],  # min_samples_leaf
                    ['int']])  # max_features
                   # ['int'],  # max_leaf_nodes
                   # ['real']])  # ccp_alpha

In [8]:
model = ga(function=fitness,
           dimension=5,
           variable_type_mixed=vartype,
           variable_boundaries=varbound,
           algorithm_parameters=algorithm_param,
           function_timeout=60.*60.)
model.run()

[0.0, 5.0, 4.0, 6.0, 0.0]
0.1956281777731284

[1.0, 6.0, 17.0, 11.0, 1.0]
0.21833499568030273

[0.0, 3.0, 35.0, 13.0, 2.0]
0.20550396167643933

[1.0, 5.0, 13.0, 20.0, 0.0]
0.21215665621161647

[0.0, 11.0, 8.0, 11.0, 0.0]
0.24109321708327547

[0.0, 11.0, 37.0, 3.0, 0.0]
0.24184893865308413

[1.0, 20.0, 28.0, 15.0, 1.0]
0.24496381013532648

[0.0, 15.0, 15.0, 20.0, 1.0]
0.2511385423411497

[1.0, 8.0, 3.0, 6.0, 1.0]
0.22631786839622897

[0.0, 17.0, 18.0, 8.0, 1.0]
0.2518482713094545

[1.0, 17.0, 22.0, 6.0, 1.0]
0.25241551339466883

[1.0, 6.0, 3.0, 8.0, 1.0]
0.21833499568030273

[0.0, 6.0, 22.0, 6.0, 2.0]
0.2315583195205226

[1.0, 13.0, 15.0, 4.0, 0.0]
0.2546357836868752

[0.0, 3.0, 33.0, 15.0, 2.0]
0.20550396167643933

[0.0, 3.0, 7.0, 7.0, 2.0]
0.20550396167643933

__________________________________________________ 0.1% GA is running...[1.0, 13.0, 15.0, 4.0, 0.0]
0.2546357836868752

[0.0, 15.0, 15.0, 20.0, 1.0]
0.2511385423411497

[1.0, 13.0, 15.0, 4.0, 0.0]
0.2546357836868752

[1.0, 13.0,

KeyboardInterrupt: 

## Vol 2

In [2]:
XY_train, X_test, inverse_target_map = get_data(min_size=None, min_size_test=None, fill_nan=None)
train_columns = list(XY_train.columns)
train_columns.remove("TARGET_NUM")

min_max_scaler = MinMaxScaler(feature_range=(1, 9))
X_train_minmax = min_max_scaler.fit(XY_train[train_columns])
x_train = X_train_minmax.transform(XY_train[train_columns])

imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
imp_train = imp.fit(x_train)
x_train_full = imp_train.transform(x_train)

In [3]:
def fitness_2(x):
    print(x.tolist())
    min_size = int(x[0])

    x_train_full_df = pd.DataFrame(x_train_full, columns=train_columns, index=XY_train.index)
    for c in x_train_full_df.columns:
        if c != "TARGET_NUM":
            x_train_full_df[c][x_train_full_df.groupby(c)[c].transform('size') <= min_size] = 10
            
    if int(x[1]) == 44:
        pca_result = x_train_full_df.values
    else:
        pca = PCA(n_components=int(x[1]))
        pca_result = pca.fit_transform(x_train_full_df)
        del pca
        
    X_train, X_test, y_train, y_test = train_test_split(pca_result, XY_train["TARGET_NUM"].values, test_size=0.3)
    del x_train_full_df
   
    if x[2] == 8:
        max_depth = None
    else:
        max_depth = int(x[2])
    
    dtc = DTC(max_depth=max_depth, random_state=42)
    
    #cv_results = cv(dtc, pca_result, XY_train["TARGET_NUM"], cv=3)
    #print(cv_results['test_score'])
    #fitness_v = cv_results['test_score'].mean()
    
    dtc = dtc.fit(X_train, y_train)
    #print(dtc.get_params(deep=True))
    fitness_v = dtc.score(X_test, y_test)
    
    
    print(fitness_v)
    print()
    
    return 1 - fitness_v

In [None]:
algorithm_param = {'max_num_iteration': 1000,\
                   'population_size':16,\
                   'mutation_probability':0.1,\
                   'elit_ratio': 0.01,\
                   'crossover_probability': 0.5,\
                   'parents_portion': 0.3,\
                   'crossover_type':'uniform',\
                   'max_iteration_without_improv':100}

varbound = np.array([[0, 300],  # min_size
                     [20, 44],  # n_components
                     [8, 20]])   # max_depth

vartype = np.array([['int'],  # min_size
                    ['int'],  # n_components
                    ['int']])  # max_depth
        
model = ga(function=fitness_2,
           dimension=3,
           variable_type_mixed=vartype,
           variable_boundaries=varbound,
           algorithm_parameters=algorithm_param,
           function_timeout=60. * 60.)
model.run()

[108.0, 42.0, 9.0]
0.25734754354417183

[29.0, 30.0, 11.0]
0.26257446743272905

[222.0, 31.0, 9.0]
0.2542182430104773

[47.0, 25.0, 15.0]
0.26378380230756604

[134.0, 35.0, 15.0]
0.26386857298484767

[139.0, 37.0, 9.0]
0.2540613270759347

[216.0, 25.0, 20.0]
0.2535229430936247

[19.0, 20.0, 20.0]
0.2508328268135063

[194.0, 21.0, 12.0]
0.2609376719086208

[64.0, 42.0, 13.0]
0.2638315985405014

[214.0, 24.0, 10.0]
0.2557928132501979

[94.0, 41.0, 20.0]
0.2527645160766688

[223.0, 35.0, 15.0]
0.2642879172926771

[8.0, 33.0, 19.0]
0.2553960143352626

[169.0, 36.0, 14.0]
0.2649191882937106

[35.0, 26.0, 17.0]
0.2613281581135458

__________________________________________________ 0.1% GA is running...[169.0, 36.0, 14.0]
0.26442679691290444

[169.0, 36.0, 18.0]
0.2598203222367916

[223.0, 35.0, 14.0]
0.26450074580159694

[169.0, 36.0, 15.0]
0.2648542575621757

[169.0, 35.0, 14.0]
0.264731610624832

[223.0, 36.0, 15.0]
0.26455485474454266

[223.0, 35.0, 15.0]
0.2648912320065219

[223.0, 35.0,