In [43]:
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
import pandas
from imblearn.pipeline import Pipeline
from tensorflow import keras
from keras.layers import Dense
from keras.models import Sequential
from scikeras.wrappers import KerasClassifier
import math
from typing import Iterable
import warnings
import tensorflow
from sklearn.utils import class_weight
import numpy
import keras
import re
import os

import joblib

tensorflow.get_logger().setLevel('ERROR')
warnings.filterwarnings('ignore')

In [44]:
ATTRIBUTES = ["majority_distance", "majority_distance_squared", "shannon_entropy", "gini_impurity"]
CLASS = 'num_correct'
RANDOM_STATE = 42

In [45]:
class MLExploration:
    def __init__(self, data_x, data_y, scoring, output_file_path):
        self.data_x = data_x
        self.data_y = data_y
        self.scoring = scoring
        self.output_file_path = output_file_path
        self.explored_models = pandas.DataFrame()
        if os.path.exists(output_file_path): self.explored_models = pandas.read_json(output_file_path, lines=True)

    def explore_model(self, clf, sampler):
        clf_hash = self.__hash_model(clf)
        sampler_hash = self.__hash_model(sampler)
        
        if (
            "clf" in self.explored_models.columns
            and "sampler" in self.explored_models.columns
            and (self.explored_models[['clf', 'sampler']] == [clf_hash, sampler_hash]).all(axis=1).any()
        ):
            return

        if sampler == None:
            model = clf
        else:
            model = Pipeline([("sampler", sampler), ("clf", clf)])
        try:
                
            results = cross_validate(
                estimator=model, X=self.data_x, y=self.data_y, scoring=self.scoring
            )

            row = pandas.DataFrame(
                [
                    {
                        "clf": clf_hash,
                        "sampler": sampler_hash,
                        **self.__dict_mean(results),
                    }
                ]
            )
            self.explored_models = pandas.concat([self.explored_models, row])
            self.explored_models.to_json(self.output_file_path, lines=True, orient='records')
        except: pass

    def __dict_mean(self, obj):
        try:
            return sum(obj) / len(obj)
        except:
            return {
                key.replace("test_", ""): self.__dict_mean(obj[key])
                for key in obj.keys()
            }

    def __hash_model(self, clf):
        if type(clf) == KerasClassifier:
            cleaned = re.sub(r'\n\tmodel\=.*\n', '', str(clf), re.DOTALL)
            cleaned = re.sub(r'\n', ',', cleaned, re.DOTALL)
            cleaned = re.sub(r'\t', '', cleaned, re.DOTALL)
            return str((cleaned, self.__keras_model_info(clf)))
        return str(clf)

    def __keras_model_info(self, clf):
        return str(
            [
                (type(layer).__name__, layer.units, layer.activation.__name__)
                for layer in clf.model.layers
            ]
        )

In [46]:
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import CondensedNearestNeighbour, EditedNearestNeighbours, RandomUnderSampler, InstanceHardnessThreshold

from scikeras.wrappers import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense


In [47]:
# import sklearn
# sklearn.metrics.get_scorer_names()

In [48]:
scoring = [
        'r2', 
        'neg_mean_squared_error',
        'neg_median_absolute_error',
        'neg_mean_absolute_error',
        'max_error']

In [49]:
def create_model(n_features, n_layers, units, hidden_activation, output_activation, step_size=5):
    model = Sequential()
    step = -step_size
    for i in range(n_layers):
        if i == 0:
            model.add(Dense(units, input_dim=n_features, activation=hidden_activation))  
        else:
            if step == 0: units = max(1, units // 2)
            model.add(Dense(units, activation=hidden_activation))
        step = (step + 1) % step_size
    model.add(Dense(1, activation=output_activation))  
    
    optimizer = keras.optimizers.Adam(learning_rate=0.001)

    model.compile(loss='crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [50]:
for file_path, cache_file_path in [
    ('data/draw/draw-T0.3.jsonl', 'cache/draw/draw-T0.3.jsonl'),
    # ('data/draw/draw-T0.5.jsonl', 'cache/draw/draw-T0.5.json'),
    # ('data/draw/draw-T0.7.jsonl', 'cache/draw/draw-T0.7.json'),
    # ('data/csqa/csqa-T0.3.jsonl', 'cache/csqa/csqa-T0.3.json'),
    # ('data/csqa/csqa-T0.5.jsonl', 'cache/csqa/csqa-T0.5.json'),
    # ('data/csqa/csqa-T0.7.jsonl', 'cache/csqa/csqa-T0.7.json'),
    # ('data/last_letters/last_letters-T0.3.jsonl', 'cache/last_letters/last_letters-T0.3.json'),
    # ('data/last_letters/last_letters-T0.5.jsonl', 'cache/last_letters/last_letters-T0.5.json'),
    # ('data/last_letters/last_letters-T0.7.jsonl', 'cache/last_letters/last_letters-T0.7.json'),
]:
    for sampler in [
        None,
        RandomOverSampler(random_state=RANDOM_STATE),
        ADASYN(random_state=RANDOM_STATE),
        SMOTE(random_state=RANDOM_STATE),
        # CondensedNearestNeighbour(random_state=RANDOM_STATE),
        # EditedNearestNeighbours(),
        RandomUnderSampler(random_state=RANDOM_STATE)
    ]:
        data = pandas.read_json(file_path, lines=True)
        data_x = data[ATTRIBUTES]
        data_y = data[CLASS]
        ml_exploration = MLExploration(
            data_x=data_x, data_y=data_y, output_file_path=cache_file_path, scoring=scoring
        )
        ml_exploration.explore_model(clf=AdaBoostRegressor(), sampler=sampler)
        ml_exploration.explore_model(clf=RandomForestRegressor(), sampler=sampler)
        ml_exploration.explore_model(clf=ExtraTreesRegressor(), sampler=sampler)
        ml_exploration.explore_model(clf=GradientBoostingRegressor(), sampler=sampler)
        ml_exploration.explore_model(clf=KNeighborsRegressor(), sampler=sampler)
        ml_exploration.explore_model(clf=MLPRegressor(), sampler=sampler)
        ml_exploration.explore_model(clf=SVR(), sampler=sampler)
        ml_exploration.explore_model(clf=DecisionTreeRegressor(), sampler=sampler)

        class_labels = numpy.unique(data_y)
        class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=class_labels, y=data_y)
        class_weights = dict(zip(class_labels, class_weights))

        # ml_exploration.explore_model(clf=KerasRegressor(model=create_model(len(ATTRIBUTES), 5, 100, 'relu', 'sigmoid', step_size=1), verbose=0, epochs=100, batch_size=50), sampler=RandomUnderSampler())
        # ml_exploration.explore_model(clf=KerasRegressor(model=create_model(len(ATTRIBUTES), 10, 100, 'relu', 'sigmoid', step_size=2), verbose=0, epochs=100, batch_size=50), sampler=RandomUnderSampler())
        # ml_exploration.explore_model(clf=KerasRegressor(model=create_model(len(ATTRIBUTES), 15, 100, 'relu', 'sigmoid', step_size=3), verbose=0, epochs=100, batch_size=50), sampler=RandomUnderSampler())
        # ml_exploration.explore_model(clf=KerasRegressor(model=create_model(len(ATTRIBUTES), 25, 100, 'relu', 'sigmoid', step_size=5), verbose=0, epochs=100, batch_size=50), sampler=RandomUnderSampler())
        # ml_exploration.explore_model(clf=KerasRegressor(model=create_model(len(ATTRIBUTES), 30, 100, 'relu', 'sigmoid', step_size=6), verbose=0, epochs=100, batch_size=50), sampler=RandomUnderSampler())

In [None]:
ml_exploration.explored_models.to_json('bonkus', lines=True, orient='records')