### **Section 5.6 - Various Classifier Results**

In [39]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score
from imblearn.pipeline import Pipeline
from tensorflow import keras
from scikeras.wrappers import KerasClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
import pandas as pd
from scikeras.wrappers import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense
import warnings
import tensorflow
import keras
import re
import os


tensorflow.get_logger().setLevel('ERROR')
warnings.filterwarnings('ignore')
tensorflow.random.set_seed(0)

In [40]:
ATTRIBUTES = ["majority_distance", "majority_distance_squared", "shannon_entropy", "gini_impurity"]
CLASS = 'majority_correct'
RANDOM_STATE = 42

In [41]:
class MLExploration:
    def __init__(self, data_x, data_y, scoring, output_file_path):
        self.data_x = data_x
        self.data_y = data_y
        self.scoring = scoring
        self.output_file_path = output_file_path
        self.explored_models = pd.DataFrame()
        if os.path.exists(output_file_path): self.explored_models = pd.read_json(output_file_path, lines=True)

    def explore_model(self, clf, sampler):
        clf_hash = self.__hash_model(clf)
        sampler_hash = self.__hash_model(sampler)
        index = str((clf_hash, sampler_hash))
        
        if (
            "id" in self.explored_models.columns
            and (self.explored_models['id'].eq(index)).any()
        ):
            return

        if sampler == None:
            model = clf
        else:
            model = Pipeline([("sampler", sampler), ("clf", clf)])
        # try:
        results = cross_validate(
            estimator=model, X=self.data_x, y=self.data_y, scoring=self.scoring
        )

        row = pd.DataFrame(
            [
                {
                    "id": index,
                    "clf": clf_hash,
                    "sampler": sampler_hash,
                    **self.__dict_mean(results),
                }
            ]
        )
        self.explored_models = pd.concat([self.explored_models, row])
        self.explored_models.to_json(self.output_file_path, lines=True, orient='records')
        # except: pass


    def __dict_mean(self, obj):
        try:
            return sum(obj) / len(obj)
        except:
            return {
                key.replace("test_", ""): self.__dict_mean(obj[key])
                for key in obj.keys()
            }

    def __hash_model(self, clf):
        if type(clf) == KerasClassifier:
            cleaned = re.sub(r'\n\tmodel\=.*\n', '', str(clf), re.DOTALL)
            cleaned = re.sub(r'\n', ',', cleaned, re.DOTALL)
            cleaned = re.sub(r'\t', '', cleaned, re.DOTALL)
            return str((cleaned, self.__keras_model_info(clf)))
        return str(clf)

    def __keras_model_info(self, clf):
        return str(
            [
                (type(layer).__name__, layer.units, layer.activation.__name__)
                for layer in clf.model.layers
            ]
        )

In [42]:
scoring = {
    "accuracy": make_scorer(accuracy_score),
    "precision-Incorrect": make_scorer(precision_score, pos_label=0),
    "recall-Incorrect": make_scorer(recall_score, pos_label=0),
    "f1-Incorrect": make_scorer(f1_score, pos_label=0),
    "precision-Correct": make_scorer(precision_score, pos_label=1),
    "recall-Correct": make_scorer(recall_score, pos_label=1),
    "f1-Correct": make_scorer(f1_score, pos_label=1),
}

In [43]:
def create_model(n_features, n_layers, units, hidden_activation, output_activation, step_size=5):
    model = Sequential()
    step = -step_size
    for i in range(n_layers):
        if i == 0:
            model.add(Dense(units, input_dim=n_features, activation=hidden_activation))  
        else:
            if step == 0: units = max(1, units // 2)
            model.add(Dense(units, activation=hidden_activation))
        step = (step + 1) % step_size

    model.add(Dense(1, activation=output_activation))   
    optimizer = keras.optimizers.Adam(learning_rate=0.0001)
    if output_activation == "sigmoid": model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    if output_activation == "softmax": model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [44]:
for file_path, cache_file_path in [
    ('../../data/machine-learning/draw/draw-T0.7.jsonl', '../../cache/classifier/draw/draw-T0.7.json'),
    ('../../data/machine-learning/csqa/csqa-T0.7.jsonl', '../../cache/classifier/csqa/csqa-T0.7.json'),
    ('../../data/machine-learning/base-T0.7/svamp.jsonl', '../../cache/classifier/base-T0.7/svamp.json'),
    ('../../data/machine-learning/base-T0.7/arc.jsonl', '../../cache/classifier/base-T0.7/arc.json'),
]:
    print('FILE_PATH:', file_path)
    data = pd.read_json(file_path, lines=True)
    data_x = data[ATTRIBUTES]
    data_y = data[CLASS]
    ml_exploration = MLExploration(
        data_x=data_x, data_y=data_y, output_file_path=cache_file_path, scoring=scoring
    )
    for sampler in [
        # None,
        RandomOverSampler(random_state=0),
        # RandomUnderSampler(random_state=0)
    ]:
        print('SAMPLER:', sampler)
        ml_exploration.explore_model(clf=XGBClassifier(random_state=0), sampler=sampler)
        ml_exploration.explore_model(clf=AdaBoostClassifier(random_state=0), sampler=sampler)
        ml_exploration.explore_model(clf=RandomForestClassifier(random_state=0), sampler=sampler)
        ml_exploration.explore_model(clf=ExtraTreesClassifier(random_state=0), sampler=sampler)
        ml_exploration.explore_model(clf=GradientBoostingClassifier(random_state=0), sampler=sampler)
        ml_exploration.explore_model(clf=KNeighborsClassifier(), sampler=sampler)
        ml_exploration.explore_model(clf=MLPClassifier(random_state=0), sampler=sampler)
        ml_exploration.explore_model(clf=SVC(random_state=0), sampler=sampler)
        ml_exploration.explore_model(clf=DecisionTreeClassifier(random_state=0), sampler=sampler)

        ml_exploration.explore_model(clf=KerasClassifier(model=create_model(len(ATTRIBUTES), 5, 100, 'relu', 'sigmoid', step_size=1), verbose=0, epochs=100, batch_size=128), sampler=sampler)
        ml_exploration.explore_model(clf=KerasClassifier(model=create_model(len(ATTRIBUTES), 10, 100, 'relu', 'sigmoid', step_size=2), verbose=0, epochs=100, batch_size=128), sampler=sampler)
        ml_exploration.explore_model(clf=KerasClassifier(model=create_model(len(ATTRIBUTES), 15, 100, 'relu', 'sigmoid', step_size=3), verbose=0, epochs=100, batch_size=128), sampler=sampler)
        # ml_exploration.explore_model(clf=KerasClassifier(model=create_model(len(ATTRIBUTES), 25, 100, 'relu', 'sigmoid', step_size=5), verbose=0, epochs=100, batch_size=128), sampler=sampler)
        # ml_exploration.explore_model(clf=KerasClassifier(model=create_model(len(ATTRIBUTES), 30, 100, 'relu', 'sigmoid', step_size=6), verbose=0, epochs=100, batch_size=128), sampler=sampler)


FILE_PATH: ../../data/machine-learning/draw/draw-T0.7.jsonl
SAMPLER: RandomOverSampler(random_state=0)
FILE_PATH: ../../data/machine-learning/csqa/csqa-T0.7.jsonl
SAMPLER: RandomOverSampler(random_state=0)
FILE_PATH: ../../data/machine-learning/base-T0.7/svamp.jsonl
SAMPLER: RandomOverSampler(random_state=0)
FILE_PATH: ../../data/machine-learning/base-T0.7/arc.jsonl
SAMPLER: RandomOverSampler(random_state=0)


In [45]:
cache_file_paths = [
    ('../../cache/classifier/csqa/csqa-T0.7.json', 'CSQA'),
    ('../../cache/classifier/draw/draw-T0.7.json', 'DRAW-1K'),
    ('../../cache/classifier/last_letters/last_letters-T0.7.json', 'LAST LETTERS'),
    ('../../cache/classifier/base-T0.7/svamp.json', 'SVAMP'),
    ('../../cache/classifier/base-T0.7/arc.json', 'ARC EASY')
]

In [46]:
for file_path, dataset in cache_file_paths:
    data = pd.read_json(file_path, lines=True)

    data['classifier_name'] = data['clf'].apply(lambda x: str(x).split('(')[0])

    # For the KerasClassifiers
    data.loc[data.index[-3], 'classifier_name'] = 'MLP 5'
    data.loc[data.index[-2], 'classifier_name'] = 'MLP 10'
    data.loc[data.index[-1], 'classifier_name'] = 'MLP 15'

    # Round numerical columns to three significant figures
    numerical_columns = ['accuracy', 'precision-Incorrect', 'recall-Incorrect', 'f1-Incorrect',
                         'precision-Correct', 'recall-Correct', 'f1-Correct']
    data[numerical_columns] = data[numerical_columns].apply(lambda x: round(x, 3))

    COLUMNS = ['classifier_name', 'accuracy', 
               'precision-Incorrect', 'recall-Incorrect', 
               'precision-Correct', 'recall-Correct']
    
    df = data[COLUMNS].rename(columns={
        'precision-Incorrect': 'Incorrect Precision',
        'recall-Incorrect': 'Incorrect Recall',
        # 'f1-Incorrect': 'Incorrect F1',
        'precision-Correct': 'Correct Precision',
        'recall-Correct': 'Correct Recall',
        # 'f1-Correct': 'Correct F1'
    })

    pd.set_option('display.max_columns', None)
    pd.set_option('display.expand_frame_repr', False)
    pd.set_option('max_colwidth', None)

    # Display the DataFrame
    print(f"Results for: {dataset}")
    display(df)
    print("\n" + "="*50 + "\n") 



Results for: CSQA


Unnamed: 0,classifier_name,accuracy,Incorrect Precision,Incorrect Recall,Correct Precision,Correct Recall
0,XGBClassifier,0.627,0.524,0.457,0.68,0.735
1,AdaBoostClassifier,0.665,0.589,0.458,0.698,0.796
2,RandomForestClassifier,0.604,0.488,0.408,0.659,0.728
3,ExtraTreesClassifier,0.597,0.478,0.394,0.653,0.726
4,GradientBoostingClassifier,0.661,0.583,0.454,0.695,0.793
5,KNeighborsClassifier,0.591,0.477,0.52,0.676,0.636
6,MLPClassifier,0.667,0.592,0.461,0.7,0.798
7,SVC,0.667,0.592,0.461,0.7,0.798
8,DecisionTreeClassifier,0.598,0.48,0.387,0.653,0.732
9,MLP 5,0.666,0.591,0.462,0.699,0.797




Results for: DRAW-1K


Unnamed: 0,classifier_name,accuracy,Incorrect Precision,Incorrect Recall,Correct Precision,Correct Recall
0,XGBClassifier,0.764,0.369,0.397,0.862,0.847
1,AdaBoostClassifier,0.741,0.38,0.636,0.904,0.765
2,RandomForestClassifier,0.759,0.352,0.359,0.855,0.849
3,ExtraTreesClassifier,0.759,0.339,0.315,0.848,0.859
4,GradientBoostingClassifier,0.772,0.406,0.516,0.884,0.83
5,KNeighborsClassifier,0.694,0.32,0.587,0.885,0.718
6,MLPClassifier,0.769,0.422,0.674,0.916,0.791
7,SVC,0.756,0.406,0.69,0.918,0.771
8,DecisionTreeClassifier,0.772,0.372,0.342,0.854,0.869
9,MLP 5,0.753,0.405,0.701,0.92,0.765




Results for: LAST LETTERS


Unnamed: 0,classifier_name,accuracy,Incorrect Precision,Incorrect Recall,Correct Precision,Correct Recall
0,XGBClassifier,0.665,0.613,0.643,0.712,0.682
1,AdaBoostClassifier,0.678,0.614,0.734,0.756,0.635
2,RandomForestClassifier,0.656,0.607,0.613,0.697,0.689
3,ExtraTreesClassifier,0.652,0.606,0.588,0.687,0.701
4,GradientBoostingClassifier,0.678,0.62,0.7,0.74,0.661
5,KNeighborsClassifier,0.633,0.574,0.634,0.691,0.633
6,MLPClassifier,0.655,0.591,0.729,0.744,0.597
7,SVC,0.656,0.593,0.711,0.735,0.613
8,DecisionTreeClassifier,0.628,0.578,0.554,0.665,0.686
9,MLP 5,0.659,0.6,0.686,0.725,0.638




Results for: SVAMP


Unnamed: 0,classifier_name,accuracy,Incorrect Precision,Incorrect Recall,Correct Precision,Correct Recall
0,XGBClassifier,0.811,0.393,0.455,0.899,0.876
1,AdaBoostClassifier,0.77,0.385,0.805,0.956,0.764
2,RandomForestClassifier,0.806,0.394,0.481,0.902,0.865
3,ExtraTreesClassifier,0.81,0.372,0.344,0.882,0.895
4,GradientBoostingClassifier,0.8,0.408,0.663,0.931,0.825
5,KNeighborsClassifier,0.753,0.349,0.695,0.932,0.764
6,MLPClassifier,0.735,0.348,0.825,0.958,0.719
7,SVC,0.745,0.356,0.812,0.955,0.733
8,DecisionTreeClassifier,0.803,0.369,0.396,0.889,0.877
9,MLP 5,0.758,0.37,0.812,0.956,0.748




Results for: ARC EASY


Unnamed: 0,classifier_name,accuracy,Incorrect Precision,Incorrect Recall,Correct Precision,Correct Recall
0,XGBClassifier,0.769,0.177,0.404,0.929,0.806
1,AdaBoostClassifier,0.801,0.24,0.506,0.942,0.832
2,RandomForestClassifier,0.881,0.353,0.314,0.93,0.94
3,ExtraTreesClassifier,0.884,0.355,0.283,0.927,0.946
4,GradientBoostingClassifier,0.823,0.26,0.453,0.938,0.862
5,KNeighborsClassifier,0.804,0.236,0.48,0.94,0.837
6,MLPClassifier,0.907,0.515,0.426,0.941,0.957
7,SVC,0.908,0.522,0.421,0.941,0.958
8,DecisionTreeClassifier,0.882,0.328,0.247,0.924,0.948
9,MLP 5,0.91,0.534,0.426,0.942,0.96




