In [313]:
import os
import itertools
from functools import reduce

from sklearn.model_selection import *

from sklearn.metrics import classification_report, accuracy_score, r2_score, max_error, mean_absolute_error, mean_squared_error, median_absolute_error
from sklearn.metrics import classification_report, accuracy_score, make_scorer
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC

import numpy
import pandas
import seaborn

from lab_v2.io import read_file

In [314]:
# FILE PATHS
DRAW_T03 = './data/draw/draw-T0.3.jsonl' 
DRAW_T07 = './data/draw/draw-T0.7.jsonl' 
CSQA_T07 = './data/csqa/csqa-T0.7.jsonl' 
LAST_LETTERS_T07 = './data/last_letters/last_letters-T0.7.jsonl' 

ATTRIBUTES = ["majority_distance", "majority_distance_squared", "shannon_entropy", "gini_impurity"]
CLASS = 'majority_correct'
K_FOLDS = 5
RANDOM_STATE = 0
FILE_PATH = CSQA_T07

CACHE = 'cache/csqa/csqa-T0.3.json'
CACHE_ROS = 'cache/csqa/csqa-T0.3-ROS.json'

DATA_EXPLORATION = False

In [315]:
data = read_file(FILE_PATH)
data.head(1)

Unnamed: 0,majority_distance,majority_distance_squared,shannon_entropy,gini_impurity,majority_correct,num_correct
0,1.378e-07,0.0,0.0,0.0,True,20


In [316]:
if DATA_EXPLORATION:
    fig = px.scatter_3d(data, x='majority_distance', y='shannon_entropy', z='gini_impurity', color=CLASS, 
                        color_discrete_sequence=[px.colors.sequential.Plasma_r[3], px.colors.sequential.Plasma_r[-1]])
    fig.update_traces(marker=dict(size=3), selector=dict(mode='markers'))
    fig.update_traces(marker=dict(opacity=0.75), selector=dict(mode='markers'))
    fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
    fig.show()

## **Data exploration**

In [317]:
if DATA_EXPLORATION:
    seaborn.set_theme(style='ticks')
    seaborn.pairplot(data, hue="majority_correct", plot_kws={'alpha': 0.4})
    print('')

In [318]:
data_x = data[ATTRIBUTES]
data_y = data[CLASS]

In [319]:
classification_reports = []
def classification_report_scorer(y_true, y_pred):
    classification_reports.append(classification_report(y_true, y_pred))
    return accuracy_score(y_true, y_pred)

In [320]:
def report_average(*args):
    report_list = list()
    for report in args:
        splited = [' '.join(x.split()) for x in report.split('\n\n')]
        header = [x for x in splited[0].split(' ')]
        data = numpy.array(splited[1].split(' ')).reshape(-1, len(header) + 1)
        data = numpy.delete(data, 0, 1).astype(float)
        rest = splited[2].split(' ')
        accuarcy =numpy.array([0, 0, rest[1], rest[2]]).astype(float).reshape(-1, len(header))
        macro_avg = numpy.array([rest[5:9]]).astype(float).reshape(-1, len(header))
        weighted_avg = numpy.array([rest[11:]]).astype(float).reshape(-1, len(header))
        #avg_total = numpy.array([x for x in avg]).astype(float).reshape(-1, len(header))
        df = pandas.DataFrame(numpy.concatenate((data, accuarcy,macro_avg,weighted_avg)), columns=header)
        report_list.append(df)
    res = reduce(lambda x, y: x.add(y, fill_value=0), report_list) / len(report_list)
    return res.rename(index={res.index[-3]: 'accuracy',res.index[-2]: 'macro_avg',res.index[-1]: 'weighted_avg'})

In [321]:
class ReportAverage:
    def __init__(self):
        self.classification_reports = []

    def classification_report_scorer(self, y_true, y_pred):
        self.classification_reports.append(classification_report(y_true, y_pred))
        return accuracy_score(y_true, y_pred)
    
    def average_report(self):
        average = report_average(*self.classification_reports)
        return {
            'precision_True': average.loc[0]['precision'],
            'recall_True': average.loc[0]['recall'],
            'f1-score_True': average.loc[0]['f1-score'],
            'precision_False': average.loc[1]['precision'],
            'recall_False': average.loc[1]['recall'],
            'f1-score_False': average.loc[1]['f1-score'],
            'f1-score_Average': (average.loc[0]['f1-score'] + average.loc[1]['f1-score']) / 2,
            'accuracy': average.loc['accuracy']['f1-score']
        }

In [322]:
def my_cross_validate(model, sampler, data_x, data_y, cv=5):
    kfold = KFold(n_splits=cv)
    results = {'r2': [], 
               'mean_squared_error': [],
               'median_absolute_error': [],
               'mean_absolute_error': [],
               'max_error': []}
    for train_idx, test_idx, in kfold.split(data_x):
        X_train, X_test = data_x.iloc[train_idx], data_x.iloc[test_idx]
        y_train, y_test = data_y.iloc[train_idx], data_y.iloc[test_idx]
        
        if sampler != None: X_train, y_train = sampler.fit_resample(X_train, y_train)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        results['r2'].append(r2_score(y_test, y_pred))
        results['mean_squared_error'].append(mean_squared_error(y_test, y_pred))
        results['mean_absolute_error'].append(mean_absolute_error(y_test, y_pred))
        results['median_absolute_error'].append(median_absolute_error(y_test, y_pred))
        results['max_error'].append(max_error(y_test, y_pred))

    for a in results:
        results[a] = sum(results[a]) / len(results[a])
    return results

In [323]:
class MLExploration:
    TEMPLATE_DICT = {
        'sampler': '',
        'model': '', 
        'hyperparameters': '',
        'r2': [], 
        'mean_squared_error': [],
        'median_absolute_error': [],
        'mean_absolute_error': [],
        'max_error': []
    }
    RANDOM_STATE = 42

    def __init__(self, data_x, data_y, output_file_path):
        self.data_x = data_x
        self.data_y = data_y
        self.output_file_path = output_file_path

        self.explored_models = pandas.DataFrame([MLExploration.TEMPLATE_DICT])
        if os.path.exists(output_file_path): 
            self.explored_models = pandas.read_json(output_file_path, orient='split')


    def grid_search(self, model, parameters):
        parameter_combinations = self.__parameter_product(parameters)
        for combination in parameter_combinations:
            print(combination)
            self.explore_model(model, combination)

    def explore_model(self, model, hyperparameters, sampler=None):
        index = MLExploration.hash(sampler, model, hyperparameters)
        if index in self.explored_models.index: return self.explored_models.loc[index]

        scores = my_cross_validate(model(**hyperparameters), sampler, data_x, data_y)
        average_scores = scores.values()
        print(average_scores)

        self.explored_models.loc[index] = [
            MLExploration.hash_sampler(sampler), 
            MLExploration.hash_model(model), 
            MLExploration.hash_hyperparameters(hyperparameters),
            *average_scores
        ]
        self.explored_models.to_json(self.output_file_path, orient='split')

    def hash_sampler(model):
        if model == None: return ''
        return str(model)
    
    def hash_model(model):
        return str(model.__name__)
    
    def hash_hyperparameters(hyperparameters):
       return str(sorted(hyperparameters.items(), key=lambda x:x[0]))
    
    def hash(sampler, model, hyperparameters):
        sampler = MLExploration.hash_sampler(sampler)
        model = MLExploration.hash_model(model)
        hyperparameters = MLExploration.hash_hyperparameters(hyperparameters)
        return str((sampler, model, hyperparameters))
    
    def __parameter_product(self, parameters):
        keys, values = zip(*parameters.items())
        experiments = [dict(zip(keys, v)) for v in itertools.product(*values)]
        return experiments

In [324]:
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor

from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import CondensedNearestNeighbour, EditedNearestNeighbours, RandomUnderSampler, InstanceHardnessThreshold

In [325]:
def try_models(data_x, data_y, file_path, sampler):
    csqa = MLExploration(data_x, data_y, file_path)
    csqa.explore_model(AdaBoostRegressor, {}, sampler=sampler)
    csqa.explore_model(RandomForestRegressor, {}, sampler=sampler)
    csqa.explore_model(ExtraTreesRegressor, {}, sampler=sampler)
    csqa.explore_model(GradientBoostingRegressor, {}, sampler=sampler)
    csqa.explore_model(GaussianProcessRegressor, {}, sampler=sampler)
    csqa.explore_model(KNeighborsRegressor, {}, sampler=sampler)
    csqa.explore_model(MLPRegressor, {}, sampler=sampler)
    csqa.explore_model(DecisionTreeRegressor, {}, sampler=sampler)

In [326]:
for file_path, cache_file_path in [
    ('data/draw/draw-T0.3.jsonl', 'cache/draw/draw-T0.3.json'),
    ('data/draw/draw-T0.5.jsonl', 'cache/draw/draw-T0.5.json'),
    ('data/draw/draw-T0.7.jsonl', 'cache/draw/draw-T0.7.json'),
    ('data/csqa/csqa-T0.3.jsonl', 'cache/csqa/csqa-T0.3.json'),
    ('data/csqa/csqa-T0.5.jsonl', 'cache/csqa/csqa-T0.5.json'),
    ('data/csqa/csqa-T0.7.jsonl', 'cache/csqa/csqa-T0.7.json'),
    ('data/csqa/last_letters-T0.3.jsonl', 'cache/last_letters/last_letters-T0.3.json'),
    ('data/last_letters/last_letters-T0.5.jsonl', 'cache/last_letters/last_letters-T0.5.json'),
    ('data/last_letters/last_letters-T0.7.jsonl', 'cache/last_letters/last_letters-T0.7.json'),
]:
    data = read_file(file_path)
    data_x = data[ATTRIBUTES]
    data_y = data[CLASS]

    for row in [
        ('', None),

        ('-ROS', RandomOverSampler(random_state=RANDOM_STATE)),
        ('-ADASYN', ADASYN(random_state=RANDOM_STATE)),
        ('-SMOTE', SMOTE(random_state=RANDOM_STATE)),

        # ('-CNN', CondensedNearestNeighbour(random_state=RANDOM_STATE)),
        # ('-ENN', EditedNearestNeighbours()),
        ('-RUS', RandomUnderSampler(random_state=RANDOM_STATE))
    ]:
        name = row[0]
        sampler = row[1]
        try_models(data_x, data_y, cache_file_path, sampler)

dict_values([0.14946646063825603, 0.20791797745684004, 0.4177514273265139, 0.4199222405153183, 0.8862102719396396])


dict_values([0.07071391486812843, 0.22721153333333333, 0.339, 0.37836666666666663, 1.0])
dict_values([-0.026663886436217664, 0.2509298, 0.272, 0.3683133333333333, 1.0])
dict_values([0.18840397923771618, 0.1983580455908004, 0.3924773435251451, 0.3935580726139758, 1.0089960594178016])
dict_values([-0.18138922575595395, 0.28874959994405264, 0.38109779357910156, 0.388421751499176, 4.882826614379883])
dict_values([0.03655590451434243, 0.2356133333333333, 0.4, 0.39139999999999997, 1.0])
