In [1182]:
import os
import itertools
from functools import reduce
from collections import Counter

from sklearn.metrics import classification_report, accuracy_score, r2_score, max_error, mean_absolute_error, mean_squared_error, median_absolute_error
from sklearn.model_selection import cross_validate, KFold
from sklearn.svm import SVC

import pandas
import seaborn
from mpl_toolkits import mplot3d
import matplotlib.pyplot as plt
import matplotlib
import plotly.express as px

from lab_v2.io import read_file

In [1183]:
# FILE PATHS
DRAW_T07 = './data/draw/draw-T0.7.jsonl' 
LAST_LETTERS_T07 = './data/last_letters/last_letters-T0.7.jsonl' 
CSQA_T07 = './data/csqa/csqa-T0.7.jsonl' 

ATTRIBUTES = ["majority_distance", "majority_distance_squared", "shannon_entropy", "gini_impurity"]
SCORES = ['r2', 'neg_root_mean_squared_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'max_error']
CLASS = 'num_correct'

DATA_EXPLORATION = False

FILE_PATH = CSQA_T07
CACHE = 'cache/csqa/csqa-T0.3.json'

In [1184]:
data = read_file(FILE_PATH)
data['num_correct'] = data['num_correct'].apply(lambda row : row)
data.head(1)

Unnamed: 0,majority_distance,majority_distance_squared,shannon_entropy,gini_impurity,majority_correct,num_correct
0,1.378e-07,0.0,0.0,0.0,True,20


## **Data exploration**

In [1185]:
if DATA_EXPLORATION:
    count = dict(Counter(data['num_correct']))
    count = sorted(count.items(), key=lambda x: x[0])
    ax = seaborn.barplot(count, errorbar=None)
    ax.set(xlabel='num_correct', ylabel='count')

In [1186]:
if DATA_EXPLORATION: 
    seaborn.set_theme(style='ticks')
    seaborn.pairplot(data[[*ATTRIBUTES, CLASS]], hue=CLASS, plot_kws={'alpha': 0.35})
    print('')

In [1187]:
if DATA_EXPLORATION:
    fig = px.scatter_3d(data, x='majority_distance', y='shannon_entropy', z='gini_impurity', color=CLASS)
    fig.update_traces(marker=dict(size=3), selector=dict(mode='markers'))
    fig.update_traces(marker=dict(opacity=0.75), selector=dict(mode='markers'))
    fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
    fig.show()

In [1188]:
data_x = data[ATTRIBUTES]
data_y = data[CLASS]
display(data_x.head(1))
display(data_y.head(1))

Unnamed: 0,majority_distance,majority_distance_squared,shannon_entropy,gini_impurity
0,1.378e-07,0.0,0.0,0.0


0    20
Name: num_correct, dtype: int64

In [1189]:
classification_reports = []
def classification_report_scorer(y_true, y_pred):
    classification_reports.append(classification_report(y_true, y_pred))
    return accuracy_score(y_true, y_pred)

In [1190]:
import math
def report_average(report):
    report = report.copy()
    for r in report:
        report[r] = sum(report[r]) / len(report[r])
    return report

In [1191]:
from sklearn.metrics import classification_report, accuracy_score, r2_score, max_error, mean_absolute_error, mean_squared_error, median_absolute_error

In [1192]:
def my_cross_validate(model, sampler, data_x, data_y, cv=5):
    kfold = KFold(n_splits=cv)
    results = {'r2': [], 
               'mean_squared_error': [],
               'median_absolute_error': [],
               'mean_absolute_error': [],
               'max_error': []}
    for train_idx, test_idx, in kfold.split(data_x):
        X_train, X_test = data_x.iloc[train_idx], data_x.iloc[test_idx]
        y_train, y_test = data_y.iloc[train_idx], data_y.iloc[test_idx]
        
        if sampler != None: X_train, y_train = sampler.fit_resample(X_train, y_train)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        results['r2'].append(r2_score(y_test, y_pred))
        results['mean_squared_error'].append(mean_squared_error(y_test, y_pred))
        results['mean_absolute_error'].append(mean_absolute_error(y_test, y_pred))
        results['median_absolute_error'].append(median_absolute_error(y_test, y_pred))
        results['max_error'].append(max_error(y_test, y_pred))

    for a in results:
        results[a] = sum(results[a]) / len(results[a])
    return results

In [1193]:
class MLExploration:
    SCORES_DICT = {s: '' for s in SCORES}
    TEMPLATE_DICT = {
        'model': '', 
        'hyperparameters': '',
        'r2': [], 
        'mean_squared_error': [],
        'median_absolute_error': [],
        'mean_absolute_error': [],
        'max_error': []
    }
    RANDOM_STATE = 42

    def __init__(self, data_x, data_y, output_file_path):
        self.data_x = data_x
        self.data_y = data_y
        self.output_file_path = output_file_path

        self.explored_models = pandas.DataFrame([MLExploration.TEMPLATE_DICT])
        if os.path.exists(output_file_path): 
            self.explored_models = pandas.read_json(output_file_path, orient='split')


    def grid_search(self, model, parameters):
        parameter_combinations = self.__parameter_product(parameters)
        for combination in parameter_combinations:
            print(combination)
            self.explore_model(model, combination)

    def explore_model(self, model, hyperparameters, sampler=None):
        index = MLExploration.hash(model, hyperparameters)
        if index in self.explored_models.index: return self.explored_models.loc[index]

        scores = my_cross_validate(model(**hyperparameters), sampler, data_x, data_y)
        average_scores = scores.values()

        self.explored_models.loc[index] = [
            MLExploration.hash_model(model), 
            MLExploration.hash_hyperparameters(hyperparameters),
            *average_scores
        ]
        self.explored_models.to_json(self.output_file_path, orient='split')

    def hash_model(model):
        return str(model.__name__)
    
    def hash_hyperparameters(hyperparameters):
       return str(sorted(hyperparameters.items(), key=lambda x:x[0]))
    
    def hash(model, hyperparameters):
        model = MLExploration.hash_model(model)
        hyperparameters = MLExploration.hash_hyperparameters(hyperparameters)
        return str((model, hyperparameters))
    
    def __parameter_product(self, parameters):
        keys, values = zip(*parameters.items())
        experiments = [dict(zip(keys, v)) for v in itertools.product(*values)]
        return experiments

## **Regression**

In [1194]:
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [1195]:
csqa_reg = MLExploration(data_x, data_y, CACHE)

In [1196]:
csqa_reg.explore_model(AdaBoostRegressor, {})
csqa_reg.explore_model(GradientBoostingRegressor, {})
csqa_reg.explore_model(RandomForestRegressor, {})
csqa_reg.explore_model(XGBRegressor, {})
csqa_reg.explore_model(GaussianProcessRegressor, {})
csqa_reg.explore_model(KNeighborsRegressor, {})
csqa_reg.explore_model(DecisionTreeRegressor, {})
csqa_reg.explore_model(MLPRegressor, {'max_iter': 10000})
print('')

TypeError: my_cross_validate() missing 1 required positional argument: 'data_y'

In [None]:
csqa_reg.explored_models.style.hide(axis='index')

model,hyperparameters,r2,neg_root_mean_squared_error,neg_median_absolute_error,neg_mean_absolute_error,max_error
,,,,,,
AdaBoostRegressor,[],0.340786,-5.566115,-2.45265,-3.918998,-18.842168
GradientBoostingRegressor,[],0.332465,-5.602659,-1.670877,-3.493938,-19.497899
RandomForestRegressor,[],0.206292,-6.093592,-1.276,-3.626547,-19.962
XGBRegressor,[],0.099333,-6.475855,-1.007585,-3.75257,-19.918621
GaussianProcessRegressor,[],-2.317232,-10.590219,-1.695351,-4.322612,-104.583773
KNeighborsRegressor,[],0.297723,-5.739037,-2.24,-3.578497,-20.0
DecisionTreeRegressor,[],-0.243509,-7.599239,-0.4,-4.101612,-20.0
MLPRegressor,"[('max_iter', 10000)]",0.400695,-5.317205,-2.102771,-3.512536,-20.305056
