In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from LogisticRegressors import SGD, ADAM, IWLS
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score as bal_acc

import numpy as np

import json
import openml

from tqdm.notebook import tqdm

from utils import validate_datasets, preprocess_datasets, performance_test

import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)

import random
random.seed(42)
np.random.seed(42)

import copy

In [2]:
with open("datasets.json", "r") as file:
    datasets = json.load(file)
validate_datasets(datasets)

Dataset: banknote-authentication
Binary target - OK
Low dimensionality (4) when assumed small (<=10) - OK
No missing values - OK
No non-numeric columns - OK
More rows (1372) than columns (4) - OK
Dataset banknote-authentication is OK
----------------------------------------------------------------------------------------------------

Dataset: puma8NH
Binary target - OK
Low dimensionality (8) when assumed small (<=10) - OK
No missing values - OK
No non-numeric columns - OK
More rows (8192) than columns (8) - OK
Dataset puma8NH is OK
----------------------------------------------------------------------------------------------------

Dataset: phoneme
Binary target - OK
Low dimensionality (5) when assumed small (<=10) - OK
No missing values - OK
No non-numeric columns - OK
More rows (5404) than columns (5) - OK
Dataset phoneme is OK
----------------------------------------------------------------------------------------------------

Dataset: bank32nh
Binary target - OK
High dimensionality

In [3]:
datasets = preprocess_datasets(datasets, collinear_threshold=0.7)

Preprocessing dataset banknote-authentication
Columns with only one unique value were dropped: set()
Columns to drop: ['V3'] because of collinearity

Preprocessing dataset puma8NH
Columns with only one unique value were dropped: set()
Columns to drop: [] because of collinearity

Preprocessing dataset phoneme
Columns with only one unique value were dropped: set()
Columns to drop: [] because of collinearity

Preprocessing dataset bank32nh
Columns with only one unique value were dropped: set()
Columns to drop: [] because of collinearity

Preprocessing dataset PizzaCutter1
Columns with only one unique value were dropped: set()
Columns to drop: ['f', 'g', 'i', 'k', 'm', 'n', 'p', 's', 'u', 'v', 'z', 'ab', 'ac', 'ad', 'ae', 'af', 'ag', 'ah', 'ai', 'aj', 'ak', 'al', 'am', 'ao'] because of collinearity

Preprocessing dataset MegaWatt1
Columns with only one unique value were dropped: set()
Columns to drop: ['f', 'g', 'i', 'k', 'm', 'n', 'p', 's', 't', 'u', 'v', 'z', 'aa', 'ab', 'ac', 'ae', 'af'

In [4]:
validate_datasets(datasets)

Dataset: banknote-authentication
Binary target - OK
Low dimensionality (3) when assumed small (<=10) - OK
No missing values - OK
No non-numeric columns - OK
More rows (1372) than columns (3) - OK
Dataset banknote-authentication is OK
----------------------------------------------------------------------------------------------------

Dataset: puma8NH
Binary target - OK
Low dimensionality (8) when assumed small (<=10) - OK
No missing values - OK
No non-numeric columns - OK
More rows (8192) than columns (8) - OK
Dataset puma8NH is OK
----------------------------------------------------------------------------------------------------

Dataset: phoneme
Binary target - OK
Low dimensionality (5) when assumed small (<=10) - OK
No missing values - OK
No non-numeric columns - OK
More rows (5404) than columns (5) - OK
Dataset phoneme is OK
----------------------------------------------------------------------------------------------------

Dataset: bank32nh
Binary target - OK
High dimensionality

# Batch 1

In [5]:
import copy
reslut_dict = dict.fromkeys(datasets.keys(), dict())
reslut_dict['MegaWatt1']= {
    "test" : "test"
}
for key in reslut_dict.keys():
    reslut_dict[key] = {
        "SGD": dict(),
        "ADAM": dict(),
        "IWLS": dict(),
    }
reslut_dict_interactions = copy.deepcopy(reslut_dict)

keys = list(reslut_dict_interactions.keys())

# remove large datasets from reslut_dict_interactions
for key in keys:
    if datasets[key]['size'] == 'large':
        reslut_dict_interactions.pop(key)


## All datasets no interactions

In [6]:
for dataset in tqdm(datasets.keys(), desc="Datasets"):
    print(f"Dataset: {dataset}")
    X, y = datasets[dataset]["X"], datasets[dataset]["y"]
    SGDs = [SGD(batch_size=1, epochs=500, intercept=True) for _ in range(5)]
    ADAMs = [ADAM(batch_size=1, epochs=500, intercept=True) for _ in range(5)]
    IWLSs = [IWLS(epochs=500, intercept=True) for _ in range(5)]
    for model_name, models in tqdm(
        {
            "SGD": SGDs,
            "ADAM": ADAMs,
            "IWLS": IWLSs,
        }.items(), desc="Models"
    ):
        reslut_dict[dataset][model_name] = performance_test(X, y, models, model_name, test_size=0.2)
    # print(reslut_dict)

    with open("training_results.json", "w") as file:
        json.dump(reslut_dict, file)

Datasets:   0%|          | 0/9 [00:00<?, ?it/s]

Dataset: banknote-authentication


Models:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Dataset: puma8NH


Models:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Dataset: phoneme


Models:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Dataset: bank32nh


Models:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Dataset: PizzaCutter1


Models:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Dataset: MegaWatt1


Models:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Dataset: analcatdata_authorship


Models:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Dataset: mc1


Models:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Dataset: twonorm


Models:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

## Small datasets with interactions

In [7]:
for dataset in tqdm(datasets.keys(), desc="Datasets"):
    if datasets[dataset]['size'] == "large":
        print(f"Skipping {dataset} because it is large.")
        continue
    print(f"Dataset: {dataset}")
    X, y = datasets[dataset]["X"], datasets[dataset]["y"]
    SGDs = [SGD(batch_size=1, epochs=500, intercept=True, include_interactions=True) for _ in range(5)]
    ADAMs = [ADAM(batch_size=1, epochs=500, intercept=True, include_interactions=True) for _ in range(5)]
    IWLSs = [IWLS(epochs=500, intercept=True, include_interactions=True) for _ in range(5)]
    for model_name, models in tqdm(
        {
            "SGD": SGDs,
            "ADAM": ADAMs,
            "IWLS": IWLSs,
        }.items(), desc="Models"
    ):
        reslut_dict_interactions[dataset][model_name] = performance_test(X, y, models, model_name, test_size=0.2)

    with open("training_results_interactions.json", "w") as file:
        json.dump(reslut_dict_interactions, file)

Datasets:   0%|          | 0/9 [00:00<?, ?it/s]

Dataset: banknote-authentication


Models:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Dataset: puma8NH


Models:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Dataset: phoneme


Models:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Skipping bank32nh because it is large.
Skipping PizzaCutter1 because it is large.
Skipping MegaWatt1 because it is large.
Skipping analcatdata_authorship because it is large.
Skipping mc1 because it is large.
Skipping twonorm because it is large.
