In [28]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from copy import deepcopy
import random

import sys
import os

sys.path.append("..")

from src.configs import DataConfig, NetConfig, NetArchitecture, FilterConfig, CNNConfig, LC_SIZE
from src.configs import DataType as DT
from src.utils import train, get_wandb_logger
from src.configs import PACKAGE_PATH
from src.data.data_processor import DataProcessor
from src.sweeps.sweep import DATA_CONFIG
from src.module.lightning_module import LCModule
from src.configs import AugmentType as A
from src.configs import SplitStrategy as ST
from src.module.resnet import resnet20
from src.utils import get_wandb_logger

torch.manual_seed(42)
np.random.seed(42)

In [29]:
from xgboost import XGBClassifier

In [37]:

data_cfg = DataConfig(
        path=f"{PACKAGE_PATH}/Fall_2021_csv",
        output_path=f"{PACKAGE_PATH}/resources/datasets",
        validation_path = f"{PACKAGE_PATH}/resources/SDLCD.csv",
        # class_names=["cz_3", "falcon_9", "atlas_V",  "h2a" ],
        # regexes=[r'CZ-3B.*', r'FALCON_9.*', r'ATLAS_[5|V]_CENTAUR_R_B$',  r'H-2A.*'],
        class_names=["cz_3", "falcon_9", "atlas_V",  "h2a", "globalstar"],
        regexes=[r'CZ-3B.*', r'FALCON_9.*', r'ATLAS_[5|V]_CENTAUR_R_B$',  r'H-2A.*', r'GLOBALSTAR.*'],
        validation_split=0.2,
        split_strategy=ST.TRACK_ID,
        number_of_training_examples_per_class=10_000,
        filter_config=FilterConfig( n_bins= 30, n_gaps= 10, gap_size= 5, rms_ratio= 0., non_zero_ratio=0.8),
        data_types=[DT.FS, DT.AMPLITUDE],
        lc_shifts = 0,
        convert_to_mag=False,
        wavelet_scales= 10,
        wavelet_name= 'gaus1',
        train_augmentations=[],
)

dp = DataProcessor(data_cfg)

if os.path.exists(f'{dp.output_path}/{dp.hash}'):
    dp.load_data_from_file()
    dp.load_data_from_file(test=True)
else:
    dp.create_dataset_from_csv()
    dp.save_data()
    dp.save_data(test=True)


train_set, val_set, test_set = dp.get_pytorch_datasets()
X_train = train_set.data
y_train = train_set.labels
# ok = (y_train != 2) * (y_train != 3)
# X_train = X_train[ok]
# y_train = y_train[ok]
# y_train[y_train == 3] = 1

X_val = val_set.data
y_val = val_set.labels
# ok = (y_val != 2) * (y_val != 3)
# X_val = X_val[ok]
# y_val = y_val[ok]
# y_val[y_val == 3] = 1


X_test = test_set.data
y_test = test_set.labels
# ok = (y_test != 2) * (y_test != 3)
# X_test = X_test[ok]
# y_test = y_test[ok]
# y_test[y_test == 3] = 1

False False


In [38]:
bst = XGBClassifier(n_estimators=10, max_depth=4, learning_rate=1.,objective='binary:logistic')

In [39]:
bst.fit(X_train, y_train)

In [40]:
preds1 = bst.predict(X_val)
print(f"Validation accuracy: {np.mean(preds1 == y_val)}")
preds2 = bst.predict(X_test)
print(f"Test accuracy: {np.mean(preds2 == y_test)}")

Validation accuracy: 0.7518198362147407
Test accuracy: 0.5726495726495726


In [42]:
P = preds1
Y = y_val
N = 5
tab = []
for i in range(N):
    line = []
    c = np.sum(Y == i)
    for j in range(N):
        x = sum((i == P) & (j == Y))
        line.append(x)
    tab.append(line)

# prety print the confusion matrix
print("                  cz_3 falcon_9 atlas_V h2a globalstar")
for i, line in enumerate(tab):
    s = '| ' + ' | '.join(str(x) for x in line) + ' |'
    print(s)
    # print(f"{data_cfg.class_names[i]: <16}: {line}")
    
precision = np.diag(tab) / np.sum(tab, axis=0)
recall = np.diag(tab) / np.sum(tab, axis=1)

f1 = 2 * precision * recall / (precision + recall)

print(f"Precision: \n{precision}")
print(f"Recall: \n{recall}")
print(f"F1: \n{f1}")

a = '''
| predicted / actual | CZ-3 | Falcon 9 | Atlas V | H2-A | Globalstar |
| ------------------ | ---- | -------- | ------- | ---- | ---------- |
| CZ-3               | 1734 | 272      | 183     | 13   | 12         |
| Falcon 9           | 140  | 132      | 25      | 0    | 0          |
| Atlas V            | 110  | 24       | 258     | 12   | 3          |
| H2-A               | 41   | 11       | 25      | 448  | 5          |
| Globalstar         | 19   | 3        | 61      | 9    | 880        |
'''
# compute precision, recall and f1 score from a:
# Precision:
tab = np.array([[1734, 272, 183, 13, 12],[140, 132, 25, 0, 0],[110, 24, 258, 12, 3],[41, 11, 25, 448, 5],[19, 3, 61, 9, 880]])
precision = np.diag(tab) / np.sum(tab, axis=0)
recall = np.diag(tab) / np.sum(tab, axis=1)
f1 = 2 * precision * recall / (precision + recall)

#format as markdown table
print("| Precision |", " | ".join(f"{x:.2f}" for x in precision), "|")
print("| Recall |", " | ".join(f"{x:.2f}" for x in recall), "|")
print("| F1 |", " | ".join(f"{x:.2f}" for x in f1), "|")

                  cz_3 falcon_9 atlas_V h2a globalstar
| 1805 | 373 | 276 | 27 | 12 |
| 74 | 53 | 11 | 6 | 2 |
| 73 | 9 | 197 | 7 | 10 |
| 30 | 3 | 18 | 409 | 20 |
| 52 | 4 | 50 | 34 | 841 |
Precision: 
[0.88741396 0.1199095  0.35688406 0.84679089 0.95028249]
Recall: 
[0.72402728 0.3630137  0.66554054 0.85208333 0.85728848]
F1: 
[0.7974376  0.18027211 0.46462264 0.84942887 0.90139335]
| Precision | 0.85 | 0.30 | 0.47 | 0.93 | 0.98 |
| Recall | 0.78 | 0.44 | 0.63 | 0.85 | 0.91 |
| F1 | 0.81 | 0.36 | 0.54 | 0.89 | 0.94 |


# Best preformance on binary classification
1. Falcon 9  vs. CZ-3B
1. Atlas V Centaur vs. CZ-3B

In [4]:
FALCON_ID = 1
ATLAS_ID = 2

data_cfg.data_types = [DT.LC]
dp = DataProcessor(data_cfg)

if os.path.exists(f'{dp.output_path}/{dp.hash}'):
    dp.load_data_from_file()
    dp.load_data_from_file(test=True)
else:
    dp.create_dataset_from_csv()
    dp.save_data()
    dp.save_data(test=True)

train_set, val_set, test_set = dp.get_pytorch_datasets()

X_train = train_set.data
y_train = train_set.labels
cz_indices_train = y_train == 0
falcon_indices_train = y_train == FALCON_ID
atlas_indices_train = y_train == ATLAS_ID

X_val = val_set.data
y_val = val_set.labels
cz_indices_val = y_val == 0
falcon_indices_val = y_val == FALCON_ID
atlas_indices_val = y_val == ATLAS_ID

False False


In [5]:
sweep_cfg = {
    "name": "Binary classification_atlas",
    "method": "bayes", 
    "metric": {
        "goal": "maximize",
        "name": "f1_score",
    },
    "parameters": {
        "learning_rate": {
            "min": 0.0001,
            "max": 1.,
        },
        "max_depth": {
            "min": 3,
            "max": 20
        },
        "n_estimators":{
            "min": 10,
            "max": 100,
        },
    }
}

In [6]:
import wandb

train_X = X_train[np.logical_or(atlas_indices_train, cz_indices_train)]
train_y = y_train[np.logical_or(atlas_indices_train, cz_indices_train)]
val_X = X_val[np.logical_or(atlas_indices_val, cz_indices_val)]
val_y = y_val[np.logical_or(atlas_indices_val, cz_indices_val)]
train_y[train_y == ATLAS_ID] = 1
val_y[val_y == ATLAS_ID] = 1
# 1: Define objective/training function
def main():
    wandb.init(project="Preprocessing")
    config = wandb.config
    classifier = XGBClassifier(
        n_estimators=config.n_estimators,
        max_depth=config.max_depth,
        learning_rate=config.learning_rate,
        objective='binary:logistic'
    )
    classifier.fit(train_X, train_y)

    conf_matrix = np.zeros((2, 2))
    preds = classifier.predict(val_X)
    conf_matrix[0, 0] = np.sum((preds == 0) & (val_y == 0))
    conf_matrix[0, 1] = np.sum((preds == 0) & (val_y == 1))
    conf_matrix[1, 0] = np.sum((preds == 1) & (val_y == 0))
    conf_matrix[1, 1] = np.sum((preds == 1) & (val_y == 1))
    
    precision = np.diag(conf_matrix) / np.sum(conf_matrix, axis=0)
    recall = np.diag(conf_matrix) / np.sum(conf_matrix, axis=1)
    f1 = 2 * precision * recall / (precision + recall)
    
    wandb.log({"f1_score": f1[1]})

In [7]:
sweep_id = wandb.sweep(sweep=sweep_cfg, project="Preprocessing")

wandb.agent(sweep_id, function=main)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Create sweep with ID: tq59cfzj
Sweep URL: https://wandb.ai/dano-kyselica/Preprocessing/sweeps/tq59cfzj


[34m[1mwandb[0m: Agent Starting Run: tysfpsx0 with config:
[34m[1mwandb[0m: 	learning_rate: 0.7970669552382228
[34m[1mwandb[0m: 	max_depth: 18
[34m[1mwandb[0m: 	n_estimators: 58
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdano-kyselica[0m. Use [1m`wandb login --relogin`[0m to force relogin


0,1
f1_score,▁

0,1
f1_score,0.35488


[34m[1mwandb[0m: Agent Starting Run: 4up6fjzq with config:
[34m[1mwandb[0m: 	learning_rate: 0.08989654867838368
[34m[1mwandb[0m: 	max_depth: 20
[34m[1mwandb[0m: 	n_estimators: 37
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
f1_score,▁

0,1
f1_score,0.24588


[34m[1mwandb[0m: Agent Starting Run: ii540t41 with config:
[34m[1mwandb[0m: 	learning_rate: 0.21029785745117816
[34m[1mwandb[0m: 	max_depth: 32
[34m[1mwandb[0m: 	n_estimators: 90
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
f1_score,▁

0,1
f1_score,0.2993


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ju6t28v4 with config:
[34m[1mwandb[0m: 	learning_rate: 0.7909047395671636
[34m[1mwandb[0m: 	max_depth: 20
[34m[1mwandb[0m: 	n_estimators: 67
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
f1_score,▁

0,1
f1_score,0.34606


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: qb4z7u2e with config:
[34m[1mwandb[0m: 	learning_rate: 0.9931995538055236
[34m[1mwandb[0m: 	max_depth: 14
[34m[1mwandb[0m: 	n_estimators: 57
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


0,1
f1_score,▁

0,1
f1_score,0.38107
