In [1]:
import numpy as np
import pandas as pd
import re
import tqdm
from collections import defaultdict

In [2]:
PATH = 'D:\\work\\classification_of_light_curves\\resources\\Fall_2021_R_B_globalstar.csv'

In [3]:
df = pd.read_csv(PATH, index_col=0)

In [4]:
df.head()

Unnamed: 0,Object name,Object ID,Track ID,Phase,0,1,2,3,4,5,...,290,291,292,293,294,295,296,297,298,299
0,CZ-3B_R|B,5021,12475947.0,7.868446,3.061125,3.118444,3.0955,3.044583,3.04775,2.9625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.205143,3.186875,3.1854
1,CZ-3B_R|B,5021,12475947.0,6.956231,3.1664,2.9712,2.9812,2.84,2.987167,3.041,...,0.0,0.0,0.0,3.168429,3.3228,3.184,3.2942,3.254778,3.0976,3.043
2,CZ-3B_R|B,5021,12475947.0,6.244945,3.187429,3.157125,2.985625,3.102556,3.157667,3.0545,...,3.071333,3.253889,3.2143,3.279556,3.04275,2.960167,3.011667,3.102625,3.090333,2.992714
3,CZ-3B_R|B,5021,12475947.0,5.827657,2.891333,3.0676,2.957429,3.0219,2.980909,3.046917,...,0.0,0.0,0.0,3.3096,3.555333,3.1506,3.101667,3.166,3.340571,3.4905
4,CZ-3B_R|B,5021,12475947.0,5.750368,3.4905,3.314167,3.302556,3.068,3.272286,3.345714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.3315,3.422,3.138


In [22]:
labels = ["cz_3", "falcon_9", "atlas",  "h2a", "globalstar"]
regexes = [r'CZ-3B.*', r'FALCON_9.*', r'ATLAS_[5|V]_CENTAUR_R\|B$',  r'H-2A.*', r'GLOBALSTAR.*']


In [23]:
r = re.search(regexes[2], df["Object name"].unique()[0], re.IGNORECASE) 

In [24]:
data = defaultdict(list)

for name in df["Object name"].unique():

    label = None

    for i in range(len(labels)-1):

        if re.search(regexes[i], name):
            label = labels[i]
            break
    if label is None:
        continue

    print(label, name)

    df_object = df[df["Object name"] == name]

    object_IDs = df_object["Object ID"].unique()

    for object_ID in object_IDs:
        df_object_ID = df_object[df_object["Object ID"] == object_ID]

        data[label].append(df_object_ID.to_numpy()[:, 4:])    
    

cz_3 CZ-3B_R|B
h2a H-2A_R|B
falcon_9 FALCON_9_R|B
h2a H-2A_R|B(2)
atlas ATLAS_V_CENTAUR_R|B
atlas ATLAS_5_CENTAUR_R|B


In [10]:
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view
from functools import partial
import sys

sys.path.append("D:\\work\\classification_of_light_curves")

from src.config import FilterConfig

def get_filter_continuous(data, n_bins=10, gap=0, continous_gap=3):
    N = 300 // n_bins

    x = np.resize(data, (data.shape[0], n_bins, N))
    bins = np.sum(x, axis=2) != 0
    bins_sum = np.sum(bins, axis=1)

    res = bins_sum >= (n_bins - gap)

    if continous_gap > 0:
        continous_gaps = sliding_window_view(bins, window_shape=continous_gap+1, axis=1)
        continous_gaps_ok = np.all(np.sum(continous_gaps, axis=2) != 0, axis=1)

        res = np.logical_and(res, continous_gaps_ok)

    return res

def get_filter_ratio(data, ratio=0.5):

    x = np.sum(data != 0, axis= 1) / 300
    return x >= ratio

def apply_filters(data, filters_f, operation="AND"):

    f_res = None

    for f in filters_f:
        if f_res is None:
            f_res = f(data)
        else:
            if operation == "AND":
                f_res = np.logical_and(f(data), f_res)
            else:
                print(":)")
                f_res = np.logical_or(f(data), f_res)
    
    return data[f_res]

def apply_sequential_filters(data, filters):

    for f in filters:
        ok = f(data)
        data = data[ok]

    return data

def filter_data_from_csv_format(data, cfg: FilterConfig):
    filters = []
    filters.append(partial(get_filter_continuous, n_bins=cfg.n_bins, 
                                                gap=cfg.n_gaps, 
                                                continous_gap=cfg.gap_size))
    filters.append(partial(get_filter_ratio, ratio=cfg.non_zero_ratio))

    app_filters_p = partial(apply_sequential_filters, filters=filters)

    filtered_data = {}
    for label in data:
        tmp = []
        for d in data[label]:
            r = app_filters_p(d)
            if len(r) > 0:
                tmp.append(r)
        filtered_data[label] = tmp
    
    return filtered_data

In [11]:
filter_cfg = FilterConfig(
            n_bins= 30,
            n_gaps= 10,
            gap_size= 5, 
            rms_ratio= 0.,
            non_zero_ratio= 0.8
        )

In [12]:
f_data = filter_data_from_csv_format(data,filter_cfg)

In [13]:
for k in f_data:
    s = sum([len(i) for i in f_data[k]])
    print(f"{k}: {s}, {len(data[k])}")

cz_3: 10169, 45
globalstar: 4420, 31
h2a: 2411, 15
falcon_9: 2205, 29
atlas: 2857, 36


In [45]:
def split_object_data_to_test_validation(data, label, k, split=0.1):

    sizes = [len(i) for i in data[label]]

    N = sum(sizes)

    indices = np.argsort(-np.array(sizes))
    
    total = 0
    train = np.empty((0, *data[label][0].shape[1:]))
    val = np.empty((0, *data[label][0].shape[1:]))
    
    for i in range(len(indices)):
        if (sizes[indices[i]] + total < k*1.1 and sizes[indices[i]] + total < N * (1-split)) or \
           (total == 0 and sizes[indices[i]] + total < N * (1-split)):
            total += sizes[indices[i]]
            train = np.concatenate((train, data[label][indices[i]]))
        else:
            val = np.concatenate((val, data[label][indices[i]]))

   
    return train, val

def split_data_to_test_validation_by_object(data, labels, k, split=0.1):
    X_train, X_val = None, None
    Y_train, Y_val = None, None
    for i, label in enumerate(labels):
        obj_train, obj_val = split_object_data_to_test_validation(data, label, k, split)
        print(f"{label:15}: {len(obj_train):5} training examples, {len(obj_val):5} validation examples")
        
        if X_train is None:
            X_train = obj_train
            X_val = obj_val
            Y_train = np.array([i]*len(obj_train))
            Y_val = np.array([i]*len(obj_val))
        else:
            X_train = np.concatenate((X_train, obj_train))
            X_val = np.concatenate((X_val, obj_val))
            Y_train = np.concatenate((Y_train, np.array([i]*len(obj_train))))
            Y_val = np.concatenate((Y_val, np.array([i]*len(obj_val))))

    id_train = np.random.permutation(len(X_train))
    id_val = np.random.permutation(len(X_val))

    X_train, Y_train = X_train[id_train], Y_train[id_train]
    X_val, Y_val = X_val[id_val], Y_val[id_val]

    return (X_train, Y_train), (X_val, Y_val)


In [46]:
train, val = split_data_to_test_validation_by_object(f_data, labels, 10000)

cz_3           :  9152 training examples,  1017 validation examples
falcon_9       :  1984 training examples,   221 validation examples
atlas          :  2571 training examples,   286 validation examples
h2a            :  2169 training examples,   242 validation examples
globalstar     :  3977 training examples,   443 validation examples


In [40]:
val[0].shape

(2209, 300)

In [26]:
a = np.arange(12).reshape(6,2)
a

array([[ 0,  1],
       [ 2,  3],
       [ 4,  5],
       [ 6,  7],
       [ 8,  9],
       [10, 11]])

In [27]:
rng = np.random.default_rng()
rng.shuffle(a, axis=0)

In [28]:
a

array([[ 8,  9],
       [ 2,  3],
       [ 6,  7],
       [10, 11],
       [ 0,  1],
       [ 4,  5]])

In [29]:
np.random.permutation(10)

array([7, 2, 0, 4, 9, 6, 8, 5, 3, 1])