In [1]:
import astrofeatures as AF

In [2]:
def get_features(filepath):
    return AF.AstroDataFeatures(filepath).INIT()

In [3]:
%%time
f = get_features(r"OGLE-SMC-LPV-11911.dat")

CPU times: user 11.4 s, sys: 4.9 s, total: 16.3 s
Wall time: 16.3 s


In [4]:
f.shape
f

array([ 2.41090495e+02,  3.55824532e-02,  4.90000000e-02,  2.03243185e-04,
        2.03000000e-01,  2.27288883e-01,  2.69125683e+01,  5.53278689e-01,
        1.20931518e+00,  5.32786885e-02,  4.93449916e-01,  1.21475054e-01,
        2.00960645e-01,  3.03687636e-01,  4.77688255e-01,  7.49302758e-01,
        9.86326120e-01,  3.54824487e-01,  2.02721220e-02,  1.41512102e-01,
        4.48285133e-02,  2.04974322e-02,  5.48910606e-01,  7.52633435e-01,
        2.29242241e-01,  1.37500000e+00,  8.70730681e-06,  2.65365125e+00,
        1.08744698e-01,  8.44725996e-02,  1.57905757e-02,  3.53450924e-02,
        1.29014468e-01,  7.57856340e-02,  2.15776191e-02,  1.68375735e-02,
       -6.89712563e-01,  6.78349276e-01,  4.08746427e-01,  2.96274308e-01,
       -2.91307244e+00, -1.04472727e+00, -1.06059237e+00, -6.45395068e-02,
       -1.09692231e-01, -8.38010128e-01, -4.88078969e-01, -2.31051528e-01,
       -7.76634894e-01, -8.84358911e-01, -2.65807689e-03,  4.88815324e-02,
        2.99113541e-02, -

In [5]:
import os

In [6]:
def list_dir_of_path(path):
    """
    path: the path of directory
    return: the list of directory
    """
    return [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))]


def read_ocvs_data(datasetpath):
    """
    datasetpath: the path of dataset
    返回一个字典 , key是类别, value 是类别数据的相对地址

    datasets 的目录结构有四层, 第一层是类别, 第二层是子类别,第三层是I,V波段,第四层是数据
    """
    dataset = {}

    class_dir = list_dir_of_path(datasetpath)
    for class_name in class_dir:
      dir_path = os.path.join(datasetpath,class_name)
      sub_class_name = list_dir_of_path(dir_path)      
      for sub_class in sub_class_name:
        subdir_path = os.path.join(dir_path,sub_class)
        I_data_path = os.path.join(subdir_path,"I")
        I_file_list = os.listdir(I_data_path)
        I_file_list_path = [os.path.join(I_data_path, f) for f in I_file_list]
        dataset[sub_class] = I_file_list_path

    return dataset




In [7]:
dataset = read_ocvs_data(r"datasets/OCVS")
for i,j in dataset.items():
  print(i,len(j))

MIRA 1500
SRV 1500
OSARG 1500
RRD 1335
RRC 1500
RRE 1378
RRAB 1500
C 1237
ELL 1360
NC 800
CV 18
T2 603
T110 1500
T1F 1471
T1M 558
A 89
T120 97
DPV 137
M 78
S 1500


In [8]:
%%time
import threading as td
import time
import numpy as np

empty = td.Semaphore(5)

index = 0

index_lock = td.Lock()

RRAB_data = np.zeros((60, 143),dtype=np.float64)
print(RRAB_data.shape)
def get_data():
    empty.acquire()
    
    index_lock.acquire()
    global index
    current_index = index
    index += 1
    index_lock.release()
    features = get_features(dataset['RRAB'][current_index])
    RRAB_data[current_index] = features
    print(features.shape)

    empty.release()

threads = []
for i in range(5):
    t1 = td.Thread(target=get_data)
    threads.append(t1)
    t1.start()

for i in threads:
    i.join()



(60, 143)
(143,)
(143,)
(143,)
(143,)
(143,)
CPU times: user 49.3 s, sys: 23.1 s, total: 1min 12s
Wall time: 20.7 s


In [9]:
%%time
import concurrent.futures as cf

def read_class_data(path_array, class_num):
    arr = np.zeros((len(path_array), 143), dtype=np.float64)
    with cf.ThreadPoolExecutor(max_workers=32) as executor:
        results = executor.map(get_features, path_array)

    for i, result in enumerate(results):
        arr[i] = result
    
    class_label = np.array([class_num] * len(path_array))
    return arr, class_label

def save_npy(arr, class_label, class_name):
    np.save(f"./npy_data/{class_name}_data.npy", arr)
    np.save(f"./npy_data/{class_name}_label.npy", class_label)

CPU times: user 6 µs, sys: 3 µs, total: 9 µs
Wall time: 12.9 µs


In [None]:
%%time
dataset = read_ocvs_data(r"datasets/OCVS")
class_num =[i for i in range(len(dataset.keys()))]
for i in class_num:
    for class_name, path_arr in dataset.items():
        arr, class_label = read_class_data(path_arr, i)
        print(arr.shape)
        save_npy(arr, class_label, class_name)
        

(1500, 143)
