In [1]:
import astrofeatures as AF


In [2]:
def get_features(filepath):
    return AF.AstroDataFeatures(filepath).INIT()

In [3]:
%%time
f = get_features(r"datasets\OCVS\LPV\MIRA\I\OGLE-BLG-LPV-000113.dat")

CPU times: total: 547 ms
Wall time: 8.48 s


In [4]:
f.shape
f

array([ 1.96050094e+02,  5.17428174e-01,  6.89000000e-01,  3.51440790e-03,
        1.71800000e+00,  2.78885094e-01,  3.44086022e+01,  3.01075269e-01,
        9.63267795e+00,  0.00000000e+00, -4.11968491e-01,  1.82544970e-01,
        2.91872085e-01,  4.59027315e-01,  5.94137242e-01,  7.98800799e-01,
       -4.07242833e-01, -3.61807258e-01,  1.06197369e-01,  5.00853695e-03,
       -5.71954682e-02, -8.92173164e-02,  7.50282547e-01,  8.23624712e-01,
        4.10174427e-01,  2.36389685e-01,  4.82159683e-03,  2.52724794e+00,
        3.22779253e+01,  1.87826745e+01,  7.24818004e+00,  1.65611852e+00,
        3.49060734e+01,  2.10660758e+01,  8.19385443e+00,  1.89646878e+00,
        1.00330616e+00, -9.50726399e-01, -2.97201013e+00,  1.44374551e+00,
       -2.57737964e+00,  1.27133026e+00, -1.21117912e+00,  2.68960639e+00,
       -2.35148179e-01, -6.48676640e-01, -1.28981420e+00, -2.19317356e-01,
       -6.29412754e-01, -1.26495530e+00, -7.95975989e-01,  4.98226092e-01,
        1.58690273e-01, -

In [5]:
import os

In [6]:
def list_dir_of_path(path):
    """
    path: the path of directory
    return: the list of directory
    """
    return [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))]


def read_ocvs_data(datasetpath):
    """
    datasetpath: the path of dataset
    返回一个字典 , key是类别, value 是类别数据的相对地址

    datasets 的目录结构有四层, 第一层是类别, 第二层是子类别,第三层是I,V波段,第四层是数据
    """
    dataset = {}

    class_dir = list_dir_of_path(datasetpath)
    for class_name in class_dir:
      dir_path = os.path.join(datasetpath,class_name)
      sub_class_name = list_dir_of_path(dir_path)      
      for sub_class in sub_class_name:
        subdir_path = os.path.join(dir_path,sub_class)
        I_data_path = os.path.join(subdir_path,"I")
        I_file_list = os.listdir(I_data_path)
        I_file_list_path = [os.path.join(I_data_path, f) for f in I_file_list]
        dataset[sub_class] = I_file_list_path

    return dataset




In [7]:
dataset = read_ocvs_data(r"datasets/OCVS")
for i,j in dataset.items():
  print(i,len(j))

A 89
T110 1500
T120 97
T1F 1471
T1M 558
T2 603
DPV 137
M 78
S 1500
C 1237
CV 18
ELL 1360
NC 800
MIRA 1500
OSARG 1500
SRV 1500
RRAB 1500
RRC 1500
RRD 1335
RRE 1378


In [8]:
%%time
import threading as td
import time
import numpy as np

empty = td.Semaphore(5)

index = 0

index_lock = td.Lock()

RRAB_data = np.zeros((60, 143),dtype=np.float64)
print(RRAB_data.shape)
def get_data():
    empty.acquire()
    
    index_lock.acquire()
    global index
    current_index = index
    index += 1
    index_lock.release()
    features = get_features(dataset['RRAB'][current_index])
    RRAB_data[current_index] = features
    print(features.shape)

    empty.release()

threads = []
for i in range(5):
    t1 = td.Thread(target=get_data)
    threads.append(t1)
    t1.start()

for i in threads:
    i.join()



(60, 143)


(143,)
(143,)
(143,)
(143,)
(143,)
CPU times: total: 2min 22s
Wall time: 25.4 s


In [1]:
%%time
import concurrent.futures as cf

def read_class_data(path_array, class_num):
    arr = np.zeros((len(path_array), 143), dtype=np.float64)
    with cf.ThreadPoolExecutor(max_workers=30) as executor:
        results = executor.map(get_features, path_array)

    for i, result in enumerate(results):
        arr[i] = result
    
    class_label = np.array([class_num] * len(path_array))
    return arr, class_label

def save_npy(arr, class_label, class_name):
    np.save(f"./npy_data/{class_name}_data.npy", arr)
    np.save(f"./npy_data/{class_name}_label.npy", class_label)

CPU times: total: 0 ns
Wall time: 0 ns


In [15]:
dataset = read_ocvs_data(r"datasets/OCVS")
class_num =[i for i in range(len(dataset.keys()))]
for i in class_num:
    for class_name, path_arr in dataset.items():
        arr, class_label = read_class_data(path_arr, i)
        save_npy(arr, class_label, class_name)