In [4]:
import astrofeatures as AF


In [5]:
def get_features(filepath):
    return AF.AstroDataFeatures(filepath).INIT()

In [13]:
%%time
f = get_features(r"datasets/OCVS\\RRLYR\\RRE\\I\\OGLE-LMC-RRLYR-06659.dat")

CPU times: total: 1.52 s
Wall time: 14.9 s


In [15]:
f.shape
f[0:-90]

array([ 1.25673843e+02,  1.14160371e-01,  1.54000000e-01,  1.22539421e-03,
        3.69000000e-01,  5.22725121e-02,  3.41590613e+01,  4.40677966e-01,
        1.79383923e+00,  2.60756193e-03,  1.61074305e-01,  2.04744070e-01,
        3.43476904e-01,  4.80649189e-01,  6.36548065e-01,  8.27091136e-01,
       -2.90575082e-01, -2.49418648e-01,  1.00460977e+00,  6.73686643e-02,
        4.05523555e-02, -1.91768258e-02,  4.74527546e-01,  8.27596448e-01,
        2.94591802e-01,  9.74025974e-01,  1.31842325e-04, -6.42669312e-01,
        2.24483868e-02,  2.73863821e-02,  3.23939354e-03,  7.23952923e-03,
        1.02206327e-01,  3.64699106e-02,  2.08763253e-03,  5.92086334e-03,
       -2.71302905e+00,  8.35632655e-01, -2.49854356e+00, -6.57670605e-01,
        9.87047618e-01, -2.77201413e+00, -1.02818375e+00, -9.99850308e-01,
        8.63495254e-02, -8.40721426e-01, -4.91474812e-01, -4.47543083e-01,
       -1.68982373e+00, -1.23709274e+00, -1.72849977e-01,  1.00906981e+00,
        1.00187281e+00])

In [9]:
import os

In [10]:
def list_dir_of_path(path):
    """
    path: the path of directory
    return: the list of directory
    """
    return [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))]


def read_ocvs_data(datasetpath):
    """
    datasetpath: the path of dataset
    返回一个字典 , key是类别, value 是类别数据的相对地址

    datasets 的目录结构有四层, 第一层是类别, 第二层是子类别,第三层是I,V波段,第四层是数据
    """
    dataset = {}

    class_dir = list_dir_of_path(datasetpath)
    for class_name in class_dir:
      dir_path = os.path.join(datasetpath,class_name)
      sub_class_name = list_dir_of_path(dir_path)      
      for sub_class in sub_class_name:
        subdir_path = os.path.join(dir_path,sub_class)
        I_data_path = os.path.join(subdir_path,"I")
        I_file_list = os.listdir(I_data_path)
        I_file_list_path = [os.path.join(I_data_path, f) for f in I_file_list]
        dataset[sub_class] = I_file_list_path

    return dataset




In [12]:
dataset = read_ocvs_data(r"datasets/OCVS")
for i,j in dataset.items():
  print(i,len(j))

dataset["RRE"][314]

A 89
T110 1500
T120 97
T1F 1471
T1M 558
T2 603
DPV 137
M 78
S 1500
C 1237
CV 18
ELL 1360
NC 800
MIRA 1500
OSARG 1500
SRV 1500
RRAB 1500
RRC 1500
RRD 1335
RRE 1378


'datasets/OCVS\\RRLYR\\RRE\\I\\OGLE-LMC-RRLYR-06659.dat'

In [None]:
%%time
import threading as td
import time
import numpy as np

empty = td.Semaphore(5)

index = 0

index_lock = td.Lock()

RRAB_data = np.zeros((60, 143),dtype=np.float64)
print(RRAB_data.shape)
def get_data():
    empty.acquire()
    
    index_lock.acquire()
    global index
    current_index = index
    index += 1
    index_lock.release()
    features = get_features(dataset['RRAB'][current_index])
    RRAB_data[current_index] = features
    print(features.shape)

    empty.release()

threads = []
for i in range(5):
    t1 = td.Thread(target=get_data)
    threads.append(t1)
    t1.start()

for i in threads:
    i.join()



(60, 143)


(143,)
(143,)
(143,)
(143,)
(143,)
CPU times: total: 2min 22s
Wall time: 25.4 s


In [None]:
%%time
import concurrent.futures as cf

def read_class_data(path_array, class_num):
    arr = np.zeros((len(path_array), 143), dtype=np.float64)
    with cf.ThreadPoolExecutor(max_workers=30) as executor:
        results = executor.map(get_features, path_array)

    for i, result in enumerate(results):
        arr[i] = result
    
    class_label = np.array([class_num] * len(path_array))
    return arr, class_label

def save_npy(arr, class_label, class_name):
    np.save(f"./npy_data/{class_name}_data.npy", arr)
    np.save(f"./npy_data/{class_name}_label.npy", class_label)

CPU times: total: 0 ns
Wall time: 0 ns


In [None]:
dataset = read_ocvs_data(r"datasets/OCVS")
class_num =[i for i in range(len(dataset.keys()))]
for i in class_num:
    for class_name, path_arr in dataset.items():
        arr, class_label = read_class_data(path_arr, i)
        save_npy(arr, class_label, class_name)