In [1]:
#!/usr/bin/env python3
"""
Neven Caplar 
Last updated: 2023-12-01

Goals: 
Fit the data

Each Section can/should run independently,
only these initial imports should be shared among all sections

Open questions:
None at the moment
"""

import os

import numpy as np
import pandas as pd
import pyarrow as pa

# from scipy.spatial import KDTree
import matplotlib.pyplot as plt

import JaxPeriodDrwFit


from tape.ensemble import Ensemble
from tape.utils import ColumnMapper

from tqdm import tqdm

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [2]:
import dask
# many workers
# dask.config.set(scheduler='threads') 

dask.config.set({"temporary-directory" :'/epyc/ssd/users/ncaplar/tmp'})

# does not work
# from multiprocessing.pool import ThreadPool
# dask.config.set(pool=ThreadPool(20))

# one worker
# dask.config.set(scheduler='processes')  
from dask.distributed import Client, LocalCluster
cluster = LocalCluster(n_workers=12, threads_per_worker=1)
client = Client(cluster)
# cluster.adapt(minimum=10, maximum=40) 

In [3]:
def pack_output_to_parquet(result, cols, output_dir, output_filename, drop_cols=[], full=False):
    """Packs output to a dataframe, written as a parquet file. The created dataframe object is returned for inspection."""
    result_df = None
    if full:
        # Construct dataframes with the results for each object.
        dfs = []
        for i in range(len(result)):
            obj_data = result.iloc[i]
            # Construct a series representing the index
            obj_index = pd.Series(np.full(len(obj_data), result.index[i]), name=result.index.name)
            dfs.append(pd.DataFrame(data=obj_data, columns=cols, index=obj_index))

        # Concatenate all of the per-object dataframes
        result_df = pd.concat(dfs)
    else:
        # Each object only has a 1D array in the result series, so the constructed
        # dataframe has the same number of rows. So we can just do a 1:1 mapping with column names 
        result_df = pd.DataFrame(columns=cols, index=result.index)
        for i in range(len(result)):
            result_df.iloc[i] = result[i]

    # Drop any columns if requested.
    if drop_cols:
        result_df = result_df.drop(columns=drop_cols)

    # Write the output to the parquet file
    pa_table = pa.Table.from_pandas(result_df)
    pa_table
    pa.parquet.write_table(pa_table, f"{output_dir}/data/{output_filename}.parquet")
    return result_df

# Create columns for result of using just the drw kernel
param_cols = ['log_drw_scale', 'log_drw_amp']
init_param_cols = ["init_" + c for c in param_cols]
drw_columns = ['min_neg_log_lh', 'neg_log_lh'] + param_cols + init_param_cols

# Create columns for result of combining the drw params with periodic params
param_cols = ['log_drw_scale', 'log_drw_amp', 'log_per_scale', 'log_per_amp']
init_param_cols = ["init_" + c for c in param_cols]
combined_columns = ['min_neg_log_lh', 'neg_log_lh'] + param_cols + init_param_cols

In [4]:
ens = Ensemble(client = client)  # initialize an ensemble object
ens.client_info()

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 12
Total threads: 12,Total memory: 251.68 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:34758,Workers: 12
Dashboard: http://127.0.0.1:8787/status,Total threads: 12
Started: Just now,Total memory: 251.68 GiB

0,1
Comm: tcp://127.0.0.1:43199,Total threads: 1
Dashboard: http://127.0.0.1:36999/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:43710,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-zgdcefr9,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-zgdcefr9
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:37777,Total threads: 1
Dashboard: http://127.0.0.1:39269/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:40994,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-5agl75ne,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-5agl75ne
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:40767,Total threads: 1
Dashboard: http://127.0.0.1:43872/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:39295,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-gao785mn,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-gao785mn
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:38864,Total threads: 1
Dashboard: http://127.0.0.1:40025/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:41002,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-dqygo9dg,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-dqygo9dg
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:33051,Total threads: 1
Dashboard: http://127.0.0.1:39192/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:37744,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-wctsktwz,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-wctsktwz
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:40736,Total threads: 1
Dashboard: http://127.0.0.1:40503/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:37454,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-c80dhpn2,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-c80dhpn2
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:39209,Total threads: 1
Dashboard: http://127.0.0.1:46574/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:42037,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-yfekbsb4,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-yfekbsb4
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:38834,Total threads: 1
Dashboard: http://127.0.0.1:41406/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:40926,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-olqsq74d,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-olqsq74d
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:32822,Total threads: 1
Dashboard: http://127.0.0.1:34516/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:40381,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-3fs23l0v,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-3fs23l0v
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:44149,Total threads: 1
Dashboard: http://127.0.0.1:40762/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:40831,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-4fz9kmzf,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-4fz9kmzf
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:44667,Total threads: 1
Dashboard: http://127.0.0.1:39915/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:36946,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-suzv98h2,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-suzv98h2
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:38326,Total threads: 1
Dashboard: http://127.0.0.1:42198/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:42083,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-4_59tzvj,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-4_59tzvj
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB


In [5]:
dask.config.get("temporary-directory")

'/epyc/ssd/users/ncaplar/tmp'

In [6]:
# Setup base directory for saving output files
username= "ncaplar"
basedir = f"/astro/users/{username}/data/"

# Simulations

In [10]:
# if running on baldur or epyc
data_path = "/epyc/users/ncaplar/data/drw_per_sim/"

col_map = ColumnMapper(id_col="lc_nr", 
                       time_col="t",
                       flux_col="y", 
                       err_col="yerr",
                       band_col="band")

ens.from_parquet(data_path,
                 source_subdir="sources",
                 column_mapper=col_map,
                 additional_cols=True,
                 sync_tables=True,
                 sorted=True
                 )

<tape.ensemble.Ensemble at 0x7f9cd4fd6380>

In [11]:
ens_source = ens._source.compute()

In [12]:
ens_source

Unnamed: 0_level_0,drw_amp,drw_tau,per_amp,per_tau,t,y,yerr,y_drw,y_per,band
lc_nr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.116043,562.082417,0.000000,0.000000,0.000000,,,15.122011,,g
0,0.116043,562.082417,0.000000,0.000000,7.056726,,,15.136634,,g
0,0.116043,562.082417,0.000000,0.000000,9.997028,,,15.132266,,g
0,0.116043,562.082417,0.000000,0.000000,14.113451,,,15.123652,,g
0,0.116043,562.082417,0.000000,0.000000,17.053754,,,15.138068,,g
...,...,...,...,...,...,...,...,...,...,...
99,0.116043,562.082417,0.424354,3508.692826,1504.062670,14.617994,0.021970,15.018900,-0.424067,g
99,0.116043,562.082417,0.424354,3508.692826,1506.022871,14.604771,0.022042,15.013113,-0.424119,g
99,0.116043,562.082417,0.424354,3508.692826,1508.963174,14.619190,0.021882,15.026179,-0.424188,g
99,0.116043,562.082417,0.424354,3508.692826,1510.923375,14.574873,0.021840,15.029697,-0.424227,g


In [14]:
JaxPeriodDrwFit_instance = JaxPeriodDrwFit.JaxPeriodDrwFit()
res_tsp_full = ens.batch(JaxPeriodDrwFit_instance.optimize_map, 't', "y", "yerr",
                compute=True, meta=None, n_init=100, full=True)

res_tsp_drw_full = ens.batch(JaxPeriodDrwFit_instance.optimize_map_drw, 't', "y", "yerr",
                compute=True, meta=None, n_init=100, full=True)

An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.
An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.


In [21]:
# Save output for results from just the drw kernel
drw_df_full = pack_output_to_parquet(res_tsp_drw_full, drw_columns,
                       f"/astro/users/{username}", "res_sim_run_g_0_drw_full", full=True)

# Save output for results from the combined drw and periodic kernel
combined_df_full = pack_output_to_parquet(res_tsp_full, combined_columns,
                       f"/astro/users/{username}", "res_sim_run_g_0_full", full=True)