In [1]:
#!/usr/bin/env python3
"""
Neven Caplar 
Last updated: 2023-12-01

Goals: 
Fit the data

Each Section can/should run independently,
only these initial imports should be shared among all sections

Open questions:
None at the moment
"""

import os

import numpy as np
import pandas as pd
import pyarrow as pa

# from scipy.spatial import KDTree
import matplotlib.pyplot as plt

import JaxPeriodDrwFit


from tape.ensemble import Ensemble
from tape.utils import ColumnMapper

from tqdm import tqdm

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [2]:
import dask
# many workers
# dask.config.set(scheduler='threads') 

dask.config.set({"temporary-directory" :'/epyc/ssd/users/ncaplar/tmp'})

# does not work
# from multiprocessing.pool import ThreadPool
# dask.config.set(pool=ThreadPool(20))

# one worker
# dask.config.set(scheduler='processes')  
from dask.distributed import Client, LocalCluster
cluster = LocalCluster(n_workers=12, threads_per_worker=1)
client = Client(cluster)
# cluster.adapt(minimum=10, maximum=40) 

In [19]:
def pack_output_to_parquet(result, cols, output_dir, output_filename, drop_cols=[], full=False):
    """Packs output to a dataframe, written as a parquet file. The created dataframe object is returned for inspection."""
    result_df = None
    if full:
        # Construct dataframes with the results for each object.
        dfs = []
        for i in range(len(result)):
            obj_data = result.iloc[i]
            # Construct a series representing the index
            obj_index = pd.Series(np.full(len(obj_data), result.index[i]), name=result.index.name)
            dfs.append(pd.DataFrame(data=obj_data, columns=cols, index=obj_index))

        # Concatenate all of the per-object dataframes
        result_df = pd.concat(dfs)
    else:
        # Each object only has a 1D array in the result series, so the constructed
        # dataframe has the same number of rows. So we can just do a 1:1 mapping with column names 
        result_df = pd.DataFrame(columns=cols, index=result.index)
        for i in range(len(result)):
            result_df.iloc[i] = result[i]

    # Drop any columns if requested.
    if drop_cols:
        result_df = result_df.drop(columns=drop_cols)

    # Write the output to the parquet file
    pa_table = pa.Table.from_pandas(result_df)
    pa_table
    pa.parquet.write_table(pa_table, f"{output_dir}/data/{output_filename}.parquet")
    return result_df

# Create columns for result of using just the drw kernel
param_cols = ['log_drw_scale', 'log_drw_amp']
init_param_cols = ["init_" + c for c in param_cols]
drw_columns = ['min_neg_log_lh', 'neg_log_lh'] + param_cols + init_param_cols

# Create columns for result of combining the drw params with periodic params
param_cols = ['log_drw_scale', 'log_drw_amp', 'log_per_scale', 'log_per_amp']
init_param_cols = ["init_" + c for c in param_cols]
combined_columns = ['min_neg_log_lh', 'neg_log_lh'] + param_cols + init_param_cols

In [3]:
ens = Ensemble(client = client)  # initialize an ensemble object
ens.client_info()

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 12
Total threads: 12,Total memory: 251.68 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:33551,Workers: 12
Dashboard: http://127.0.0.1:8787/status,Total threads: 12
Started: Just now,Total memory: 251.68 GiB

0,1
Comm: tcp://127.0.0.1:32872,Total threads: 1
Dashboard: http://127.0.0.1:37431/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:38533,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-28gxu7ak,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-28gxu7ak
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:37762,Total threads: 1
Dashboard: http://127.0.0.1:43914/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:46494,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-hoel0288,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-hoel0288
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:39359,Total threads: 1
Dashboard: http://127.0.0.1:32909/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:40063,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-pxwz1g3o,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-pxwz1g3o
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:44898,Total threads: 1
Dashboard: http://127.0.0.1:32846/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:40638,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-521f21mj,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-521f21mj
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:41674,Total threads: 1
Dashboard: http://127.0.0.1:40383/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:44334,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-1fbm1u_o,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-1fbm1u_o
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:40105,Total threads: 1
Dashboard: http://127.0.0.1:40575/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:32910,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-17ck97vu,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-17ck97vu
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:32902,Total threads: 1
Dashboard: http://127.0.0.1:33558/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:40642,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-8qz0_x2_,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-8qz0_x2_
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:36867,Total threads: 1
Dashboard: http://127.0.0.1:45050/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:40583,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-lzqi37p0,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-lzqi37p0
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:36364,Total threads: 1
Dashboard: http://127.0.0.1:42362/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:34133,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-m64j6z7l,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-m64j6z7l
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:34998,Total threads: 1
Dashboard: http://127.0.0.1:39076/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:35178,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-lqvrm7b8,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-lqvrm7b8
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:32986,Total threads: 1
Dashboard: http://127.0.0.1:41300/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:40218,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-la07oocp,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-la07oocp
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB

0,1
Comm: tcp://127.0.0.1:44155,Total threads: 1
Dashboard: http://127.0.0.1:39459/status,Memory: 20.97 GiB
Nanny: tcp://127.0.0.1:36911,
Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-xbkxs8ox,Local directory: /epyc/ssd/users/ncaplar/tmp/dask-scratch-space/worker-xbkxs8ox
GPU: NVIDIA GeForce RTX 2080 Ti,GPU memory: 11.00 GiB


In [4]:
dask.config.get("temporary-directory")

'/epyc/ssd/users/ncaplar/tmp'

In [5]:
# Setup base directory for saving output files
username= "ncaplar"
basedir = f"/astro/users/{username}/data/"

# Simulations

In [6]:
# if running on baldur or epyc
data_path = "/epyc/users/ncaplar/data/drw_per_sim/"

col_map = ColumnMapper(id_col="lc_nr", 
                       time_col="t",
                       flux_col="y", 
                       err_col="yerr",
                       band_col="band")

ens.from_parquet(data_path,
                 source_subdir="sources",
                 column_mapper=col_map,
                 additional_cols=True,
                 sync_tables=True,
                 sorted=True
                 )

<tape.ensemble.Ensemble at 0x7f0530b81f30>

In [7]:
ens_source = ens._source.compute()

In [8]:
ens_source

Unnamed: 0_level_0,drw_amp,drw_tau,per_amp,per_tau,t,y,yerr,y_drw,y_per,band,provenance
lc_nr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0.116043,562.082417,0.018065,300.210489,0.000000,15.138818,0.016477,15.153661,-0.008325,g,survey_1
0,0.116043,562.082417,0.018065,300.210489,7.056726,15.147453,0.016442,15.160518,-0.010593,g,survey_1
0,0.116043,562.082417,0.018065,300.210489,9.997028,15.148385,0.016477,15.156828,-0.011473,g,survey_1
0,0.116043,562.082417,0.018065,300.210489,14.113451,15.153356,0.016473,15.158536,-0.012631,g,survey_1
0,0.116043,562.082417,0.018065,300.210489,17.053754,15.161318,0.016515,15.153753,-0.013402,g,survey_1
...,...,...,...,...,...,...,...,...,...,...,...
99,0.116043,562.082417,0.424354,3508.692826,1504.062670,14.617994,0.021970,15.018900,-0.424067,g,survey_1
99,0.116043,562.082417,0.424354,3508.692826,1506.022871,14.604771,0.022042,15.013113,-0.424119,g,survey_1
99,0.116043,562.082417,0.424354,3508.692826,1508.963174,14.619190,0.021882,15.026179,-0.424188,g,survey_1
99,0.116043,562.082417,0.424354,3508.692826,1510.923375,14.574873,0.021840,15.029697,-0.424227,g,survey_1


In [14]:


JaxPeriodDrwFit_instance = JaxPeriodDrwFit.JaxPeriodDrwFit()
res_tsp_full = ens.batch(JaxPeriodDrwFit_instance.optimize_map, 't', "y", "yerr",
                compute=True, meta=None, n_init=100, full=True)

res_tsp_drw_full = ens.batch(JaxPeriodDrwFit_instance.optimize_map_drw, 't', "y", "yerr",
                compute=True, meta=None, n_init=100, full=True)

An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.
An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.


In [20]:
# Save output for results from just the drw kernel
drw_df_full = pack_output_to_parquet(res_tsp_drw_full, drw_columns,
                       f"/astro/users/{username}", "res_tsp_run_g_0_drw_full", full=True)

# Save output for results from the combined drw and periodic kernel
combined_df_full = pack_output_to_parquet(res_tsp_full, combined_columns,
                       f"/astro/users/{username}", "res_tsp_run_g_0_full", full=True)