In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [123]:
import bochamm
import glob
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
import numpy.matlib
import pandas as pd
import scipy.stats as stats
import time
import time as timer


from astropy.timeseries import LombScargle
from scipy.stats import qmc
from sloscillations import frequencies, mixed_modes_utils
#from tqdm import tqdm
#from taco.rotation import rotation_utils
from sklearn.cluster import OPTICS, cluster_optics_dbscan
from turbo import Turbo1, TurboM
from turbo.utils import from_unit_cube, latin_hypercube, to_unit_cube
from typing import Optional

from joblib import Parallel, delayed
from tqdm import tqdm

In [15]:
def prep_data(kic):
    summary = pd.read_csv(f'../../TACO_benchmarking/data/intermediate/{str(kic).zfill(9)}/summary.csv')
    #pds = pd.read_csv('../../TACO_benchmarking/data/intermediate/002283721/pds_bgr.csv')#
    pds = pd.read_csv(f'../../TACO_benchmarking/data/intermediate/{str(kic).zfill(9)}/pds_bgr.csv')
    #peaks = pd.read_csv('../../TACO_benchmarking/data/intermediate/011353313/peaksMLE.csv')#/
    peaks = pd.read_csv(f'../../TACO_benchmarking/data/intermediate/{str(kic).zfill(9)}/peaksMLE.csv')

    # Only keep pds around oscillations
    pds = pds.loc[abs(pds['frequency'].values - summary['numax'].values) < 3 * summary['sigmaEnv'].values, ]

    # # If ΔΠ1 is in mega seconds then convert to seconds
    # if summary['DeltaPi1'].values < 1:
    #     summary['DeltaPi1'] *= 1e6

    # Read in and filter peaks file to be within +/-3 sigmaEnv of numax
    peaks = peaks.loc[abs(peaks.frequency.values - summary.numax.values) < 3*summary.sigmaEnv.values, ]

    # Split the peaks in the l=0,2,3 peaks (which have been already identified)
    # and the rest, which should hopefully be unidentified l=3
    l023_peaks = peaks.loc[(peaks.l == 0) | (peaks.l == 2) | (peaks.l == 3), ]
    l0_peaks = peaks.loc[(peaks.l==0), ]
    l1_peaks = peaks.loc[(peaks.l == 1) | (np.isfinite(peaks.l) == False)]  
    
    # Divide the data through by the model of the l=0,2 modes
    pds_l023_removed = pds.assign(power = pds.power / bochamm.utils.fit_model(pds, l023_peaks))
    return summary, pds, peaks, pds_l023_removed

In [240]:
def initial_RGB_or_RC(pds_l023_removed, freqs):
    # RGB run
    f, PSD_LS = bochamm.utils.compute_PS_PS(pds_l023_removed.frequency.values, pds_l023_removed.power.values, 
                                            bochamm.utils.DeltaPi1_from_DeltaNu_RGB(freqs.delta_nu), 
                                            0.15, freqs,
                                            lower_tau_lim=25, upper_tau_lim=200) 
    rgb_dpi1_guess = (1/f)[np.argmax(PSD_LS)]
    rgb_dpi1_max = np.max(PSD_LS)

    # RC run
    f, PSD_LS = bochamm.utils.compute_PS_PS(pds_l023_removed.frequency.values, pds_l023_removed.power.values, 
                                            300, 
                                            0.3, freqs,
                                            lower_tau_lim=120, upper_tau_lim=400)   
    rc_dpi1_guess = (1/f)[np.argmax(PSD_LS)]
    rc_dpi1_max = np.max(PSD_LS)
    
    if rgb_dpi1_max > rc_dpi1_max:
        if abs(rgb_dpi1_guess - (bochamm.utils.DeltaPi1_from_DeltaNu_RGB(freqs.delta_nu)/2))/(bochamm.utils.DeltaPi1_from_DeltaNu_RGB(freqs.delta_nu)/2) < 0.1:
            #print("Best guess from PSxPS could be a harmonic, doubling.")
            rgb_dpi1_guess *= 2
            harmonic = "first"
        elif abs(rgb_dpi1_guess - (bochamm.utils.DeltaPi1_from_DeltaNu_RGB(freqs.delta_nu)/3))/(bochamm.utils.DeltaPi1_from_DeltaNu_RGB(freqs.delta_nu)/3) < 0.1:
            #print("Best guess from PSxPS could be a harmonic, doubling.")
            rgb_dpi1_guess *= 3
            harmonic = "second"
        else:
            harmonic = None
        #print("Star is likely RGB")

        return rgb_dpi1_guess, rgb_dpi1_max, "RGB", harmonic
    else:
        #print("Star is likely RC")
        return rc_dpi1_guess, rc_dpi1_max, "RC", None

In [241]:
def cluster_threshold_results(X, fX, threshold, save=None):
    # Set up for clustering
    X_clust = X[-fX > threshold, :] 
    fX_clust = fX[-fX > threshold]
    
    # Hyperparameters still need to be robustly decided, but these aren't too bad
    #clust = OPTICS(min_samples=100, max_eps=1, xi=0.05, min_cluster_size=0.05)
    clust = OPTICS(min_samples=30, xi=0.5)#, min_cluster_size=0.05)

    # Run the fit
    clust.fit(X_clust)

    labels_050 = cluster_optics_dbscan(
        reachability=clust.reachability_,
        core_distances=clust.core_distances_,
        ordering=clust.ordering_,
        eps=0.5,
    )
    labels_200 = cluster_optics_dbscan(
        reachability=clust.reachability_,
        core_distances=clust.core_distances_,
        ordering=clust.ordering_,
        eps=2,
    )

    space = np.arange(len(X_clust))
    reachability = clust.reachability_[clust.ordering_]
    labels = clust.labels_[clust.ordering_]

    plt.figure(figsize=(10, 7))
    G = gridspec.GridSpec(2, 3)
    ax1 = plt.subplot(G[0, :])
    ax2 = plt.subplot(G[1, 0])
    ax3 = plt.subplot(G[1, 1])
    ax4 = plt.subplot(G[1, 2])

    # Reachability plot
    colors = ["g.", "r.", "b.", "y.", "c."]
    for klass, color in zip(range(0, labels.max()+1), colors):
        Xk = space[labels == klass]
        Rk = reachability[labels == klass]
        ax1.plot(Xk, Rk, color, alpha=0.3)
    ax1.plot(space[labels == -1], reachability[labels == -1], "k.", alpha=0.3)
    ax1.plot(space, np.full_like(space, 2.0, dtype=float), "k-", alpha=0.5)
    ax1.plot(space, np.full_like(space, 0.5, dtype=float), "k-.", alpha=0.5)
    ax1.set_ylabel("Reachability (epsilon distance)")
    ax1.set_title("Reachability Plot")

    # OPTICS
    colors = [f"C{i}" for i in range(10)]
    #colors = ["g.", "r.", "b.", "y.", "c.", "C0", "C1"]
    for klass, color in zip(range(0, labels.max()+1), colors):
        Xk = X_clust[clust.labels_ == klass]
        ax2.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)
    ax2.plot(X_clust[clust.labels_ == -1, 0], X_clust[clust.labels_ == -1, 1], "k+", alpha=0.1)
    ax2.set_title("Automatic Clustering\nOPTICS")

    # DBSCAN at 0.5
    #colors = ["g", "greenyellow", "olive", "r", "b", "c"]
    for klass, color in zip(range(0, labels_050.max()+1), colors):
        Xk = X_clust[labels_050 == klass]
        ax3.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3, marker=".")
    ax3.plot(X_clust[labels_050 == -1, 0], X_clust[labels_050 == -1, 1], "k+", alpha=0.1)
    ax3.set_title("Clustering at 0.5 epsilon cut\nDBSCAN")

    # DBSCAN at 2.
    #colors = ["g.", "m.", "y.", "c."]
    for klass, color in zip(range(0, labels_200.max()+1), colors):
        Xk = X_clust[labels_200 == klass]
        ax4.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)
    ax4.plot(X_clust[labels_200 == -1, 0], X_clust[labels_200 == -1, 1], "k+", alpha=0.1)
    ax4.set_title("Clustering at 2.0 epsilon cut\nDBSCAN")

    plt.tight_layout()
    if save is not None:
        plt.savefig(save, bbox_inches='tight')
        plt.close()
    else:
        plt.show()
    
    return X_clust[clust.labels_ >= 0, :], fX_clust[clust.labels_ >= 0], clust.labels_[clust.labels_ >= 0]

In [242]:
def process_clusters(data, loss, labels):
    n_clusters = len(np.unique(labels))
    med_loss = np.zeros(n_clusters)
    percentiles_DPi1 = np.zeros([n_clusters, 5])
    percentiles_q = np.zeros([n_clusters, 5])

    for i in range(n_clusters):
        med_loss[i] = np.median(loss[labels == i])
        percentiles_DPi1[i,:] = np.percentile(data[labels == i, 0], q=[2.5, 16, 50, 84, 97.5])
        percentiles_q[i,:] = np.percentile(data[labels == i, 1], q=[2.5, 16, 50, 84, 97.5])

    return med_loss, percentiles_DPi1, percentiles_q

In [261]:
def run_opt(kic, save=False, verbose=True):
    print(kic)
    summary, pds, peaks, pds_l023_removed = prep_data(kic)
    
    # Create artificial frequencies for creation of stretched power spectrum using values determined from TACO for this star
    freqs = frequencies.Frequencies(frequency=pds_l023_removed.frequency.values,
                                    numax=summary.numax.values, 
                                    delta_nu=summary.DeltaNu.values if np.isfinite(summary.DeltaNu.values) else None, 
                                    epsilon_p=summary.eps_p.values if np.isfinite(summary.eps_p.values) else None,
                                    alpha=summary.alpha.values if np.isfinite(summary.alpha.values) else None)
    
    dpi1_guess, dpi1_guess_power, evo_state_guess, harmonic = initial_RGB_or_RC(pds_l023_removed, freqs)

    optim = bochamm.optimise.PSxPSOptimisation(pds_l023_removed, freqs)
    X, fX, turbo1 = optim.run_optimisation(init_dpi=dpi1_guess, harmonic=harmonic, verbose=verbose)
    
    fX = fX.ravel()
    ind_best = np.argmin(fX)
    f_best, x_best = fX[ind_best], X[ind_best, :]
    
    if save:
        np.savetxt(f"{kic}_samples.txt", np.c_[X, fX])
    
    extraction_kwargs = dict(return_threshold = True, plot=False)
    threshold_results = bochamm.extract_results.extract_results(X, fX, bayes_opt_method=turbo1, extraction_method="thresholding", extraction_kwargs=extraction_kwargs)

   
    extraction_kwargs = dict(eps=None, min_samples=30, n_neighbours=3, verbose=True, plot=False)
    results = bochamm.extract_results.extract_results(X, fX, bayes_opt_method=turbo1, extraction_method="clustering", extraction_kwargs=extraction_kwargs)
    colors = ['royalblue', 'maroon', 'forestgreen', 'mediumorchid', 'tan', 'deeppink', 'olive', 'goldenrod', 'lightcyan', 'navy']
    vectorizer = np.vectorize(lambda x: colors[x % len(colors)])
   
    threshold = threshold_results.threshold
    n_clusters = len(np.unique(results.reduced_cluster_labels))
    
    cond = (-fX > threshold_results.threshold) 
    fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(20, 24))
    ax[0].scatter(X[:,0], X[:,1], c=fX)
    ax[0].set_xlabel(r'$\Delta\Pi_{1}$ (s)', fontsize=18)
    ax[0].set_ylabel(r'q', fontsize=18)
    ax[1].scatter(X[cond,0], X[cond,1], c=fX[cond])
    ax[1].set_xlabel(r'$\Delta\Pi_{1}$ (s)', fontsize=18)
    ax[1].set_ylabel(r'q', fontsize=18)
    ax[2].scatter(results.reduced_data[:,0], results.reduced_data[:,1], c=vectorizer(results.reduced_cluster_labels))
    ax[2].set_xlabel(r'$\Delta\Pi_{1}$ (s)', fontsize=18)
    ax[2].set_ylabel(r'q', fontsize=18)
    if save:
        plt.savefig(f"{kic}_clustering_thresholding_comparison.png", bbox_inches="tight")
        plt.close()
    else:
        plt.show()
    
    cluster_X, cluster_loss, cluster_thresh_labels = cluster_threshold_results(X, fX, threshold, save=f"{kic}_cluster_thresholds.png")#
    n_threshold_clusters = len(np.unique(cluster_thresh_labels))
    
    # Iterate over solutions and get uncertainties
    # Firstly from clustering
    cluster_med_loss, cluster_DPi1_perc, cluster_q_perc = process_clusters(results.reduced_data, results.reduced_loss, results.reduced_cluster_labels)
    #print("Clustering results")
    # Secondly from clustering of thresholding
    thresh_med_loss, thresh_DPi1_perc, thresh_q_perc = process_clusters(cluster_X, cluster_loss, cluster_thresh_labels)
    #print("Thresholding results")
    if save:
        np.savetxt(f"{kic}_best.txt", np.r_[x_best, np.array([dpi1_guess, dpi1_guess_power])])
        np.savetxt(f"{kic}_cluster_results.txt", np.c_[np.r_[cluster_DPi1_perc, cluster_q_perc], np.tile(np.array([cluster_med_loss]), (np.shape(cluster_DPi1_perc)[0]*2, 1))])
        np.savetxt(f"{kic}_threshold_results.txt", np.c_[np.r_[thresh_DPi1_perc, thresh_q_perc], np.tile(np.array([thresh_med_loss]), (np.shape(thresh_DPi1_perc)[0]*2, 1))])

## Loading in the data

To start with we're going to use a fast rotating star KIC 8564976 (a.k.a KOI-3890). This may seem counter intuitive since going for a complicated case straight away does not seem wise, but all will become clear a little later.

In [262]:
kics = glob.glob('../../TACO_benchmarking/data/intermediate/*/summary.csv')
kics = [int(i.split('/')[-2]) for i in kics]

In [265]:
#kics = [1569842, 1726291, 2998532, 3129312, 3222834, 3426673, 3432802,
       #3641504, 7619745]

In [270]:
Parallel(n_jobs=4)(delayed(run_opt)(kics[i], save=True, verbose=False) for i in tqdm(range(len(kics)), total=len(kics)))