In [1]:
%load_ext autoreload
%autoreload 2
from IPython.display import clear_output
import os
import fnmatch
import numpy as np
import pickle
import matplotlib.pyplot as plt
# import umap
from sklearn.mixture import GaussianMixture
from scipy import stats
# from sklearn.cluster import OPTICS
from copy import deepcopy

from scipy.stats import zscore
from scipy.spatial import distance


plt.rcParams['figure.figsize'] = (5.0, 5.0)
plt.rcParams.update({'font.size': 12})
plt.rcParams.update(plt.rcParamsDefault)

np.random.seed(seed=11)


cwd = os.getcwd()

if cwd.split("/")[1] == "export":
    data_dir = "../../../files_from_snuffy"
else:
    data_dir = "../../../data_GRS1915"


In [2]:
def component_mahalanobis_distances(GMmodel):
    """
    input = Gaussian mixture model
    output = matrix of mahalanobis distances between Gaussian components
    """
    no_components = GMmodel.means_.shape[0]
    GM_comp_mahal_distances = np.zeros((no_components,no_components))
    for comp1_ind, comp1 in enumerate(GMmodel.means_):
        for comp2_ind, comp2 in enumerate(GMmodel.means_):
            GM_comp_mahal_distances[comp1_ind, comp2_ind] = distance.mahalanobis(comp1, comp2, np.linalg.inv(GMmodel.covariances_[comp2_ind]))
            print(comp1_ind, comp2_ind)
            clear_output(wait=True)
    return GM_comp_mahal_distances

In [3]:
"GMM_222comps_model_2020-12-24_13-14-02_segments_474471_len128_stride10_1sec_cad_countrates_sum_bin.pkl"
"GMM_279comps_model_2020-12-21_20-11-39_segments_468202_len128_stride8_4sec_cad_countrates_sum_bin.pkl"

'GMM_279comps_model_2020-12-21_20-11-39_segments_468202_len128_stride8_4sec_cad_countrates_sum_bin.pkl'

In [4]:
# Load Gaussian mixture model which produced the minimum BIC with 1 second cadence data
with open("{}/GMM_222comps_model_2020-12-24_13-14-02_segments_474471_len128_stride10_1sec_cad_countrates_sum_bin.pkl".format(data_dir), 'rb') as f:
    GMmodel_1s = pickle.load(f)



In [5]:
# load light curve segments
with open('{}/474471_len128_stride10_1sec_cad_countrates_sum_bin.pkl'.format(data_dir), 'rb') as f:
    segments_counts = pickle.load(f)
    
# load latent variables for light curve segments
weights_dir = "../../../model_weights/model_2020-12-24_13-14-02.h5"
segments_dir = '../../../data_GRS1915/474471_len128_stride10_1sec_cad_countrates_sum_bin.pkl'
segment_encoding_dir = '{}/segment_encoding_{}_segments_{}.pkl'.format(data_dir, weights_dir.split("/")[-1].split(".")[0], segments_dir.split("/")[-1].split(".")[0])
with open(segment_encoding_dir, 'rb') as f:
    segment_encoding = pickle.load(f)

# take latent variable means, i.e. 20 values per segment
segment_encoding_scaled_means = zscore(segment_encoding[:,0,:], axis=0).astype(np.float32)  # standardize per feature

# calculate statistical moments for the segments
desc_stats = np.zeros((len(segments_counts), 4)) #mean, std, skew, kurt
desc_stats[:,0] = np.mean(segments_counts, axis=1).flatten()
desc_stats[:,1] = np.std(segments_counts, axis=1).flatten()
desc_stats[:,2] = stats.skew(segments_counts, axis=1).flatten()
desc_stats[:,3] = stats.kurtosis(segments_counts, axis=1).flatten()
zscore_desc_stats = zscore(desc_stats, axis=0)

# merge the two types of features; shape of shape_moments is [474471, 24]
shape_moments = np.hstack((segment_encoding_scaled_means, zscore_desc_stats)) # every column is standardized

In [6]:
GMmodel_1s_labels = GMmodel_1s.predict(shape_moments)

In [6]:
# calculate mahalanobis distances between components
GM_comp_mahal_distances_1s = component_mahalanobis_distances(GMmodel_1s)

221 221


In [62]:
# merge components within 3 sigma
couples = np.array(np.where(((np.triu(GM_comp_mahal_distances_1s)<3)&(np.triu(GM_comp_mahal_distances_1s)>0))
      &(np.triu(GM_comp_mahal_distances_1s.T)<3)&(np.triu(GM_comp_mahal_distances_1s.T)>0))).T

In [63]:
l = [str(c) for c in list(range(GMmodel_1s.means_.shape[0]))]
for couple in couples:
    l.append([str(c) for c in couple])
    
import networkx 
from networkx.algorithms.components.connected import connected_components


def to_graph(l):
    G = networkx.Graph()
    for part in l:
        # each sublist is a bunch of nodes
        G.add_nodes_from(part)
        # it also imlies a number of edges:
        G.add_edges_from(to_edges(part))
    return G

def to_edges(l):
    """ 
        treat `l` as a Graph and returns it's edges 
        to_edges(['a','b','c','d']) -> [(a,b), (b,c),(c,d)]
    """
    it = iter(l)
    last = next(it)

    for current in it:
        yield last, current
        last = current    

G = to_graph(l)
print(connected_components(G))

<generator object connected_components at 0x7f07eb8b0f20>


In [64]:
data_labels = GMmodel_1s_labels
# merge components
for n_connection, connection in enumerate(connected_components(G)):
    node_indices=np.array([int(node) for node in connection])
    data_labels = np.where(np.isin(data_labels, node_indices), n_connection+np.unique(GMmodel_1s_labels).shape[0], data_labels)

In [17]:
GMmodel_1s_labels

array([182,  76, 152, ...,   9, 152,  88])

In [60]:
data_labels

array([182, 225, 222, ..., 222, 222, 222])

In [7]:
with open('{}/474471_len128_stride10_1sec_cad_ids_sum_bin.pkl'.format(data_dir), 'rb') as f:
    seg_ids = pickle.load(f)
seg_ObIDs = [seg.split("_")[0] for seg in seg_ids] # get rid of the within-observation segment indices and create a degenerate list of observation IDs

In [8]:
clean_belloni = open('{}/1915Belloniclass_updated.dat'.format(data_dir))
lines = clean_belloni.readlines()
states = lines[0].split()
belloni_clean = {}
for h,l in zip(states, lines[1:]):
    belloni_clean[h] = l.split()
    #state: obsID1, obsID2...
ob_state = {}
for state, obs in belloni_clean.items():
    if state == "chi1" or state == "chi2" or state == "chi3" or state == "chi4": state = "chi"
    for ob in obs:
        ob_state[ob] = state

        
# inverse the ob_state dictionary, so that inv_ob_state contains {"state name" : [list of observation IDs], ...}

inv_ob_state = {}
for k, v in ob_state.items():
    inv_ob_state[v] = inv_ob_state.get(v, [])
    inv_ob_state[v].append(k)

In [9]:
with open('{}/lightcurve1738_train70_val10_test20.pkl'.format(data_dir), 'rb') as f:
    split_ob_ids = pickle.load(f)

In [124]:
split_ob_ids

[array(['20402-01-22-00', '20187-02-01-01', '20187-02-01-00', ...,
        '94701-01-42-00', '30703-01-09-00', '90105-04-02-00'], dtype='<U15'),
 array(['20402-01-23-00', '40703-01-18-00', '40703-01-35-00',
        '20402-01-39-00', '20402-01-41-03', '10408-01-14-09',
        '10408-01-15-04', '20186-03-02-03', '40703-01-12-00',
        '20402-01-37-01', '10408-01-08-00', '10408-01-44-00',
        '40703-01-23-00', '20402-01-32-01', '10408-01-19-01',
        '30402-01-12-02', '20402-01-15-00', '30402-01-12-03',
        '30703-01-33-00', '20402-01-48-00', '20402-01-50-00',
        '40703-01-27-00', '70702-01-17-00', '80127-05-04-00',
        '90105-03-01-000', '93701-01-33-00', '95701-01-51-00',
        '93411-01-01-00', '50703-01-38-02', '91701-01-14-00',
        '70702-01-52-00', '50703-01-42-00', '90105-10-07-00',
        '80701-01-09-01', '80701-01-20-01', '90105-06-03-02',
        '91701-01-69-00', '60701-01-14-00', '50703-01-47-02',
        '50703-01-03-00', '90701-01-02-00', '901

In [10]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from  sklearn.ensemble import RandomForestClassifier

In [28]:
from  sklearn.ensemble import RandomForestClassifier

In [11]:
def test_frankGMM_in_classification(data_labels, data, train_set_ids, val_set_ids):
#     comp=GMmodel.means_.shape[0]
    
    # find GMM component labels for data
    data_GMMcomp_labels = data_labels
    
    # make a dict that groups indices of segments of the same observation 
    # i.e. where each observation id can be found in seg_ObIDs
    #i.e. ObID_SegIndices_dict == {'10258-01-01-00': [916, 949, 1046...467528, 467578], ....}
    ObID_SegIndices_dict = {key:[] for key in np.unique(seg_ObIDs)}
    for ID_index, ObID in enumerate(seg_ObIDs):
        ObID_SegIndices_dict.setdefault(ObID, []).append(ID_index)
    
    # make a dictionary of Gaussian component labels instead of segment indices  
    #i.e. ObID_GaussComps_dict_500 == {'10258-01-01-00': [401, 433, 382...101, 152], ....}
    ObID_GaussComps_dict_comp = {}
    for ObID, Indices in ObID_SegIndices_dict.items():
        ObID_GaussComps_dict_comp[ObID] = [data_GMMcomp_labels[ind] for ind in Indices]
        
    # make a data frame containing the counts of light curve segments in each of the Gaussian components, for each observation
    obs_component_counts_df_comp = pd.DataFrame(np.zeros((len(ObID_GaussComps_dict_comp),len(np.unique(data_GMMcomp_labels)))),
                                               index=np.unique(seg_ObIDs), columns=np.unique(data_GMMcomp_labels), dtype=int)
        
    # populate the data frame
    for ObID, GaussComps in ObID_GaussComps_dict_comp.items():
        for comp_id, comp_count in np.array(np.unique(GaussComps, return_counts=True)).T:
            obs_component_counts_df_comp.loc[ObID][comp_id] = comp_count
    
    
    obs_component_counts_df_comp = obs_component_counts_df_comp.iloc[:,:].div(np.sum(obs_component_counts_df_comp.iloc[:,:], axis=1), axis="rows") # normalise rows
    
    # add classification column
    obs_component_counts_df_comp["Class"] = "Unknown" 
    for k,v in ob_state.items():
        if v == "eta": v = "Unknown" ##################################### remove eta classifications, there are only two in the set of 1738 observations
        if str(k) in obs_component_counts_df_comp.index.values:
            obs_component_counts_df_comp.loc[str(k), "Class"] = v
            
    
    
    # training data
    train_data = obs_component_counts_df_comp.loc[train_set_ids].loc[obs_component_counts_df_comp.loc[train_set_ids].iloc[:,-1] != "Unknown"]  
    # validation data
    val_data = obs_component_counts_df_comp.loc[val_set_ids].loc[obs_component_counts_df_comp.loc[val_set_ids].iloc[:,-1] != "Unknown"]
            
    #random forest hyperparameters
    n_estimators_list = [50, 100, 300, 800] # 100
    max_depth_list = [None, 5, 8, 15, 25] # None
    min_samples_split_list = [2, 5, 10, 15] # 2
    min_samples_leaf_list = [1, 2, 5, 10] # 1   

    reports = []
    
    for n_estimators in n_estimators_list:
        for max_depth in max_depth_list:
            for min_samples_split in min_samples_split_list:
                for min_samples_leaf in min_samples_leaf_list:
    
                    RF_clf = RandomForestClassifier(random_state=0,
                                                    class_weight="balanced",
                                                    n_estimators=n_estimators,
                                                    max_depth=max_depth, 
                                                    min_samples_split=min_samples_split,
                                                    min_samples_leaf = min_samples_leaf
                                                   ).fit(train_data.iloc[:,:-1], train_data.iloc[:,-1])
                    preds = RF_clf.predict(val_data.iloc[:,:-1])

                    reports.append((precision_recall_fscore_support(val_data.iloc[:,-1], preds, zero_division=0, average="weighted")[2],
                                    accuracy_score(val_data.iloc[:,-1], preds),
                                   (n_estimators,max_depth,min_samples_split,min_samples_leaf)))
            

    return reports

In [71]:
reports = test_frankGMM_in_classification(data_labels, shape_moments, split_ob_ids[0], split_ob_ids[1])

1


In [72]:
reports

[(0.6367965367965367, 0.7272727272727273, (50, None, 2, 1))]

In [70]:
np.unique(data_labels).shape

(164,)

In [12]:
def merge_gaussian_component_labels(distance_matrix, observation_labels, sigma_threshold):
    """
    if mahalanobis distance between Gaussian components is smaller than the sigma_threshold, relabel the data within
    those components as belonging to single cluster
    """
    # merge components within 3 sigma
    couples = np.array(np.where(((np.triu(distance_matrix)<sigma_threshold)&(np.triu(distance_matrix)>0))
      &(np.triu(distance_matrix.T)<sigma_threshold)&(np.triu(distance_matrix.T)>0))).T
    
    # build a graph of connections
    l = [str(c) for c in list(range(distance_matrix.shape[0]))]
    for couple in couples:
        l.append([str(c) for c in couple])
    import networkx 
    from networkx.algorithms.components.connected import connected_components
    def to_graph(l):
        G = networkx.Graph()
        for part in l:
            # each sublist is a bunch of nodes
            G.add_nodes_from(part)
            # it also imlies a number of edges:
            G.add_edges_from(to_edges(part))
        return G
    def to_edges(l):
        """ 
            treat `l` as a Graph and returns it's edges 
            to_edges(['a','b','c','d']) -> [(a,b), (b,c),(c,d)]
        """
        it = iter(l)
        last = next(it)

        for current in it:
            yield last, current
            last = current    
    G = to_graph(l)

    data_labels = observation_labels
    # merge components
    for n_connection, connection in enumerate(connected_components(G)):
        node_indices=np.array([int(node) for node in connection])
        data_labels = np.where(np.isin(data_labels, node_indices), n_connection+np.unique(observation_labels).shape[0], data_labels)
        
    return data_labels

In [95]:
no_components_premerger = np.unique(GMmodel_1s_labels).shape[0]

sigma_reports_list = []

for sigma_threshold in np.linspace(1,6,50):
    data_labels = merge_gaussian_component_labels(GM_comp_mahal_distances_1s, GMmodel_1s_labels, sigma_threshold)
    if np.unique(data_labels).shape[0] < no_components_premerger:
        no_components_premerger = np.unique(data_labels).shape[0]
        reports = test_frankGMM_in_classification(data_labels, shape_moments, split_ob_ids[0], split_ob_ids[1])
        sigma_reports_list.append((sigma_threshold, reports))
        print(len(sigma_reports_list), sigma_threshold)
        clear_output(wait=True)

39 6.0


In [97]:
with open("sigma_reports_list_1s.pkl", 'wb') as f:
    pickle.dump(sigma_reports_list, f)

# 4 second data classification

In [99]:
# Load Gaussian mixture model which produced the minimum BIC with 1 second cadence data
with open("{}/GMM_279comps_model_2020-12-21_20-11-39_segments_468202_len128_stride8_4sec_cad_countrates_sum_bin.pkl".format(data_dir), 'rb') as f:
    GMmodel_4s = pickle.load(f)



In [100]:
# load light curve segments
with open('{}/468202_len128_stride8_4sec_cad_countrates_sum_bin.pkl'.format(data_dir), 'rb') as f:
    segments_counts = pickle.load(f)
    
# load latent variables for light curve segments
weights_dir = "../../../model_weights/model_2020-12-21_20-11-39.h5"
segments_dir = '../../../data_GRS1915/468202_len128_stride8_4sec_cad_countrates_sum_bin.pkl'
segment_encoding_dir = '{}/segment_encoding_{}_segments_{}.pkl'.format(data_dir, weights_dir.split("/")[-1].split(".")[0], segments_dir.split("/")[-1].split(".")[0])
with open(segment_encoding_dir, 'rb') as f:
    segment_encoding = pickle.load(f)

# take latent variable means, i.e. 16 values per segment
segment_encoding_scaled_means = zscore(segment_encoding[:,0,:], axis=0).astype(np.float32)  # standardize per feature

# calculate statistical moments for the segments
desc_stats = np.zeros((len(segments_counts), 4)) #mean, std, skew, kurt
desc_stats[:,0] = np.mean(segments_counts, axis=1).flatten()
desc_stats[:,1] = np.std(segments_counts, axis=1).flatten()
desc_stats[:,2] = stats.skew(segments_counts, axis=1).flatten()
desc_stats[:,3] = stats.kurtosis(segments_counts, axis=1).flatten()
zscore_desc_stats = zscore(desc_stats, axis=0)

# merge the two types of features; shape of shape_moments is [474471, 24]
shape_moments = np.hstack((segment_encoding_scaled_means, zscore_desc_stats)) # every column is standardized

In [101]:
GMmodel_4s_labels = GMmodel_4s.predict(shape_moments)

In [102]:
# calculate mahalanobis distances between components
GM_comp_mahal_distances_4s = component_mahalanobis_distances(GMmodel_4s)

278 278


In [None]:
with open('{}/468202_len128_stride8_4sec_cad_ids_sum_bin.pkl'.format(data_dir), 'rb') as f:
    seg_ids = pickle.load(f)
seg_ObIDs = [seg.split("_")[0] for seg in seg_ids] # get rid of the within-observation segment indices and create a degenerate list of observation IDs

In [127]:
no_components_premerger = np.unique(GMmodel_4s_labels).shape[0]

sigma_reports_list = []

for sigma_threshold in np.linspace(1,6,50):
    data_labels = merge_gaussian_component_labels(GM_comp_mahal_distances_4s, GMmodel_4s_labels, sigma_threshold)
    if np.unique(data_labels).shape[0] < no_components_premerger:
        no_components_premerger = np.unique(data_labels).shape[0]
        reports = test_frankGMM_in_classification(data_labels, shape_moments, split_ob_ids[0], split_ob_ids[1])
        sigma_reports_list.append((sigma_threshold, reports))
        print(len(sigma_reports_list), sigma_threshold)
        clear_output(wait=True)

39 6.0


In [128]:
# with open("sigma_reports_list_4s.pkl", 'wb') as f:
#     pickle.dump(sigma_reports_list, f)

In [135]:
best_f1 = 0
best_result = []
for threshold, report_list in sigma_reports_list:
    for report in report_list:
        if report[0] > best_f1:
            best_f1 = report[0]
            best_result = [threshold, report]

In [136]:
best_result
# best results for the grid search (including sigma search) with unknown class excluded
# [3.6530612244897958, (0.782051282051282, 0.8181818181818182, (50, None, 2, 5))]

[3.6530612244897958, (0.782051282051282, 0.8181818181818182, (50, None, 2, 5))]

In [123]:
# results for the grid search without merging
# reports0 = test_frankGMM_in_classification(GMmodel_4s_labels, shape_moments, split_ob_ids[0], split_ob_ids[1])
# (0.7449035812672178, 0.7727272727272727, (50, 5, 2, 5))

In [133]:
best_f1 = 0
best_result = []
for report in reports0:
    if report[0] > best_f1:
        best_f1 = report[0]
        best_result = report

In [134]:
best_result

(0.7449035812672178, 0.7727272727272727, (50, 5, 2, 5))

In [138]:
data_labels = merge_gaussian_component_labels(GM_comp_mahal_distances_4s, GMmodel_4s_labels, 3.6530612244897958)
test_classification(data_labels, shape_moments, split_ob_ids[0], split_ob_ids[2],
                                                    n_estimators=50,
                                                    max_depth=None, 
                                                    min_samples_split=2,
                                                    min_samples_leaf = 5)
# (0.6790571547768044, 0.7021276595744681)

(0.6790571547768044, 0.7021276595744681)

In [137]:
def test_classification(data_labels, data, train_set_ids, test_set_ids,
                                                    n_estimators=50,
                                                    max_depth=5, 
                                                    min_samples_split=15,
                                                    min_samples_leaf = 1):
    """
    prepare representations of observations based on their make up in terms of Gaussian mixture component contributions.
    test the representation as the feature set for classification task
    """
    
#     comp=GMmodel.means_.shape[0]
    
    # find GMM component labels for data
    data_GMMcomp_labels = data_labels
    
    # make a dict that groups indices of segments of the same observation 
    # i.e. where each observation id can be found in seg_ObIDs
    #i.e. ObID_SegIndices_dict == {'10258-01-01-00': [916, 949, 1046...467528, 467578], ....}
    ObID_SegIndices_dict = {key:[] for key in np.unique(seg_ObIDs)}
    for ID_index, ObID in enumerate(seg_ObIDs):
        ObID_SegIndices_dict.setdefault(ObID, []).append(ID_index)
    
    # make a dictionary of Gaussian component labels instead of segment indices  
    #i.e. ObID_GaussComps_dict_comp == {'10258-01-01-00': [401, 433, 382...101, 152], ....}
    ObID_GaussComps_dict_comp = {}
    for ObID, Indices in ObID_SegIndices_dict.items():
        ObID_GaussComps_dict_comp[ObID] = [data_GMMcomp_labels[ind] for ind in Indices]
        
    # make a data frame containing the counts of light curve segments in each of the Gaussian components, for each observation
    obs_component_counts_df_comp = pd.DataFrame(np.zeros((len(ObID_GaussComps_dict_comp),len(np.unique(data_GMMcomp_labels)))),
                                               index=np.unique(seg_ObIDs), columns=np.unique(data_GMMcomp_labels), dtype=int)
        
    # populate the data frame
    for ObID, GaussComps in ObID_GaussComps_dict_comp.items():
        for comp_id, comp_count in np.array(np.unique(GaussComps, return_counts=True)).T:
            obs_component_counts_df_comp.loc[ObID][comp_id] = comp_count
    
    
    obs_component_counts_df_comp = obs_component_counts_df_comp.iloc[:,:].div(np.sum(obs_component_counts_df_comp.iloc[:,:], axis=1), axis="rows") # normalise rows
    
    # add classification column
    obs_component_counts_df_comp["Class"] = "Unknown" 
    for k,v in ob_state.items():
        if v == "eta": v = "Unknown" ##################################### remove eta classifications, there are only two in the set of 1738 observations
        if str(k) in obs_component_counts_df_comp.index.values:
            obs_component_counts_df_comp.loc[str(k), "Class"] = v
            
    
    
    # training data
    train_data = obs_component_counts_df_comp.loc[train_set_ids].loc[obs_component_counts_df_comp.loc[train_set_ids].iloc[:,-1] != "Unknown"]  
    # validation data
    test_data = obs_component_counts_df_comp.loc[test_set_ids].loc[obs_component_counts_df_comp.loc[test_set_ids].iloc[:,-1] != "Unknown"]
            

    RF_clf = RandomForestClassifier(random_state=0,
                                    class_weight="balanced",
                                    n_estimators=n_estimators,
                                    max_depth=max_depth, 
                                    min_samples_split=min_samples_split,
                                    min_samples_leaf = min_samples_leaf
                                   ).fit(train_data.iloc[:,:-1], train_data.iloc[:,-1])
    preds = RF_clf.predict(test_data.iloc[:,:-1])

    return (precision_recall_fscore_support(test_data.iloc[:,-1], preds, zero_division=0, average="weighted")[2],
                    accuracy_score(test_data.iloc[:,-1], preds))

In [153]:
# merge components within 3 sigma
couples_4s = np.array(np.where(((np.triu(GM_comp_mahal_distances_4s)<3)&(np.triu(GM_comp_mahal_distances_4s)>0))
      &(np.triu(GM_comp_mahal_distances_4s.T)<3)&(np.triu(GM_comp_mahal_distances_4s.T)>0))).T

In [154]:
l = [str(c) for c in list(range(GMmodel_4s.means_.shape[0]))]
for couple in couples_4s:
    l.append([str(c) for c in couple])
    
import networkx 
from networkx.algorithms.components.connected import connected_components


def to_graph(l):
    G = networkx.Graph()
    for part in l:
        # each sublist is a bunch of nodes
        G.add_nodes_from(part)
        # it also imlies a number of edges:
        G.add_edges_from(to_edges(part))
    return G

def to_edges(l):
    """ 
        treat `l` as a Graph and returns it's edges 
        to_edges(['a','b','c','d']) -> [(a,b), (b,c),(c,d)]
    """
    it = iter(l)
    last = next(it)

    for current in it:
        yield last, current
        last = current    

G = to_graph(l)
print(connected_components(G))

<generator object connected_components at 0x7fc3b8ee77b0>


In [156]:
data_labels = GMmodel_4s_labels
# merge components
for n_connection, connection in enumerate(connected_components(G)):
    node_indices=np.array([int(node) for node in connection])
    data_labels = np.where(np.isin(data_labels, node_indices), n_connection+np.unique(GMmodel_4s_labels).shape[0], data_labels)

In [87]:
data_labels

array([182,  76, 222, ..., 222, 222, 222])

In [105]:
with open('{}/468202_len128_stride8_4sec_cad_ids_sum_bin.pkl'.format(data_dir), 'rb') as f:
    seg_ids = pickle.load(f)
seg_ObIDs = [seg.split("_")[0] for seg in seg_ids] # get rid of the within-observation segment indices and create a degenerate list of observation IDs

In [123]:
with open('{}/lightcurve1738_train70_val10_test20.pkl'.format(data_dir), 'rb') as f:
    split_ob_ids = pickle.load(f)

In [124]:
split_ob_ids

[array(['20402-01-22-00', '20187-02-01-01', '20187-02-01-00', ...,
        '94701-01-42-00', '30703-01-09-00', '90105-04-02-00'], dtype='<U15'),
 array(['20402-01-23-00', '40703-01-18-00', '40703-01-35-00',
        '20402-01-39-00', '20402-01-41-03', '10408-01-14-09',
        '10408-01-15-04', '20186-03-02-03', '40703-01-12-00',
        '20402-01-37-01', '10408-01-08-00', '10408-01-44-00',
        '40703-01-23-00', '20402-01-32-01', '10408-01-19-01',
        '30402-01-12-02', '20402-01-15-00', '30402-01-12-03',
        '30703-01-33-00', '20402-01-48-00', '20402-01-50-00',
        '40703-01-27-00', '70702-01-17-00', '80127-05-04-00',
        '90105-03-01-000', '93701-01-33-00', '95701-01-51-00',
        '93411-01-01-00', '50703-01-38-02', '91701-01-14-00',
        '70702-01-52-00', '50703-01-42-00', '90105-10-07-00',
        '80701-01-09-01', '80701-01-20-01', '90105-06-03-02',
        '91701-01-69-00', '60701-01-14-00', '50703-01-47-02',
        '50703-01-03-00', '90701-01-02-00', '901

In [138]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

In [157]:
reports = test_frankGMM_in_classification(data_labels, shape_moments, split_ob_ids[0], split_ob_ids[1])

1


In [158]:
reports

[((0.779018005448143, 0.8793103448275862, 0.8247804302976717, None),
  0.8793103448275862,
  (50, None, 2, 1))]

In [128]:
from  sklearn.ensemble import RandomForestClassifier

In [None]:
RandomForestClassifier()

In [145]:
def test_frankGMM_in_classification(data_labels, data, train_set_ids, val_set_ids):
#     comp=GMmodel.means_.shape[0]
    
    # find GMM component labels for data
    data_GMMcomp_labels = data_labels
    
    # make a dict that groups indices of segments of the same observation 
    # i.e. where each observation id can be found in seg_ObIDs
    #i.e. ObID_SegIndices_dict == {'10258-01-01-00': [916, 949, 1046...467528, 467578], ....}
    ObID_SegIndices_dict = {key:[] for key in np.unique(seg_ObIDs)}
    for ID_index, ObID in enumerate(seg_ObIDs):
        ObID_SegIndices_dict.setdefault(ObID, []).append(ID_index)
    
    # make a dictionary of Gaussian component labels instead of segment indices  
    #i.e. ObID_GaussComps_dict_500 == {'10258-01-01-00': [401, 433, 382...101, 152], ....}
    ObID_GaussComps_dict_comp = {}
    for ObID, Indices in ObID_SegIndices_dict.items():
        ObID_GaussComps_dict_comp[ObID] = [data_GMMcomp_labels[ind] for ind in Indices]
        
    # make a data frame containing the counts of light curve segments in each of the Gaussian components, for each observation
    obs_component_counts_df_comp = pd.DataFrame(np.zeros((len(ObID_GaussComps_dict_comp),len(np.unique(data_GMMcomp_labels)))),
                                               index=np.unique(seg_ObIDs), columns=np.unique(data_GMMcomp_labels), dtype=int)
        
    # populate the data frame
    for ObID, GaussComps in ObID_GaussComps_dict_comp.items():
        for comp_id, comp_count in np.array(np.unique(GaussComps, return_counts=True)).T:
            obs_component_counts_df_comp.loc[ObID][comp_id] = comp_count
    
    
    obs_component_counts_df_comp = obs_component_counts_df_comp.iloc[:,:].div(np.sum(obs_component_counts_df_comp.iloc[:,:], axis=1), axis="rows") # normalise rows
    
    # add classification column
    obs_component_counts_df_comp["Class"] = "Unknown" 
    for k,v in ob_state.items():
        if v == "eta": v = "Unknown" ##################################### remove eta classifications, there are only two in the set of 1738 observations
        if str(k) in obs_component_counts_df_comp.index.values:
            obs_component_counts_df_comp.loc[str(k), "Class"] = v
            
    #random forest hyperparameters
    n_estimators_list = [50, 100, 300, 800, 1200] # 100
    max_depth_list = [None, 5, 8, 15, 25] # None
    min_samples_split_list = [2, 5, 10, 15] # 2
    min_samples_leaf_list = [1, 2, 5, 10] # 1   
            
    reports = []
    
    for n_estimators in n_estimators_list:
        for max_depth in max_depth_list:
            for min_samples_split in min_samples_split_list:
                for min_samples_leaf in min_samples_leaf_list:
    
                    RF_clf = RandomForestClassifier(random_state=0, class_weight="balanced").fit(obs_component_counts_df_comp.loc[train_set_ids].iloc[:,:-1], obs_component_counts_df_comp.loc[train_set_ids].iloc[:,-1])
                    preds = RF_clf.predict(obs_component_counts_df_comp.loc[val_set_ids].iloc[:,:-1])

                    reports.append((precision_recall_fscore_support(obs_component_counts_df_comp.loc[val_set_ids].iloc[:,-1], preds, zero_division=0, average="weighted"),
                                    accuracy_score(obs_component_counts_df_comp.loc[val_set_ids].iloc[:,-1], preds),
                                   (n_estimators,max_depth,min_samples_split,min_samples_leaf)))
                
                    print(len(reports))
                    clear_output(wait=True)
                    return reports
            

    return reports

1 second cadence data
validation
f1 0.833
accuracy 0.864

test
f1 0.822
accuracy 0.851

4 second cadence data
valdiation
f1 0.782
accuracy 0.818

test
f1 0.679
accuracy 0.702