In [None]:
%load_ext autoreload
%autoreload 2
from IPython.display import clear_output
import os
import fnmatch
import numpy as np
import pickle
import matplotlib.pyplot as plt
import umap
from sklearn.mixture import GaussianMixture
from scipy import stats
# from sklearn.cluster import OPTICS

# import tensorflow as tf
# from tensorflow.keras import layers
# from tensorflow.keras.backend import mean
# from tensorflow.keras.backend import square
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import CuDNNLSTM
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.layers import RepeatVector
# from tensorflow.keras.layers import TimeDistributed
# from tensorflow.keras.callbacks import EarlyStopping
# from tensorflow.keras.callbacks import ModelCheckpoint
# from tensorflow.keras.layers import Flatten

# from tensorflow.keras.utils import Sequence
# from tensorflow.keras import Input
# from tensorflow.keras import Model
# from tensorflow.keras.layers import BatchNormalization
# from tensorflow.keras.layers import Conv1D
from scipy.stats import zscore

plt.rcParams['figure.figsize'] = (5.0, 5.0)
plt.rcParams.update({'font.size': 12})
plt.rcParams.update(plt.rcParamsDefault)

np.random.seed(seed=11)


cwd = os.getcwd()

if cwd.split("/")[1] == "export":
    data_dir = "../../../files_from_snuffy"
else:
    data_dir = "../../../data_GRS1915"


In [None]:
# !for pid in `pgrep -f jupyter`; do { renice 5 $pid; }; done

In [None]:
# !for pid in `pgrep -f jupyter`; do { ps -u -p $pid; }; done

In [None]:
# !stat -c '%y' OPTICS_shape16_moments4_max_eps4_min_samp500_euclidean_alldata.pkl
# 2020-06-22 01:46:51.500058368 +0100


# Load the model (shape)

In [None]:
weights_dir = "../../../model_weights/model_2020-04-29_09-12-23.h5"

class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit.
    https://www.tensorflow.org/guide/keras/custom_layers_and_models#putting_it_all_together_an_end-to-end_example"""
    
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon


original_dim = 128
intermediate_dim = 512
latent_dim = 16

# Define encoder model.
original_inputs = tf.keras.Input(shape=(original_dim,1), name='encoder_input')
input_err = Input(shape=(original_dim,1))
x = layers.CuDNNLSTM(intermediate_dim, return_sequences=False)(original_inputs)
z_mean = layers.Dense(latent_dim, name='z_mean')(x)
z_log_var = layers.Dense(latent_dim, name='z_log_var')(x)
z = Sampling()((z_mean, z_log_var))
encoder = tf.keras.Model(inputs=original_inputs, outputs=z, name='encoder')

# Define decoder model.
latent_inputs = tf.keras.Input(shape=(latent_dim,), name='z_sampling')
x = layers.RepeatVector(original_dim)(latent_inputs)
x = layers.CuDNNLSTM(intermediate_dim, return_sequences=True)(x)
outputs = layers.TimeDistributed(layers.Dense(1))(x)
decoder = tf.keras.Model(inputs=latent_inputs, outputs=outputs, name='decoder')

# Define VAE model.
outputs = decoder(z)
vae = tf.keras.Model(inputs=[original_inputs, input_err], outputs=outputs, name='vae')

vae.load_weights(weights_dir)

# Load data (segments)

In [None]:
segments_dir = '../../../data_GRS1915/468202_len128_s2_4cad_counts_errorfix.pkl'
errors_dir = '../../../data_GRS1915/468202_len128_s2_4cad_errors_errorfix.pkl'

with open(segments_dir, 'rb') as f:
    segments = pickle.load(f)
with open(errors_dir, 'rb') as f:
    errors = pickle.load(f)


errors = ((errors)/np.expand_dims(np.std(segments, axis=1), axis=1)).astype(np.float32)
segments = zscore(segments, axis=1).astype(np.float32)  # standardize per segment

# Try reconstruction (shape)

In [None]:
try_first_x = 20
reconstructions = np.zeros((try_first_x, segments.shape[1]))
for segment_index, segment in enumerate(segments[:try_first_x]):
    reconstructions[segment_index] = vae.predict([np.expand_dims(segment, axis=0), np.expand_dims(errors[segment_index], axis=0)]).flatten()
    
plot_index = np.copy(try_first_x)-1

In [None]:
plt.rcParams['figure.figsize'] = (10.0, 5.0)
plt.rcParams.update({'font.size': 8})

plt.errorbar(np.linspace(0,512, 128), segments[plot_index], yerr=errors[plot_index], ecolor="magenta")
plt.plot(np.linspace(0,512, 128), reconstructions[plot_index], color="orange")
plt.ylabel("Normalized count rate", fontsize=12)
plt.xlabel("Time (seconds)", fontsize=12)
plt.show()
if plot_index>0:
    plot_index-=1

# Encode the data set (shape)

In [None]:
trained_encoder = tf.keras.Model(inputs=vae.input, outputs=[vae.get_layer("z_mean").output, vae.get_layer("z_log_var").output])
segment_encoding = np.zeros((segments.shape[0], 2, 16))
for seg_ind, seg in enumerate(segments):
    prediction = trained_encoder.predict([np.expand_dims(seg, axis=0), np.expand_dims(errors[seg_ind], axis=0)])
    segment_encoding[seg_ind][0] = prediction[0].flatten()
    segment_encoding[seg_ind][1] = prediction[1].flatten()
    print(seg_ind)
    clear_output(wait=True)
    

In [None]:
segment_encoding_dir = '../../../data_GRS1915/segment_encoding_{}_segments_{}.pkl'.format(weights_dir.split("/")[-1].split(".")[0], segments_dir.split("/")[-1].split(".")[0])
# with open(segment_encoding_dir, 'wb') as f:
#     pickle.dump(segment_encoding, f)
print("Encodings saved to: ", segment_encoding_dir)

# PCA the encoded data (shape)

In [None]:
weights_dir = "../../../model_weights/model_2020-04-29_09-12-23.h5"
segments_dir = '../../../data_GRS1915/468202_len128_s2_4cad_counts_errorfix.pkl'
segment_encoding_dir = '../../../data_GRS1915/segment_encoding_{}_segments_{}.pkl'.format(weights_dir.split("/")[-1].split(".")[0], segments_dir.split("/")[-1].split(".")[0])

with open(segment_encoding_dir, 'rb') as f:
    segment_encoding = pickle.load(f)

In [None]:
from sklearn.decomposition import IncrementalPCA, PCA

n_components = None# segment_encoding.shape[-1]
# ipca = IncrementalPCA(n_components=n_components, batch_size=468202)
pca = PCA(n_components=n_components)

# X_ipca = ipca.fit(segment_encoding[:,0,:])
X_pca = pca.fit(segment_encoding[:,0,:])

In [None]:
# X_transformed = X_ipca.transform(segment_encoding[:,0,:])
X_transformed = X_pca.transform(segment_encoding[:,0,:])

In [None]:
probe_points = np.zeros((5,10,16))#5 PCs, 10 points, 16 coordinates

for PC, eigen_vec in enumerate(pca.components_[:5]):#first 5 components give 78% of explained variance, next one is 4.3%
    bins = np.linspace(np.min(X_transformed[:,PC]),np.max(X_transformed[:,PC]), 11)
    binned_indices = np.digitize(X_transformed[:,PC], bins)
    for bin_index in range(10):
        bin_segment_indices = np.where(binned_indices == bin_index+1)[0]
        probe_points[PC,bin_index,:] = np.mean(X_transformed[bin_segment_indices], axis=0)
#     tenth_range = (np.max(X_transformed[:,PC]) - np.min(X_transformed[:,PC]))/10
#     for point in range(10):
#         probe_points[PC,point,:] = eigen_vec
#     np.max(X_transformed[:,PC])/tenth_range
    

In [None]:
umaped_data = X_transformed
plt.rcParams['figure.figsize'] = (abs((np.min(umaped_data[:,0])-0.5) -(np.max(umaped_data[:,0])+0.5)), abs((np.min(umaped_data[:,1])-0.5)- (np.max(umaped_data[:,1])+0.5)))
plt.scatter(X_transformed[:,0], X_transformed[:,1], s=0.05)

# UMAP the encoded data (shape)

In [None]:
with open(segment_encoding_dir, 'rb') as f:
    segment_encoding = pickle.load(f)

In [None]:
UMAP_mapper = umap.UMAP(verbose=True)#n_neighbors=50, min_dist=0.0, local_connectivity, repulsion_strength, negative_sample_rate
UMAP_mapper.fit(segment_encoding[:100000,0,:])


In [None]:
# with open('../../../data_GRS1915/fast_UMAPmapper_means_{}.pkl'.format(segment_encoding_dir.split("/")[-1]), 'wb') as f:
#     pickle.dump(UMAP_mapper, f)

In [None]:
with open('../../../data_GRS1915/fast_UMAPmapper_means_{}.pkl'.format(segment_encoding_dir.split("/")[-1]), 'rb') as f:
    UMAP_mapper = pickle.load(f)

In [None]:
umaped_data = UMAP_mapper.transform(segment_encoding[:,0,:])

In [None]:
encoded_probes = np.zeros((5,10,2))

for n, PC in enumerate(probe_points):
    UMAPed_PC_centroids = UMAP_mapper.transform(PC)
    encoded_probes[n] = UMAPed_PC_centroids

In [None]:
plt.rcParams['figure.figsize'] = (abs((np.min(umaped_data[:,0])-0.5) -(np.max(umaped_data[:,0])+0.5)), abs((np.min(umaped_data[:,1])-0.5)- (np.max(umaped_data[:,1])+0.5)))
plt.scatter(umaped_data[:,0], umaped_data[:,1], s=0.05)
plt.xlim([np.min(umaped_data[:,0])-0.5, np.max(umaped_data[:,0])+0.5])
plt.ylim([np.min(umaped_data[:,1])-0.5, np.max(umaped_data[:,1])+0.5])
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (abs((np.min(umaped_data[:,0])-0.5) -(np.max(umaped_data[:,0])+0.5)), abs((np.min(umaped_data[:,1])-0.5)- (np.max(umaped_data[:,1])+0.5)))
plt.scatter(umaped_data[:,0], umaped_data[:,1], s=0.05, c="magenta")
plt.xlim([np.min(umaped_data[:,0])-0.5, np.max(umaped_data[:,0])+0.5])
plt.ylim([np.min(umaped_data[:,1])-0.5, np.max(umaped_data[:,1])+0.5])

for pc in encoded_probes[:4]:
    plt.plot(pc[:,0], pc[:,1])
    
plt.show()

# Generate synthetic samples along principal components(shape)

In [None]:
weights_dir = "../../../model_weights/model_2020-04-29_09-12-23.h5"

class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit.
    https://www.tensorflow.org/guide/keras/custom_layers_and_models#putting_it_all_together_an_end-to-end_example"""
    
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon


original_dim = 128
intermediate_dim = 512
latent_dim = 16

# Define encoder model.
original_inputs = tf.keras.Input(shape=(original_dim,1), name='encoder_input')
input_err = Input(shape=(original_dim,1))
x = layers.CuDNNLSTM(intermediate_dim, return_sequences=False)(original_inputs)
z_mean = layers.Dense(latent_dim, name='z_mean')(x)
z_log_var = layers.Dense(latent_dim, name='z_log_var')(x)
z = Sampling()((z_mean, z_log_var))
encoder = tf.keras.Model(inputs=original_inputs, outputs=z, name='encoder')

# Define decoder model.
latent_inputs = tf.keras.Input(shape=(latent_dim,), name='z_sampling')
x = layers.RepeatVector(original_dim)(latent_inputs)
x = layers.CuDNNLSTM(intermediate_dim, return_sequences=True)(x)
outputs = layers.TimeDistributed(layers.Dense(1))(x)
decoder = tf.keras.Model(inputs=latent_inputs, outputs=outputs, name='decoder')

# Define VAE model.
outputs = decoder(z)
vae = tf.keras.Model(inputs=[original_inputs, input_err], outputs=outputs, name='vae')

vae.load_weights(weights_dir)

In [None]:
generated_samples

In [None]:
trained_decoder = tf.keras.Model(inputs=vae.get_layer("decoder").input, outputs=vae.get_layer("decoder").output)
generated_samples = np.zeros((5,10, 128))
for n_PC, PC in enumerate(probe_points):
    for n_point, point in enumerate(PC):
        prediction = trained_decoder.predict(np.expand_dims(point, axis=0))
        generated_samples[n_PC,n_point, :] = prediction.flatten()

        print(n_point)
        clear_output(wait=True)


In [None]:
plt.rcParams['figure.figsize'] = (10.0, 5.0)
plt.rcParams.update({'font.size': 8})
plot_index = 0
PC_ind = 4
# plt.errorbar(np.linspace(0,512, 128), segments[plot_index], yerr=errors[plot_index], ecolor="magenta")
offset = 0
for n_lc, lc in enumerate(generated_samples[PC_ind]):
    if n_lc>0:
        offset += np.max(generated_samples[PC_ind, n_lc])-np.min(generated_samples[PC_ind, n_lc])
    plt.plot(np.linspace(0,512, 128), generated_samples[PC_ind, n_lc]+offset)
# plt.ylabel("Normalized count rate", fontsize=12)
plt.xlabel("Time (seconds)", fontsize=12)
plt.title("Series generated along Principal Component {}".format(PC_ind+1), fontsize=15)
plt.show()
if plot_index>0:
    plot_index-=1

# Cluster the latent space(shape)

In [None]:
from sklearn.mixture import GaussianMixture

clf = GaussianMixture(n_components=20, covariance_type='full', verbose=1)
clf.fit(segment_encoding[:100000,0,:])
# aics[n_comps] = clf.aic(segment_encoding[:,0,:])
# print(n_comps)

In [None]:
custer_labels = clf.predict(segment_encoding[:,0,:])

In [None]:
transformed_means = UMAP_mapper.transform(clf.means_)

In [None]:
plt.rcParams['figure.figsize'] = (abs((np.min(umaped_data[:,0])-0.5) -(np.max(umaped_data[:,0])+0.5)), abs((np.min(umaped_data[:,1])-0.5)- (np.max(umaped_data[:,1])+0.5)))
plt.scatter(umaped_data[:,0], umaped_data[:,1], s=0.05, c=custer_labels)
plt.xlim([np.min(umaped_data[:,0])-0.5, np.max(umaped_data[:,0])+0.5])
plt.ylim([np.min(umaped_data[:,1])-0.5, np.max(umaped_data[:,1])+0.5])
for mean_n, mean in enumerate(transformed_means[15:]):
    plt.scatter(mean[0], mean[1], s=300, label=mean_n, edgecolors="white")
plt.legend()
plt.show()

# Generate synthetic samples from Gaussian centroids (shape)

In [None]:
weights_dir = "../../../model_weights/model_2020-04-29_09-12-23.h5"

class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit.
    https://www.tensorflow.org/guide/keras/custom_layers_and_models#putting_it_all_together_an_end-to-end_example"""
    
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon


original_dim = 128
intermediate_dim = 512
latent_dim = 16

# Define encoder model.
original_inputs = tf.keras.Input(shape=(original_dim,1), name='encoder_input')
input_err = Input(shape=(original_dim,1))
x = layers.CuDNNLSTM(intermediate_dim, return_sequences=False)(original_inputs)
z_mean = layers.Dense(latent_dim, name='z_mean')(x)
z_log_var = layers.Dense(latent_dim, name='z_log_var')(x)
z = Sampling()((z_mean, z_log_var))
encoder = tf.keras.Model(inputs=original_inputs, outputs=z, name='encoder')

# Define decoder model.
latent_inputs = tf.keras.Input(shape=(latent_dim,), name='z_sampling')
x = layers.RepeatVector(original_dim)(latent_inputs)
x = layers.CuDNNLSTM(intermediate_dim, return_sequences=True)(x)
outputs = layers.TimeDistributed(layers.Dense(1))(x)
decoder = tf.keras.Model(inputs=latent_inputs, outputs=outputs, name='decoder')

# Define VAE model.
outputs = decoder(z)
vae = tf.keras.Model(inputs=[original_inputs, input_err], outputs=outputs, name='vae')

vae.load_weights(weights_dir)

In [None]:
trained_decoder = tf.keras.Model(inputs=vae.get_layer("decoder").input, outputs=vae.get_layer("decoder").output)

In [None]:
no_samples = 5
generated_samples = np.zeros((no_samples, 128))
for n_point, point in enumerate(clf.means_[15:]):
    prediction = trained_decoder.predict(np.expand_dims(point, axis=0))
    generated_samples[n_point, :] = prediction.flatten()

    print(n_point)
    clear_output(wait=True)


In [None]:
plt.rcParams['figure.figsize'] = (10.0, 5.0)
plt.rcParams.update({'font.size': 8})
plot_index = 0
PC_ind = 4
# plt.errorbar(np.linspace(0,512, 128), segments[plot_index], yerr=errors[plot_index], ecolor="magenta")
offset = 0
for n_lc, lc in enumerate(generated_samples):
    if n_lc>0:
        offset += np.max(lc)-np.min(lc)+ (np.max(generated_samples[n_lc])-np.min(generated_samples[n_lc]))
    plt.plot(np.linspace(0,512, 128), lc+offset)
# plt.ylabel("Normalized count rate", fontsize=12)
plt.xlabel("Time (seconds)", fontsize=12)
plt.title("Series generated from Gaussian mixture means", fontsize=15)
plt.show()
if plot_index>0:
    plot_index-=1

# Overplot UMAP with the classified data (shape)

In [None]:
# load observation classifications from Huppenkothen 2017
# %matplotlib inline

# import matplotlib.pyplot as plt


clean_belloni = open('../../../data_GRS1915/1915Belloniclass_updated.dat')
lines = clean_belloni.readlines()
states = lines[0].split()
belloni_clean = {}
for h,l in zip(states, lines[1:]):
    belloni_clean[h] = l.split()
    #state: obsID1, obsID2...
ob_state = {}
for state, obs in belloni_clean.items():
    if state == "chi1" or state == "chi2" or state == "chi3" or state == "chi4": state = "chi"
    for ob in obs:
        ob_state[ob] = state
        
# load segmented light curves

import pickle
with open('../../../data_GRS1915/468202_len128_s2_4cad_counts_errorfix.pkl', 'rb') as f:
    segments = pickle.load(f)
with open('../../../data_GRS1915/468202_len128_s2_4cad_ids_errorfix.pkl', 'rb') as f:
    seg_ids = pickle.load(f)

# # HF QPO observation ids
# paper_obIDs = np.loadtxt("../../../data_GRS1915/Belloni_Altamirano_obsIDs.txt", dtype=str)

# qpo_colours = []

# for seg_id in seg_ids:
#     if seg_id.split("_")[0] in paper_obIDs:
#         qpo_colours.append("red")
#     else:
#         qpo_colours.append("grey")
        
# qpo_labels = []

# for seg_id in seg_ids:
#     if seg_id.split("_")[0] in paper_obIDs:
#         qpo_labels.append("QPO")
#     else:
#         qpo_labels.append("other")
        
        
# qpo_scales = []

# for seg_id in seg_ids:
#     if seg_id.split("_")[0] in paper_obIDs:
#         qpo_scales.append("QPO")
#     else:
#         qpo_scales.append("other")
        
        
xxx = [seg.split("_")[0] for seg in seg_ids]

classes = np.array(["alpha", "beta", "gamma", "delta", "theta", "kappa", "lambda", "mu", "nu", "rho", "phi", "chi", "eta", "omega"])
class_colour = []
for ob in xxx:
    if ob in ob_state:
        class_colour.append(np.where(classes == ob_state[ob])[0][0])
    else:
        class_colour.append(15)
        
classes = np.array(["alpha", "beta", "gamma", "delta", "theta", "kappa", "lambda", "mu", "nu", "rho", "phi", "chi", "eta", "omega"])
scales = []
segment_class = []
for ob in xxx:
    if ob in ob_state:
        segment_class.append(ob_state[ob])
        scales.append(5)
    else:
        segment_class.append("Unknown")
        scales.append(0.1)
        
        
from matplotlib import cm
cm.get_cmap(plt.get_cmap("Set1"))


colours = ['#ffd8b1', '#000075', '#808080', '#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#000000']

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# sns.set_style("white")
plt.rcParams['figure.figsize'] = (30.0, 30.0)
plt.rcParams.update({'font.size': 0})

embeddings_lap = umaped_data

# fig, ax = plt.subplots()

fig, axs = plt.subplots(4, 4)
axs = axs.flatten()

for plot_class_ind, plot_class in enumerate(classes):
    class_indices = np.where(np.array(segment_class) == "Unknown")[0]
    class_data = embeddings_lap[class_indices]
    axs[plot_class_ind].scatter(class_data[:,0], class_data[:,1], s = 0.2, c="grey", label="Unknown")

    class_indices = np.where(np.array(segment_class) == plot_class)[0]
    class_data = embeddings_lap[class_indices]
    
    axs[plot_class_ind].scatter(class_data[:,0], class_data[:,1], s = 25, c='red', label=plot_class)
    
# plt.legend()
    axs[plot_class_ind].set_title("{}".format(plot_class), fontsize=42)
axs.reshape((4,4))
# plt.savefig("classes_separate.png")

# plt.savefig("UMAP_embedding_separate_classes_model_2020-02-09_10-36-06.png")
plt.show()


# redint = np.where(np.array(qpo_colours) == "red")
# greyint= np.where(np.array(qpo_colours) != "red")
# plt.scatter(embeddings_lap[:,0][greyint], embeddings_lap[:,1][greyint], s=1, c="grey", label= "other")
# plt.scatter(embeddings_lap[:,0][redint], embeddings_lap[:,1][redint], s=1, c="red", label= "HF QPO")
# plt.title("UMAP embedding of the encoded GRS1915 segments, neighbors=50, min_dist=0.0, components=2", fontsize=12)
# plt.legend()
# plt.show()

# Histogram reconstruction

In [None]:
weights_dir = "../../../model_weights/model_2020-04-29_13-04-35.h5" #24>16>24
segments_dir = '../../../data_GRS1915/468202_len128_s2_4cad_histograms_24bin_0-13k_errorfix.pkl'

In [None]:
with open(segments_dir, 'rb') as f:
    segments = pickle.load(f)
# with open('../../data_GRS1915/94465_len512_s40_errors_errorfix.pkl', 'rb') as f:
#     errors = pickle.load(f)
    
# errors = ((errors)/np.std(segments)).astype(np.float32)
segments = zscore(segments, axis=None).astype(np.float32)  # standardize


class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit.
    https://www.tensorflow.org/guide/keras/custom_layers_and_models#putting_it_all_together_an_end-to-end_example"""
    
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

original_dim = 24
intermediate_dim = 64
latent_dim = 16

# Define encoder model.
original_inputs = tf.keras.Input(shape=(original_dim,1), name='encoder_input')
input_err = Input(shape=(original_dim,1))
x = layers.CuDNNLSTM(intermediate_dim, return_sequences=False)(original_inputs)
z_mean = layers.Dense(latent_dim, name='z_mean')(x)
z_log_var = layers.Dense(latent_dim, name='z_log_var')(x)
z = Sampling()((z_mean, z_log_var))
encoder = tf.keras.Model(inputs=original_inputs, outputs=z, name='encoder')

# Define decoder model.
latent_inputs = tf.keras.Input(shape=(latent_dim,), name='z_sampling')
x = layers.RepeatVector(original_dim)(latent_inputs)
x = layers.CuDNNLSTM(intermediate_dim, return_sequences=True)(x)
outputs = layers.TimeDistributed(layers.Dense(1))(x)
decoder = tf.keras.Model(inputs=latent_inputs, outputs=outputs, name='decoder')

# Define VAE model.
outputs = decoder(z)
vae = tf.keras.Model(inputs=original_inputs, outputs=outputs, name='vae')


vae.load_weights(weights_dir)

In [None]:
try_first_x = 200
hist_reconstructions = np.zeros((try_first_x, segments.shape[1]))
for segment_index, segment in enumerate(segments[:try_first_x]):
    hist_reconstructions[segment_index] = vae.predict(np.expand_dims(segment, axis=0)).flatten()
    
plot_index = np.copy(try_first_x)-1

In [None]:
plt.plot(segments[plot_index])
plt.plot(hist_reconstructions[plot_index])
plt.show()
if plot_index>0:
    plot_index-=1

# Encode the data set (histograms)

In [None]:
trained_encoder = tf.keras.Model(inputs=vae.input, outputs=[vae.get_layer("z_mean").output, vae.get_layer("z_log_var").output])
segment_encoding = np.zeros((segments.shape[0], 2, 16))
for seg_ind, seg in enumerate(segments):
    prediction = trained_encoder.predict(np.expand_dims(seg, axis=0))
    segment_encoding[seg_ind][0] = prediction[0].flatten()
    segment_encoding[seg_ind][1] = prediction[1].flatten()
    print(seg_ind)
    clear_output(wait=True)
    

In [None]:
weights_dir = "../../../model_weights/model_2020-04-29_13-04-35.h5" #24>16>24

segments_dir = '../../../data_GRS1915/468202_len128_s2_4cad_histograms_24bin_0-13k_errorfix.pkl'
segment_encoding_dir = '../../../data_GRS1915/histogram_encoding_{}_segments_{}.pkl'.format(weights_dir.split("/")[-1].split(".")[0], segments_dir.split("/")[-1].split(".")[0])
# with open(segment_encoding_dir, 'wb') as f:
#     pickle.dump(segment_encoding, f)
print("Encodings saved to: ", segment_encoding_dir)

# UMAP the encoded data (histograms)

In [None]:
with open(segment_encoding_dir, 'rb') as f:
    segment_encoding = pickle.load(f)

In [None]:
with open(segments_dir, 'rb') as f:
    segments = pickle.load(f)
# with open('../../data_GRS1915/94465_len512_s40_errors_errorfix.pkl', 'rb') as f:
#     errors = pickle.load(f)
    
# errors = ((errors)/np.std(segments)).astype(np.float32)
segments = zscore(segments, axis=None).astype(np.float32)  # standardize

In [None]:
UMAP_mapper = umap.UMAP(verbose=True)#n_neighbors=50, min_dist=0.0
UMAP_mapper.fit(segment_encoding[:100000,:,0])

In [None]:
umaped_data = UMAP_mapper.transform(segment_encoding[:,:,0])

In [None]:
plt.rcParams['figure.figsize'] = (abs((np.min(umaped_data[:,0])-0.5) -(np.max(umaped_data[:,0])+0.5)), abs((np.min(umaped_data[:,1])-0.5)- (np.max(umaped_data[:,1])+0.5)))
plt.scatter(umaped_data[:,0], umaped_data[:,1], s=0.05)
plt.xlim([np.min(umaped_data[:,0])-0.5, np.max(umaped_data[:,0])+0.5])
plt.ylim([np.min(umaped_data[:,1])-0.5, np.max(umaped_data[:,1])+0.5])
plt.show()

# Cluster the latent space (histograms)

In [None]:
from sklearn.mixture import GaussianMixture

clf = GaussianMixture(n_components=100, covariance_type='full', verbose=1)
clf.fit(segment_encoding[:100000,0,:])
# aics[n_comps] = clf.aic(segment_encoding[:,0,:])
# print(n_comps)

In [None]:
custer_labels = clf.predict(segment_encoding[:,0,:])

In [None]:
transformed_means = UMAP_mapper.transform(clf.means_)

In [None]:
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']
color_matched_9 = ["#a8a495" if x>9 else colors[x] for x in custer_labels]
# color_matched_19= ["#a8a495" if x<=9 else colors[x-10] for x in custer_labels]

In [None]:
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']

color_matched_9 = ["#a8a495" if x>9 else colors[x] for x in custer_labels]
# color_matched_19= ["#a8a495" if x<=9 else colors[x-10] for x in custer_labels]

plt.rcParams['figure.figsize'] = (abs((np.min(umaped_data[:,0])-0.5) -(np.max(umaped_data[:,0])+0.5)), abs((np.min(umaped_data[:,1])-0.5)- (np.max(umaped_data[:,1])+0.5)))
plt.scatter(umaped_data[:,0], umaped_data[:,1], s=0.05, c=color_matched_9)
plt.xlim([np.min(umaped_data[:,0])-0.5, np.max(umaped_data[:,0])+0.5])
plt.ylim([np.min(umaped_data[:,1])-0.5, np.max(umaped_data[:,1])+0.5])
for mean_n, mean in enumerate(transformed_means[:10]):
    plt.scatter(mean[0], mean[1], s=300, label=mean_n, edgecolors="white")
plt.legend()
plt.show()

# Generate synthetic samples (histograms)

In [None]:
weights_dir = "../../../model_weights/model_2020-04-29_13-04-35.h5" #24>16>24

class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit.
    https://www.tensorflow.org/guide/keras/custom_layers_and_models#putting_it_all_together_an_end-to-end_example"""
    
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon


original_dim = 24
intermediate_dim = 64
latent_dim = 16

# Define encoder model.
original_inputs = tf.keras.Input(shape=(original_dim,1), name='encoder_input')
input_err = Input(shape=(original_dim,1))
x = layers.CuDNNLSTM(intermediate_dim, return_sequences=False)(original_inputs)
z_mean = layers.Dense(latent_dim, name='z_mean')(x)
z_log_var = layers.Dense(latent_dim, name='z_log_var')(x)
z = Sampling()((z_mean, z_log_var))
encoder = tf.keras.Model(inputs=original_inputs, outputs=z, name='encoder')

# Define decoder model.
latent_inputs = tf.keras.Input(shape=(latent_dim,), name='z_sampling')
x = layers.RepeatVector(original_dim)(latent_inputs)
x = layers.CuDNNLSTM(intermediate_dim, return_sequences=True)(x)
outputs = layers.TimeDistributed(layers.Dense(1))(x)
decoder = tf.keras.Model(inputs=latent_inputs, outputs=outputs, name='decoder')

# Define VAE model.
outputs = decoder(z)
vae = tf.keras.Model(inputs=[original_inputs, input_err], outputs=outputs, name='vae')

vae.load_weights(weights_dir)

In [None]:
trained_decoder = tf.keras.Model(inputs=vae.get_layer("decoder").input, outputs=vae.get_layer("decoder").output)

In [None]:
no_samples = 10
generated_samples = np.zeros((no_samples, 24))
for n_point, point in enumerate(clf.means_[:10]):
    prediction = trained_decoder.predict(np.expand_dims(point, axis=0))
    generated_samples[n_point, :] = prediction.flatten()

    print(n_point)
    clear_output(wait=True)


In [None]:
plt.rcParams['figure.figsize'] = (10.0, 5.0)
plt.rcParams.update({'font.size': 8})
plot_index = 0
PC_ind = 4
# plt.errorbar(np.linspace(0,512, 128), segments[plot_index], yerr=errors[plot_index], ecolor="magenta")
offset = 0
for n_lc, lc in enumerate(generated_samples):
    if n_lc>0:
        offset += np.max(generated_samples[n_lc-1])-np.min(generated_samples[n_lc-1]) +1
    plt.plot(np.linspace(0,13100, num=24), lc+offset)
# plt.ylabel("Normalized count rate", fontsize=12)
plt.xlabel("Count rate bins", fontsize=12)
plt.title("Series generated from Gaussian mixture means", fontsize=15)
plt.show()
if plot_index>0:
    plot_index-=1

# UMAP the raw histogram data

In [None]:
segments_dir = '../../../data_GRS1915/468202_len128_s2_4cad_histograms_24bin_0-13k_errorfix.pkl'
with open(segments_dir, 'rb') as f:
    segments = pickle.load(f)
segments = zscore(segments, axis=None).astype(np.float32)  # standardize

In [None]:
UMAP_mapper = umap.UMAP(verbose=True)#n_neighbors=50, min_dist=0.0
UMAP_mapper.fit(segments[:,:,0])

In [None]:
# with open('../../../data_GRS1915/UMAPmapper_raw_histograms_trainedonall.pkl', 'wb') as f:
#     pickle.dump(UMAP_mapper, f)

In [None]:
umaped_data = UMAP_mapper.transform(segments[:,:,0])

In [None]:
plt.rcParams['figure.figsize'] = (abs((np.min(umaped_data[:,0])-0.5) -(np.max(umaped_data[:,0])+0.5)), abs((np.min(umaped_data[:,1])-0.5)- (np.max(umaped_data[:,1])+0.5)))
plt.scatter(umaped_data[:,0], umaped_data[:,1], s=0.05)
plt.xlim([np.min(umaped_data[:,0])-0.5, np.max(umaped_data[:,0])+0.5])
plt.ylim([np.min(umaped_data[:,1])-0.5, np.max(umaped_data[:,1])+0.5])
plt.show()

# Cluster the raw histograms

In [None]:
from sklearn.mixture import GaussianMixture

clf = GaussianMixture(n_components=200, covariance_type='full', verbose=1)
clf.fit(segments[:10000,:,0])
# aics[n_comps] = clf.aic(segment_encoding[:,0,:])
# print(n_comps)

In [None]:
segments.shape

In [None]:
custer_labels = clf.predict(segments[:,:,0])

In [None]:
custer_labels = clf.predict(segments[:,:,0])
transformed_means = UMAP_mapper.transform(clf.means_)

In [None]:
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']
color_matched_9 = ["#a8a495" if x>9 else colors[x] for x in custer_labels]
# color_matched_19= ["#a8a495" if x<=9 else colors[x-10] for x in custer_labels]

In [None]:
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']

plt.rcParams['figure.figsize'] = (abs((np.min(umaped_data[:,0])-0.5) -(np.max(umaped_data[:,0])+0.5)), abs((np.min(umaped_data[:,1])-0.5)- (np.max(umaped_data[:,1])+0.5)))
plt.scatter(umaped_data[:,0], umaped_data[:,1], s=0.05, c=color_matched_9)
plt.xlim([np.min(umaped_data[:,0])-0.5, np.max(umaped_data[:,0])+0.5])
plt.ylim([np.min(umaped_data[:,1])-0.5, np.max(umaped_data[:,1])+0.5])
for mean_n, mean in enumerate(transformed_means[:10]):
    plt.scatter(mean[0], mean[1], s=300, label=mean_n, edgecolors="white")
plt.legend()
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (10.0, 5.0)
plt.rcParams.update({'font.size': 8})
plot_index = 0
PC_ind = 4
# plt.errorbar(np.linspace(0,512, 128), segments[plot_index], yerr=errors[plot_index], ecolor="magenta")
offset = 0
for n_lc, lc in enumerate(clf.means_[:10]):
    if n_lc>0:
        offset += np.max(clf.means_[n_lc-1])-np.min(clf.means_[n_lc-1]) +1
    plt.plot(np.linspace(0,13100, num=24), lc+offset)
# plt.ylabel("Normalized count rate", fontsize=12)
plt.xlabel("Count rate bins", fontsize=12)
plt.title("Series generated from Gaussian mixture means", fontsize=15)
plt.show()
if plot_index>0:
    plot_index-=1

# UMAP Shape and intensity combination (16 shape latent variables and 24 histogram bin populations) 

In [None]:
hist_dir = '../../../data_GRS1915/468202_len128_s2_4cad_histograms_24bin_0-13k_errorfix.pkl'
with open(hist_dir, 'rb') as f:
    hists = pickle.load(f)
# with open('../../data_GRS1915/94465_len512_s40_errors_errorfix.pkl', 'rb') as f:
#     errors = pickle.load(f)
    
# errors = ((errors)/np.std(segments)).astype(np.float32)
hists = zscore(hists[:,:,0], axis=None).astype(np.float32)  # standardize


weights_dir = "../../../model_weights/model_2020-04-29_09-12-23.h5"
segments_dir = '../../../data_GRS1915/468202_len128_s2_4cad_counts_errorfix.pkl'
segment_encoding_dir = '../../../data_GRS1915/segment_encoding_{}_segments_{}.pkl'.format(weights_dir.split("/")[-1].split(".")[0], segments_dir.split("/")[-1].split(".")[0])

with open(segment_encoding_dir, 'rb') as f:
    segment_encoding = pickle.load(f)
    
segment_encoding_scaled_means = zscore(segment_encoding[:,0,:], axis=None).astype(np.float32)  # standardize


In [None]:
SAI_data = np.hstack((segment_encoding_scaled_means, hists))
SAI_data.shape

In [None]:
UMAP_mapper = umap.UMAP(verbose=True)#n_neighbors=50, min_dist=0.0
UMAP_mapper.fit(SAI_data[:50000,:])
umaped_data = UMAP_mapper.transform(SAI_data)

In [None]:
plt.rcParams['figure.figsize'] = (abs((np.min(umaped_data[:,0])-0.5) -(np.max(umaped_data[:,0])+0.5)), abs((np.min(umaped_data[:,1])-0.5)- (np.max(umaped_data[:,1])+0.5)))
plt.scatter(umaped_data[:,0], umaped_data[:,1], s=0.05)
plt.xlim([np.min(umaped_data[:,0])-0.5, np.max(umaped_data[:,0])+0.5])
plt.ylim([np.min(umaped_data[:,1])-0.5, np.max(umaped_data[:,1])+0.5])
plt.show()

# Cluster the shape and intensity features

In [None]:
from sklearn.mixture import GaussianMixture
criteria111_114 = np.zeros((2,4))


for ind, n_components in enumerate([111, 112, 113, 114]):

    clf = GaussianMixture(n_components=n_components, covariance_type='full', verbose=1)
    clf.fit(SAI_data[:50000,:])
    
#     with open('../../../data_GRS1915/SAI_GM_components{}_fitto50k_v2.pkl'.format(n_components), 'wb') as f:
#         pickle.dump(clf, f)
    
    criteria111_114[0,ind] = clf.aic(SAI_data)
    criteria111_114[1,ind] = clf.bic(SAI_data)
    print(n_components, criteria111_114[:,ind])

In [None]:
plt.rcParams['figure.figsize'] =[20,10]

plt.plot([5,10,20,30,50,75,100,110,120,130,140,150,160,170,180,190,200,225,250,300,500], criteria4[0,:], label="AIC") #125
plt.plot([5,10,20,30,50,75,100,110,120,130,140,150,160,170,180,190,200,225,250,300,500], criteria4[1,:], label="BIC") #125
plt.title("Information criteria for GMM trained on 50k samples of shape and intensity data", pad=20, fontsize=24)
plt.ylabel("Information criterion value", fontsize=24)
plt.xlabel("Number of Gaussian components", fontsize=24)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
plt.rcParams['figure.figsize'] =[20,10]

plt.plot([111, 112, 113, 114], criteria111_114[0,:], label="AIC") #125
plt.plot([111, 112, 113, 114], criteria111_114[1,:], label="BIC") #125
plt.title("Information criteria for GMM trained on 50k samples of shape and intensity data", pad=20, fontsize=24)
plt.ylabel("Information criterion value", fontsize=24)
plt.xlabel("Number of Gaussian components", fontsize=24)
plt.legend()
plt.tight_layout()
plt.show()

# CLassification

In [None]:
clean_belloni = open('../../../data_GRS1915/1915Belloniclass_updated.dat')
lines = clean_belloni.readlines()
states = lines[0].split()
belloni_clean = {}
for h,l in zip(states, lines[1:]):
    belloni_clean[h] = l.split()
    #state: obsID1, obsID2...
ob_state = {}
for state, obs in belloni_clean.items():
    if state == "chi1" or state == "chi2" or state == "chi3" or state == "chi4": state = "chi"
    for ob in obs:
        ob_state[ob] = state



# inv_ob_state = {v: k for k, v in ob_state.items()}

inv_ob_state = {}
for k, v in ob_state.items():
    inv_ob_state[v] = inv_ob_state.get(v, [])
    inv_ob_state[v].append(k)

In [None]:
with open('../../../data_GRS1915/468202_len128_s2_4cad_counts_errorfix.pkl', 'rb') as f:
    segments = pickle.load(f)
with open('../../../data_GRS1915/468202_len128_s2_4cad_ids_errorfix.pkl', 'rb') as f:
    seg_ids = pickle.load(f)
    
xxx = [seg.split("_")[0] for seg in seg_ids]

In [None]:
classes = np.array(["alpha", "beta", "gamma", "delta", "theta", "kappa", "lambda", "mu", "nu", "rho", "phi", "chi", "eta", "omega"])
scales = []
segment_class = []
for ob in xxx:
    if ob in ob_state:
        segment_class.append(ob_state[ob])
        scales.append(5)
    else:
        segment_class.append("Unknown")
        scales.append(0.1)

In [None]:
test_set = []
train_set = []

for class_name in ["alpha", "beta", "gamma", "delta", "theta", "kappa", "lambda", "mu", "nu", "rho", "phi", "chi", "eta", "omega"]:
    
    class_obs_all = inv_ob_state[class_name]
    class_obs = []
    for ob in class_obs_all:
        if ob in xxx:
            class_obs.append(ob)
    
    test_obs = np.random.choice(class_obs, size=int(np.ceil(len(class_obs)/3)))
    
    if len(test_obs) == 0:
        print(class_name)
    
    train_obs = []
    for ob in class_obs:
        if ob not in test_obs:
            train_obs.append(ob)
    test_set.append(test_obs)
    train_set.append(train_obs)
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
#     test_set = np.random.choice(class_obs, size=)
    
#     np.where(class_name == "class_name")
test_set=np.hstack(test_set)
train_set=np.hstack(train_set)

test_seg_ids=[]
train_seg_ids=[]

for ind_ob, ob in enumerate(xxx):
    if ob in test_set:
        test_seg_ids.append(ind_ob)
    elif ob in train_set:
        train_seg_ids.append(ind_ob)

In [None]:
from sklearn.svm import SVC
SVC_clf = SVC().fit(SAI_data[train_seg_ids], np.array(segment_class)[train_seg_ids])
SVC_clf_score = SVC_clf.score(SAI_data[test_seg_ids], np.array(segment_class)[test_seg_ids])
print(SVC_clf_score)

In [None]:
from sklearn.ensemble import RandomForestClassifier

RF_clf = RandomForestClassifier().fit(SAI_data[train_seg_ids], np.array(segment_class)[train_seg_ids])
RF_clf_score = RF_clf.score(SAI_data[test_seg_ids], np.array(segment_class)[test_seg_ids])
print(RF_clf_score)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNN_clf = KNeighborsClassifier().fit(SAI_data[train_seg_ids], np.array(segment_class)[train_seg_ids])
KNN_clf_score = KNN_clf.score(SAI_data[test_seg_ids], np.array(segment_class)[test_seg_ids])
print(KNN_clf_score)

In [None]:
preds = SVC_clf.predict(SAI_data[test_seg_ids])

In [None]:
for class_name in ["alpha", "beta", "gamma", "delta", "theta", "kappa", "lambda", "mu", "nu", "rho", "phi", "chi", "eta", "omega"]:

for 

In [None]:
from sklearn.metrics import classification_report
print(classification_report(np.array(segment_class)[test_seg_ids], preds))

# Shape data and 8 descriptive stats

In [None]:
from scipy import stats
# import umap
from sklearn.mixture import GaussianMixture
from scipy.stats import zscore


In [None]:
with open('../../../data_GRS1915/468202_len128_s2_4cad_counts_errorfix.pkl', 'rb') as f:
    segments_counts = pickle.load(f)
with open('../../../data_GRS1915/468202_len128_s2_4cad_errors_errorfix.pkl', 'rb') as f:
    segments_errors = pickle.load(f)
with open('../../../data_GRS1915/468202_len128_s2_4cad_ids_errorfix.pkl', 'rb') as f:
    id_per_seg = pickle.load(f)

weights_dir = "../../../model_weights/model_2020-04-29_09-12-23.h5"
segments_dir = '../../../data_GRS1915/468202_len128_s2_4cad_counts_errorfix.pkl'
segment_encoding_dir = '../../../data_GRS1915/segment_encoding_{}_segments_{}.pkl'.format(weights_dir.split("/")[-1].split(".")[0], segments_dir.split("/")[-1].split(".")[0])

with open(segment_encoding_dir, 'rb') as f:
    segment_encoding = pickle.load(f)
    
segment_encoding_scaled_means = zscore(segment_encoding[:,0,:], axis=None).astype(np.float32)  # standardize

with open('../../../data_GRS1915/468202_segment_GMM_bic_1-3components_zscored.pkl'.format(segment_encoding_dir.split("/")[-1]), 'rb') as f:
    GMM_bics = pickle.load(f)

In [None]:
desc_stats = np.zeros((len(segments_counts), 5)) # median, mean, std, skew, kurt, GM1_bic, GM2_bic, GM3_bic
desc_stats[:,0] = np.median(segments_counts, axis=1).flatten()
desc_stats[:,1] = np.mean(segments_counts, axis=1).flatten()
desc_stats[:,2] = np.std(segments_counts, axis=1).flatten()
desc_stats[:,3] = stats.skew(segments_counts, axis=1).flatten()
desc_stats[:,4] = stats.kurtosis(segments_counts, axis=1).flatten()
zscore_desc_stats = zscore(desc_stats, axis=0)

# desc_GM = np.hstack((zscore(desc_stats, axis=0), GMM_bics))
desc_GM = np.hstack((desc_stats, GMM_bics))


shape_desc_GM = np.hstack((segment_encoding_scaled_means, desc_GM))

In [None]:
SVC_clf2 = SVC().fit(shape_desc_GM[train_seg_ids], np.array(segment_class)[train_seg_ids])
SVC_clf2_score = SVC_clf2.score(shape_desc_GM[test_seg_ids], np.array(segment_class)[test_seg_ids])
print(SVC_clf2_score)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(np.array(segment_class)[test_seg_ids], preds))

In [None]:
shape_desc_GM.shape

In [None]:
UMAP_mapper = umap.UMAP(verbose=True)#n_neighbors=50, min_dist=0.0, local_connectivity, repulsion_strength, negative_sample_rate
UMAP_mapper.fit(shape_desc_GM)
    
# with open('../../../data_GRS1915/UMAPmapper_shape16latent_8desc_stats_trainedonall.pkl', 'wb') as f:
#     pickle.dump(UMAP_mapper, f)

In [None]:
with open('../../../data_GRS1915/UMAPmapper_shape16latent_8desc_stats_trainedonall.pkl', 'rb') as f:
    UMAP_mapper = pickle.load(f)

In [None]:
umaped_data = UMAP_mapper.transform(shape_desc_GM)

In [None]:
plt.rcParams.update({'font.size': 12})
plt.rcParams['figure.figsize'] = (abs((np.min(umaped_data[:,0])-0.5) -(np.max(umaped_data[:,0])+0.5)), abs((np.min(umaped_data[:,1])-0.5)- (np.max(umaped_data[:,1])+0.5)))
plt.scatter(umaped_data[:,0], umaped_data[:,1], s=0.05)
plt.xlim([np.min(umaped_data[:,0])-0.5, np.max(umaped_data[:,0])+0.5])
plt.ylim([np.min(umaped_data[:,1])-0.5, np.max(umaped_data[:,1])+0.5])
plt.show()

In [None]:
weights_dir = "../../../model_weights/model_2020-04-29_09-12-23.h5"
segments_dir = '../../../data_GRS1915/468202_len128_s2_4cad_counts_errorfix.pkl'
segment_encoding_dir = '../../../data_GRS1915/segment_encoding_{}_segments_{}.pkl'.format(weights_dir.split("/")[-1].split(".")[0], segments_dir.split("/")[-1].split(".")[0])

with open(segment_encoding_dir, 'rb') as f:
    segment_encoding = pickle.load(f)

# Comaparison of histogram vs descriptive statistics in classification

In [None]:
from sklearn.metrics import classification_report
from sklearn.svm import SVC


In [None]:
compar_results = np.zeros((2,1))


for test_iter in range(1):
    test_set = []
    train_set = []

    for class_name in ["alpha", "beta", "gamma", "delta", "theta", "kappa", "lambda", "mu", "nu", "rho", "phi", "chi", "eta", "omega"]:

        class_obs_all = inv_ob_state[class_name]
        class_obs = []
        for ob in class_obs_all:
            if ob in xxx:
                class_obs.append(ob)

        test_obs = np.random.choice(class_obs, size=int(np.ceil(len(class_obs)/3)))

        if len(test_obs) == 0:
            print(class_name)

        train_obs = []
        for ob in class_obs:
            if ob not in test_obs:
                train_obs.append(ob)
        test_set.append(test_obs)
        train_set.append(train_obs)
    #     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    #     test_set = np.random.choice(class_obs, size=)

    #     np.where(class_name == "class_name")
    test_set=np.hstack(test_set)
    train_set=np.hstack(train_set)

    test_seg_ids=[]
    train_seg_ids=[]

    for ind_ob, ob in enumerate(xxx):
        if ob in test_set:
            test_seg_ids.append(ind_ob)
        elif ob in train_set:
            train_seg_ids.append(ind_ob)


#     SVC_clf = SVC(gamma="auto").fit(SAI_data[train_seg_ids], np.array(segment_class)[train_seg_ids])
#     SVC_clf_score = SVC_clf.score(SAI_data[test_seg_ids], np.array(segment_class)[test_seg_ids])
#     compar_results[0, test_iter] = SVC_clf_score


#     SVC_clf2 = SVC(gamma="auto").fit(shape_desc_GM[train_seg_ids], np.array(segment_class)[train_seg_ids])
#     SVC_clf2_score = SVC_clf2.score(shape_desc_GM[test_seg_ids], np.array(segment_class)[test_seg_ids])
#     compar_results[1, test_iter] = SVC_clf2_score
#     print(test_iter)

In [None]:
np.where(np.array(segment_class))

In [None]:
plt.rcParams['figure.figsize'] = (15.0, 15.0)
plt.rcParams.update({'font.size': 12})
# plt.rcParams.update(plt.rcParamsDefault)

In [None]:
#Using Pearson Correlation
import pandas as pd
import seaborn as sns
data_df  = pd.DataFrame(shape_6moments[:,16:], columns=["mean", "std", "skew", "kurt", "5th", "6th"])

plt.figure(figsize=(12,10))
cor = data_df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
np.sqrt(60)/60

In [None]:
plt.scatter(shape_desc_GM[:,-3], shape_desc_GM[:,-1])

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(shape_desc_GM[:,-3], shape_desc_GM[:,-2], shape_desc_GM[:,-1])
ax.set_xlabel('1comp', fontsize=20, rotation=150)
ax.set_ylabel('2comp', fontsize=20)
ax.set_zlabel("3comp+", fontsize=20, rotation=60)

In [None]:
UMAP_mapper = umap.UMAP(verbose=True)#n_neighbors=50, min_dist=0.0, local_connectivity, repulsion_strength, negative_sample_rate
UMAP_mapper.fit(zscore(shape_desc4, axis=0))
    
with open('../../../data_GRS1915/UMAPmapper_shape16_4moments_trainedonall.pkl', 'wb') as f:
    pickle.dump(UMAP_mapper, f)

In [None]:
# with open('../../../data_GRS1915/UMAPmapper_shape16latent_8desc_stats_trainedonall.pkl', 'rb') as f:
#     UMAP_mapper = pickle.load(f)

In [None]:
umaped_data = UMAP_mapper.transform(shape_desc_GM)

In [None]:
plt.rcParams.update({'font.size': 12})
plt.rcParams['figure.figsize'] = (abs((np.min(umaped_data[:,0])-0.5) -(np.max(umaped_data[:,0])+0.5)), abs((np.min(umaped_data[:,1])-0.5)- (np.max(umaped_data[:,1])+0.5)))
plt.scatter(umaped_data[:,0], umaped_data[:,1], s=0.05)
plt.xlim([np.min(umaped_data[:,0])-0.5, np.max(umaped_data[:,0])+0.5])
plt.ylim([np.min(umaped_data[:,1])-0.5, np.max(umaped_data[:,1])+0.5])
plt.show()

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(shape_desc_GM[:,-3:])

In [None]:
Gaus_PCA = pca.transform(shape_desc_GM[:,-3:])

In [None]:
plt.scatter(zscore(Gaus_PCA[:,0]), zscore(Gaus_PCA[:,1]))

In [None]:
shape_desc4_GMPCA2.shape

In [None]:
# shape_desc_normGM = shape_desc_GM
# shape_desc_normGM[:,-3:] = zscore(shape_desc_GM[:,-3:])

# shape_desc4 = np.hstack((shape_desc_GM[:,:16], shape_desc_GM[:,17:-3]))

# shape_desc4_GMPCA2 = np.hstack((zscore(shape_desc4), zscore(Gaus_PCA)))

plt.rcParams['figure.figsize'] = (10,10)

experiment_list = [
#     [SAI_data, "40d: 16 shape and 24 histogram features"],
#     [shape_desc_GM[:, :-2], "24d: 16 shape features and 8 descriptive statistics"], ### 22 with the indexing
#     [shape_desc_normGM, "24d with zscored GM BICs"],
#     [shape_desc_GM[:,:-3], "21d: 16 shape features and 5 descriptive statistics"],
#     [shape_desc4, "20d: 16 shape features and 4 descriptive statistics"],
#     [zscore(shape_desc4, axis=0), "20d: 16 shape features and 4 descriptive statistics, standardized"],
    [shape_moments, "20d: 16 shape features and 4 moments, standardized"],
    [shape_moments, "20d: 16 shape features and 4 moments, standardized, balanced"],
    [shape_6moments,"22d: 16 shape features and 6 moments, standardized" ]
#     [shape_desc4_GMPCA2, "22d: 16 shape, 4 desc. stats and 2 PCs of BIC"]
#     [shape_desc_GM[:,16:], "8 descriptive statistics"],
#     [SAI_data[:,:16],"16 shape features"],
#     [SAI_data[:,16:], "24 histogram features"]
]



for dataset, title in experiment_list:
    SVC_clf = SVC(gamma="auto").fit(dataset[train_seg_ids], np.array(segment_class)[train_seg_ids])
    if title[-8:] == "balanced":
        SVC_clf = SVC(gamma="auto", class_weight="balanced").fit(dataset[train_seg_ids], np.array(segment_class)[train_seg_ids])

    preds = SVC_clf.predict(dataset[test_seg_ids])
    print("{}".format(title))
    print(classification_report(np.array(segment_class)[test_seg_ids], preds))
    
    
    
    disp = plot_confusion_matrix(SVC_clf, dataset[test_seg_ids], np.array(segment_class)[test_seg_ids],
                             cmap=plt.cm.Blues,
                             normalize='true')
    disp.ax_.set_title("Normalized confusion matrix for {}".format(title))
    plt.show()
    
    

In [None]:
print("22d: 16 shape features and 6 moments, standardized")
print(classification_report(np.array(segment_class)[test_seg_ids], preds))

In [None]:
res1 = np.array([[0.77866676, 0.73311277, 0.78083213, 0.69198966, 0.72730377,
        0.78456467, 0.78693837, 0.74564189, 0.71476161, 0.80871077],
       [0.79372295, 0.83236835, 0.8409853 , 0.71007752, 0.78622103,
        0.83191778, 0.82478704, 0.81088997, 0.78396835, 0.83451901]])

res2 = np.array([[0.74569551, 0.72505262, 0.83017905, 0.79743465, 0.79211854,
        0.84074226, 0.82077626, 0.80905131, 0.77454224, 0.72485089],
       [0.80997834, 0.73009652, 0.85212816, 0.86321069, 0.82912774,
        0.85724932, 0.84044608, 0.85375467, 0.82216909, 0.79736129]])

res3 = np.array([[0.74975602, 0.76254434, 0.755848  , 0.77953619, 0.7278387 ,
        0.7519164 , 0.75126418, 0.77643904, 0.792922  , 0.75279727,
        0.72323293, 0.76381347, 0.72353266, 0.77342608, 0.80767374,
        0.71281754, 0.76253514, 0.76736953, 0.85906312, 0.79865008,
        0.75978855, 0.67200531, 0.83241201, 0.80688174, 0.76097234,
        0.71648287, 0.79924554, 0.76408826, 0.73696223, 0.75044426],
       [0.78048124, 0.82040156, 0.81124087, 0.83567979, 0.77840704,
        0.81282178, 0.80083615, 0.82939717, 0.83040996, 0.81371566,
        0.77192067, 0.80505023, 0.77363956, 0.8242435 , 0.83905954,
        0.78059637, 0.77894025, 0.79149336, 0.86587611, 0.84104353,
        0.77852716, 0.71145472, 0.85347952, 0.83835766, 0.81526955,
        0.7709217 , 0.85668724, 0.83868636, 0.79672106, 0.79592084]])

res4 = np.array([[0.79191534, 0.76087685, 0.67046855, 0.72114209, 0.76189944,
        0.77667217, 0.7330542 , 0.73618209, 0.81619439, 0.75384945,
        0.71386682, 0.80245486, 0.79031112, 0.72723923, 0.74515297,
        0.72560147, 0.72812745, 0.77691827, 0.67197249, 0.81058563,
        0.75088462, 0.75157571, 0.81092084, 0.76765467, 0.70972944,
        0.80203729, 0.79307808, 0.758981  , 0.76432133, 0.70835706,
        0.78316553, 0.65833264, 0.79104252, 0.80694916, 0.79571977,
        0.76556444, 0.76959578, 0.76056712, 0.72495242, 0.79219349,
        0.79374297, 0.73663961, 0.71930101, 0.78722207, 0.78305448,
        0.75806628, 0.7187707 , 0.71505897, 0.72505291, 0.84645117],
       [0.83263955, 0.81029424, 0.74589675, 0.79941799, 0.82089385,
        0.77814498, 0.79755573, 0.76267984, 0.85024375, 0.81226108,
        0.78982217, 0.85099239, 0.83496526, 0.75015659, 0.80638563,
        0.80097514, 0.77870636, 0.83239569, 0.73674135, 0.84918495,
        0.79053734, 0.8257413 , 0.84715791, 0.78475336, 0.78390969,
        0.85366135, 0.84370507, 0.80469095, 0.82173239, 0.72859475,
        0.76969203, 0.74902724, 0.82775535, 0.84975974, 0.83397338,
        0.81577569, 0.80737807, 0.81309559, 0.76012779, 0.86020742,
        0.82233046, 0.84990381, 0.83276921, 0.84491979, 0.85557161,
        0.81210947, 0.76720978, 0.78806509, 0.8159287 , 0.88692258]])

In [None]:
res1234 = np.hstack((res1,res2,res3,res4))

In [None]:
from scipy.stats import iqr

fig, axs = plt.subplots(1, 1)

plt.hist(res1234[0,:], label="hist. data")
plt.hist(res1234[1,:], alpha=0.5, label="desc. stats")
fig.text(0.5, 0.94, "hist.data: mean={:.3f}, std={:.3f}, min={:.3f}, max={:.3f}, IQR={:.3f}\ndesc.stats: mean={:.3f}, std={:.3f}, min={:.3f}, max={:.3f}, IQR={:.3f}".format(
    np.mean(res1234[0,:]),np.std(res1234[0,:]),np.min(res1234[0,:]),np.max(res1234[0,:]), iqr(res1234[0,:]),
np.mean(res1234[1,:]),np.std(res1234[1,:]),np.min(res1234[1,:]),np.max(res1234[1,:]), iqr(res1234[1,:])), ha='center', va="top")
plt.ylabel("Population")
plt.xlabel("Test set accuracy")
plt.legend()
plt.show()

# Cluster the shape and descriptive stats features
https://jakevdp.github.io/PythonDataScienceHandbook/05.12-gaussian-mixtures.html

In [None]:
distances47.shape# = np.copy(distances)
distances100.shape # = np.copy(distances)
distances200.shape # = np.copy(distances)
distances70.shape # = np.copy(distances)
distances300# = np.copy(distances)

In [None]:
distances47

In [None]:
# with open('all_dat_kneigh_dists.pkl', 'wb') as f:
#     pickle.dump(distances47, f)

In [None]:
distances = np.zeros((468202, 7))

for n_k, k in enumerate([30, 47, 70, 100, 200, 500, 1000]):
#     with open('k-NN_distances_all_data_24D_468202samples_k{}.pkl'.format(k), 'wb') as f:
#         pickle.dump(distances, f)
        
    with open('k-NN_distances_all_data_24D_468202samples_k{}.pkl'.format(k), 'rb') as f:
        distances_k = pickle.load(f)
        
    distances[:,n_k] = distances_k

# plt.plot(distances47, label="k=47")
# plt.plot(distances70, label="k=70")
# plt.plot(distances100, label="k=100")
# plt.plot(distances200, label="k=200")
# plt.plot(distances, label="k=300")
# plt.ylim((0,5))
# plt.show()

In [None]:
distances0_30

In [None]:
plt.rcParams['figure.figsize'] = (20.0, 20.0)
plt.rcParams.update({'font.size': 12})

plt.plot(distances0_30[:,5], label="k=5")
plt.plot(distances0_30[:,10], label="k=10")
plt.plot(distances0_30[:,20], label="k=20")

plt.plot(distances[:, 0], label="k=30")
plt.plot(distances[:, 1], label="k=47")
plt.plot(distances[:, 2], label="k=70")
plt.plot(distances[:, 3], label="k=100")
plt.plot(distances[:, 4], label="k=200")
plt.plot(distances[:, 5], label="k=500")
plt.plot(distances[:, 6], label="k=1000")

plt.title("Sorted k-distance for 468202 samples of 24d data")
plt.xlabel("k-order")
plt.ylabel("distance (Euclidean)")
plt.ylim((0,3))
plt.legend()
plt.show()

In [None]:
from sklearn.neighbors import NearestNeighbors
# X = shape_desc_GM[:50000,:]

for k in [30]:
    neigh = NearestNeighbors(n_neighbors=k, n_jobs=7, metric='euclidean')
    nbrs = neigh.fit(shape_desc_GM)
    distances, indices = nbrs.kneighbors(shape_desc_GM)
    distances = np.sort(distances, axis=0)
    #distances = distances[:,-1]
    with open('k-NN_distances_all_data_24D_468202samples_k0to{}.pkl'.format(k), 'wb') as f:
        pickle.dump(distances, f)
    print(k)
#     clear_output(wait=True)
# plt.plot(distances)
# plt.ylim((0,4))

In [None]:
with open('k-NN_distances_all_data_24D_468202samples_k0to30.pkl', 'rb') as f:
    distances0_30 = pickle.load(f)

In [None]:
shape_desc_GM.shape

In [None]:
with open('shape_desc_stat_24D_GMM_aic_bic_5-500comps_100k.pkl', 'wb') as f:
    pickle.dump(criteria5_500, f)

In [None]:
with open('shape_desc_stat_24D_GMM_aic_bic_127_130_135comps_alldata.pkl'.format(segment_encoding_dir.split("/")[-1]), 'rb') as f:
    criteria127_130 = pickle.load(f)

In [None]:
from sklearn.mixture import GaussianMixture

comp_no_list = [151, 152, 153]

criteria = np.zeros((2,len(comp_no_list)))


for ind, n_components in enumerate(comp_no_list):

    clf = GaussianMixture(n_components=n_components, covariance_type='full', verbose=1)
    clf.fit(shape_desc_GM)
    
    criteria[0,ind] = clf.aic(shape_desc_GM)
    criteria[1,ind] = clf.bic(shape_desc_GM)
    print(n_components, criteria[:,ind])
    
    with open('../../../data_GRS1915/shape16_stats8_GM_components{}_alldata.pkl'.format(n_components), 'wb') as f:
        pickle.dump(clf, f)
    
#     with open('shape_desc_stat_24D_GMM_aic_bic_range145-160-2comps_50K.pkl', 'wb') as f:
#         pickle.dump(criteria, f)

In [None]:
list(range(145, 160, 2))

In [None]:
plt.rcParams['figure.figsize'] =[20,10]

plt.plot([135,140,145,150,155,160,165]+list(range(170, 240, 5)), np.hstack((criteria135_165[:,0],criteria[0,:])), label="AIC") #125
plt.plot([135,140,145,150,155,160,165]+list(range(170, 240, 5)), np.hstack((criteria135_165[:,1],criteria[1,:])), label="BIC") #125
plt.title("Information criteria for GMM trained on 50k samples of shape data and descriptive statistics", pad=20, fontsize=24)
plt.ylabel("Information criterion value", fontsize=24)
plt.xlabel("Number of Gaussian components", fontsize=24)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
criteria135_165[:,0]

In [None]:
criteria135_165 = np.array([[-7446053.41718781, -6960953.73107995],
[-7426157.10488937, -6923090.35423473],
[-7426586.17351449, -6905552.35831308],
[-7510014.68999931, -6971013.81025113],
[-7512876.35162247, -6955908.40732751],
[-7400180.39300913, -6825245.38416739],
[-7496718.9980214,  -6903816.92463288]])

In [None]:
# all data
115 [-8073146.27283041 -7659914.84490966]
120 [-8140824.45935922 -7709625.96689169]
125 [-8192828.92915021 -7743663.3721359 ]
127 [-8249915.1942408  -7793562.81140779]
130 [-8263993.33624558, -7796860.7146845 ]
135 [-8328600.2442305 , -7843500.55812264]
140 [-8412351.18572903 -7909284.43507439]
151 [-8527567.38468881 -7984973.09203127] ###
152 [-8528492.50677821 -7982304.80121131]

# 50k samples
135 [-7446053.41718781 -6960953.73107995]
140 [-7426157.10488937 -6923090.35423473]
145 [-7426586.17351449 -6905552.35831308]
150 [-7510014.68999931 -6971013.81025113]###
155 [-7512876.35162247 -6955908.40732751]
160 [-7400180.39300913 -6825245.38416739]
165 [-7496718.9980214  -6903816.92463288]


145 [-7506767.23430007 -6985733.41909866]
147 [-7505868.92420308 -6977648.28318296]
149 [-7487498.22538117 -6952090.75854234]
151 [-7509885.08825046 -6967290.79559292]
153 [-7551318.88243436 -7001537.7639581 ] ###
155 [-7490368.82190454 -6933400.87760957]
157 [-7487697.41874471 -6923542.64863104]
159 [-7508364.59994809 -6937023.00401571]

In [None]:
np.where(np.array(xxx) == "10408-01-22-00") #64, 74,4,11,113,44,38,85

In [None]:
dist_mat = np.zeros((128,128))
for n1, seg1_ind in enumerate(np.where(np.array(xxx) == "10408-01-22-00")[0]):
    for n2, seg2_ind in enumerate(np.random.randint(len(xxx), size=128)):
        dist_mat[n1,n2] = np.linalg.norm(shape_desc_GM[seg1_ind]-shape_desc_GM[seg2_ind])

In [None]:
len(np.where(np.array(xxx) == "10408-01-22-00")[0])

In [None]:
np.min(dist_mat)

In [None]:
plt.hist(dist_mat.flatten())
# plt.title("Distribution of euclidean distances between 128 segments of light curve 10408-01-22-00 (chi) in 24D latent space (16shape 8stats). Both triags and diagonal of distance matrix")
plt.title("Distribution of euclidean distances between 128 segments of light cur ve 10408-01-22-00 (chi) in 24D latent space (16shape 8stats) and random segments. Both triags and diagonal of distance matrix")


In [None]:
from sklearn.cluster import DBSCAN

dbscan_grid = []

for n_eps, eps in enumerate(np.arange(1.5, 2.2, 0.05)):
    for n_min, min_samples in enumerate(range(40, 120, 5)):

        clf = DBSCAN(eps=eps,min_samples=min_samples, n_jobs=7)
        clf.fit(shape_desc_GM[:50000,:])
        
        dbscan_grid.append((np.unique(clf.labels_, return_counts=1), eps, min_samples))
        
        print(n_eps, n_min)
        clear_output(wait=True)
        
# with open('dbscan_grid_search_50k.pkl', 'wb') as f:
#     pickle.dump(dbscan_grid, f)

In [None]:
with open('dbscan_grid_search_50k_eps15-22_min40-120.pkl', 'wb') as f:
    pickle.dump(dbscan_grid, f)

In [None]:
from sklearn.cluster import OPTICS
dbscan_grid_op = []

for n_min, min_samples in enumerate(range(47, 120, 5)):

    clf = OPTICS(max_eps=3,min_samples=min_samples, metric="euclidean", n_jobs=7)
    clf.fit(shape_desc_GM[:50000,:])

    dbscan_grid_op.append((np.unique(clf.labels_, return_counts=1), eps, min_samples))

    print(n_eps, n_min)
    clear_output(wait=True)

# with open('dbscan_grid_search_50k.pkl', 'wb') as f:
#     pickle.dump(dbscan_grid, f)

In [None]:
clf = OPTICS(max_eps=2,min_samples=500, metric="euclidean", n_jobs=7)
clf.fit(shape_desc_GM)

In [None]:
np.unique(clf.labels_, return_counts=1)# 

In [None]:
np.unique(clf.labels_, return_counts=1)# 2,300

In [None]:
(array([-1,  0,  1,  2]), array([11890, 32158,  5569,   383])) 1.8, 90
(array([-1,  0,  1,  2,  3]), array([11351, 32433,  5619,   436,   161])) 1.8, 80
(array([-1,  0,  1,  2,  3]), array([13242, 29944,  5302,  1215,   297])) 1.7 80

In [None]:
clf = DBSCAN(eps=1.7,min_samples=3, n_jobs=7)
clf.fit(shape_desc_GM[:50000,:])

np.unique(clf.labels_, return_counts=1)

In [None]:
with open('../../../data_GRS1915/UMAPmapper_shape16latent_8desc_stats_trainedonall.pkl', 'rb') as f:
    UMAP_mapper = pickle.load(f)

In [None]:
umaped_data = UMAP_mapper.transform(shape_desc_GM[:50000,:])

In [None]:
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']
color_matched = ["#a8a495" if x==-1 else colors[x] for x in clf.labels_]

In [None]:
plt.rcParams.update({'font.size': 12})
plt.rcParams['figure.figsize'] = (abs((np.min(umaped_data[:,0])-0.5) -(np.max(umaped_data[:,0])+0.5)), abs((np.min(umaped_data[:,1])-0.5)- (np.max(umaped_data[:,1])+0.5)))
plt.scatter(umaped_data[:,0], umaped_data[:,1], s=1, c=color_matched)
plt.xlim([np.min(umaped_data[:,0])-0.5, np.max(umaped_data[:,0])+0.5])
plt.ylim([np.min(umaped_data[:,1])-0.5, np.max(umaped_data[:,1])+0.5])
plt.show()

In [None]:
np.unique(clf.labels_, return_counts=1)

In [None]:
count = 0
for record in dbscan_grid:
#     print(record[0][0][1][0])
    if record[0][1][0] < 1000:# and record[0][1][1] < 40000:
        count +=1
#         print(record)
#         break
print(count)
        #print(record)

In [None]:
dbscan_grid

In [None]:
np.unique(clf.labels_, return_counts=1)

In [None]:
plt.hist(np.unique(xxx, return_counts=1)[1])

In [None]:
[5,10,20,30,50,75,100,120,140,160,180,200,225,250,300,500][np.argmin(criteria5_500[1,:])]

In [None]:
plt.rcParams['figure.figsize'] =[20,10]

plt.plot([130,140,150,160,170,180,190,200], criteria130_200[0,:], label="AIC") #125
plt.plot([130,140,150,160,170,180,190,200], criteria130_200[1,:], label="BIC") #125
plt.title("Information criteria for GMM trained on 50k samples of shape and descriptive statistics data", pad=20, fontsize=24)
plt.ylabel("Information criterion value", fontsize=24)
plt.xlabel("Number of Gaussian components", fontsize=24)
plt.legend()
plt.tight_layout()
plt.show()

# Analysis of 20d space; 16 shape features and 4 moments

In [None]:
hist_dir = '../../../data_GRS1915/468202_len128_s2_4cad_histograms_24bin_0-13k_errorfix.pkl'
with open(hist_dir, 'rb') as f:
    hists = pickle.load(f)
# with open('../../data_GRS1915/94465_len512_s40_errors_errorfix.pkl', 'rb') as f:
#     errors = pickle.load(f)
    
# errors = ((errors)/np.std(segments)).astype(np.float32)
hists = zscore(hists[:,:,0], axis=None).astype(np.float32)  # standardize


weights_dir = "../../../model_weights/model_2020-04-29_09-12-23.h5"
segments_dir = '../../../data_GRS1915/468202_len128_s2_4cad_counts_errorfix.pkl'
segment_encoding_dir = '../../../data_GRS1915/segment_encoding_{}_segments_{}.pkl'.format(weights_dir.split("/")[-1].split(".")[0], segments_dir.split("/")[-1].split(".")[0])

with open(segment_encoding_dir, 'rb') as f:
    segment_encoding = pickle.load(f)
    
segment_encoding_scaled_means = zscore(segment_encoding[:,0,:], axis=None).astype(np.float32)  # standardize


In [None]:
SAI_data = np.hstack((segment_encoding_scaled_means, hists))
SAI_data.shape

In [None]:
with open('{}/468202_len128_s2_4cad_counts_errorfix.pkl'.format(data_dir), 'rb') as f:
    segments_counts = pickle.load(f)
# with open('../../../data_GRS1915/468202_len128_s2_4cad_errors_errorfix.pkl', 'rb') as f:
#     segments_errors = pickle.load(f)
# with open('../../../data_GRS1915/468202_len128_s2_4cad_ids_errorfix.pkl', 'rb') as f:
#     id_per_seg = pickle.load(f)

weights_dir = "../../../model_weights/model_2020-04-29_09-12-23.h5"
segments_dir = '../../../data_GRS1915/468202_len128_s2_4cad_counts_errorfix.pkl'
segment_encoding_dir = '{}/segment_encoding_{}_segments_{}.pkl'.format(data_dir, weights_dir.split("/")[-1].split(".")[0], segments_dir.split("/")[-1].split(".")[0])

with open(segment_encoding_dir, 'rb') as f:
    segment_encoding = pickle.load(f)
    
segment_encoding_scaled_means = zscore(segment_encoding[:,0,:], axis=0).astype(np.float32)  # standardize per feature

# with open('../../../data_GRS1915/468202_segment_GMM_bic_1-3components_zscored.pkl'.format(segment_encoding_dir.split("/")[-1]), 'rb') as f:
#     GMM_bics = pickle.load(f)

In [None]:
desc_stats = np.zeros((len(segments_counts), 4)) #mean, std, skew, kurt
# desc_stats[:,0] = np.median(segments_counts, axis=1).flatten()
desc_stats[:,0] = np.mean(segments_counts, axis=1).flatten()
desc_stats[:,1] = np.std(segments_counts, axis=1).flatten()
desc_stats[:,2] = stats.skew(segments_counts, axis=1).flatten()
desc_stats[:,3] = stats.kurtosis(segments_counts, axis=1).flatten()
zscore_desc_stats = zscore(desc_stats, axis=0)

# desc_GM = np.hstack((zscore(desc_stats, axis=0), GMM_bics))

shape_moments = np.hstack((segment_encoding_scaled_means, zscore_desc_stats)) # every column is standardized

In [None]:
desc_stats = np.zeros((len(segments_counts), 6)) #mean, std, skew, kurt
# desc_stats[:,0] = np.median(segments_counts, axis=1).flatten()
desc_stats[:,0] = np.mean(segments_counts, axis=1).flatten()
desc_stats[:,1] = np.std(segments_counts, axis=1).flatten()
desc_stats[:,2] = np.mean(((np.squeeze(segments_counts)-np.mean(segments_counts, axis=1))/np.std(segments_counts, axis=1))**3, axis=1) #stats.skew(segments_counts, axis=1).flatten()
desc_stats[:,3] = np.mean(((np.squeeze(segments_counts)-np.mean(segments_counts, axis=1))/np.std(segments_counts, axis=1))**4, axis=1)#stats.kurtosis(segments_counts, axis=1).flatten()
desc_stats[:,4] = np.mean(((np.squeeze(segments_counts)-np.mean(segments_counts, axis=1))/np.std(segments_counts, axis=1))**5, axis=1)
desc_stats[:,5] = np.mean(((np.squeeze(segments_counts)-np.mean(segments_counts, axis=1))/np.std(segments_counts, axis=1))**6, axis=1)
zscore_desc_stats = zscore(desc_stats, axis=0)

# desc_GM = np.hstack((zscore(desc_stats, axis=0), GMM_bics))

shape_6moments = np.hstack((segment_encoding_scaled_means, zscore_desc_stats)) # every column is standardized

In [None]:
desc_stats = np.zeros((len(segments_counts), 4)) #mean, std, skew, kurt
# desc_stats[:,0] = np.median(segments_counts, axis=1).flatten()
desc_stats[:,0] = np.mean(segments_counts, axis=1).flatten()
desc_stats[:,1] = np.var(segments_counts, axis=1).flatten()
desc_stats[:,2] = stats.skew(segments_counts, axis=1).flatten()
desc_stats[:,3] = stats.kurtosis(segments_counts, axis=1).flatten()
zscore_desc_stats = zscore(desc_stats, axis=0)

# desc_GM = np.hstack((zscore(desc_stats, axis=0), GMM_bics))

shape_moments_var = np.hstack((segment_encoding_scaled_means, zscore_desc_stats)) # every column is standardized

In [None]:
(segments_counts.squeeze()-np.mean(segments_counts, axis=1))/np.std(segments_counts, axis=1)

In [None]:
np.mean(segments_counts, axis=1).shape

In [None]:
stats.kurtosis(X, fisher=0)

In [None]:
X = segments_counts[0]
stand_counts = (segments_counts[0]-np.mean(segments_counts[0]))/np.var(segments_counts[0])

stats.moment(stand_counts, moment=4)
np.mean(((X-np.mean(X))/np.var(X))**4)
# these two give the same value




(stats.moment(X, moment=4)/(np.var(X)**2))-3 # gives the same value as Fisher kurtosis

In [None]:
# stand_counts = (segments_counts[0]-np.mean(segments_counts[0]))/np.var(segments_counts[0])

stats.moment(X, moment=3)/(np.var(X)**2)

In [None]:
np.mean(((X-np.mean(X))/np.var(X))**4)#/(np.var(X)**2)

In [None]:
np.mean(((X-np.mean(X))/np.std(X))**4)

In [None]:
stats.skew(X)

In [None]:
stats.moment(X, moment=3)

In [None]:
(stats.moment(X, moment=4)/(np.var(X)**2))-3

In [None]:
UMAP_mapper = umap.UMAP(verbose=True)#n_neighbors=50, min_dist=0.0, local_connectivity, repulsion_strength, negative_sample_rate
UMAP_mapper.fit(shape_moments)
    
# with open('../../../data_GRS1915/UMAPmapper_shape16_4moments_trainedonall.pkl', 'wb') as f:
#     pickle.dump(UMAP_mapper, f)

In [None]:
# with open('../../../data_GRS1915/UMAPmapper_20d_shape16_moments4_trainedonall_468202.pkl', 'wb') as f:
#     pickle.dump(UMAP_mapper, f)

with open('../../../data_GRS1915/UMAPmapper_moments4standardized_trainedonall_468202.pkl', 'rb') as f:
    UMAP_mapper = pickle.load(f)

In [None]:
umaped_data = UMAP_mapper.transform(shape_moments[:,16:])

In [None]:
plt.rcParams.update({'font.size': 12})
plt.rcParams['figure.figsize'] = (abs((np.min(umaped_data[:,0])-0.5) -(np.max(umaped_data[:,0])+0.5)), abs((np.min(umaped_data[:,1])-0.5)- (np.max(umaped_data[:,1])+0.5)))
plt.scatter(umaped_data[:,0], umaped_data[:,1], s=0.05)
plt.xlim([np.min(umaped_data[:,0])-0.5, np.max(umaped_data[:,0])+0.5])
plt.ylim([np.min(umaped_data[:,1])-0.5, np.max(umaped_data[:,1])+0.5])
plt.show()

In [None]:
# load observation classifications from Huppenkothen 2017
# %matplotlib inline

# import matplotlib.pyplot as plt


clean_belloni = open('../../../data_GRS1915/1915Belloniclass_updated.dat')
lines = clean_belloni.readlines()
states = lines[0].split()
belloni_clean = {}
for h,l in zip(states, lines[1:]):
    belloni_clean[h] = l.split()
    #state: obsID1, obsID2...
ob_state = {}
for state, obs in belloni_clean.items():
    if state == "chi1" or state == "chi2" or state == "chi3" or state == "chi4": state = "chi"
    for ob in obs:
        ob_state[ob] = state
        
# load segmented light curves

import pickle
with open('../../../data_GRS1915/468202_len128_s2_4cad_counts_errorfix.pkl', 'rb') as f:
    segments = pickle.load(f)
with open('../../../data_GRS1915/468202_len128_s2_4cad_ids_errorfix.pkl', 'rb') as f:
    seg_ids = pickle.load(f)

# # HF QPO observation ids
# paper_obIDs = np.loadtxt("../../../data_GRS1915/Belloni_Altamirano_obsIDs.txt", dtype=str)

# qpo_colours = []

# for seg_id in seg_ids:
#     if seg_id.split("_")[0] in paper_obIDs:
#         qpo_colours.append("red")
#     else:
#         qpo_colours.append("grey")
        
# qpo_labels = []

# for seg_id in seg_ids:
#     if seg_id.split("_")[0] in paper_obIDs:
#         qpo_labels.append("QPO")
#     else:
#         qpo_labels.append("other")
        
        
# qpo_scales = []

# for seg_id in seg_ids:
#     if seg_id.split("_")[0] in paper_obIDs:
#         qpo_scales.append("QPO")
#     else:
#         qpo_scales.append("other")
        
        
xxx = [seg.split("_")[0] for seg in seg_ids]

classes = np.array(["alpha", "beta", "gamma", "delta", "theta", "kappa", "lambda", "mu", "nu", "rho", "phi", "chi", "eta", "omega"])
class_colour = []
for ob in xxx:
    if ob in ob_state:
        class_colour.append(np.where(classes == ob_state[ob])[0][0])
    else:
        class_colour.append(15)
        
classes = np.array(["alpha", "beta", "gamma", "delta", "theta", "kappa", "lambda", "mu", "nu", "rho", "phi", "chi", "eta", "omega"])
scales = []
segment_class = []
for ob in xxx:
    if ob in ob_state:
        segment_class.append(ob_state[ob])
        scales.append(5)
    else:
        segment_class.append("Unknown")
        scales.append(0.1)
        
        
from matplotlib import cm
cm.get_cmap(plt.get_cmap("Set1"))


colours = ['#ffd8b1', '#000075', '#808080', '#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#000000']

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# sns.set_style("white")
plt.rcParams['figure.figsize'] = (30.0, 30.0)
plt.rcParams.update({'font.size': 0})

embeddings_lap = umaped_data

# fig, ax = plt.subplots()

fig, axs = plt.subplots(4, 4)
axs = axs.flatten()

for plot_class_ind, plot_class in enumerate(classes):
    class_indices = np.where(np.array(segment_class) == "Unknown")[0]
    class_data = embeddings_lap[class_indices]
    axs[plot_class_ind].scatter(class_data[:,0], class_data[:,1], s = 0.2, c="grey", label="Unknown")

    class_indices = np.where(np.array(segment_class) == plot_class)[0]
    class_data = embeddings_lap[class_indices]
    
    axs[plot_class_ind].scatter(class_data[:,0], class_data[:,1], s = 25, c='red', label=plot_class)
    
# plt.legend()
    axs[plot_class_ind].set_title("{}".format(plot_class), fontsize=42)
axs.reshape((4,4))
# plt.savefig("classes_separate.png")

# plt.savefig("UMAP_embedding_separate_classes_model_2020-02-09_10-36-06.png")
plt.show()


# redint = np.where(np.array(qpo_colours) == "red")
# greyint= np.where(np.array(qpo_colours) != "red")
# plt.scatter(embeddings_lap[:,0][greyint], embeddings_lap[:,1][greyint], s=1, c="grey", label= "other")
# plt.scatter(embeddings_lap[:,0][redint], embeddings_lap[:,1][redint], s=1, c="red", label= "HF QPO")
# plt.title("UMAP embedding of the encoded GRS1915 segments, neighbors=50, min_dist=0.0, components=2", fontsize=12)
# plt.legend()
# plt.show()

In [None]:
UMAP_mapper = umap.UMAP(verbose=True)#n_neighbors=50, min_dist=0.0, local_connectivity, repulsion_strength, negative_sample_rate
UMAP_mapper.fit(histogram_data[:50000,:,0])

In [None]:
umaped_data = UMAP_mapper.transform(histogram_data[:,:,0])

In [None]:
plt.rcParams.update({'font.size': 12})
plt.rcParams['figure.figsize'] = (abs((np.min(umaped_data[:,0])-0.5) -(np.max(umaped_data[:,0])+0.5)), abs((np.min(umaped_data[:,1])-0.5)- (np.max(umaped_data[:,1])+0.5)))
plt.scatter(umaped_data[:,0], umaped_data[:,1], s=0.05)
plt.xlim([np.min(umaped_data[:,0])-0.5, np.max(umaped_data[:,0])+0.5])
plt.ylim([np.min(umaped_data[:,1])-0.5, np.max(umaped_data[:,1])+0.5])
plt.show()

In [None]:
histograms_dir = '../../../data_GRS1915/468202_len128_s2_4cad_histograms_24bin_0-13k_errorfix.pkl'
with open(histograms_dir, 'rb') as f:
    histogram_data = pickle.load(f)
# with open('../../data_GRS1915/94465_len512_s40_errors_errorfix.pkl', 'rb') as f:
#     errors = pickle.load(f)
    
# errors = ((errors)/np.std(segments)).astype(np.float32)
histogram_data = zscore(histogram_data, axis=None).astype(np.float32)  # standardize

In [None]:
# with open('../../../data_GRS1915/UMAPmapper_raw_histograms_trainedonall.pkl', 'wb') as f:
#     pickle.dump(UMAP_mapper, f)

with open('../../../data_GRS1915/UMAPmapper_20d_shape16_moments4_trainedonall_468202.pkl', 'rb') as f:
    UMAP_mapper = pickle.load(f)

In [None]:
umaped_data = UMAP_mapper.transform(shape_moments)

In [None]:
histogram_data.shape

In [None]:
# load observation classifications from Huppenkothen 2017
# %matplotlib inline

# import matplotlib.pyplot as plt


clean_belloni = open('../../../data_GRS1915/1915Belloniclass_updated.dat')
lines = clean_belloni.readlines()
states = lines[0].split()
belloni_clean = {}
for h,l in zip(states, lines[1:]):
    belloni_clean[h] = l.split()
    #state: obsID1, obsID2...
ob_state = {}
for state, obs in belloni_clean.items():
    if state == "chi1" or state == "chi2" or state == "chi3" or state == "chi4": state = "chi"
    for ob in obs:
        ob_state[ob] = state
        
# load segmented light curves

import pickle
with open('../../../data_GRS1915/468202_len128_s2_4cad_counts_errorfix.pkl', 'rb') as f:
    segments = pickle.load(f)
with open('../../../data_GRS1915/468202_len128_s2_4cad_ids_errorfix.pkl', 'rb') as f:
    seg_ids = pickle.load(f)

# # HF QPO observation ids
# paper_obIDs = np.loadtxt("../../../data_GRS1915/Belloni_Altamirano_obsIDs.txt", dtype=str)

# qpo_colours = []

# for seg_id in seg_ids:
#     if seg_id.split("_")[0] in paper_obIDs:
#         qpo_colours.append("red")
#     else:
#         qpo_colours.append("grey")
        
# qpo_labels = []

# for seg_id in seg_ids:
#     if seg_id.split("_")[0] in paper_obIDs:
#         qpo_labels.append("QPO")
#     else:
#         qpo_labels.append("other")
        
        
# qpo_scales = []

# for seg_id in seg_ids:
#     if seg_id.split("_")[0] in paper_obIDs:
#         qpo_scales.append("QPO")
#     else:
#         qpo_scales.append("other")
        
        
xxx = [seg.split("_")[0] for seg in seg_ids]

classes = np.array(["alpha", "beta", "gamma", "delta", "theta", "kappa", "lambda", "mu", "nu", "rho", "phi", "chi", "eta", "omega"])
class_colour = []
for ob in xxx:
    if ob in ob_state:
        class_colour.append(np.where(classes == ob_state[ob])[0][0])
    else:
        class_colour.append(15)
        
classes = np.array(["alpha", "beta", "gamma", "delta", "theta", "kappa", "lambda", "mu", "nu", "rho", "phi", "chi", "eta", "omega"])
scales = []
segment_class = []
for ob in xxx:
    if ob in ob_state:
        segment_class.append(ob_state[ob])
        scales.append(5)
    else:
        segment_class.append("Unknown")
        scales.append(0.1)
        
        
from matplotlib import cm
cm.get_cmap(plt.get_cmap("Set1"))


colours = ['#ffd8b1', '#000075', '#808080', '#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#000000']

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# sns.set_style("white")
plt.rcParams['figure.figsize'] = (30.0, 30.0)
plt.rcParams.update({'font.size': 0})

embeddings_lap = umaped_data

# fig, ax = plt.subplots()

fig, axs = plt.subplots(4, 4)
axs = axs.flatten()

for plot_class_ind, plot_class in enumerate(classes):
    class_indices = np.where(np.array(segment_class) == "Unknown")[0]
    class_data = embeddings_lap[class_indices]
    axs[plot_class_ind].scatter(class_data[:,0], class_data[:,1], s = 0.2, c="grey", label="Unknown")

    class_indices = np.where(np.array(segment_class) == plot_class)[0]
    class_data = embeddings_lap[class_indices]
    
    axs[plot_class_ind].scatter(class_data[:,0], class_data[:,1], s = 25, c='red', label=plot_class)
    
# plt.legend()
    axs[plot_class_ind].set_title("{}".format(plot_class), fontsize=42)
axs.reshape((4,4))
# plt.savefig("classes_separate.png")

# plt.savefig("UMAP_embedding_separate_classes_model_2020-02-09_10-36-06.png")
plt.show()


# redint = np.where(np.array(qpo_colours) == "red")
# greyint= np.where(np.array(qpo_colours) != "red")
# plt.scatter(embeddings_lap[:,0][greyint], embeddings_lap[:,1][greyint], s=1, c="grey", label= "other")
# plt.scatter(embeddings_lap[:,0][redint], embeddings_lap[:,1][redint], s=1, c="red", label= "HF QPO")
# plt.title("UMAP embedding of the encoded GRS1915 segments, neighbors=50, min_dist=0.0, components=2", fontsize=12)
# plt.legend()
# plt.show()

In [None]:
histograms_dir = '../../../data_GRS1915/468202_len128_s2_4cad_histograms_24bin_0-13k_errorfix.pkl'
with open(histograms_dir, 'rb') as f:
    histogram_data = pickle.load(f)
# with open('../../data_GRS1915/94465_len512_s40_errors_errorfix.pkl', 'rb') as f:
#     errors = pickle.load(f)
    
# errors = ((errors)/np.std(segments)).astype(np.float32)
histogram_data = zscore(histogram_data, axis=0).astype(np.float32)  # standardize

UMAP_mapper = umap.UMAP(verbose=True)#n_neighbors=50, min_dist=0.0, local_connectivity, repulsion_strength, negative_sample_rate
UMAP_mapper.fit(histogram_data[:50000,:,0])
    
# with open('../../../data_GRS1915/UMAPmapper_shape16_4moments_trainedonall.pkl', 'wb') as f:
#     pickle.dump(UMAP_mapper, f)

umaped_data = UMAP_mapper.transform(histogram_data[:,:,0])

In [None]:
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.metrics import plot_confusion_matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix

In [None]:
np.std(shape_desc_GM, axis=0)

In [None]:
plt.rcParams['figure.figsize'] = (15.0, 15.0)
plt.rcParams.update({'font.size': 12})
# plt.rcParams.update(plt.rcParamsDefault)

In [None]:
fig, axs = plt.subplots(2, 2)
axs = axs.flatten()

moment_names = ["mean", "st.dev.", "skew", "kurt"]

for plot_ind, moment_index in enumerate([-4, -3, -2, -1]):
    axs[plot_ind].hist(shape_moments[:,moment_index], density=True)
    axs[plot_ind].hist(shape_moments[np.where(np.array(segment_class) == "phi")[0]][:,moment_index], alpha=0.8, density=True)
    axs[plot_ind].set_title("{}".format(moment_names[plot_ind]), fontsize=42)
axs.reshape((2,2))

# plt.title("Normed distribution of chi class moments")

In [None]:
shape_moments[np.where(np.array(segment_class) == "chi")[0]]

In [None]:
plt.hist(shape_moments[:,-4])
plt.hist(shape_moments[np.where(np.array(segment_class) == "chi")[0]][:,-4], alpha=0.8)
plt.show()

In [None]:
plt.hist(shape_moments[:,-3])
plt.hist(shape_moments[np.where(np.array(segment_class) == "chi")[0]][:,-3], alpha=0.8)
plt.show()

In [None]:
plt.hist(shape_moments[:,-2])
plt.hist(shape_moments[np.where(np.array(segment_class) == "chi")[0]][:,-2], alpha=0.8)
plt.show()

In [None]:
from sklearn.mixture import GaussianMixture

comp_no_list = [5, 10, 30, 50, 70, 90, 100, 110, 120, 150, 170, 200]

criteria = np.zeros((2,len(comp_no_list)))


for ind, n_components in enumerate(comp_no_list):

    clf = GaussianMixture(n_components=n_components, covariance_type='full', verbose=1)
    clf.fit(shape_moments[:50000,:])
    
    criteria[0,ind] = clf.aic(shape_moments)
    criteria[1,ind] = clf.bic(shape_moments)
    print(n_components, criteria[:,ind])
    
#     with open('shape16_moments4_criteria_search_50k.pkl', 'wb') as f:
#         pickle.dump(clf, f)
    
#     with open('../../../data_GRS1915/shape16_moments4_components{}_alldata.pkl'.format(n_components), 'wb') as f:
#         pickle.dump(clf, f)
    
    with open('shape16_moments4_criteria_search_50k.pkl', 'wb') as f:
        pickle.dump(criteria, f)

In [None]:
from sklearn.mixture import GaussianMixture

comp_no_list = np.arange(113,116, 1)

criteria = np.zeros((2,len(comp_no_list)))


for ind, n_components in enumerate(comp_no_list):

    clf = GaussianMixture(n_components=n_components, covariance_type='full', verbose=1)
    clf.fit(shape_moments[:,:])
    
    criteria[0,ind] = clf.aic(shape_moments)
    criteria[1,ind] = clf.bic(shape_moments)
    print(n_components, criteria[:,ind])
    
#     with open('shape16_moments4_criteria_search_arange110-150-2_50k.pkl', 'wb') as f:
#         pickle.dump(clf, f)
    
    with open('../../../data_GRS1915/shape16_moments4_components{}_alldata.pkl'.format(n_components), 'wb') as f:
        pickle.dump(clf, f)
    
#     with open('shape_desc_stat_24D_GMM_aic_bic_range145-160-2comps_50K.pkl', 'wb') as f:
#         pickle.dump(criteria, f)

In [None]:
from sklearn.cluster import DBSCAN
clf_dbscan = DBSCAN(eps=1.5,min_samples=5, n_jobs=7)
clf_dbscan.fit(shape_moments)
with open('../../../data_GRS1915/DBSCAN_shape16_moments4_eps1-5_min_samp5_alldata.pkl', 'wb') as f:
    pickle.dump(clf_dbscan, f)

In [None]:
# eps = 1.5, min_samp15

# (array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
#         16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
#         33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]),
#  array([189574, 224915,  19939,  25596,   3785,    390,    172,   2230,
#            361,    348,     26,    115,     47,     20,     32,     39,
#             45,     25,     12,     45,     16,     15,     74,     20,
#             23,     44,     20,     31,     12,     26,     15,     12,
#              8,     14,     20,     16,     10,     13,     23,      8,
#             16,      9,     15,     11,     15]))



# eps = 1.3, min_samp40
# (array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
#         16, 17]),
#  array([318569,  34645,   4110,  78114,   9796,  18123,    846,   2249,
#            402,    420,     55,     85,    473,     42,     44,    103,
#             40,     44,     42]))

# eps = 1.5, min_samp=40
# (array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
#         16, 17, 18]),
#  array([219747, 197332,   7912,   2613,  11103,  23748,   2352,    820,
#            413,   1126,    121,    116,    141,    235,     93,    113,
#             42,    104,     46,     25]))

# eps = 1.6, min_samp=40
# (array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
#         16, 17, 18, 19, 20]),
#  array([186871, 225196,   8361,   3542,  36848,     71,   3290,    154,
#            186,   1787,    225,    512,    261,    313,     41,     59,
#            116,    204,     52,     28,     40,     45]))

# eps = 1.7, min_samp=40
# (array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]),
#  array([159151, 253519,  47322,   3951,    414,    162,   2297,    308,
#            574,     67,     58,    351,     28]))

# eps = 1.8, min_samp=40
# (array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13]),
#  array([135415, 274741,  49058,   2690,   4388,    649,    171,    368,
#             89,     67,    409,     40,     63,     34,     20]))

# eps = 1.9, min_samp=40
# (array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
#  array([113161, 294350,  50515,   4692,   2978,   1071,    411,    177,
#            459,    124,     72,     45,     32,     34,     41,     40]))

# eps = 1.9, min_samp=30
# (array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
#         16, 17, 18, 19]),
#  array([106113, 300526,  54158,   4788,   1266,    431,    179,     49,
#            189,     84,     37,     39,     96,     18,     35,     40,
#             30,     17,     47,     26,     34]))

# eps = 1.9, min_samp=25
# (array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
#         16, 17, 18, 19, 20, 21, 22]),
#  array([101173, 304979,  54720,   4893,   1379,    277,    112,     65,
#             55,     20,    199,     25,     28,     26,     26,     19,
#             48,     25,     31,     25,     13,     22,     23,     19]))

# eps = 1.9, min_samp=20
# (array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
#         16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]),
#  array([ 94687, 309998,  56803,   4987,    412,    169,     16,    316,
#             46,     20,     71,     87,     46,     60,     58,     43,
#             54,     72,     35,     11,     27,     21,     23,     16,
#             31,     18,      8,     18,     30,     19]))

# eps = 1.9, min_samp=15
# (array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
#         16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
#         33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
#         50, 51, 52, 53]),
#  array([ 85861, 374589,   5108,     57,    537,    146,    222,     25,
#             28,     74,    440,     12,     50,     10,     12,     67,
#             14,     50,    122,     94,     60,     31,     15,     51,
#             78,     15,     33,     25,     19,     11,     15,      9,
#              8,     12,     21,     15,     15,     31,     14,     13,
#             16,     12,      9,     13,      5,     13,     10,     15,
#             13,     20,      8,     15,     16,     11,     17]))

# eps = 2.0, min_samp=20
# (array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
#         16, 17, 18, 19, 20, 21, 22, 23, 24, 25]),
#  array([ 71817, 388777,   5265,    189,    625,    591,     60,     12,
#             34,    155,    130,     98,     16,     36,     19,     76,
#             56,     23,     23,     64,     20,     19,     24,     16,
#             21,     16,     20]))


# eps = 2.0, min_samp=40
# (array([-1,  0,  1,  2,  3,  4,  5,  6]),
#  array([ 91131, 314434,  56881,   4938,    460,    193,    125,     40]))

#eps = 2.1, min_samp=40
# (array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8]),
#  array([ 69832, 391938,   5250,    509,    370,     78,     68,     45,
#             71,     41]))

# eps = 2.2, min_samp=40
# (array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9]),
#  array([ 51761, 414550,    744,    619,     71,    158,     57,    101,
#             41,     39,     61]))

np.unique(clf_dbscan.labels_, return_counts=1)

In [None]:
with open('../../../data_GRS1915/UMAPmapper_20d_shape16_moments4_trainedonall_468202.pkl', 'rb') as f:
    UMAP_mapper = pickle.load(f)
    
umaped_data = UMAP_mapper.transform(shape_moments[:,:])

# clf = GaussianMixture(n_components=50, covariance_type='full', verbose=0)
# clf.fit(shape_moments[:50000,:])
# custer_labels = clf.predict(shape_moments[:,:])

# NUM_COLORS = 50
# cm = plt.get_cmap("winter")#('gist_rainbow')
# colors = [cm(1.*i/NUM_COLORS) for i in range(NUM_COLORS)]
# mapped_colors = [colors[i] for i in custer_labels]


fig, axes = plt.subplots(nrows=1, ncols=1)

plt.rcParams['figure.figsize'] = (10,10)

axes.scatter(umaped_data[:,0], umaped_data[:,1], s=0.1, c=clf_dbscan.labels_, marker=".")

axes.set_xlim([np.min(umaped_data[:,0])-0.5, np.max(umaped_data[:,0])+0.5])
axes.set_ylim([np.min(umaped_data[:,1])-0.5, np.max(umaped_data[:,1])+0.5])


axes.set_yticks([])
axes.set_xticks([])

# plt.savefig('figures/classes/shape_moments_50clusters_alldata_winter_10x10.png', dpi=600)

plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (20.0, 20.0)
plt.rcParams.update({'font.size': 12})

plt.plot(distances[:,5], label="k=5")
plt.plot(distances[:,10], label="k=10")
plt.plot(distances[:,20], label="k=20")
plt.plot(distances[:,39], label="k=39")
plt.plot(distances[:,50], label="k=50")
plt.plot(distances[:,100], label="k=100")
plt.plot(distances[:,200], label="k=200")
plt.plot(distances[:,499], label="k=499")






plt.title("Sorted k-distance for 468202 samples of 20d data")
plt.xlabel("k-order")
plt.ylabel("distance (Euclidean)")
plt.ylim((0,4))
plt.legend()
plt.show()

In [None]:
from sklearn.neighbors import NearestNeighbors
# X = shape_desc_GM[:50000,:]

# for k in [30]:
neigh = NearestNeighbors(n_neighbors=500, n_jobs=7, metric='euclidean')
nbrs = neigh.fit(shape_moments)
distances, indices = nbrs.kneighbors(shape_moments)
distances = np.sort(distances, axis=0)
#distances = distances[:,-1]
# with open('../../../data_GRS1915/k-NN_distances_all_data_20D_468202samples_k0to500.pkl', 'wb') as f:
#     pickle.dump(distances, f)
#     clear_output(wait=True)
# plt.plot(distances)
# plt.ylim((0,4))

# Gaussian mixture model analysis (20d data, 114 gaussian components)

In [None]:
with open("{}/shape16_moments4_components114_alldata.pkl".format(data_dir), 'rb') as f:
    clf_GM114 = pickle.load(f)

# v, w = np.linalg.eig(clf_GM114.covariances_)


In [None]:
means = clf_GM114.means_
covs = clf_GM114.covariances_

In [None]:
from numpy.random import multivariate_normal
from scipy.spatial.distance import mahalanobis
from scipy.spatial.distance import euclidean


In [None]:
dists=np.zeros((114,100,114))
for comp1 in range(114):
    #for comp in range(114):
    multivar_dist = multivariate_normal(mean=means[comp1], cov=covs[comp1])
    samples = multivar_dist.rvs(size=100)
    for ns, sample in enumerate(samples):
        for comp2 in range(114):
            dists[comp1,ns,comp2]=mahalanobis(sample, means[comp2], np.linalg.inv(covs[comp2]))

In [None]:
dists=np.zeros((114,100,114))
for comp1 in range(114):
    #for comp in range(114):
    multivar_dist = multivariate_normal(mean=means[comp1], cov=covs[comp1])
    samples = multivar_dist.rvs(size=100)
    for ns, sample in enumerate(samples):
        for comp2 in range(114):
            dists[comp1,ns,comp2]=mahalanobis(sample, means[comp2], np.linalg.inv(covs[comp2]))

In [None]:
dists=np.zeros((1,10000,114))
for comp1 in range(1):
    #for comp in range(114):
    multivar_dist = multivariate_normal(mean=means[comp1], cov=covs[comp1])
    samples = multivar_dist.rvs(size=10000)
#     for ns, sample in enumerate(samples):
    for comp2 in range(114):
        mahalanobis_l = lambda sample: mahalanobis(sample, means[comp2], np.linalg.inv(covs[comp2]))
        mahal_mapper = map(mahalanobis_l,samples)
        dists[comp1,:,comp2]=np.array(list(mahal_mapper))
        print([comp1, comp2])
        clear_output(wait=True)

In [None]:
dists=np.zeros((114,100000,20))
for comp1 in range(114):
    #for comp in range(114):
    samples = multivariate_normal(mean=means[comp1], cov=covs[comp1], size=100000)
    dists[comp1,:,:]=np.array(samples)
#     samples = multivar_dist.rvs(size=10000)
#     for ns, sample in enumerate(samples):
#     for comp2 in range(114):
#         mahalanobis_l = lambda sample: mahalanobis(sample, means[comp2], np.linalg.inv(covs[comp2]))
#         mahal_mapper = map(mahalanobis_l,samples)
#         dists[comp1,:,comp2]=np.array(list(mahal_mapper))
    print([comp1])
    clear_output(wait=True)

In [None]:
samples.shape

In [None]:
multivar_dist = multivariate_normal(mean=means[comp1], cov=covs[comp1])
multivar_dist

In [None]:
samples = np.random.multivariate_normal(mean=means[comp1], cov=covs[comp1], size=10000)
# samples = multivar_dist.rvs(size=10000)
euclidean_l = lambda sample: euclidean(sample, means[comp1])
euclid_mapper = map(euclidean_l,samples)
euclid_dists=np.array(list(euclid_mapper))

mahalanobis_l = lambda sample: mahalanobis(sample, means[comp1], np.linalg.inv(covs[comp1]))
mahal_mapper = map(mahalanobis_l,samples)
mahal_dists=np.array(list(mahal_mapper))

plt.hist(mahal_dists, label="Mahalanobis")
plt.hist(euclid_dists, label="Euclidean", alpha=0.5)
plt.legend()
plt.xlabel("distance")
plt.show()

In [None]:
from scipy.spatial.distance import mahalanobis
from scipy.spatial.distance import euclidean

dimensions = 20
mean = np.zeros(dimensions) # sampling from standard normal distribution
cov = np.identity(dimensions)
samples = np.random.multivariate_normal(mean=mean, cov=cov, size=100000)

euclidean_l = lambda sample: euclidean(sample, mean)
euclid_mapper = map(euclidean_l,samples) # euclidean distance calculation
euclid_dists=np.array(list(euclid_mapper)) 

mahalanobis_l = lambda sample: mahalanobis(sample, mean, np.linalg.inv(cov))
mahal_mapper = map(mahalanobis_l,samples) # mahalanobis distance calculation
mahal_dists=np.array(list(mahal_mapper))

plt.hist(mahal_dists, label="Mahalanobis")
plt.hist(euclid_dists, label="Euclidean", alpha=0.5)
plt.legend()
plt.xlabel("distance from the mean/origin in {}d".format(dimensions))
plt.show()

In [None]:
np.sqrt(20)

In [None]:
np.mean(euclid_dists)

In [None]:
np.mean(samples, axis=0)

In [None]:
from scipy.stats import chi2
df=20
rv = chi2(df)
r = chi2.rvs(df, size=10000)
x = np.linspace(chi2.ppf(0.01, df), chi2.ppf(0.99, df), 100)
plt.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')
plt.hist(r, normed=True, histtype='stepfilled', alpha=0.2)
plt.xlim([0,50])
# plt.legend(loc='best', frameon=False)
print(np.min(r))
plt.show()

In [None]:
np.min(r)

In [None]:
# with open('{}/GMM114_1000samples_mahalanobis_dists.pkl'.format(data_dir), 'wb') as f:
#     pickle.dump(dists, f)

In [None]:
pairs=[]
for comp1_ind, comp1 in enumerate(dists):
    max_self = np.max(comp1.T[comp1_ind])
    for comp12_ind, comp12 in enumerate(comp1.T):
        if np.min(comp12) < max_self:
            pairs.append((comp1_ind,comp12_ind))

In [None]:
len(pairs)

In [None]:
import math
math.factorial(114)

In [None]:
mahalanobis_l = lambda sample, dist_ind: mahalanobis(sample, means[dist_ind], np.linalg.inv(covs[dist_ind]))

In [None]:
dists_l = map(mahalanobis_l,samples, list(range(114)))

In [None]:
np.array(list(dists_l)).shape

In [None]:
maxes =[]
for n in range(114):
    maxes.append(np.max(dists[n,:,n]))

In [None]:
plt.hist(maxes)
plt.show()

In [None]:
n=0
plt.hist(dists[n,:,n])
plt.xlabel("Mahalanobis distance between mean of cluster 1 and points sampled from it")
plt.show()

In [None]:
def mahalanobis_ML(x=None, data=None, cov=None):
    """Compute the Mahalanobis Distance between each row of x and the data  
    x    : vector or matrix of data with, say, p columns.
    data : ndarray of the distribution from which Mahalanobis distance of each observation of x is to be computed.
    cov  : covariance matrix (p x p) of the distribution. If None, will be computed from data.
    """
    x_minus_mu = x - data
#     if not cov:
#         cov = np.cov(data.values.T)
    inv_covmat = np.linalg.inv(cov)
    left_term = np.dot(x_minus_mu, inv_covmat)
    mahal = np.dot(left_term, x_minus_mu.T)
    return mahal.diagonal()

In [None]:
(samples-means[comp])[2] - (samples[2]-means[comp])

In [None]:
mahalanobis_ML(samples, means[comp], covs[comp])

In [None]:
from scipy.special import gamma
import pandas as pd

n=20
component_vol_members = np.zeros((114, 2))


for comp in range(114):
    component_vol_members[comp,0] = (np.pi**(n/2)/gamma(n/2+1))*np.product(np.sqrt(v[comp])*3)
    component_vol_members[comp,1] = np.unique(preds, return_counts=1)[1][comp]

component_vol_members_df = pd.DataFrame(component_vol_members, columns=("volume", "members"))
component_vol_members_df["volume_per_member"] = component_vol_members_df.volume/component_vol_members_df.members
component_vol_members_df

In [None]:
component_vol_members_df.sort_values(by=['volume'])

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(shape_moments)
n=20
dataset_vol = (np.pi**(n/2)/gamma(n/2+1))*np.product(np.sqrt(pca.explained_variance_)*3)

In [None]:
dataset_vol/(component_vol_members_df.volume.sum())

# $\frac{\pi^{\frac{n}{2}}}{\Gamma({\frac{n}{2}+1})} \prod_{a=1}^{n} c_{a}$

where n is the number of dimensions and $c_a$ is the length of semi-axis in dimension a


In [None]:
np.cumsum(component_vol_members_df.sort_values(by=['volume']).volume.values)

In [None]:
cum_sort_comp_vols = np.cumsum(component_vol_members_df.sort_values(by=['volume']).volume.values)
cum_sort_comp_members = np.cumsum(component_vol_members_df.sort_values(by=['volume']).members.values)


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Create some mock data
t = np.arange(0.01, 10.0, 0.01)
data1 = np.exp(t)
data2 = np.sin(2 * np.pi * t)

fig, ax1 = plt.subplots()

color = 'tab:red'
ax1.set_xlabel('time (s)')
ax1.set_ylabel('exp', color=color)
ax1.plot(t, data1, color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:blue'
ax2.set_ylabel('sin', color=color)  # we already handled the x-label with ax1
ax2.plot(t, data2, color=color)
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.show()

In [None]:
# https://matplotlib.org/examples/axes_grid/demo_parasite_axes2.html

fig, ax1 = plt.subplots()

ax2 = ax1.twinx()
ax3 = ax1.twinx()

# offset = 60
# new_fixed_axis = ax3.get_grid_helper().new_fixed_axis
# ax3.axis["right"] = new_fixed_axis(loc="right",
#                                     axes=ax3,
#                                     offset=(offset, 0))

# ax3.axis["right"].toggle(all=True)



ax1.plot([0,113], [dataset_vol]*2, label="3-sigma data set vol", color = "black")
ax1.plot(cum_sort_comp_vols, label="3-sigma component vol", color = "tab:blue")
# ax1.tick_params(axis='y', color="tab:blue")
ax1.set_yscale('log')
ax1.set_ylabel("Cumulative volume", color="tab:blue")


ax2.plot(cum_sort_comp_members, label="number of members", color = "tab:red")
# ax2.tick_params(axis='y', color="tab:red")
ax2.set_ylabel("Cumulative number of members", color="tab:red")

ax3.plot(component_vol_members_df.sort_values(by=['volume']).volume_per_member.values, label="volume per member", color = "tab:green")
ax3.set_yscale('log')
ax3.set_ylabel("Volume per member", color="tab:green")

ax3.spines["right"].set_position(("axes", 1.2))

fig.legend(loc='lower right', bbox_to_anchor=(0.8, 0.1))
# plt.yscale("log")
# plt.xlabel("Gaussian components")
# plt.ylabel("Cumulative volume")
plt.draw()
plt.show()

In [None]:
log_vols = np.log(component_vol_members_df.volume.values)
log_vols+=abs(np.min(log_vols))

In [None]:
umap_mapper_GM114 = umap.UMAP()
umaped_data = umap_mapper_GM114.fit_transform(clf_GM114.means_)

In [None]:
plt.rcParams.update({'font.size': 12})
plt.rcParams['figure.figsize'] = (10,10)
plt.scatter(umaped_data[:,0], umaped_data[:,1], s=log_vols)
plt.xlim([np.min(umaped_data[:,0])-0.5, np.max(umaped_data[:,0])+0.5])
plt.ylim([np.min(umaped_data[:,1])-0.5, np.max(umaped_data[:,1])+0.5])
plt.show()

In [None]:
from sklearn.cluster import OPTICS

In [None]:
optics_shape_moments = OPTICS(max_eps=4,min_samples=500, metric="euclidean", n_jobs=30)
optics_shape_moments.fit(shape_moments)
with open('{}/OPTICS_shape16_moments4_max_eps4_min_samp500_euclidean_alldata.pkl'.format(data_dir), 'wb') as f:
    pickle.dump(optics_shape_moments, f)

In [None]:
np.unique(optics_shape_moments.labels_, return_counts=1)

In [None]:
with open('{}/OPTICS_shape16_moments4_max_eps2-5_min_samp300_euclidean_alldata.pkl'.format(data_dir), 'rb') as f:
    optics_shape_moments = pickle.load(f)

In [None]:
# Reachability plot
clust = optics_shape_moments

plt.rcParams['figure.figsize'] = (100,10)
plt.rcParams.update({'font.size': 42})



# labels_050 = cluster_optics_dbscan(reachability=clust.reachability_,
#                                    core_distances=clust.core_distances_,
#                                    ordering=clust.ordering_, eps=0.5)
# labels_200 = cluster_optics_dbscan(reachability=clust.reachability_,
#                                    core_distances=clust.core_distances_,
#                                    ordering=clust.ordering_, eps=2)

space = np.arange(len(shape_moments))
reachability = clust.reachability_[clust.ordering_]
labels = clust.labels_[clust.ordering_]

breaks = [0,21430,199990,242305,303065,309196,371590,383575,391430,395315, 414530, 468201]#371590
for nb, brejk in enumerate(breaks[:-1]):
    Xk = space[labels == 0][brejk:breaks[nb+1]]
    Rk = reachability[labels == 0][brejk:breaks[nb+1]]
    plt.scatter(Xk, Rk, alpha=1)
    

plt.plot(space[labels == -1], reachability[labels == -1], 'white', alpha=0.3)
# plt.plot(space, np.full_like(space, 2., dtype=float), 'k-', alpha=0.5)
# plt.plot(space, np.full_like(space, 0.5, dtype=float), 'k-.', alpha=0.5)
# plt.set_ylabel('Reachability (epsilon distance)')
# plt.set_title('Reachability Plot')
plt.show()

In [None]:
(12**2)/7

In [None]:
12/7*1

In [None]:
# Reachability plot
clust = optics_shape_moments

plt.rcParams['figure.figsize'] = (100,10)
plt.rcParams.update({'font.size': 42})



# labels_050 = cluster_optics_dbscan(reachability=clust.reachability_,
#                                    core_distances=clust.core_distances_,
#                                    ordering=clust.ordering_, eps=0.5)
# labels_200 = cluster_optics_dbscan(reachability=clust.reachability_,
#                                    core_distances=clust.core_distances_,
#                                    ordering=clust.ordering_, eps=2)

space = np.arange(len(shape_moments))
reachability = clust.reachability_[clust.ordering_]
labels = clust.labels_[clust.ordering_]

breaks = [0,21430,199990,242305,303065,309196,371590,383575,391430,395315, 414530, 468201]#371590
for nb, brejk in enumerate(breaks[:-1]):
    Xk = space[labels == 0][brejk:breaks[nb+1]]
    Rk = reachability[labels == 0][brejk:breaks[nb+1]]
    plt.scatter(Xk, Rk, alpha=1)
    

plt.plot(space[labels == -1], reachability[labels == -1], 'white', alpha=0.3)
# plt.plot(space, np.full_like(space, 2., dtype=float), 'k-', alpha=0.5)
# plt.plot(space, np.full_like(space, 0.5, dtype=float), 'k-.', alpha=0.5)
# plt.set_ylabel('Reachability (epsilon distance)')
# plt.set_title('Reachability Plot')
plt.show()

In [None]:
space[labels == 0]

In [None]:
# with open('../../../data_GRS1915/UMAPmapper_raw_histograms_trainedonall.pkl', 'wb') as f:
#     pickle.dump(UMAP_mapper, f)

# with open('{}/UMAPmapper_20d_shape16_moments4_trainedonall_468202.pkl'.format(data_dir), 'rb') as f:
#     UMAP_mapper = pickle.load(f)
    
with open('{}/UMAP_transformed_20d_shape16_moments4_trainedonall_468202.pkl'.format(data_dir), 'rb') as f:
    umaped_data = pickle.load(f)


In [None]:
umaped_data[clust.ordering_]

In [None]:
plt.rcParams.update({'font.size': 12})
plt.rcParams['figure.figsize'] = (abs((np.min(umaped_data[:,0])-0.5) -(np.max(umaped_data[:,0])+0.5)), abs((np.min(umaped_data[:,1])-0.5)- (np.max(umaped_data[:,1])+0.5)))
for nb, brejk in enumerate(breaks[:-1]):
    plt.scatter(umaped_data[clust.ordering_][labels == 0][brejk:breaks[nb+1],0], umaped_data[clust.ordering_][labels == 0][brejk:breaks[nb+1],1], s=0.1)
plt.xlim([np.min(umaped_data[:,0])-0.5, np.max(umaped_data[:,0])+0.5])
plt.ylim([np.min(umaped_data[:,1])-0.5, np.max(umaped_data[:,1])+0.5])
plt.show()

In [None]:

# from matplotlib import cm
# cm.get_cmap(plt.get_cmap("Set1"))


colours = ['#ffd8b1', '#000075', '#808080', '#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#000000']

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# sns.set_style("white")
plt.rcParams['figure.figsize'] = (30.0, 30.0)
plt.rcParams.update({'font.size': 0})

class_data = umaped_data[clust.ordering_][labels == 0]

# fig, ax = plt.subplots()

fig, axs = plt.subplots(4, 3)
axs = axs.flatten()

for plot_class_ind, plot_class in enumerate(breaks[:-1]):
#     class_indices = np.where(np.array(segment_class) == "Unknown")[0]
#     class_data = embeddings_lap[class_indices]
    axs[plot_class_ind].scatter(class_data[:,0], class_data[:,1], s = 0.2, c="grey", label="Unknown")
    
    axs[plot_class_ind].scatter(class_data[plot_class:breaks[plot_class_ind+1],0], class_data[plot_class:breaks[plot_class_ind+1],1], s = 1, c='red', label=plot_class_ind)
    
# plt.legend()
    axs[plot_class_ind].set_title("{}".format(plot_class_ind), fontsize=42)
    
    
axs[-1].scatter(class_data[:,0], class_data[:,1], s = 0.2, c="grey", label="Unknown")
axs[-1].scatter(umaped_data[clust.ordering_][labels == 0][:,0], umaped_data[clust.ordering_][labels == 0][:,1], s = 1, c='red', label=plot_class_ind)
axs[-1].set_title("{}".format("noise"), fontsize=42)
axs.reshape((4,3))
# plt.savefig("classes_separate.png")

# plt.savefig("UMAP_embedding_separate_classes_model_2020-02-09_10-36-06.png")
plt.show()


# redint = np.where(np.array(qpo_colours) == "red")
# greyint= np.where(np.array(qpo_colours) != "red")
# plt.scatter(embeddings_lap[:,0][greyint], embeddings_lap[:,1][greyint], s=1, c="grey", label= "other")
# plt.scatter(embeddings_lap[:,0][redint], embeddings_lap[:,1][redint], s=1, c="red", label= "HF QPO")
# plt.title("UMAP embedding of the encoded GRS1915 segments, neighbors=50, min_dist=0.0, components=2", fontsize=12)
# plt.legend()
# plt.show()

In [None]:
from sklearn.cluster import cluster_optics_dbscan

In [None]:
 min_samples=5, max_eps=inf, metric='minkowski', p=2, metric_params=None, cluster_method='xi', eps=None, xi=0.05,
    predecessor_correction=True, min_cluster_size=None, algorithm='auto', leaf_size=30, n_jobs=None

In [None]:
clf_GM114.means_.shape

In [None]:
space_vol = 1
for dim in range(20):
    space_vol*= np.abs(np.max(shape_moments[dim])-np.min(shape_moments[dim]))
print(space_vol)

In [None]:
43483574/71396378333

In [None]:

with open("../../../data_GRS1915/shape16_moments4_components114_alldata.pkl", 'rb') as f:
    clf = pickle.load(f)
    
print(clf.aic(shape_moments),
      clf.bic(shape_moments))

In [None]:
scores = clf.predict_proba(shape_moments)

In [None]:
preds = clf.predict(shape_moments)

In [None]:
plt.hist(np.unique(preds, return_counts=1)[1])

In [None]:
np.linalg.eigh(clf.covariances_)[1].shape

In [None]:
v, w = np.linalg.eigh(clf.covariances_)
u = w[0] / np.linalg.norm(w[0])
angle = np.arctan2(u[1], u[0])
angle = 180 * angle / np.pi  # convert to degrees
v = 2. * np.sqrt(2.) * np.sqrt(v)

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(shape_moments)

In [None]:
4/3

In [None]:
from scipy.special import gamma

n=3
(np.pi**(n/2)/gamma(n/2+1))

In [None]:
(np.pi**(n/2)/gamma(n/2+1))*np.product(np.sqrt(pca.explained_variance_)*2)

In [None]:
np.sqrt(pca.explained_variance_)*3

In [None]:
from matplotlib import pyplot as plt
from shapely.geometry.point import Point
from shapely import affinity
from matplotlib.patches import Polygon
import numpy as np

def create_ellipse(center, lengths, angle=0):
    """
    create a shapely ellipse. adapted from
    https://gis.stackexchange.com/a/243462
    """
    circ = Point(center).buffer(1)
    ell = affinity.scale(circ, int(lengths[0]), int(lengths[1]))
    ellr = affinity.rotate(ell, angle)
    return ellr

fig,ax = plt.subplots()

##these next few lines are pretty important because
##otherwise your ellipses might only be displayed partly
##or may be distorted
ax.set_xlim([-5,5])
ax.set_ylim([-5,5])
ax.set_aspect('equal')

##first ellipse in blue
ellipse1 = create_ellipse((0,0),(2,4),10)
verts1 = np.array(ellipse1.exterior.coords.xy)
patch1 = Polygon(verts1.T, color = 'blue', alpha = 0.5)
ax.add_patch(patch1)

##second ellipse in red    
ellipse2 = create_ellipse((1,-1),(3,2),50)
verts2 = np.array(ellipse2.exterior.coords.xy)
patch2 = Polygon(verts2.T,color = 'red', alpha = 0.5)
ax.add_patch(patch2)

##the intersect will be outlined in black
intersect = ellipse1.intersection(ellipse2)
verts3 = np.array(intersect.exterior.coords.xy)
patch3 = Polygon(verts3.T, facecolor = 'none', edgecolor = 'black')
ax.add_patch(patch3)

##compute areas and ratios 
print('area of ellipse 1:',ellipse1.area)
print('area of ellipse 2:',ellipse2.area)
print('area of intersect:',intersect.area)
print('intersect/ellipse1:', intersect.area/ellipse1.area)
print('intersect/ellipse2:', intersect.area/ellipse2.area)


plt.show()

In [None]:
plt.hist(clf.weights_)

In [None]:
scores

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=114)
pca.fit(scores)

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
# euclidean_distances(clf.means_)
plt.hist(euclidean_distances(clf.means_)[np.tril_indices(114)])
plt.xlabel("Euclidean distance")
plt.ylabel("Ratio of explained variance")

In [None]:
np.sum(pca.explained_variance_ratio_[:60])

In [None]:
plt.plot(pca.explained_variance_ratio_)
# plt.plot(np.cumsum(pca.explained_variance_ratio_))

plt.xlabel("No. of principal components")
plt.ylabel("Ratio of explained variance")

In [None]:
from sklearn.cluster import DBSCAN
clf_dbscan = DBSCAN(eps=1.9,min_samples=200, n_jobs=7)
clf_dbscan.fit(shape_moments)


In [None]:
np.unique(clf_dbscan.labels_, return_counts=1)

In [None]:
from sklearn.cluster import DBSCAN

dbscan_grid = []

for n_eps, eps in enumerate(np.arange(1.5, 2.2, 0.05)):
    for n_min, min_samples in enumerate(range(40, 120, 5)):

        clf = DBSCAN(eps=eps,min_samples=min_samples, n_jobs=7)
        clf.fit(shape_desc_GM[:50000,:])
        
        dbscan_grid.append((np.unique(clf.labels_, return_counts=1), eps, min_samples))
        
        print(n_eps, n_min)
        clear_output(wait=True)
        
# with open('dbscan_grid_search_50k.pkl', 'wb') as f:
#     pickle.dump(dbscan_grid, f)

In [None]:



plt.rcParams['figure.figsize'] =[20,10]

plt.plot([5, 10, 30, 50, 70, 90, 100, 110, 120, 150, 170, 200], criteria[0,:], label="AIC") #125
plt.plot([5, 10, 30, 50, 70, 90, 100, 110, 120, 150, 170, 200], criteria[1,:], label="BIC") #125
plt.title("Information criteria for GMM trained on 50k samples of shape and descriptive statistics data", pad=20, fontsize=24)
plt.ylabel("Information criterion value", fontsize=24)
plt.xlabel("Number of Gaussian components", fontsize=24)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
plt.rcParams['figure.figsize'] =[20,10]

plt.plot(np.arange(110,150, 2), criteria[0,:], label="AIC") #125
plt.plot(np.arange(110,150, 2), criteria[1,:], label="BIC") #125
plt.title("Information criteria for GMM trained on 50k samples of shape and descriptive statistics data", pad=20, fontsize=24)
plt.ylabel("Information criterion value", fontsize=24)
plt.xlabel("Number of Gaussian components", fontsize=24)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
with open('../../../data_GRS1915/468202_len128_s2_4cad_counts_errorfix.pkl', 'rb') as f:
    segments = pickle.load(f)
with open('../../../data_GRS1915/468202_len128_s2_4cad_ids_errorfix.pkl', 'rb') as f:
    seg_ids = pickle.load(f)

In [None]:
from sklearn.mixture import GaussianMixture

clf = GaussianMixture(n_components=1, covariance_type='full', verbose=1)
clf.fit(segments[0])

In [None]:
def plot_gmm(gmm, X, label=True, ax=None):
    """https://jakevdp.github.io/PythonDataScienceHandbook/05.12-gaussian-mixtures.html"""
    
    ax = ax or plt.gca()
    labels = gmm.fit(X).predict(X)
    if label:
        ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis', zorder=2)
    else:
        ax.scatter(X[:, 0], X[:, 1], s=40, zorder=2)
    ax.axis('equal')
    
    w_factor = 0.2 / gmm.weights_.max()
    for pos, covar, w in zip(gmm.means_, gmm.covars_, gmm.weights_):
        draw_ellipse(pos, covar, alpha=w * w_factor)

In [None]:
plt.hist(segments[0])

In [None]:
with open('{}/468202_len128_s2_4cad_counts_errorfix.pkl'.format(data_dir), 'rb') as f:
    segments_counts = pickle.load(f)
# with open('../../../data_GRS1915/468202_len128_s2_4cad_errors_errorfix.pkl', 'rb') as f:
#     segments_errors = pickle.load(f)
# with open('../../../data_GRS1915/468202_len128_s2_4cad_ids_errorfix.pkl', 'rb') as f:
#     id_per_seg = pickle.load(f)

weights_dir = "../../../model_weights/model_2020-04-29_09-12-23.h5"
segments_dir = '../../../data_GRS1915/468202_len128_s2_4cad_counts_errorfix.pkl'
segment_encoding_dir = '{}/segment_encoding_{}_segments_{}.pkl'.format(data_dir, weights_dir.split("/")[-1].split(".")[0], segments_dir.split("/")[-1].split(".")[0])

with open(segment_encoding_dir, 'rb') as f:
    segment_encoding = pickle.load(f)
    
segment_encoding_scaled_means = zscore(segment_encoding[:,0,:], axis=0).astype(np.float32)  # standardize per feature


desc_stats = np.zeros((len(segments_counts), 4)) #mean, std, skew, kurt
# desc_stats[:,0] = np.median(segments_counts, axis=1).flatten()
desc_stats[:,0] = np.mean(segments_counts, axis=1).flatten()
desc_stats[:,1] = np.std(segments_counts, axis=1).flatten()
desc_stats[:,2] = stats.skew(segments_counts, axis=1).flatten()
desc_stats[:,3] = stats.kurtosis(segments_counts, axis=1).flatten()
zscore_desc_stats = zscore(desc_stats, axis=0)

# desc_GM = np.hstack((zscore(desc_stats, axis=0), GMM_bics))

shape_moments = np.hstack((segment_encoding_scaled_means, zscore_desc_stats)) # every column is standardized

In [None]:
clf = GaussianMixture(n_components=500, covariance_type='full', verbose=1)
clf.fit(shape_moments)

In [None]:
# with open('{}/GMM_shape16_moments4_components500_alldata.pkl'.format(data_dir), 'wb') as f:
#     pickle.dump(clf, f)

In [None]:
clf