In [None]:
%load_ext autoreload
%autoreload 2
from IPython.display import clear_output
import os
import fnmatch
import numpy as np
import pickle
import matplotlib.pyplot as plt

cwd = os.getcwd()

if cwd.split("/")[1] == "export":
    data_dir = "../../../files_from_snuffy"
else:
    data_dir = "../../../data_GRS1915"

# Load light curves from txt files

In [None]:
lcs=[] # light curves (time stamps, count rate, uncertainty)
lc_ids=[] # observation ids

for root, dirnames, filenames in os.walk("{}/std1".format(data_dir)): #Std1_PCU2
    for filename in fnmatch.filter(filenames, "*_std1_lc.txt"):
        lc = os.path.join(root, filename)
        lc_ids.append(filename.split("_")[0])
        f=np.loadtxt(lc)
        f=np.transpose(f)
        lcs.append(f)
        print("Loaded {} lightcurves".format(len(lcs)))
        clear_output(wait=True)

In [None]:
def binning(times, counts, errors, output_cadence, input_cadence=0.125):
    """
    Bin the input time series. Time series must contain an array of time stamps, 
    count values and uncertainty on the count.
    Make sure that count rates are transformed to count values (uncertainty should be equal to the square root of the count).
    
    times = 1D array of bin time stamps
    counts = 1D array of couts per bin 
    errors = 1D array of uncertainty of counts
    input_cadence = input cadence in seconds
    output_cadence = desired cadence in seconds
    """
    binned_stamps = int(output_cadence/input_cadence) # how many data points to bin
    
    times = np.mean(times[:(len(times)//binned_stamps)*binned_stamps].reshape(-1, binned_stamps), axis=1)
    counts = np.sum(counts[:(len(counts)//binned_stamps)*binned_stamps].reshape(-1, binned_stamps), axis=1)
    errors = np.sqrt(counts)
    rm_points = []
    skip=False
    for i in range(len(times)-1):
        if skip==True:
            skip=False
            continue
        delta = times[i+1]-times[i]
        if delta > output_cadence:
            rm_points.append(i+1)
            skip=True

    times=np.delete(times,rm_points)
    counts=np.delete(counts,rm_points)
    errors=np.delete(errors,rm_points)
    return np.stack((times,counts, errors))

def segmentation(time_series, seg_len, stride, keep_time_stamps=True, experimental = False, input_cadence=1):
    """
    Create a list of 1D (when time_stamps=False) or 2D (when time_stamps=True) arrays, which are overlappig segments of ts. Incomplete fragments are rejected.

    time_series = time series to be segmented
    seg_len = length of a segment, 
    stride = step size; difference in the starting position of the consecutive segments
    """
    segments=[]
    for start in range(0, len(time_series[0])-seg_len, stride):
        end=start+seg_len
        if time_series[0][end]-time_series[0][start] != seg_len*input_cadence: #don't allow segments outside of good time intervals
            continue
        if keep_time_stamps==True:
            segments.append(time_series[:,start:end])
        else:
            segments.append(time_series[1:,start:end])
    return np.array(segments) # check why time stamps are kept 

# Re-bin light curves to 4 second bins and segment

In [None]:
output_cadence=4 # in seconds
input_cadence=0.125

binned_lcs = []
for lc in lcs:
    lc_rate = np.copy(lc)
    # transform count rate values to counts per bin
    lc_rate[1] *=input_cadence # photon count values
    lc_rate[2] *=input_cadence # uncertainty values
    # bin to 1 second cadence
    binned_lc = binning(lc_rate[0],lc_rate[1],lc_rate[2], output_cadence=output_cadence, input_cadence=input_cadence)
    # transform back to count rate
    binned_lc[1] /=output_cadence # photon count values
    binned_lc[2] /=output_cadence # uncertainty values
    binned_lcs.append(binned_lc)
    print("Binned {} light curves".format(len(binned_lcs)))
    clear_output(wait=True)

In [None]:
cadence=4
seg_len_s=512
stride_s=8

segments_counts=[]
segments_times = []
segments_errors=[]
seg_ids=[]


seg_len = seg_len_s//cadence # segment length and stride size in data points
stride = stride_s//cadence



for lc_index, lc in enumerate(binned_lcs):
    if len(lc[1]) >= seg_len: 
        segments = segmentation(lc, seg_len, stride, keep_time_stamps=True, experimental = False, input_cadence=cadence)
    else:
        continue
    if len(segments) > 0:
        segments_times.append(segments[:,0,:])
        segments_counts.append(segments[:,1,:])
        segments_errors.append(segments[:,2,:])
        seg_ids.append(lc_ids[lc_index])
        print("Segmented {}/{} light curves.".format(lc_index+1, len(lcs)))
        clear_output(wait=True)
        

In [None]:
print("Stacking the segments and creating segment IDs, shuffling.")
id_per_seg = []  # for each light curve, copy the observation id for every segment of the light curve
for lc_index, lc in enumerate(segments_counts):
    for i in range(len(lc)):
        id_per_seg.append(seg_ids[lc_index]+"_{}".format(i))

segments_counts=np.vstack(segments_counts)
segments_errors=np.vstack(segments_errors)
segments_counts = np.expand_dims(segments_counts, axis=-1)
segments_errors = np.expand_dims(segments_errors, axis=-1)

rng_state = np.random.get_state()
np.random.shuffle(segments_counts)
np.random.set_state(rng_state)
np.random.shuffle(segments_errors)
np.random.set_state(rng_state)
np.random.shuffle(id_per_seg)

print("Done")

In [None]:
# with open('{}/468202_len128_stride8_4sec_cad_countrates_sum_bin.pkl'.format(data_dir), 'wb') as f:
#     pickle.dump(segments_counts, f)
    
# with open('{}/468202_len128_stride8_4sec_cad_errors_sum_bin.pkl'.format(data_dir), 'wb') as f:
#     pickle.dump(segments_errors, f)
    
# with open('{}/468202_len128_stride8_4sec_cad_ids_sum_bin.pkl'.format(data_dir), 'wb') as f:
#     pickle.dump(id_per_seg, f)

# Re-bin light curves to 1 second bins and segment

In [None]:
output_cadence=1 # in seconds
input_cadence=0.125

binned_lcs = []
for lc in lcs:
    lc_rate = np.copy(lc)
    # transform count rate values to counts per bin
    lc_rate[1] *=input_cadence # photon count values
    lc_rate[2] *=input_cadence # uncertainty values
    # bin to 1 second cadence
    binned_lc = binning(lc_rate[0],lc_rate[1],lc_rate[2], output_cadence=output_cadence, input_cadence=input_cadence)
    # transform back to count rate
    binned_lc[1] /=output_cadence # photon count values
    binned_lc[2] /=output_cadence # uncertainty values
    binned_lcs.append(binned_lc)
    print("Binned {} light curves".format(len(binned_lcs)))
    clear_output(wait=True)

In [None]:
# observations missing from the 4 second dataset must be filtered out.
# prepare a list of observation IDs
with open('{}/468202_len128_stride8_4sec_cad_ids_sum_bin.pkl'.format(data_dir), 'rb') as f:
    seg_ids_4s = pickle.load(f)
ob_IDs_4s = np.unique([seg.split("_")[0] for seg in seg_ids_4s]) 

In [None]:
cadence=1
seg_len_s=128
stride_s=10

segments_counts=[]
segments_times = []
segments_errors=[]
seg_ids=[]


seg_len = seg_len_s//cadence # segment length and stride size in data points
stride = stride_s//cadence



for lc_index, lc in enumerate(binned_lcs):
    if lc_ids[lc_index] in ob_IDs_4s: # filter out observations missing from the 4 second dataset
        if len(lc[1]) >= seg_len: 
            segments = segmentation(lc, seg_len, stride, keep_time_stamps=True, experimental = False, input_cadence=cadence)
        else:
            continue
        if len(segments) > 0:
            segments_times.append(segments[:,0,:])
            segments_counts.append(segments[:,1,:])
            segments_errors.append(segments[:,2,:])
            seg_ids.append(lc_ids[lc_index])
            print("Segmented {}/{} light curves.".format(lc_index+1, len(lcs)))
            clear_output(wait=True)


In [None]:
print("Stacking the segments and creating segment IDs, shuffling.")
id_per_seg = []  # for each light curve, copy the observation id for every segment of the light curve
for lc_index, lc in enumerate(segments_counts):
    for i in range(len(lc)):
        id_per_seg.append(seg_ids[lc_index]+"_{}".format(i))

segments_counts=np.vstack(segments_counts)
segments_errors=np.vstack(segments_errors)
segments_counts = np.expand_dims(segments_counts, axis=-1)
segments_errors = np.expand_dims(segments_errors, axis=-1)

rng_state = np.random.get_state()
np.random.shuffle(segments_counts)
np.random.set_state(rng_state)
np.random.shuffle(segments_errors)
np.random.set_state(rng_state)
np.random.shuffle(id_per_seg)

print("Done")

In [None]:
segments_counts.shape

In [None]:
# with open('{}/474471_len128_stride10_1sec_cad_countrates_sum_bin.pkl'.format(data_dir), 'wb') as f:
#     pickle.dump(segments_counts, f)
    
# with open('{}/474471_len128_stride10_1sec_cad_errors_sum_bin.pkl'.format(data_dir), 'wb') as f:
#     pickle.dump(segments_errors, f)
    
# with open('{}/474471_len128_stride10_1sec_cad_ids_sum_bin.pkl'.format(data_dir), 'wb') as f:
#     pickle.dump(id_per_seg, f)

# Make the training/validation/testing split
## Load classifications from Huppenkothen+2017

In [None]:
clean_belloni = open('{}/1915Belloniclass_updated.dat'.format(data_dir))
lines = clean_belloni.readlines()
states = lines[0].split()
belloni_clean = {}
for h,l in zip(states, lines[1:]):
    belloni_clean[h] = l.split()
    #state: obsID1, obsID2...
ob_state = {}
for state, obs in belloni_clean.items():
    if state == "chi1" or state == "chi2" or state == "chi3" or state == "chi4": state = "chi"
    for ob in obs:
        ob_state[ob] = state

        
# inverse the ob_state dictionary, so that inv_ob_state contains {"state name" : [list of observation IDs], ...}

inv_ob_state = {}
for k, v in ob_state.items():
    inv_ob_state[v] = inv_ob_state.get(v, [])
    inv_ob_state[v].append(k)

## Split observation IDs 
- according to the 7/1/2 ratio w.r.t. the number of data points in the light curves
- stratify the split for the classified subset of data to ensure training/testing completeness
- account for the fact that 4second data is missing some of the observations. Split must be based on this data

In [None]:
with open('{}/468202_len128_stride8_4sec_cad_ids_sum_bin.pkl'.format(data_dir), 'rb') as f:
    seg_ids_4s = pickle.load(f)
with open('{}/475765_len128_stride10_1sec_cad_ids_sum_bin.pkl'.format(data_dir), 'rb') as f:
    seg_ids_1s = pickle.load(f)

In [None]:
# get rid of the within-observation segment indices and create a degenerate list of observation IDs
seg_ob_IDs = np.unique([seg.split("_")[0] for seg in seg_ids_4s]) 

# create list of segment classifications
classes = np.array(["alpha", "beta", "gamma", "delta", "theta", "kappa", "lambda", "mu", "nu", "rho", "phi", "chi", "eta", "omega"])
seg_class = []
for seg in seg_ob_IDs:
    if seg in ob_state:
        seg_class.append(ob_state[seg])
    else:
        seg_class.append("Unknown")
        
print(np.unique(seg_class, return_counts=True)) # class eta doesn't have enough observations for stratification, 
# all of them will be included in the training set and the class won't be included in the test set

In [None]:
len([ob for ob in np.unique(seg_ob_IDs) if ob in ob_state.keys()])

In [None]:
# use only observations present in both datasets
with open('{}/468202_len128_stride8_4sec_cad_ids_sum_bin.pkl'.format(data_dir), 'rb') as f:
    seg_ids_4s = pickle.load(f)
ob_IDs_4s = np.unique([seg.split("_")[0] for seg in seg_ids_4s]) 

In [None]:
np.random.seed(seed=11)

# calculate total number of data points in 1738 observations
# obsrvation split will be common to all data sets
total_data_volume = 0
for lc_index, lc in enumerate(lcs):
    if lc_ids[lc_index] in ob_IDs_4s:
        total_data_volume += len(lc[0])
needed_validation_data = total_data_volume*0.1
needed_testing_data = total_data_volume*0.2

test_set_size = 0
val_set_size = 0
train_set_size = 0

test_set = []
val_set = []
train_set = []

# 1738 observations found in the 4second data set. this is the subset that will be class-stratified
obs_to_split = ob_IDs_4s

# split is stratified, so done separately for each class
# eta class has only 2 classified observations, so both will be in the training set
for class_name in ["alpha", "beta", "gamma", "delta", "theta", "kappa", "lambda", "mu", "nu", "rho", "phi", "chi", "omega"]:

    class_obs_all = inv_ob_state[class_name] # all labeled observation IDs of this class
    class_obs = [] # exclude observations which did not produce any light curve segments
    for ob in class_obs_all:
        if ob in obs_to_split:
            class_obs.append(ob)

    # pick 20% of observations for the test set, at least 1 observation
    test_obs = np.random.choice(class_obs, size=int(np.ceil(len(class_obs)*0.2)), replace=False) 
    
    for ob in test_obs:
        test_set_size += len(lcs[np.where(np.array(lc_ids) == ob)[0][0]][0]) # add the length of the light curve
    
    if len(test_obs) == 0:
        print(class_name)
    
    # remove test observations from the class_obs list
    class_obs = [ob for ob in class_obs if ob not in test_obs]
    
    # pick 10% of observations for the valdiation set, at least 1 observation
    val_obs = np.random.choice(class_obs, size=int(np.ceil(len(class_obs)*0.1)), replace=False)
    
    for ob in val_obs:
        val_set_size += len(lcs[np.where(np.array(lc_ids) == ob)[0][0]][0]) # add the length of the light curve

    if len(val_obs) == 0:
        print(class_name)
        
    # remove val observations from the class_obs list
    class_obs = [ob for ob in class_obs if ob not in val_obs]

    # use the remaining observations as training set
    train_obs = []
    for ob in class_obs:
        train_obs.append(ob)
        train_set_size += len(lcs[np.where(np.array(lc_ids) == ob)[0][0]][0]) # add the length of the light curve
    
    if len(train_obs) == 0:
        print(class_name)
    
    test_set.append(test_obs)
    val_set.append(val_obs)
    train_set.append(train_obs)
    
class_obs_all = inv_ob_state["eta"] # all labeled observation IDs of eta class
# exclude observations which did not produce any light curve segments, and append the rest to training set
class_obs=[]
for ob in class_obs_all:
    if ob in obs_to_split:
        class_obs.append(ob)
train_set.append(class_obs)

test_set=list(np.hstack(test_set))
val_set=list(np.hstack(val_set))
train_set=list(np.hstack(train_set))


# fill train/val/test sets with the remaining observations
remaining_obs_to_split = [ob for ob in obs_to_split if (ob not in test_set) and (ob not in val_set) and (ob not in train_set)]
np.random.shuffle(remaining_obs_to_split)

for ob in remaining_obs_to_split:
    test_set.append(ob)
    test_set_size += len(lcs[np.where(np.array(lc_ids) == ob)[0][0]][0])
    if test_set_size >= needed_testing_data:
        break

remaining_obs_to_split = [ob for ob in remaining_obs_to_split if ob not in test_set]
np.random.shuffle(remaining_obs_to_split)

for ob in remaining_obs_to_split:
    val_set.append(ob)
    val_set_size += len(lcs[np.where(np.array(lc_ids) == ob)[0][0]][0])
    if val_set_size >= needed_validation_data:
        break
        
remaining_obs_to_split = [ob for ob in remaining_obs_to_split if ob not in val_set]

for ob in remaining_obs_to_split:
    train_set.append(ob)
    train_set_size += len(lcs[np.where(np.array(lc_ids) == ob)[0][0]][0])

        
test_set=np.hstack(test_set)
val_set=np.hstack(val_set)
train_set=np.hstack(train_set)

split_obs = [train_set, val_set, test_set]

print("Test set ", test_set_size/total_data_volume)
print("Validation set percentage", valid_set_size/total_data_volume)
print("Training set percentage", (total_data_volume-valid_set_size-test_set_size)/total_data_volume)

In [None]:
print("Observation ID intersection between: \ntest-valid {} \ntest-train {} \nvalid-train sets {}".format(
      len([ob for ob in test_set if ob in val_set]),
      len([ob for ob in test_set if ob in train_set]), 
      len([ob for ob in val_set if ob in train_set])))
print()
print("Sum of train/val/test sizes: {}".format(np.sum([len(subset) for subset in split_obs])))

In [None]:
# with open('{}/lightcurve1738_train70_val10_test20.pkl'.format(data_dir), 'wb') as f:
#     pickle.dump(split_obs, f)

In [None]:
np.unique([x.split("_")[0] for x in id_per_seg]).shape

In [None]:
len(seg_ids)

In [None]:
np.unique(ids).shape

In [None]:
with open('../../../data_GRS1915/468202_len128_stride8_4sec_cad_countrates_sum_bin.pkl', 'rb') as f:
    segments = pickle.load(f)
with open('../../../data_GRS1915/468202_len128_stride8_4sec_cad_errors_sum_bin.pkl', 'rb') as f:
    errors = pickle.load(f)
with open('../../../data_GRS1915/468202_len128_stride8_4sec_cad_ids_sum_bin.pkl', 'rb') as f:
    ids = pickle.load(f)

# errors = ((errors)/np.expand_dims(np.std(segments, axis=1), axis=1)).astype(np.float32)
# segments = zscore(segments, axis=1).astype(np.float32)  # standardize per segment


with open('../../../data_GRS1915/lightcurve1738_train70_val10_test20.pkl', 'rb') as f:
    split_ob_ids = pickle.load(f)
    
ids_no_index = [obid.split("_")[0] for obid in ids]
training_segments_indices = np.array([seg_n for seg_n, seg in enumerate(ids_no_index) if seg in split_ob_ids[0]])
validation_segments_indices = np.array([seg_n for seg_n, seg in enumerate(ids_no_index) if seg in split_ob_ids[1]])


In [None]:
t_segments_indices = np.array([seg_n for seg_n, seg in enumerate(ids_no_index) if seg in split_ob_ids[2]])


In [None]:
[ob for ob in np.unique(np.take(ids_no_index, training_segments_indices)) if ob in np.unique(np.take(ids_no_index, validation_segments_indices))]

In [None]:
[ob for ob in np.unique(np.take(ids_no_index, validation_segments_indices)) if ob in np.unique(np.take(ids_no_index, training_segments_indices))]

In [None]:
[ob for ob in np.unique(np.take(ids_no_index, validation_segments_indices)) if ob in np.unique(np.take(ids_no_index, t_segments_indices))]

In [None]:
[ob for ob in np.unique(np.take(ids_no_index, training_segments_indices)) if ob in np.unique(np.take(ids_no_index, t_segments_indices))]

In [None]:
len(training_segments_indices)

In [None]:
len(validation_segments_indices)

In [None]:
len(t_segments_indices)

In [None]:
327737+47198+93267

# Load Standard 1 light curves from txt, bin to 1 second cadence

In [None]:
def segmentation(time_series, seg_len, stride, keep_time_stamps=True, experimental = False, input_cadence=4):
    """
    Create a list of 1D (when time_stamps=False) or 2D (when time_stamps=True) arrays, which are overlappig segments of ts. Incomplete fragments are rejected.

    time_series = time series to be segmented
    seg_len = length of a segment, 
    stride = step size; difference in the starting position of the consecutive segments
    """
    segments=[]
    for start in range(0, len(time_series[0])-seg_len, stride):
        end=start+seg_len
        if time_series[0][end]-time_series[0][start] != seg_len*input_cadence: #don't allow segments outside of good time intervals
            continue
        if keep_time_stamps==True:
            segments.append(time_series[:,start:end])
        else:
            segments.append(time_series[1:,start:end])
    return np.array(segments) # check why time stamps are kept 

def binning(time_series, output_cadence, input_cadence=0.125):
    """
    Bin the input time series. First dimension of the time series must be equal to 3. Time series must contain an array of time stamps, 
    count values and uncertainty on the count.
    Make sure that count rates are transformed to count values (uncertainty should be equal to the square root of the count).
    
    time_series = array of size [3, N], where N is the length of the series
    input_cadence = input cadence in seconds
    output_cadence = desired cadence in seconds
    """
    binned_stamps = int(output_cadence/input_cadence) # how many data points to bin
        
    weights = f[2]**-2
    weighted_counts = f[1]*weights # weigh counts by the inverse of squared error
    binned_counts = np.sum(weighted_counts[:(len(weighted_counts)//binned_stamps)*binned_stamps].reshape(-1, binned_stamps), axis=1) # sum weighted counts within each bin
    binned_weights = np.sum(weights[:(len(weights)//binned_stamps)*binned_stamps].reshape(-1, binned_stamps), axis=1) # sum weights within each bin
    binned_counts/=binned_weights # normalise weighted values using sum of weights
    binned_errors = np.sqrt(1.0/(binned_weights)) # calculate uncertainty of each bin
    binned_time = np.mean(f[0][:(len(f[0])//binned_stamps)*binned_stamps].reshape(-1, binned_stamps), axis=1) # find the mean time of each bin
    
    # if bin crosses between two good time intervals, the difference between its binned time and the binned time of preceding bin will not
    # be equal to the desired cadence. Remove those bins from the light curve
    rm_points = []
    skip=False
    for i in range(len(binned_time)-1):
        if skip==True:
            skip=False
            continue
        delta = binned_time[i+1]-binned_time[i]
        if delta > output_cadence:
            rm_points.append(i+1)
            skip=True
    times=np.delete(binned_time,rm_points)
    counts=np.delete(binned_counts,rm_points)
    errors=np.delete(binned_errors,rm_points)
    
    return np.stack((times,counts, errors))


def std1_to_segments(in_data_dir, cadence, seg_len_s, stride_s, random_seed):
    """
    in_data_dir = directory that will be searched for "*_std1_lc.txt" files containing Standard1 light curve data
    cadence = desired amount of time between data points of the final segments, unit of seconds, should be a multiple of 0.125 (std1 resolution)
    seg_len_s = desired segment length in seconds
    stride_s = time difference between consecutive segments; stride size of the moving window in seconds
    random_seed = set the seed of the numpy random state
    
    returns segments_counts, segments_errors, id_per_seg
    """
    np.random.seed(seed=random_seed)
    
    lcs = []
    ids=[]

    binned_stamps = int(cadence/0.125) # how many time stamps go into one bin

    for root, dirnames, filenames in os.walk(in_data_dir): #Std1_PCU2
        for filename in fnmatch.filter(filenames, "*_std1_lc.txt"):
            lc = os.path.join(root, filename)
            ids.append(filename.split("_")[0])
            f=np.loadtxt(lc)
            f=np.transpose(f)#,axis=1)
            
            binned_lc = binning(f, bin_size=binned_stamps)
            lcs.append(binned_lc)
    
    print("Binned {} light curves.".format(len(lcs)))
    clear_output(wait=True)
            
    segments_counts=[]
    segments_errors=[]
    seg_ids=[]


    seg_len = seg_len_s//cadence # segment length and stride size in data points
    stride = stride_s//cadence



    for lc_index, lc in enumerate(lcs):
        if len(lc[1]) >= seg_len: 
            segments = segmentation(lc, seg_len, stride, keep_time_stamps=False, experimental = False)
        else:
            continue
        if len(segments) > 0:
            segments_counts.append(segments[:,0,:])
            segments_errors.append(segments[:,1,:])
            seg_ids.append(ids[lc_index])
            print("Segmented {}/{} light curves.".format(lc_index+1, len(lcs)))
            clear_output(wait=True)
    
    print("Stacking the segments and creating segment IDs, shuffling.")
    id_per_seg = []  # for each light curve, copy the observation id for every segment of the light curve
    for lc_index, lc in enumerate(segments_counts):
        for i in range(len(lc)):
            id_per_seg.append(seg_ids[lc_index]+"_{}".format(i))

    segments_counts=np.vstack(segments_counts)
    segments_errors=np.vstack(segments_errors)
    segments_counts = np.expand_dims(segments_counts, axis=-1)
    segments_errors = np.expand_dims(segments_errors, axis=-1)
    
    rng_state = np.random.get_state()
    np.random.shuffle(segments_counts)
    np.random.set_state(rng_state)
    np.random.shuffle(segments_errors)
    np.random.set_state(rng_state)
    np.random.shuffle(id_per_seg)

    print("Done")
    
    return segments_counts, segments_errors, id_per_seg

In [None]:
segments_counts, segments_errors, id_per_seg = std1_to_segments(in_data_dir="/export/data/jakubok/GRS1915+105/Std1_PCU2", cadence=4, seg_len_s=512, stride_s=8, random_seed=11)


In [None]:
plt.rcParams['figure.figsize'] = (50.0, 5.0)
plt.rcParams.update({'font.size': 12})
plt.errorbar(lcs[4][0],lcs[4][1], yerr=lcs[4][2], ecolor="orange",barsabove=True)
# plt.errorbar(binned_lc[0],binned_lc[1], yerr=binned_lc[2], ecolor="orange",barsabove=True)
plt.show()

In [None]:
lcs[4][1]

In [None]:
(2112/8) #+-16

In [None]:
np.sqrt(264)

In [None]:
129.98461447/8

In [None]:
np.sqrt(lcs[4][1]/8)*8

In [None]:
lcs[4][2]

In [None]:
lcs[4][0]

In [None]:
lcs[4]

In [None]:
np.sqrt(lcs[4][1])

In [None]:
lcs[4][2]

In [None]:
lc_notrate=lcs[4]
lc_notrate[1]/=8
lc_notrate[2]/=8


binned_lc = binning(lc_notrate, 1)

In [None]:
lc_notrate

In [None]:
binned_lc[1]

In [None]:
np.sqrt(binned_lc[1])

In [None]:
binned_lc[2]

In [None]:
lcs[4][2].shape

In [None]:
binned_lc[2].shape

In [None]:
11544/4565

In [None]:
counts=lcs[0][1]
errors=lcs[0][2]

In [None]:
def normalised_variance(counts, errors):
    return (np.var(counts)-np.mean(errors**2))/np.mean(counts**2)

In [None]:
normalised_variance(counts, errors)

In [None]:
normalised_variance(binned_lc[1], binned_lc[2])

In [None]:
np.sqrt(lcs[0][1])

In [None]:
lcs[0][2]

In [None]:
lcs[0][2][:8]

In [None]:
np.sqrt(lcs[0][1])

In [None]:
lcs[0][2]

In [None]:
np.sqrt(    1/ sum(lcs[0][2][:8]**-2)    )

In [None]:
np.sqrt(    sum(lcs[0][2][:8]**2)    )/8

In [None]:
binned_lc[2][0]

In [None]:
lcs=[]
ids=[]

for root, dirnames, filenames in os.walk("/data/jkok1g14/data_GRS1915/std1"): #Std1_PCU2
    for filename in fnmatch.filter(filenames, "*_std1_lc.txt"):
        lc = os.path.join(root, filename)
        ids.append(filename.split("_")[0])
        f=np.loadtxt(lc)
        f=np.transpose(f)#,axis=1)
        #f=f[0:2]
        ###1s average and time check to eliminate points outside of GTIs
        f8t = np.mean(f[0][:(len(f[0])//8)*8].reshape(-1, 8), axis=1)
        f8c = np.mean(f[1][:(len(f[1])//8)*8].reshape(-1, 8), axis=1)
        f8e = np.sqrt(np.sum(f[2][:(len(f[2])//8)*8].reshape(-1, 8)**2, axis=1))/8
        rm_points = []
        skip=False
        for i in range(len(f8t)-1):
            if skip==True:
                skip=False
                continue
            delta = f8t[i+1]-f8t[i]
            if delta > 1.0:
                rm_points.append(i+1)
                skip=True

        times=np.delete(f8t,rm_points)
        counts=np.delete(f8c,rm_points)
        errors=np.delete(f8e,rm_points)
        lcs.append(np.stack((times,counts, errors)))
#         break
#     if len(lcs)>0:
#         break
    


In [None]:
# with open('../../../data_GRS1915/1776_light_curves_1s_bin_errorfix.pkl', 'wb') as f:
#     pickle.dump(lcs, f)
# with open('../../../data_GRS1915/1776_light_curves_1s_bin_ids_errorfix.pkl', 'wb') as f:
#     pickle.dump(ids, f)

with open('../../../data_GRS1915/1776_light_curves_1s_bin_errorfix.pkl', 'rb') as f:
    lcs = pickle.load(f)
with open('../../../data_GRS1915/1776_light_curves_1s_bin_ids_errorfix.pkl', 'rb') as f:
    ids = pickle.load(f)

In [None]:
clean_belloni = open('../../../data_GRS1915/1915Belloniclass_updated.dat')
lines = clean_belloni.readlines()
states = lines[0].split()
belloni_clean = {}
for h,l in zip(states, lines[1:]):
    belloni_clean[h] = l.split()
    #state: obsID1, obsID2...
ob_state = {}
for state, obs in belloni_clean.items():
    if state == "chi1" or state == "chi2" or state == "chi3" or state == "chi4": state = "chi"
    for ob in obs:
        ob_state[ob] = state


In [None]:
def segmentation(time_series, seg_len, stride, keep_time_stamps=True, experimental = False):
    """
    Create a list of 1D (when time_stamps=False) or 2D (when time_stamps=True) arrays, which are overlappig segments of ts. Incomplete fragments are rejected.
    
    time_series = time series to be segmented
    seg_len = length of a segment, 
    stride = step size; difference in the starting position of the consecutive segments
    """
    
    
    segments=[]
    for start in range(0, len(time_series[0])-seg_len, stride):
        end=start+seg_len
        ############################################# *4 because of the 4 second cadance 
        if time_series[0][end]-time_series[0][start] != seg_len: #don't allow temporally discontinous segments
            continue
        if keep_time_stamps==True:
            segments.append(time_series[:,start:end])
        else:
            segments.append(time_series[1:,start:end])
    return np.array(segments) # check why time stamps are kept 


In [None]:
segments_counts=[]
segments_errors=[]
seg_ids=[]
for lc_index, lc in enumerate(lcs):
    if len(lc[1]) >= 512: 
        segments = segmentation(lc, 512, 40, keep_time_stamps=False, experimental = False)
    else:
        continue
    if len(segments) > 0:
        segments_counts.append(segments[:,0,:])
        segments_errors.append(segments[:,1,:])
        seg_ids.append(ids[lc_index])
        print(lc_index+1, "/{}".format(len(lcs)))
        clear_output(wait=True)

In [None]:
id_per_seg = []  # for each light curve, copy the observation id for every segment of the light curve
for lc_index, lc in enumerate(segments_counts):
    for i in range(len(lc)):
        id_per_seg.append(seg_ids[lc_index]+"_{}".format(i))
        
segments_counts=np.vstack(segments_counts)
segments_errors=np.vstack(segments_errors)
segments_counts = np.expand_dims(segments_counts, axis=-1)
segments_errors = np.expand_dims(segments_errors, axis=-1)

In [None]:
rng_state = np.random.get_state()
np.random.shuffle(segments_counts)
np.random.set_state(rng_state)
np.random.shuffle(segments_errors)
np.random.set_state(rng_state)
np.random.shuffle(id_per_seg)

In [None]:
# with open('../../../data_GRS1915/94465_len512_s40_counts_errorfix.pkl', 'wb') as f:
#     pickle.dump(segments_counts, f)
    
# with open('../../../data_GRS1915/94465_len512_s40_errors_errorfix.pkl', 'wb') as f:
#     pickle.dump(segments_errors, f)
    
# with open('../../../data_GRS1915/94465_len512_s40_ids_errorfix.pkl', 'wb') as f:
#     pickle.dump(id_per_seg, f)

In [None]:

def std1_to_segments(in_data_dir, cadence, seg_len_s, stride_s, random_seed):
    """
    in_data_dir = directory that will be searched for "*_std1_lc.txt" files containing Standard1 light curve data
    cadence = desired amount of time between data points of the final segments, unit of seconds, should be a multiple of 0.125 (std1 resolution)
    seg_len_s = desired segment length in seconds
    stride_s = time difference between consecutive segments; stride size of the moving window in seconds
    random_seed = set the seed of the numpy random state
    
    returns segments_counts, segments_errors, id_per_seg
    """
    np.random.seed(seed=random_seed)
    
    def segmentation(time_series, seg_len, stride, keep_time_stamps=True, experimental = False, cadence=4):
        """
        Create a list of 1D (when time_stamps=False) or 2D (when time_stamps=True) arrays, which are overlappig segments of ts. Incomplete fragments are rejected.

        time_series = time series to be segmented
        seg_len = length of a segment, 
        stride = step size; difference in the starting position of the consecutive segments
        """


        segments=[]
        for start in range(0, len(time_series[0])-seg_len, stride):
            end=start+seg_len
            ############################################# *4 because of the 4 second cadance 
            if time_series[0][end]-time_series[0][start] != seg_len*cadence: #don't allow temporally discontinous segments
                continue
            if keep_time_stamps==True:
                segments.append(time_series[:,start:end])
            else:
                segments.append(time_series[1:,start:end])
        return np.array(segments) # check why time stamps are kept 


    
    lcs=[]
    ids=[]

    binned_stamps = int(cadence/0.125)

    for root, dirnames, filenames in os.walk(in_data_dir): #Std1_PCU2
        for filename in fnmatch.filter(filenames, "*_std1_lc.txt"):
            lc = os.path.join(root, filename)
            ids.append(filename.split("_")[0])
            f=np.loadtxt(lc)
            f=np.transpose(f)#,axis=1)
            #f=f[0:2]
            ###1s average and time check to eliminate points outside of GTIs
            fbinned_t = np.mean(f[0][:(len(f[0])//binned_stamps)*binned_stamps].reshape(-1, binned_stamps), axis=1)
            fbinned_c = np.mean(f[1][:(len(f[1])//binned_stamps)*binned_stamps].reshape(-1, binned_stamps), axis=1)
            fbinned_e = np.sqrt(np.sum(f[2][:(len(f[2])//binned_stamps)*binned_stamps].reshape(-1, binned_stamps)**2, axis=1))/binned_stamps
            rm_points = []
            skip=False
            for i in range(len(fbinned_t)-1):
                if skip==True:
                    skip=False
                    continue
                delta = fbinned_t[i+1]-fbinned_t[i]
                if delta > cadence:
                    rm_points.append(i+1)
                    skip=True

            times=np.delete(fbinned_t,rm_points)
            counts=np.delete(fbinned_c,rm_points)
            errors=np.delete(fbinned_e,rm_points)
            lcs.append(np.stack((times,counts, errors)))
            
            print("Binned {} light curves.".format(len(lcs)))
            clear_output(wait=True)
            
            
    segments_counts=[]
    segments_errors=[]
    seg_ids=[]


    seg_len = seg_len_s//cadence # segment length and stride size in data points
    stride = stride_s//cadence



    for lc_index, lc in enumerate(lcs):
        if len(lc[1]) >= seg_len: 
            segments = segmentation(lc, seg_len, stride, keep_time_stamps=False, experimental = False)
        else:
            continue
        if len(segments) > 0:
            segments_counts.append(segments[:,0,:])
            segments_errors.append(segments[:,1,:])
            seg_ids.append(ids[lc_index])
            print("Segmented {}/{} light curves.".format(lc_index+1, len(lcs)))
            clear_output(wait=True)
    
    print("Stacking the segments and creating segment IDs, shuffling.")
    id_per_seg = []  # for each light curve, copy the observation id for every segment of the light curve
    for lc_index, lc in enumerate(segments_counts):
        for i in range(len(lc)):
            id_per_seg.append(seg_ids[lc_index]+"_{}".format(i))

    segments_counts=np.vstack(segments_counts)
    segments_errors=np.vstack(segments_errors)
    segments_counts = np.expand_dims(segments_counts, axis=-1)
    segments_errors = np.expand_dims(segments_errors, axis=-1)
    
    rng_state = np.random.get_state()
    np.random.shuffle(segments_counts)
    np.random.set_state(rng_state)
    np.random.shuffle(segments_errors)
    np.random.set_state(rng_state)
    np.random.shuffle(id_per_seg)

    print("Done")
    
    return segments_counts, segments_errors, id_per_seg

In [None]:
segments_counts, segments_errors, id_per_seg = std1_to_segments(in_data_dir="/export/data/jakubok/GRS1915+105/Std1_PCU2", cadence=4, seg_len_s=512, stride_s=8, random_seed=11)


In [None]:
data_dir = "../../../files_from_snuffy"
with open('{}/1776_light_curves_4s_bin_errorfix.pkl'.format(data_dir), 'rb') as f:
    lcs = pickle.load(f)
with open('{}/1776_light_curves_4s_bin_ids_errorfix.pkl'.format(data_dir), 'rb') as f:
    ids = pickle.load(f)

In [None]:
def segmentation(time_series, seg_len, stride, keep_time_stamps=True, experimental = False, cadence=4):
    """
    Create a list of 1D (when time_stamps=False) or 2D (when time_stamps=True) arrays, which are overlappig segments of ts. Incomplete fragments are rejected.

    time_series = time series to be segmented
    seg_len = length of a segment, 
    stride = step size; difference in the starting position of the consecutive segments
    """


    segments=[]
    for start in range(0, len(time_series[0])-seg_len, stride):
        end=start+seg_len
        ############################################# *4 because of the 4 second cadance 
        if time_series[0][end]-time_series[0][start] != seg_len*cadence: #don't allow temporally discontinous segments
            continue
        if keep_time_stamps==True:
            segments.append(time_series[:,start:end])
        else:
            segments.append(time_series[1:,start:end])
    return np.array(segments) # check why time stamps are kept 


cadence=4
seg_len_s=512
stride_s=8

segments_counts=[]
segments_times = []
segments_errors=[]
seg_ids=[]


seg_len = seg_len_s//cadence # segment length and stride size in data points
stride = stride_s//cadence



for lc_index, lc in enumerate(lcs):
    if len(lc[1]) >= seg_len: 
        segments = segmentation(lc, seg_len, stride, keep_time_stamps=True, experimental = False)
    else:
        continue
    if len(segments) > 0:
        segments_times.append(segments[:,0,:])
        segments_counts.append(segments[:,1,:])
        segments_errors.append(segments[:,2,:])
        seg_ids.append(ids[lc_index])
        print("Segmented {}/{} light curves.".format(lc_index+1, len(lcs)))
        clear_output(wait=True)
        
id_per_seg = []  # for each light curve, copy the observation id for every segment of the light curve
for lc_index, lc in enumerate(segments_counts):
    for i in range(len(lc)):
        id_per_seg.append(seg_ids[lc_index]+"_{}".format(i))

In [None]:
import pandas as pd

In [None]:
segments_times[0][:,0].shape

In [None]:
seg_start_times = np.concatenate([time[:,0] for time in segments_times])

In [None]:
seg_id_start_time_df = pd.DataFrame(seg_start_times, columns=["Start_time"], index=id_per_seg)

In [None]:
seg_id_start_time_df

In [None]:
seg_id_start_time_df

In [None]:
with open('../../../files_from_snuffy/468202_len128_s2_4cad_start_times_errorfix.pkl', 'wb') as f:
    pickle.dump(seg_id_start_time_df, f)

In [None]:
# with open('../../../data_GRS1915/468202_len128_s2_4cad_counts_errorfix.pkl', 'wb') as f:
#     pickle.dump(segments_counts, f)
    
# with open('../../../data_GRS1915/468202_len128_s2_4cad_errors_errorfix.pkl', 'wb') as f:
#     pickle.dump(segments_errors, f)
    
# with open('../../../data_GRS1915/468202_len128_s2_4cad_ids_errorfix.pkl', 'wb') as f:
#     pickle.dump(id_per_seg, f)

In [None]:
lcs=[]
ids=[]

cadence=4 # seconds
binned_stamps = int(cadence/0.125)

for root, dirnames, filenames in os.walk("/data/jkok1g14/data_GRS1915/std1"): #Std1_PCU2
    for filename in fnmatch.filter(filenames, "*_std1_lc.txt"):
        lc = os.path.join(root, filename)
        ids.append(filename.split("_")[0])
        f=np.loadtxt(lc)
        f=np.transpose(f)#,axis=1)
        #f=f[0:2]
        ###1s average and time check to eliminate points outside of GTIs
        fbinned_t = np.mean(f[0][:(len(f[0])//binned_stamps)*binned_stamps].reshape(-1, binned_stamps), axis=1)
        fbinned_c = np.mean(f[1][:(len(f[1])//binned_stamps)*binned_stamps].reshape(-1, binned_stamps), axis=1)
        fbinned_e = np.sqrt(np.sum(f[2][:(len(f[2])//binned_stamps)*binned_stamps].reshape(-1, binned_stamps)**2, axis=1))/binned_stamps
        rm_points = []
        skip=False
        for i in range(len(fbinned_t)-1):
            if skip==True:
                skip=False
                continue
            delta = fbinned_t[i+1]-fbinned_t[i]
            if delta > cadence:
                rm_points.append(i+1)
                skip=True

        times=np.delete(fbinned_t,rm_points)
        counts=np.delete(fbinned_c,rm_points)
        errors=np.delete(fbinned_e,rm_points)
        lcs.append(np.stack((times,counts, errors)))


In [None]:
# with open('../../../data_GRS1915/1776_light_curves_4s_bin_errorfix.pkl', 'wb') as f:
#     pickle.dump(lcs, f)
# with open('../../../data_GRS1915/1776_light_curves_4s_bin_ids_errorfix.pkl', 'wb') as f:
#     pickle.dump(ids, f)

# with open('../../../data_GRS1915/1776_light_curves_4s_bin_errorfix.pkl', 'rb') as f:
#     lcs = pickle.load(f)
# with open('../../../data_GRS1915/1776_light_curves_4s_bin_ids_errorfix.pkl', 'rb') as f:
#     ids = pickle.load(f)


In [None]:
def segmentation(time_series, seg_len, stride, keep_time_stamps=True, experimental = False, cadence=4):
    """
    Create a list of 1D (when time_stamps=False) or 2D (when time_stamps=True) arrays, which are overlappig segments of ts. Incomplete fragments are rejected.
    
    time_series = time series to be segmented
    seg_len = length of a segment, 
    stride = step size; difference in the starting position of the consecutive segments
    """
    
    
    segments=[]
    for start in range(0, len(time_series[0])-seg_len, stride):
        end=start+seg_len
        ############################################# *4 because of the 4 second cadance 
        if time_series[0][end]-time_series[0][start] != seg_len*cadence: #don't allow temporally discontinous segments
            continue
        if keep_time_stamps==True:
            segments.append(time_series[:,start:end])
        else:
            segments.append(time_series[1:,start:end])
    return np.array(segments) # check why time stamps are kept 


In [None]:
segments_counts=[]
segments_errors=[]
seg_ids=[]

seg_len_s = 1024 # seconds 
stride_s = 16 # seconds
cadence=4 # seconds

seg_len = seg_len_s//cadence
stride = stride_s//cadence



for lc_index, lc in enumerate(lcs):
    if len(lc[1]) >= seg_len: 
        segments = segmentation(lc, seg_len, stride, keep_time_stamps=False, experimental = False)
    else:
        continue../../../files_from_snuffy
    if len(segments) > 0:
        segments_counts.append(segments[:,0,:])
        segments_errors.append(segments[:,1,:])
        seg_ids.append(ids[lc_index])
        print(lc_index+1, "/{}".format(len(lcs)))
        clear_output(wait=True)

In [None]:
id_per_seg = []  # for each light curve, copy the observation id for every segment of the light curve
for lc_index, lc in enumerate(segments_counts):
    for i in range(len(lc)):
        id_per_seg.append(seg_ids[lc_index]+"_{}".format(i))
        
segments_counts=np.vstack(segments_counts)
segments_errors=np.vstack(segments_errors)
segments_counts = np.expand_dims(segments_counts, axis=-1)
segments_errors = np.expand_dims(segments_errors, axis=-1)

In [None]:
rng_state = np.random.get_state()
np.random.shuffle(segments_counts)
np.random.set_state(rng_state)
np.random.shuffle(segments_errors)
np.random.set_state(rng_state)
np.random.shuffle(id_per_seg)

In [None]:
segments.shape

In [None]:
# with open('../../../data_GRS1915/159927_len256_s4_4cad_counts_errorfix.pkl', 'wb') as f:
#     pickle.dump(segments_counts, f)
    
# with open('../../../data_GRS1915/159927_len256_s4_4cad_errors_errorfix.pkl', 'wb') as f:
#     pickle.dump(segments_errors, f)
    
# with open('../../../data_GRS1915/159927_len256_s4_4cad_ids_errorfix.pkl', 'wb') as f:
#     pickle.dump(id_per_seg, f)

# 96 data point segments, 4 second cadence

In [None]:
with open('{}/1776_light_curves_4s_bin_errorfix.pkl'.format(data_dir), 'rb') as f:
    lcs = pickle.load(f)
with open('{}/1776_light_curves_4s_bin_ids_errorfix.pkl'.format(data_dir), 'rb') as f:
    ids = pickle.load(f)

In [None]:
def segmentation(time_series, seg_len, stride, keep_time_stamps=True, experimental = False, cadence=4):
    """
    Create a list of 1D (when time_stamps=False) or 2D (when time_stamps=True) arrays, which are overlappig segments of ts. Incomplete fragments are rejected.

    time_series = time series to be segmented
    seg_len = length of a segment, 
    stride = step size; difference in the starting position of the consecutive segments
    """


    segments=[]
    for start in range(0, len(time_series[0])-seg_len, stride):
        end=start+seg_len
        ############################################# *4 because of the 4 second cadance 
        if time_series[0][end]-time_series[0][start] != seg_len*cadence: #don't allow temporally discontinous segments
            continue
        if keep_time_stamps==True:
            segments.append(time_series[:,start:end])
        else:
            segments.append(time_series[1:,start:end])
    return np.array(segments) # check why time stamps are kept 


cadence=4
seg_len_s=384
stride_s=8

segments_counts=[]
segments_times = []
segments_errors=[]
seg_ids=[]


seg_len = seg_len_s//cadence # segment length and stride size in data points
stride = stride_s//cadence



for lc_index, lc in enumerate(lcs):
    if len(lc[1]) >= seg_len: 
        segments = segmentation(lc, seg_len, stride, keep_time_stamps=True, experimental = False)
    else:
        continue
    if len(segments) > 0:
        segments_times.append(segments[:,0,:])
        segments_counts.append(segments[:,1,:])
        segments_errors.append(segments[:,2,:])
        seg_ids.append(ids[lc_index])
        print("Segmented {}/{} light curves.".format(lc_index+1, len(lcs)))
        clear_output(wait=True)
        
id_per_seg = []  # for each light curve, copy the observation id for every segment of the light curve
for lc_index, lc in enumerate(segments_counts):
    for i in range(len(lc)):
        id_per_seg.append(seg_ids[lc_index]+"_{}".format(i))

In [None]:
segments_counts=np.vstack(segments_counts)
segments_errors=np.vstack(segments_errors)
segments_counts = np.expand_dims(segments_counts, axis=-1)
segments_errors = np.expand_dims(segments_errors, axis=-1)

In [None]:
rng_state = np.random.get_state()
np.random.shuffle(segments_counts)
np.random.set_state(rng_state)
np.random.shuffle(segments_errors)
np.random.set_state(rng_state)
np.random.shuffle(id_per_seg)

In [None]:
# with open('{}/509201_len96_stride8_4sec_cad_counts_errorfix.pkl'.format(data_dir), 'wb') as f:
#     pickle.dump(segments_counts, f)
    
# with open('{}/509201_len96_stride8_4sec_cad_errors_errorfix.pkl'.format(data_dir), 'wb') as f:
#     pickle.dump(segments_errors, f)
    
# with open('{}/509201_len96_stride8_4sec_cad_ids_errorfix.pkl'.format(data_dir), 'wb') as f:
#     pickle.dump(id_per_seg, f)

In [None]:
with open('{}/509201_len96_stride8_4sec_cad_ids_errorfix.pkl'.format(data_dir), 'rb') as f:
    seg_ids = pickle.load(f)
    
ObID_per_sample = np.array([seg_id.split("_")[0] for seg_id in seg_ids])


needed_validation_segments = 509201*0.25
unique_ObIDs = np.unique(ObID_per_sample, return_counts=True)
ObIDs_no = len(unique_ObIDs[0])
shuffle_indices = np.array(range(ObIDs_no))
np.random.seed(seed=11)
np.random.shuffle(shuffle_indices)


valid_set_obs = []
valid_set_size = 0

for ob_index in shuffle_indices:
    valid_set_obs.append(unique_ObIDs[0][ob_index])
    valid_set_size += unique_ObIDs[1][ob_index]
    if valid_set_size > needed_validation_segments:
        break
        
valid_set_sample_indices = []
for valid_set_ob in np.array(valid_set_obs):
    valid_set_sample_indices.append(np.where(ObID_per_sample == valid_set_ob)[0])

valid_set_sample_indices = [item for sublist in valid_set_sample_indices for item in sublist]

train_set_sample_indices = []
for train_set_ob in shuffle_indices[len(valid_set_obs):]:
    train_set_sample_indices.append(np.where(ObID_per_sample == unique_ObIDs[0][train_set_ob])[0])
    
train_set_sample_indices = [item for sublist in train_set_sample_indices for item in sublist]


split_indices = [train_set_sample_indices, valid_set_sample_indices]

# with open('{}/509201_len96_stride8_4sec_cad_observation75-25split.pkl'.format(data_dir), 'wb') as f:
#     pickle.dump(split_indices, f)

In [None]:
len(ObID_per_sample)

In [None]:
len(split_indices[1])

In [None]:
381701/3

# 128 data point segments, 1 second cadence

In [None]:
with open('{}/1776_light_curves_1s_bin_errorfix.pkl'.format(data_dir), 'rb') as f:
    lcs = pickle.load(f)
with open('{}/1776_light_curves_1s_bin_ids_errorfix.pkl'.format(data_dir), 'rb') as f:
    ids = pickle.load(f)

In [None]:
def segmentation(time_series, seg_len, stride, keep_time_stamps=True, experimental = False, cadence=1):
    """
    Create a list of 1D (when time_stamps=False) or 2D (when time_stamps=True) arrays, which are overlappig segments of ts. Incomplete fragments are rejected.

    time_series = time series to be segmented
    seg_len = length of a segment, 
    stride = step size; difference in the starting position of the consecutive segments
    """


    segments=[]
    for start in range(0, len(time_series[0])-seg_len, stride):
        end=start+seg_len
        ############################################# *4 because of the 4 second cadance 
        if time_series[0][end]-time_series[0][start] != seg_len*cadence: #don't allow temporally discontinous segments
            continue
        if keep_time_stamps==True:
            segments.append(time_series[:,start:end])
        else:
            segments.append(time_series[1:,start:end])
    return np.array(segments) # check why time stamps are kept 


cadence=1
seg_len_s=128
stride_s=8

segments_counts=[]
segments_times = []
segments_errors=[]
seg_ids=[]


seg_len = seg_len_s//cadence # segment length and stride size in data points
stride = stride_s//cadence



for lc_index, lc in enumerate(lcs):
    if len(lc[1]) >= seg_len: 
        segments = segmentation(lc, seg_len, stride, keep_time_stamps=True, experimental = False, cadence=cadence)
    else:
        continue
    if len(segments) > 0:
        segments_times.append(segments[:,0,:])
        segments_counts.append(segments[:,1,:])
        segments_errors.append(segments[:,2,:])
        seg_ids.append(ids[lc_index])
        print("Segmented {}/{} light curves.".format(lc_index+1, len(lcs)))
        clear_output(wait=True)
        
id_per_seg = []  # for each light curve, copy the observation id for every segment of the light curve
for lc_index, lc in enumerate(segments_counts):
    for i in range(len(lc)):
        id_per_seg.append(seg_ids[lc_index]+"_{}".format(i))

In [None]:
segments_counts=np.vstack(segments_counts)
segments_errors=np.vstack(segments_errors)
segments_counts = np.expand_dims(segments_counts, axis=-1)
segments_errors = np.expand_dims(segments_errors, axis=-1)

In [None]:
rng_state = np.random.get_state()
np.random.shuffle(segments_counts)
np.random.set_state(rng_state)
np.random.shuffle(segments_errors)
np.random.set_state(rng_state)
np.random.shuffle(id_per_seg)

In [None]:
len(id_per_seg)

In [None]:
with open('{}/594483_len128_stride8_1sec_cad_counts_errorfix.pkl'.format(data_dir), 'wb') as f:
    pickle.dump(segments_counts, f)
    
with open('{}/594483_len128_stride8_1sec_cad_errors_errorfix.pkl'.format(data_dir), 'wb') as f:
    pickle.dump(segments_errors, f)
    
with open('{}/594483_len128_stride8_1sec_cad_ids_errorfix.pkl'.format(data_dir), 'wb') as f:
    pickle.dump(id_per_seg, f)

In [None]:
with open('{}/594483_len128_stride8_1sec_cad_ids_errorfix.pkl'.format(data_dir), 'rb') as f:
    seg_ids = pickle.load(f)
    
ObID_per_sample = np.array([seg_id.split("_")[0] for seg_id in seg_ids])


needed_validation_segments = len(seg_ids)*0.25
unique_ObIDs = np.unique(ObID_per_sample, return_counts=True)
ObIDs_no = len(unique_ObIDs[0])
shuffle_indices = np.array(range(ObIDs_no))
np.random.seed(seed=11)
np.random.shuffle(shuffle_indices)


valid_set_obs = []
valid_set_size = 0

for ob_index in shuffle_indices:
    valid_set_obs.append(unique_ObIDs[0][ob_index])
    valid_set_size += unique_ObIDs[1][ob_index]
    if valid_set_size > needed_validation_segments:
        break
        
valid_set_sample_indices = []
for valid_set_ob in np.array(valid_set_obs):
    valid_set_sample_indices.append(np.where(ObID_per_sample == valid_set_ob)[0])

valid_set_sample_indices = [item for sublist in valid_set_sample_indices for item in sublist]

train_set_sample_indices = []
for train_set_ob in shuffle_indices[len(valid_set_obs):]:
    train_set_sample_indices.append(np.where(ObID_per_sample == unique_ObIDs[0][train_set_ob])[0])
    
train_set_sample_indices = [item for sublist in train_set_sample_indices for item in sublist]


split_indices = [train_set_sample_indices, valid_set_sample_indices]

# with open('{}/594483_len128_stride8_1sec_cad_observation75-25split.pkl'.format(data_dir), 'wb') as f:
#     pickle.dump(split_indices, f)

In [None]:
print(len(split_indices[0]), len(split_indices[1])/len(split_indices[0]))

# Histogram generation

In [None]:
with open('../../../data_GRS1915/468202_len128_s2_4cad_counts_errorfix.pkl', 'rb') as f:
    segments_counts = pickle.load(f)
with open('../../../data_GRS1915/468202_len128_s2_4cad_errors_errorfix.pkl', 'rb') as f:
    segments_errors = pickle.load(f)
with open('../../../data_GRS1915/468202_len128_s2_4cad_ids_errorfix.pkl', 'rb') as f:
    id_per_seg = pickle.load(f)

In [None]:
histograms = np.zeros((segments_counts.shape[0], 32))
for seg_ind, seg in enumerate(segments_counts):
    histograms[seg_ind] = np.histogram(seg.squeeze(), bins=32, range=[0,13000])[0]
    print(seg_ind)
    clear_output(wait=True)
histograms = np.expand_dims(histograms, axis=-1)

In [None]:
with open('../../../data_GRS1915/468202_len128_s2_4cad_histograms_32bin_0-13k_errorfix.pkl', 'wb') as f:
    pickle.dump(histograms, f)

# Stat/feature calculation for segments as an alternative for histograms

In [None]:
with open('../../../data_GRS1915/468202_len128_s2_4cad_counts_errorfix.pkl', 'rb') as f:
    segments_counts = pickle.load(f)
with open('../../../data_GRS1915/468202_len128_s2_4cad_errors_errorfix.pkl', 'rb') as f:
    segments_errors = pickle.load(f)
with open('../../../data_GRS1915/468202_len128_s2_4cad_ids_errorfix.pkl', 'rb') as f:
    id_per_seg = pickle.load(f)

In [None]:
from scipy import stats
import umap
from sklearn.mixture import GaussianMixture
from scipy.stats import zscore


In [None]:
desc_stats = np.zeros((len(segments_counts), 5)) # median, mean, std, skew, kurt, GM1_bic, GM2_bic, GM3_bic
# search for descriptive statistics
#https://towardsdatascience.com/modality-tests-and-kernel-density-estimations-3f349bb9e595

In [None]:
desc_stats[:,0] = np.median(segments_counts, axis=1).flatten()
desc_stats[:,1] = np.mean(segments_counts, axis=1).flatten()
desc_stats[:,2] = np.std(segments_counts, axis=1).flatten()
desc_stats[:,3] = stats.skew(segments_counts, axis=1).flatten()
desc_stats[:,4] = stats.kurtosis(segments_counts, axis=1).flatten()


In [None]:
weights_dir = "../../../model_weights/model_2020-04-29_09-12-23.h5"
segments_dir = '../../../data_GRS1915/468202_len128_s2_4cad_counts_errorfix.pkl'
segment_encoding_dir = '../../../data_GRS1915/segment_encoding_{}_segments_{}.pkl'.format(weights_dir.split("/")[-1].split(".")[0], segments_dir.split("/")[-1].split(".")[0])

with open(segment_encoding_dir, 'rb') as f:
    segment_encoding = pickle.load(f)
    
segment_encoding_scaled_means = zscore(segment_encoding[:,0,:], axis=None).astype(np.float32)  # standardize


In [None]:
zscore_desc_stats = zscore(desc_stats, axis=0)

In [None]:
GMM_bics = np.zeros((len(segments_counts), 3))

for n_seg ,segment in enumerate(segments_counts):
    clf = GaussianMixture(n_components=1, covariance_type='full', verbose=0)
    clf.fit(segment)
    GMM_bics[n_seg, 0] = clf.bic(segment)
    clf = GaussianMixture(n_components=2, covariance_type='full', verbose=0)
    clf.fit(segment)
    GMM_bics[n_seg, 1] = clf.bic(segment)
    clf = GaussianMixture(n_components=3, covariance_type='full', verbose=0)
    clf.fit(segment)
    GMM_bics[n_seg, 2] = clf.bic(segment)
    GMM_bics[n_seg, :] = zscore(GMM_bics[n_seg, :])
    print(n_seg)
    clear_output(wait=True)

In [None]:
# with open('../../../data_GRS1915/468202_segment_GMM_bic_1-3components_zscored.pkl', 'wb') as f:
#     pickle.dump(GMM_bics, f)

In [None]:
desc_GM = np.hstack((zscore(desc_stats, axis=0), GMM_bics))

In [None]:
shape_desc_GM = np.hstack((segment_encoding_scaled_means, desc_GM))

In [None]:
UMAP_mapper = umap.UMAP(verbose=True)#n_neighbors=50, min_dist=0.0
UMAP_mapper.fit(shape_desc_GM[:50000,:])
umaped_data = UMAP_mapper.transform(shape_desc_GM)

In [None]:
plt.rcParams['figure.figsize'] = (abs((np.min(umaped_data[:,0])-0.5) -(np.max(umaped_data[:,0])+0.5)), abs((np.min(umaped_data[:,1])-0.5)- (np.max(umaped_data[:,1])+0.5)))
plt.scatter(umaped_data[:,0], umaped_data[:,1], s=0.05)
plt.xlim([np.min(umaped_data[:,0])-0.5, np.max(umaped_data[:,0])+0.5])
plt.ylim([np.min(umaped_data[:,1])-0.5, np.max(umaped_data[:,1])+0.5])
plt.show()

In [None]:
# 

In [None]:
with open('{}/468202_len128_s2_4cad_ids_errorfix.pkl'.format(data_dir), 'rb') as f:
    seg_ids = pickle.load(f)
    
ObID_per_sample = np.array([seg_id.split("_")[0] for seg_id in seg_ids])


needed_validation_segments = 468202/10
unique_ObIDs = np.unique(ObID_per_sample, return_counts=True)
ObIDs_no = len(unique_ObIDs[0])
shuffle_indices = np.array(range(ObIDs_no))
np.random.seed(seed=11)
np.random.shuffle(shuffle_indices)


valid_set_obs = []
valid_set_size = 0

for ob_index in shuffle_indices:
    valid_set_obs.append(unique_ObIDs[0][ob_index])
    valid_set_size += unique_ObIDs[1][ob_index]
    if valid_set_size > needed_validation_segments:
        break
        
valid_set_sample_indices = []
for valid_set_ob in np.array(valid_set_obs):
    valid_set_sample_indices.append(np.where(ObID_per_sample == valid_set_ob)[0])

valid_set_sample_indices = [item for sublist in valid_set_sample_indices for item in sublist]

train_set_sample_indices = []
for train_set_ob in shuffle_indices[len(valid_set_obs):]:
    train_set_sample_indices.append(np.where(ObID_per_sample == unique_ObIDs[0][train_set_ob])[0])
    
train_set_sample_indices = [item for sublist in train_set_sample_indices for item in sublist]


split_indices = [train_set_sample_indices, valid_set_sample_indices]

# with open('{}/468202_len128_s2_4cad_observation90-10split.pkl'.format(data_dir), 'wb') as f:
#     pickle.dump(split_indices, f)

In [None]:
print(len(split_indices[0]), len(split_indices[1]))

In [None]:
(3224-565)**2/555**2

In [None]:
(3224-565/555)**2

In [None]:
with open('../../../data_GRS1915/1776_light_curves_1s_bin_errorfix.pkl', 'rb') as f:
    lcs = pickle.load(f)
with open('../../../data_GRS1915/1776_light_curves_1s_bin_ids_errorfix.pkl', 'rb') as f:
    ids = pickle.load(f)

In [None]:
binned_stamps = int(4/1) # final cadence over original cadence
f = lcs[1]
bin_lc = np.sqrt(np.sum(f[2][:(len(f[2])//binned_stamps)*binned_stamps].reshape(-1, binned_stamps)**2, axis=1))/binned_stamps

In [None]:
np.sqrt(np.sum((lcs[0][2][:32])**2))/32

In [None]:
np.sqrt(np.sum((lcs[0][2][:32])**2)/32)

In [None]:
binned_stamps

In [None]:
bin_lc

In [None]:
n_lc = 1
perc_diff = []
for n_lc in range(1000):

    binned_stamps = int(4/1) # final cadence over original cadence
    f = lcs[n_lc]
    bin_lc = np.sqrt(np.sum(f[2][:(len(f[2])//binned_stamps)*binned_stamps].reshape(-1, binned_stamps)**2, axis=1))/binned_stamps

    new_binned_errors=[]
    for i in range(len(bin_lc)):
        firstBinErrs = lcs[n_lc][2][i:i+4]
        weights =  np.sum(firstBinErrs**-2.0) # sum inverse of variance
        binErr=np.sqrt(1.0/(weights)) # root of sum of variances
        new_binned_errors.append(binErr)
    # print(new_binned_errors)

    perc_diff.append(100*np.mean(np.array(new_binned_errors)-bin_lc)/np.mean(bin_lc)) # I've been slightly overestimating the errors

In [None]:
plt.hist(perc_diff, bins=20)
plt.xlabel("Mean percentage difference between uncertainty values calculated with the two methods")
plt.show()

In [None]:
np.mean(perc_diff)

In [None]:
sqrt(    1/sum(er_array**-2)    )

In [None]:
np.mean(np.array(new_binned_errors)-bin_lc)/np.mean(bin_lc) # I've been slightly overestimating the errors

In [None]:
# binn
bins=30# chosen number of bins across the period
width=1.0/float(nbins)# calculate the width of the bins

# create arrays for bin values and weights
bins=np.zeros(nbins)
weights=np.zeros(nbins)

# bin!
for i in range(len(flux)):
    n=int(foldTimes[i]/width)# calculate bin number for this value
    weight=err[i]**-2.0# calculate weight == error^-2
    bins[n]+=flux[i]*weight# add weighted value to bin (value times weight)
    weights[n]+=weight# add weight to bin weights
    
bins/=weights# normalise weighted values using sum of weights
binErr=np.sqrt(1.0/(weights))# calculate bin errors from squared weights
binEdges=np.arange(nbins)*width# create array of bin edge values for plotting

plt.errorbar(binEdges,bins,yerr=binErr,linestyle='none',marker='o')# plotbinned lightcurve
plt.show()

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
X = np.array(list(range(8))+["a", "b"])
y = np.array([0, 0, 0,0,0,0,0,0,1,1])
skf = StratifiedKFold(n_splits=2)
skf.get_n_splits(X, y)

print(skf)

for train_index, test_index in skf.split(X, y):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("TRAIN:", X_train, "TEST:", X_test)

In [None]:
X_test

In [None]:
1.47e7

# 0.125 cadence segments (16 s)

# Segment re-binned light curves

In [None]:
def segmentation(time_series, seg_len, stride, keep_time_stamps=True, experimental = False, input_cadence=1):
    """
    Create a list of 1D (when time_stamps=False) or 2D (when time_stamps=True) arrays, which are overlappig segments of ts. Incomplete fragments are rejected.

    time_series = time series to be segmented
    seg_len = length of a segment, 
    stride = step size; difference in the starting position of the consecutive segments
    """
    segments=[]
    for start in range(0, len(time_series[0])-seg_len, stride):
        end=start+seg_len
        if time_series[0][end]-time_series[0][start] != seg_len*input_cadence: #don't allow segments outside of good time intervals
            continue
        if keep_time_stamps==True:
            segments.append(time_series[:,start:end])
        else:
            segments.append(time_series[1:,start:end])
    return np.array(segments) # check why time stamps are kept 

In [None]:
cadence=0.125
seg_len_s=16
stride_s=1

segments_counts=[]
segments_times = []
segments_errors=[]
seg_ids=[]


seg_len = seg_len_s//cadence # segment length and stride size in data points
stride = stride_s//cadence



for lc_index, lc in enumerate(lcs):
    if len(lc[1]) >= seg_len: 
        segments = segmentation(lc, seg_len, stride, keep_time_stamps=True, experimental = False, input_cadence=cadence)
    else:
        continue
    if len(segments) > 0:
        segments_times.append(segments[:,0,:])
        segments_counts.append(segments[:,1,:])
        segments_errors.append(segments[:,2,:])
        seg_ids.append(ids[lc_index])
        print("Segmented {}/{} light curves.".format(lc_index+1, len(lcs)))
        clear_output(wait=True)
        

# Shuffle light curve segments

In [None]:
print("Stacking the segments and creating segment IDs, shuffling.")
id_per_seg = []  # for each light curve, copy the observation id for every segment of the light curve
for lc_index, lc in enumerate(segments_counts):
    for i in range(len(lc)):
        id_per_seg.append(seg_ids[lc_index]+"_{}".format(i))

segments_counts=np.vstack(segments_counts)
segments_errors=np.vstack(segments_errors)
segments_counts = np.expand_dims(segments_counts, axis=-1)
segments_errors = np.expand_dims(segments_errors, axis=-1)

rng_state = np.random.get_state()
np.random.shuffle(segments_counts)
np.random.set_state(rng_state)
np.random.shuffle(segments_errors)
np.random.set_state(rng_state)
np.random.shuffle(id_per_seg)

print("Done")

In [None]:
with open('../../../data_GRS1915/468202_len128_stride8_4sec_cad_countrates_sum_bin.pkl', 'rb') as f:
    segments = pickle.load(f)
with open('../../../data_GRS1915/468202_len128_stride8_4sec_cad_errors_sum_bin.pkl', 'rb') as f:
    errors = pickle.load(f)
with open('../../../data_GRS1915/468202_len128_stride8_4sec_cad_ids_sum_bin.pkl', 'rb') as f:
    ids = pickle.load(f)

# errors = ((errors)/np.expand_dims(np.std(segments, axis=1), axis=1)).astype(np.float32)
# segments = zscore(segments, axis=1).astype(np.float32)  # standardize per segment


# with open('../../../data_GRS1915/lightcurve1776_train70_val10_test20.pkl', 'rb') as f:
#     split_ob_ids = pickle.load(f)
    
# ids_no_index = [obid.split("_")[0] for obid in ids]
# training_segments_indices = np.array([seg_n for seg_n, seg in enumerate(ids_no_index) if seg in split_ob_ids[0]])
# validation_segments_indices = np.array([seg_n for seg_n, seg in enumerate(ids_no_index) if seg in split_ob_ids[1]])
# test_segments_indices = np.array([seg_n for seg_n, seg in enumerate(ids_no_index) if seg in split_ob_ids[2]])

In [None]:
with open('../../../data_GRS1915/474471_len128_stride10_1sec_cad_countrates_sum_bin.pkl', 'rb') as f:
    segments = pickle.load(f)
with open('../../../data_GRS1915/474471_len128_stride10_1sec_cad_errors_sum_bin.pkl', 'rb') as f:
    errors = pickle.load(f)
with open('../../../data_GRS1915/474471_len128_stride10_1sec_cad_ids_sum_bin.pkl', 'rb') as f:
    ids = pickle.load(f)

In [None]:
np.sqrt((segments[990][0]))

In [None]:
errors[990][0]

In [None]:
np.unique([x.split("_")[0] for x in ids]).shape

In [None]:
train_obs = np.unique([x.split("_")[0] for x in np.take(ids, training_segments_indices)])

In [None]:
val_obs = np.unique([x.split("_")[0] for x in np.take(ids, validation_segments_indices)])

In [None]:
np.unique([x.split("_")[0] for x in np.take(ids, test_segments_indices)]).shape

In [None]:
[x for x in val_obs if x in val_obs]

In [None]:
training_segments_indices

In [None]:
len(ids_no_index)

In [None]:
training_segments_indices.shape

In [None]:
validation_segments_indices.shape

In [None]:
test_segments_indices.shape

In [None]:
326762+46445+94995