# Approach VI

1. Segment ordinary time series. 
2. Cluster the segments. 
3. Segment the validation time series of *ordinary* class and try to reconstruct using cluster centroids. **Standardize the time series and its reconstruction.** Calculate the mean squared difference between the time series and its reconstruction as the error value.
4. Repeat 3. for validation time series of the *outlier* class. 

In [1]:
import numpy as np
import sys
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import importlib
import time

sys.stdout.flush()
np.random.seed(0)

In [8]:
#"ordinary" lightcurves
ordinary_file=np.loadtxt("synthetic_rhos_v2.csv", delimiter=',')
#"outlier" lightcurves
#sine_file=np.loadtxt("synthetic_sines_v3.csv", delimiter=',')
outlier_file=np.loadtxt("synthetic_boxes_thick.csv", delimiter=',')


ordinary_train, ordinary_valid, ordinary_train_ids, ordinary_valid_ids= train_test_split(ordinary_file, list(range(len(ordinary_file))) ,test_size=0.25, random_state=0)

k_clusters=[10, 50, 100, 200]
seg_lens=[10, 50, 100,150,200]

In [30]:
import segment_cluster as sc
importlib.reload(sc)
f1s = sc.analyse("new_20190617.csv", k_clusters, seg_lens, save_histograms=True, save_grid=True)
print(f1s)

[[  0  10  50 100 150 200]
 [ 10 248 147 130  55  76]
 [ 50  95 321 229   0  12]
 [100  50 960  13 237  63]
 [200 141 979 722 144  26]]


In [17]:
import segment_cluster as sc
importlib.reload(sc)

t0 = time.time()

validation_data = np.vstack((ordinary_valid, outlier_file)) # stack validation data of ordinary and outlier time series
validation_labels = np.hstack((np.zeros(len(ordinary_valid)), np.ones(len(outlier_file)))).T # generate labels for valdiation data


validation_result = []
for k_id, k_cluster in enumerate(k_clusters):
    for len_id, seg_len in enumerate(seg_lens): #for every combination of hyperparameters
        TSSCOD = sc.TSSCOD(k_clusters = k_cluster, seg_len = seg_len) #initialise outlier detection class

        TSSCOD.train(ordinary_train, random_state = 0) # train on the subset with no outliers; segment each series with a slide of 1 and cluster the segments
        print("TSSCOD.train", sys.getsizeof(TSSCOD), time.time()-t0)
        validation_iteration = TSSCOD.validate(validation_data,
                                               validation_labels) #validation reconstructs the provided series and saves error values together with indices
        print("validation_data", sys.getsizeof(validation_data), time.time()-t0)
        print("TSSCOD.validate", sys.getsizeof(TSSCOD))
        print("validation_iteration", sys.getsizeof(validation_iteration))
        hyperparameter_ids = np.vstack((np.ones(len(validation_iteration))*k_id, # add indices of hyperparameters to the results array; this part makes it easier to feed the result into the analysis pipeline
                                        np.ones(len(validation_iteration))*len_id)).T
        validation_iteration = np.hstack((hyperparameter_ids, validation_iteration))
        validation_result.append(validation_iteration) # append results for this set of hyperparameters
        print("validation_result", sys.getsizeof(validation_result), time.time()-t0)

validation_result=np.vstack(validation_result)
np.savetxt("chi2_20190613_box_v2.csv", validation_result, delimiter=",") 

all_train_segments 29400112
cluster 56
TSSCOD.train 56 32.58978748321533
validation_data 2000112 40.413246393203735
TSSCOD.validate 56
validation_iteration 12112
validation_result 96 40.42237687110901
all_train_segments 135000112
cluster 56
TSSCOD.train 56 129.13998889923096
validation_data 2000112 131.06948614120483
TSSCOD.validate 56
validation_iteration 12112
validation_result 96 131.0717716217041
all_train_segments 240000112


KeyboardInterrupt: 

In [15]:
sys.getsizeof(TSSCOD.cluster)

56

In [6]:
t0=time.process_time()
for k_id, k_cluster in enumerate(k_clusters):
    for len_id, seg_len in enumerate(seg_lens):
        ##train the model
        #loop throught the light curves of a given class and segments them
        all_train_segments=[]
        for rho in ordinary_train:
            train_segments=sc.segmentation(rho, seg_len, 1, time_stamps=False)
            all_train_segments.append(train_segments)
        all_train_segments=np.vstack(all_train_segments)
        print("all_train_segments", sys.getsizeof(all_train_segments))
        #cluster the segments
        cluster=KMeans(n_clusters=k_cluster, random_state=0)
        cluster.fit(all_train_segments) 
        break
    break
print(time.process_time() -t0)

all_train_segments 29400112
519.5218885900103


In [None]:
from sklearn.cluster import KMeans
from scipy.stats import zscore
import segment_cluster as sc
importlib.reload(sc)

reco_error=[]
t0 = time.time()
for k_id, k_cluster in enumerate(k_clusters):
    for len_id, seg_len in enumerate(seg_lens):
        ##train the model
        #loop throught the light curves of a given class and segments them
        all_train_segments=[]
        for rho in ordinary_train:
            train_segments=sc.segmentation(rho, seg_len, 1, time_stamps=False)
            all_train_segments.append(train_segments)
        all_train_segments=np.vstack(all_train_segments)
        #cluster the segments
        cluster=KMeans(n_clusters=k_cluster, random_state=0)
        cluster.fit(all_train_segments)     
        

        ### reconstruction of the training class
        for n_rho, rho in enumerate(ordinary_valid):
            valid_segments= sc.segmentation(rho, seg_len, seg_len , time_stamps=False)
            reco= sc.reconstruct(valid_segments, rho, cluster, rel_offset=False, seg_slide=seg_len)
            
            reco[0:-seg_len]=zscore(reco[0:-seg_len])
            rho_expected=zscore(np.copy(rho[0:-seg_len]))
           # rho_error=np.power(np.e,np.log(rho_expected)*0.5+1.0397207708265923)
            error = np.mean(((reco[0:-seg_len]-rho_expected))**2.0)
            reco_error.append((k_id,len_id,0, ordinary_valid_ids[n_rho], error))
            print((k_id,len_id,0, ordinary_valid_ids[n_rho], error), flush=True)


        #reconstruction loop through light curves for every class other than rho              
        for n_sine, sine in enumerate(outlier_file):
            valid_segments= sc.segmentation(sine, seg_len, seg_len , time_stamps=False)
            reco = sc.reconstruct(valid_segments, sine, cluster, rel_offset=False, seg_slide=seg_len)
            
            reco[0:-seg_len]=zscore(reco[0:-seg_len])
            sine_expected=zscore(np.copy((sine[0:-seg_len])))
            #sine_error=np.power(np.e,np.log(rho_expected)*0.5+1.0397207708265923)
            error = np.mean(((reco[0:-seg_len]-sine_expected))**2.0)
            reco_error.append((k_id,len_id,1,n_sine, error))
           # print((k_id,len_id,1,n_sine, error), flush=True)
reco_error_ar=np.array(reco_error)
np.savetxt("chi2_20190613_box_v2.csv", reco_error_ar, delimiter=",") 
print(time.time()-t0)

(0, 0, 0, 993, 0.12803731933680051)
(0, 0, 0, 859, 0.100167254872534)
(0, 0, 0, 298, 0.23509142143811376)
(0, 0, 0, 553, 0.10881630927520557)
(0, 0, 0, 672, 0.11648979012469314)
(0, 0, 0, 971, 0.14456155087574077)
(0, 0, 0, 27, 0.15913333019702153)
(0, 0, 0, 231, 0.16121382957015723)
(0, 0, 0, 306, 0.08396377320965227)
(0, 0, 0, 706, 0.08879960000334688)
(0, 0, 0, 496, 0.08618801626285197)
(0, 0, 0, 558, 0.20825910569615397)
(0, 0, 0, 784, 0.09305669841823369)
(0, 0, 0, 239, 0.08592147222506989)
(0, 0, 0, 578, 0.11001742569061342)
(0, 0, 0, 55, 0.268012377396398)
(0, 0, 0, 906, 0.18653913938571637)
(0, 0, 0, 175, 0.13633815632431107)
(0, 0, 0, 14, 0.1898122152247505)
(0, 0, 0, 77, 0.16572290775936765)
(0, 0, 0, 31, 0.16688574573576115)
(0, 0, 0, 481, 0.10028546874861635)
(0, 0, 0, 310, 0.09080393739476585)
(0, 0, 0, 311, 0.12362063537917085)
(0, 0, 0, 883, 0.08907433718194972)
(0, 0, 0, 788, 0.10334736785342741)
(0, 0, 0, 45, 0.12038875861556098)
(0, 0, 0, 103, 0.11951480342582782)
(0,

In [32]:
import time

#"ordinary" lightcurves
rho_file=np.loadtxt("data/synthetic_rhos_v2.csv", delimiter=',')
#"outlier" lightcurves
#sine_file=np.loadtxt("synthetic_sines_v3.csv", delimiter=',')
# sine_file=np.loadtxt("synthetic_boxes_thick.csv", delimiter=',')
sine_file=np.zeros((250,500))
for n, rho in enumerate(rho_file[0:250]):
    mean=np.mean(rho)
    error=np.sqrt(mean)
    sine_file[n]=np.random.normal(mean, error, 500)


rho_train, rho_valid, rho_train_ids, rho_valid_ids= train_test_split(rho_file, list(range(len(rho_file))) ,test_size=0.25, random_state=0)

k_clusters=[10, 50, 100, 200]
seg_lens=[10, 50, 100,150,200]

In [33]:
import segment_cluster as sc
importlib.reload(sc)
real_t0 = time.time()
process_t0 = time.process_time()

reco_error=[]
loop_counter=0
for k_id, k_cluster in enumerate(k_clusters):
    for len_id, seg_len in enumerate(seg_lens):
        ##train the model
        #loop throught the light curves of a given class and segments them
        all_train_segments=[]
        for rho in rho_train:
            train_segments=sc.segmentation(rho, seg_len, seg_len, time_stamps=False)
            all_train_segments.append(train_segments)
        all_train_segments=np.vstack(all_train_segments)
        #cluster the segments
        cluster=KMeans(n_clusters=k_cluster, random_state=0)
        cluster.fit(all_train_segments)     
        

        ### reconstruction of the training class
        for n_rho, rho in enumerate(rho_valid):
            valid_segments= sc.segmentation(rho, seg_len, seg_len , time_stamps=False)
            reco= sc.reconstruct(valid_segments, rho, cluster, rel_offset=False, seg_slide=seg_len)
            
            reco[0:-seg_len]=zscore(reco[0:-seg_len])
            rho_expected=zscore(np.copy(rho[0:-seg_len]))
           # rho_error=np.power(np.e,np.log(rho_expected)*0.5+1.0397207708265923)
            error = np.mean(((reco[0:-seg_len]-rho_expected))**2.0)
            reco_error.append((k_id,len_id,0, rho_valid_ids[n_rho], error))


        #reconstruction loop through light curves for every class other than rho              
        for n_sine, sine in enumerate(sine_file):
            valid_segments= sc.segmentation(sine, seg_len, seg_len , time_stamps=False)
            reco = sc.reconstruct(valid_segments, sine, cluster, rel_offset=False, seg_slide=seg_len)
            
            reco[0:-seg_len]=zscore(reco[0:-seg_len])
            sine_expected=zscore(np.copy((sine[0:-seg_len])))
            #sine_error=np.power(np.e,np.log(rho_expected)*0.5+1.0397207708265923)
            error = np.mean(((reco[0:-seg_len]-sine_expected))**2.0)
            reco_error.append((k_id,len_id,1,n_sine, error))
        print("Loop: {}".format(loop_counter))
        loop_counter+=1
reco_error_ar=np.array(reco_error)
#np.savetxt("chi2_20190613_flat.csv", reco_error_ar, delimiter=",") 
print("Finished, elapsed time: {}s".format(time.time() - real_t0)+", total CPU time: {}s".format(time.process_time()-process_t0))


NameError: name 'test_segments' is not defined