In [1]:
import numpy as np
import sys
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import importlib
from scipy.stats import zscore
sys.stdout.flush()
np.random.seed(0)

# Approach I

1. Segment ordinary time series. 
2. Cluster the segments. 
3. Segment the validation time series of *ordinary* class and try to reconstruct using cluster centroids. Calculate the Chi-square, divided by the number of reconstructed data points, for each time series.
4. Repeat 3. for validation time series of the *outlier* class. 

In [2]:
import segment_cluster as sc
importlib.reload(sc)

#"normal" lightcurves
rho_file=np.loadtxt("synthetic_rhos_v2.csv", delimiter=',')
#"outlier" lightcurves
sine_file=np.loadtxt("synthetic_sines_v3.csv", delimiter=',')

rho_train, rho_valid, rho_train_ids, rho_valid_ids= train_test_split(rho_file, list(range(len(rho_file))) ,test_size=0.25, random_state=0)

reco_error=[]
#reco_classes=[]
k_clusters=[150]
seg_lens=[10, 50, 100,150,200]

for k_id, k_cluster in enumerate(k_clusters):
    for len_id, seg_len in enumerate(seg_lens):
        ##train the model
        #loop throught the light curves of a given class and segments them
        all_train_segments=[]
        for rho in rho_train:
            train_segments=sc.segmentation(rho, seg_len, seg_len, time_stamps=False)
            all_train_segments.append(train_segments)
        all_train_segments=np.vstack(all_train_segments)
        #cluster the segments
        cluster=KMeans(n_clusters=k_cluster, random_state=0)
        cluster.fit(all_train_segments)     
        

        ### reconstruction of the training class
        for n_rho, rho in enumerate(rho_valid):
            valid_segments= sc.segmentation(rho, seg_len, seg_len , time_stamps=False)
            reco= sc.reconstruct(valid_segments, rho, cluster, rel_offset=False, seg_slide=seg_len)
            
            reco[0:-seg_len]=reco[0:-seg_len]
            rho_expected=np.copy(rho[0:-seg_len])
            rho_error=np.power(np.e,np.log(rho_expected)*0.5+1.0397207708265923)
            error = np.mean(((reco[0:-seg_len]-rho_expected)/rho_error)**2.0)
            reco_error.append((k_id,len_id,0, rho_valid_ids[n_rho], error))
            print((k_id,len_id,0, rho_valid_ids[n_rho], error), flush=True)


        #reconstruction loop through light curves for every class other than rho              
        for n_sine, sine in enumerate(sine_file):
            valid_segments= sc.segmentation(sine, seg_len, seg_len , time_stamps=False)
            reco = sc.reconstruct(valid_segments, sine, cluster, rel_offset=False, seg_slide=seg_len)
            
            #reco[0:-seg_len] = np.mean(sine[0:-seg_len])+ (reco[0:-seg_len]- np.mean(reco[0:-seg_len]))*(np.std(sine[0:-seg_len])/np.std(reco[0:-seg_len]))
            reco[0:-seg_len]=(reco[0:-seg_len])
            sine_expected=np.copy((sine[0:-seg_len]))
            sine_error=np.power(np.e,np.log(rho_expected)*0.5+1.0397207708265923)
            error = np.mean(((reco[0:-seg_len]-sine_expected)/sine_error)**2.0)
            reco_error.append((k_id,len_id,1,n_sine, error))
            print((k_id,len_id,1,n_sine, error), flush=True)
reco_error_ar=np.array(reco_error)
np.savetxt("chi2_20190610_1.csv", reco_error_ar, delimiter=",") 

(0, 0, 0, 993, 1.3236683207286344)
(0, 0, 0, 859, 1.4637904418192889)
(0, 0, 0, 298, 2.937551529739195)
(0, 0, 0, 553, 1.455011448350712)
(0, 0, 0, 672, 2.523262858669867)
(0, 0, 0, 971, 2.558944289843558)
(0, 0, 0, 27, 2.332421098846116)
(0, 0, 0, 231, 3.1226748949440717)
(0, 0, 0, 306, 1.4038640001581557)
(0, 0, 0, 706, 1.2815598641009462)
(0, 0, 0, 496, 1.6147172661175493)
(0, 0, 0, 558, 3.11058874870069)
(0, 0, 0, 784, 2.5581515801856223)
(0, 0, 0, 239, 1.982696423578518)
(0, 0, 0, 578, 1.7301019279440852)
(0, 0, 0, 55, 3.4818416987283323)
(0, 0, 0, 906, 1.9022762193573814)
(0, 0, 0, 175, 1.572082254226861)
(0, 0, 0, 14, 5.030524816607655)
(0, 0, 0, 77, 2.9377998898198157)
(0, 0, 0, 31, 1.203827748899043)
(0, 0, 0, 481, 1.4448835503446322)
(0, 0, 0, 310, 1.3512363816187096)
(0, 0, 0, 311, 2.0492836978772027)
(0, 0, 0, 883, 2.174562421277626)
(0, 0, 0, 788, 2.359546109453789)
(0, 0, 0, 45, 1.2092683506868962)
(0, 0, 0, 103, 3.933761937957051)
(0, 0, 0, 760, 11.878728093639543)
(0, 0

# Approach II

1. Segment ordinary time series. 
2. Cluster the segments. 
3. Segment the validation time series of *ordinary* class and try to reconstruct using cluster centroids **scaled and shifted to the deviation and mean of each segment**. Calculate the Chi-square, divided by the number of reconstructed data points, for each time series.
4. Repeat 3. for validation time series of the *outlier* class. 

In [3]:
import segment_cluster_scaled as sc
importlib.reload(sc)

#"normal" lightcurves
rho_file=np.loadtxt("synthetic_rhos_v2.csv", delimiter=',')
#"outlier" lightcurves
sine_file=np.loadtxt("synthetic_sines_v3.csv", delimiter=',')

rho_train, rho_valid, rho_train_ids, rho_valid_ids= train_test_split(rho_file, list(range(len(rho_file))) ,test_size=0.25, random_state=0)

reco_error=[]
#reco_classes=[]
k_clusters=[150]
seg_lens=[10, 50, 100,150,200]

for k_id, k_cluster in enumerate(k_clusters):
    for len_id, seg_len in enumerate(seg_lens):
        ##train the model
        #loop throught the light curves of a given class and segments them
        all_train_segments=[]
        for rho in rho_train:
            train_segments=sc.segmentation(rho, seg_len, seg_len, time_stamps=False)
            all_train_segments.append(train_segments)
        all_train_segments=np.vstack(all_train_segments)
        #cluster the segments
        cluster=KMeans(n_clusters=k_cluster, random_state=0)
        cluster.fit(all_train_segments)     
        

        ### reconstruction of the training class
        for n_rho, rho in enumerate(rho_valid):
            valid_segments= sc.segmentation(rho, seg_len, seg_len , time_stamps=False)
            reco= sc.reconstruct(valid_segments, rho, cluster, rel_offset=False, seg_slide=seg_len)
            
            reco[0:-seg_len]=reco[0:-seg_len]
            rho_expected=np.copy(rho[0:-seg_len])
            rho_error=np.power(np.e,np.log(rho_expected)*0.5+1.0397207708265923)
            error = np.mean(((reco[0:-seg_len]-rho_expected)/rho_error)**2.0)
            reco_error.append((k_id,len_id,0, rho_valid_ids[n_rho], error))
            print((k_id,len_id,0, rho_valid_ids[n_rho], error), flush=True)


        #reconstruction loop through light curves for every class other than rho              
        for n_sine, sine in enumerate(sine_file):
            valid_segments= sc.segmentation(sine, seg_len, seg_len , time_stamps=False)
            reco = sc.reconstruct(valid_segments, sine, cluster, rel_offset=False, seg_slide=seg_len)
            
            #reco[0:-seg_len] = np.mean(sine[0:-seg_len])+ (reco[0:-seg_len]- np.mean(reco[0:-seg_len]))*(np.std(sine[0:-seg_len])/np.std(reco[0:-seg_len]))
            reco[0:-seg_len]=(reco[0:-seg_len])
            sine_expected=np.copy((sine[0:-seg_len]))
            sine_error=np.power(np.e,np.log(rho_expected)*0.5+1.0397207708265923)
            error = np.mean(((reco[0:-seg_len]-sine_expected)/sine_error)**2.0)
            reco_error.append((k_id,len_id,1,n_sine, error))
            print((k_id,len_id,1,n_sine, error), flush=True)
reco_error_ar=np.array(reco_error)
np.savetxt("chi2_20190610_2.csv", reco_error_ar, delimiter=",") 

(0, 0, 0, 993, 2.201606972288814)
(0, 0, 0, 859, 2.6823481715071176)
(0, 0, 0, 298, 3.5776724426735025)
(0, 0, 0, 553, 1.6522113856049312)
(0, 0, 0, 672, 5.521698469502302)
(0, 0, 0, 971, 2.8366183815793655)
(0, 0, 0, 27, 2.73585643016005)
(0, 0, 0, 231, 3.982732931878175)
(0, 0, 0, 306, 2.8214095273964572)
(0, 0, 0, 706, 3.37566224985187)
(0, 0, 0, 496, 3.212057509513115)
(0, 0, 0, 558, 5.363041478279381)
(0, 0, 0, 784, 5.5470610171432515)
(0, 0, 0, 239, 5.3809274333624595)
(0, 0, 0, 578, 4.020791645997309)
(0, 0, 0, 55, 3.3285934868475375)
(0, 0, 0, 906, 4.5445434151703905)
(0, 0, 0, 175, 4.04011965174517)
(0, 0, 0, 14, 6.337052037276847)
(0, 0, 0, 77, 4.864539945408323)
(0, 0, 0, 31, 2.046272051983212)
(0, 0, 0, 481, 2.4840075669960178)
(0, 0, 0, 310, 4.32871306963614)
(0, 0, 0, 311, 2.157049141200241)
(0, 0, 0, 883, 3.598014257298921)
(0, 0, 0, 788, 4.292106453120279)
(0, 0, 0, 45, 1.3120289322580168)
(0, 0, 0, 103, 8.126032550862444)
(0, 0, 0, 760, 4.96970935316853)
(0, 0, 0, 1, 5

# Approach III

1. Segment ordinary time series. 
2. **Standardize** and cluster the segments. 
3. Segment the validation time series of *ordinary* class, **standardize the segments** and try to reconstruct using cluster centroids. Calculate the Chi-square, divided by the number of reconstructed data points, for each time series.
4. Repeat 3. for validation time series of the *outlier* class. 

In [6]:
import segment_cluster_zscore as sc
importlib.reload(sc)

#"normal" lightcurves
rho_file=np.loadtxt("synthetic_rhos_v2.csv", delimiter=',')
#"outlier" lightcurves
sine_file=np.loadtxt("synthetic_sines_v3.csv", delimiter=',')

rho_train, rho_valid, rho_train_ids, rho_valid_ids= train_test_split(rho_file, list(range(len(rho_file))) ,test_size=0.25, random_state=0)

reco_error=[]
#reco_classes=[]
k_clusters=[150]
seg_lens=[10, 50, 100,150,200]

for k_id, k_cluster in enumerate(k_clusters):
    for len_id, seg_len in enumerate(seg_lens):
        ##train the model
        #loop throught the light curves of a given class and segments them
        all_train_segments=[]
        for rho in rho_train:
            train_segments=sc.segmentation(rho, seg_len, seg_len, time_stamps=False)
            all_train_segments.append(train_segments)
        all_train_segments=np.vstack(all_train_segments)
        #cluster the segments
        cluster=KMeans(n_clusters=k_cluster, random_state=0)
        cluster.fit(zscore(all_train_segments))   
        

        ### reconstruction of the training class
        for n_rho, rho in enumerate(rho_valid):
            valid_segments= sc.segmentation(rho, seg_len, seg_len , time_stamps=False)
            reco, error= sc.reconstruct(valid_segments, rho, cluster, rel_offset=False, seg_slide=seg_len)
            reco[0:-seg_len]=reco[0:-seg_len]
            rho_expected=np.copy(rho[0:-seg_len])
            reco_error.append((k_id,len_id,0, rho_valid_ids[n_rho], error))
            print((k_id,len_id,0, rho_valid_ids[n_rho], error), flush=True)


        #reconstruction loop through light curves for every class other than rho              
        for n_sine, sine in enumerate(sine_file):
            valid_segments= sc.segmentation(sine, seg_len, seg_len , time_stamps=False)
            reco, error = sc.reconstruct(valid_segments, sine, cluster, rel_offset=False, seg_slide=seg_len)
            reco[0:-seg_len]=(reco[0:-seg_len])
            sine_expected=np.copy((sine[0:-seg_len]))
            reco_error.append((k_id,len_id,1,n_sine, error))
            print((k_id,len_id,1,n_sine, error), flush=True)
reco_error_ar=np.array(reco_error)
np.savetxt("chi2_20190610_3.csv", reco_error_ar, delimiter=",") 

(0, 0, 0, 993, 0.019561455047014602)
(0, 0, 0, 859, 0.031207797421457146)
(0, 0, 0, 298, 0.0343219933731992)
(0, 0, 0, 553, 0.03194576570504408)
(0, 0, 0, 672, 0.03598837246671463)
(0, 0, 0, 971, 0.03330958143034083)
(0, 0, 0, 27, 0.029275251624332334)
(0, 0, 0, 231, 0.04096371704381807)
(0, 0, 0, 306, 0.035201676582443046)
(0, 0, 0, 706, 0.01812572710997947)
(0, 0, 0, 496, 0.023349073886401412)
(0, 0, 0, 558, 0.05922988811883012)
(0, 0, 0, 784, 0.01595640770977604)
(0, 0, 0, 239, 0.021022588548438168)
(0, 0, 0, 578, 0.02103192897768596)
(0, 0, 0, 55, 0.033134981839554406)
(0, 0, 0, 906, 0.03269420310779868)
(0, 0, 0, 175, 0.02314582766536438)
(0, 0, 0, 14, 0.04292796599440723)
(0, 0, 0, 77, 0.03067384405270446)
(0, 0, 0, 31, 0.036677486836280214)
(0, 0, 0, 481, 0.02218982660565643)
(0, 0, 0, 310, 0.018189725783170217)
(0, 0, 0, 311, 0.020222250161239057)
(0, 0, 0, 883, 0.026468089043741115)
(0, 0, 0, 788, 0.06194332482916992)
(0, 0, 0, 45, 0.030335656545422026)
(0, 0, 0, 103, 0.023944