Take synthetic rho data and attempt outlier detection with one other type of a synthetic lightcurve; sine, flat, box function, inverted rho etc (an obvious outlier).

In [1]:
import os
import fnmatch
import csv
import numpy as np
import sys
sys.stdout.flush()
import math

from collections import Counter

from sklearn import tree


import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

import segment_cluster as sc
import importlib
importlib.reload(sc)


np.random.seed(0)

In [12]:
#"normal" lightcurves
rho_file=np.loadtxt("synthetic_rhos.csv", delimiter=',')
#"outlier" lightcurves
sine_file=np.loadtxt("synthetic_sines.csv", delimiter=',')

In [13]:
rho_train, rho_valid, rho_train_ids, rho_valid_ids= train_test_split(rho_file, list(range(len(rho_file))) ,test_size=0.25)

In [14]:
reco_error=[]
#reco_classes=[]
k_clusters=[10, 50, 100]
seg_lens=[4,8,12,30,50,100]

for k_id, k_cluster in enumerate(k_clusters):
    for len_id, seg_len in enumerate(seg_lens):
        ##train the model
        #loop throught the light curves of a given class and segments them
        all_train_segments=[]
        for rho in rho_train:
            train_segments=sc.segmentation(rho, seg_len, seg_len, time_stamps=False)
            all_train_segments.append(train_segments)
        all_train_segments=np.vstack(all_train_segments)
        #cluster the segments
        cluster=KMeans(n_clusters=k_cluster, random_state=0)
        cluster.fit(all_train_segments)

        ### reconstruction of the training class
        for n_rho, rho in enumerate(rho_valid):
            valid_segments= sc.segmentation(rho, seg_len, seg_len , time_stamps=False)
            reco = sc.reconstruct(valid_segments, rho, cluster, rel_offset=False, seg_slide=seg_len)
            error=np.sqrt(np.mean((rho[seg_len:-seg_len]-reco[seg_len:-seg_len])**2))
            reco_error.append((k_id,len_id,0, n_rho, error))
            print((k_id,len_id,0, n_rho, error), flush=True)


        #reconstruction loop through light curves for every class other than rho              
        for n_sine, sine in enumerate(sine_file):
            valid_segments= sc.segmentation(sine, seg_len, seg_len , time_stamps=False)
            reco = sc.reconstruct(valid_segments, sine, cluster, rel_offset=False, seg_slide=seg_len)
            error=np.sqrt(np.mean((sine[seg_len:-seg_len]-reco[seg_len:-seg_len])**2))
            reco_error.append((k_id,len_id,1,n_sine, error))
            print((k_id,len_id,1,n_sine, error), flush=True)
reco_error_ar=np.array(reco_error)
np.savetxt("valid_results_20190509.csv", reco_error_ar, delimiter=",") 

(0, 0, 0, 0, 342.62018245156406)
(0, 0, 1, 0, 249.31751821844298)
(0, 0, 1, 1, 258.8310251998609)
(0, 0, 1, 2, 250.06104385816252)
(0, 0, 1, 3, 250.00189319329803)
(0, 0, 1, 4, 270.5024688389461)
(0, 0, 1, 5, 271.9493939842887)
(0, 0, 1, 6, 265.2026089815642)
(0, 0, 1, 7, 244.29957578073484)
(0, 0, 1, 8, 254.63831648860136)
(0, 0, 1, 9, 269.2810516376172)
(0, 0, 1, 10, 280.14217215074575)
(0, 0, 1, 11, 260.25812395506733)
(0, 0, 1, 12, 277.04883278912104)
(0, 0, 1, 13, 276.02605229956515)
(0, 0, 1, 14, 326.17681567676846)
(0, 0, 1, 15, 286.196465055262)
(0, 0, 1, 16, 259.10665208491974)
(0, 0, 1, 17, 264.96316279114774)
(0, 0, 1, 18, 262.2468296239352)
(0, 0, 1, 19, 254.3397899314815)
(0, 0, 1, 20, 278.16099431033615)
(0, 0, 1, 21, 267.6540352540229)
(0, 0, 1, 22, 278.1974469668898)
(0, 0, 1, 23, 280.79804548983503)
(0, 0, 1, 24, 288.97859655848345)
(0, 0, 1, 25, 255.588521069423)
(0, 0, 1, 26, 310.9912377604033)
(0, 0, 1, 27, 270.1529240273794)
(0, 0, 1, 28, 291.5751898330347)
(0, 0, 

KeyboardInterrupt: 