In [1]:
################################
# Scientific imports
################################
import gc
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
from astropy.io import fits
from astroquery.mast import Observations
from astroquery.mast import Catalogs
from astropy import units as u
from astropy.timeseries import BoxLeastSquares
from astropy.timeseries import TimeSeries
from astropy.stats import sigma_clipped_stats

################################
# General imports
################################
import csv, math, io, os, os.path, sys, random, time, json, statistics
import pandas as pd
import seaborn as sb
from tqdm.notebook import tqdm, trange

################################
# SciKitLearn Imports
################################
import sklearn
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import SpectralClustering
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans

################################
# SKTime Imports
################################
from sktime.classification.kernel_based import Arsenal
from sktime.classification.interval_based import CanonicalIntervalForest
from sktime.classification.dictionary_based import ContractableBOSS
from sktime.classification.interval_based import DrCIF
from sktime.classification.hybrid import HIVECOTEV1
from sktime.classification.dictionary_based import IndividualBOSS
from sktime.classification.dictionary_based import IndividualTDE
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
from sktime.classification.feature_based import MatrixProfileClassifier
from sktime.classification.dictionary_based import MUSE
from sktime.classification.interval_based import RandomIntervalSpectralForest
from sktime.classification.distance_based import ShapeDTW
from sktime.classification.feature_based import SignatureClassifier
from sktime.classification.interval_based import SupervisedTimeSeriesForest
#from sktime.classification.feature_based import TSFreshClassifier
from sktime.classification.dictionary_based import WEASEL

from scipy.signal import butter,filtfilt

from IPython.display import display

################################
# MatPlotLib Settings
################################
plt.rcParams["figure.figsize"] = (20,9)
sb.set()
xNaNs = np.load("X_NAN_LIST.npy")
xTime = np.load("X_TIME_LIST.npy")

In [33]:
list_of_classifiers = [
        Arsenal,
        CanonicalIntervalForest,
        ContractableBOSS,
        DrCIF,
        IndividualBOSS,
        IndividualTDE,
        MUSE,
        #MatrixProfileClassifier,
        RandomIntervalSpectralForest,
        #ShapeDTW,
        SignatureClassifier,
        #SupervisedTimeSeriesForest,
        #TSFreshClassifier,
        WEASEL
    ]

algorithm_grid = {key.__name__: None for key in list_of_classifiers}

algorithm_grid['Arsenal'] = {'num_kernels':  [1000,1500,2000,2500,3000], 'n_estimators': [15, 20, 25, 30, 35]}
algorithm_grid['CanonicalIntervalForest'] = {'n_estimators': [50, 200, 250, 300, 800], 'base_estimator': ['CIT', 'DTC']}
algorithm_grid['ContractableBOSS'] = {'n_parameter_samples':  [50, 200, 250, 300, 800], 'max_ensemble_size': [10, 45, 50, 55, 100]}
algorithm_grid['DrCIF'] = {'n_estimators':  [10, 150, 200, 250, 800], 'base_estimator': ['CIT', 'DTC']}
algorithm_grid['IndividualBOSS'] = {'window_size':  [10, 25, 50, 75, 100], 'alphabet_size': [2, 3, 4, 5, 6]}
algorithm_grid['IndividualTDE'] = {'window_size':  [10, 25, 50, 75, 100], 'alphabet_size': [2, 3, 4, 5, 6]}
algorithm_grid['MUSE'] = {'anova':  [True, False], 'window_inc': [2, 3, 4, 5, 6]}
algorithm_grid['RandomIntervalSpectralForest'] = {'n_estimators':  [10, 150, 200, 250, 800], 'min_interval': [4, 8, 16, 32, 64]}
algorithm_grid['SignatureClassifier'] = { 'n_estimators': [50, 200, 250, 300, 800]}
algorithm_grid['WEASEL'] = {'anova':  [True, False], 'window_inc': [2, 3, 4, 5, 6]}

param_grid = {}

for classifier in list_of_classifiers:
    cname = classifier.__name__
    prefix = cname.lower()+"__"
    
    for kwargs in algorithm_grid[cname]:
        #print(f"{cname}: {kwargs} -- {algorithm_grid[cname].get(kwargs)}")
        #print(f"'{prefix+kwargs}': {algorithm_grid[cname].get(kwargs)},")
        param_grid[prefix+kwargs] = algorithm_grid[cname].get(kwargs)

param_grid

{'arsenal__num_kernels': [1000, 1500, 2000, 2500, 3000],
 'arsenal__n_estimators': [15, 20, 25, 30, 35],
 'canonicalintervalforest__n_estimators': [50, 200, 250, 300, 800],
 'canonicalintervalforest__base_estimator': ['CIT', 'DTC'],
 'contractableboss__n_parameter_samples': [50, 200, 250, 300, 800],
 'contractableboss__max_ensemble_size': [10, 45, 50, 55, 100],
 'drcif__n_estimators': [10, 150, 200, 250, 800],
 'drcif__base_estimator': ['CIT', 'DTC'],
 'individualboss__window_size': [10, 25, 50, 75, 100],
 'individualboss__alphabet_size': [2, 3, 4, 5, 6],
 'individualtde__window_size': [10, 25, 50, 75, 100],
 'individualtde__alphabet_size': [2, 3, 4, 5, 6],
 'muse__anova': [True, False],
 'muse__window_inc': [2, 3, 4, 5, 6],
 'randomintervalspectralforest__n_estimators': [10, 150, 200, 250, 800],
 'randomintervalspectralforest__min_interval': [4, 8, 16, 32, 64],
 'signatureclassifier__n_estimators': [50, 200, 250, 300, 800],
 'weasel__anova': [True, False],
 'weasel__window_inc': [2, 3

In [None]:
masterX = [x[1:-1] for x in np.load("None_Or_One_Exoplanet.npy")]

################################

def Normal(masterX):
    # Takes in 'masterX', my 9154 long array of LCs.
    # Need to return a 9154 array, where the daya has been normalised for EACH LC
    for X in masterX:
        median = np.median(X)
        X[:] = [(number/median) for number in X]
    return masterX

################################

def FilterMyDataLOOPED(masterX,cutoff=0.00005,order=2,nanList=xNaNs):
    
    """
    Function to apply a Butter Filter to timeseries.
    Vars:
    
    y:        The timeseries. Must be list or np array.
    cutoff:   The cutoff frequency. Used to determine where the filter cut off is.
    order:    Approximation via polynomial of the order'th degree (2=quadratic, 3=cubic, 4=quartic, etc)
    """
    
    # First, let's calculate the observational time period;
    # This is done separately so that I can change this in the future for any TESS fits file
    numdays       = GetNumDays()
    
    # Since, in this case, all data is of same length, we can (should?) be able to take a load of stuff outside
    # of the loop, that we don't need to recalculate each time
    
    #sec           = numdays*24*60*60       # Number of seconds in the overall observation period
    #freq          = len(masterX[0])/sec    # Frequency, in Hz, ie number of observations per second
    #nyq           = 0.5 * freq
    #normal_cutoff = cutoff / nyq
    
    # Now we start the "for x in masterX" loop
    for x in masterX:
    
        # Next, fix data                           
        xMedian       = np.median(x)                                                    # Get the median value of 'x' before changing it
        x             = [xMedian if n in nanList else item for n,item in enumerate(x)]    # Change all the missing values to the median value of the whole array

        # Frequency Data Stuff
        sec           = numdays*24*60*60   # Number of seconds in the overall observation period
        freq          = len(x)/sec         # Frequency, in Hz, ie number of observations per second
        # FREQ IS APPROX 1/120 OR ~0.008333333

        # Butter Lowpass Filter
        #polynomOrder  = order
        nyq           = 0.5 * freq
        normal_cutoff = cutoff / nyq
        b, a          = butter(order, normal_cutoff, btype='low', analog=False)
        x[:]          = [filtfilt(b, a, x)]     # WAS > newX          = filtfilt(b, a, x)
    
    # Finally, return the new X and Y values
    #return (newX)
    return (masterX)

################################

def FilterAllMyData(masterX,cutoff=0.00005,order=2,nanList=xNaNs):
    # Input:  masterX
    # Output: masterX with each LC filtered
    
    for X in masterX:
        X[:] = FilterMyData(X,cutoff,order,xNaNs)
    
    return masterX
    
################################

def FilterMyData(x,cutoff=0.00005,order=2,xNaNs=xNaNs):
    
    """
    Function to apply a Butter Filter to timeseries.
    Vars:
    
    y:        The timeseries. Must be list or np array.
    cutoff:   The cutoff frequency. Used to determine where the filter cut off is.
    order:    Approximation via polynomial of the order'th degree (2=quadratic, 3=cubic, 4=quartic, etc)
    """
    
    # First, let's calculate the observational time period;
    # This is done separately so that I can change this in the future for any TESS fits file
    numdays       = GetNumDays()
    
    # Next, fix data                           
    xMedian       = np.median(x)                                                    # Get the median value of 'x' before changing it
    x             = [xMedian if n in xNaNs else item for n,item in enumerate(x)]    # Change all the missing values to the median value of the whole array
    
    # Frequency Data Stuff
    sec           = numdays*24*60*60   # Number of seconds in the overall observation period
    freq          = len(x)/sec         # Frequency, in Hz, ie number of observations per second
    # FREQ IS APPROX 1/120 OR ~0.008333333
    
    # Butter Lowpass Filter
    #polynomOrder  = order
    nyq           = 0.5 * freq
    normal_cutoff = cutoff / nyq
    #b, a          = butter(polynomOrder, normal_cutoff, btype='low', analog=False)
    b, a          = butter(order, normal_cutoff, btype='low', analog=False)
    newX          = filtfilt(b, a, x)
    
    # Finally, return the new X and Y values
    return (newX)

################################

def GetNumDays(time=xTime):
    
    #xTime = np.load("X_TIME_LIST.npy")
    nDays = time[-1]-time[0]
    
    return (nDays)

################################

def FIXNAN(masterX, nanList=xNaNs):
    # Takes in 'masterX', my 9154 long array of LCs.
    # Need to return a 9154 array, where the daya has been normalised for EACH LC
    for X in masterX:
        XMedian = np.median(X)
        X[:] = [XMedian if n in nanList else item for n,item in enumerate(X)]
    return masterX

################################

def Every_Nth_Value_EACH(y,nth=40):
    return (y[::nth])

################################

def Every_Nth_Value(masterX,nth=40):
    biglen = len(masterX)
    oldlen = len(masterX[0])
    newlen = len(masterX[0][::nth])
    #print(f"Old = {oldlen}; new = {newlen}")
    
    tmp = np.zeros((biglen,newlen))
    
    for n,X in enumerate(masterX):
        tmp[n] = Every_Nth_Value_EACH(X)
        #print(arr[i][::nth])
    
    return tmp

################################

In [None]:
lcnum = 0
testX = masterX[::10]
plt.plot(testX[lcnum])

In [None]:
# PIPE 1

%timeit tidyX = Normal(FIXNAN(testX))
plt.plot(tidyX[lcnum])

In [None]:
filtX = FilterAllMyData(tidyX)
plt.plot(filtX[lcnum])

In [None]:
smallX = Every_Nth_Value_ALL(filtX)
plt.plot(smallX[lcnum])

In [None]:
norm = FunctionTransformer(Normal)
fnan = FunctionTransformer(FIXNAN)
filt = FunctionTransformer(FilterAllMyData)
enth = FunctionTransformer(Every_Nth_Value)

pipe = Pipeline(steps=[['normalise',norm],['fixnan',fnan],['filter',filt],['everynth',enth]])

In [None]:
pipe

In [None]:
tmpX = masterX[:100]

In [None]:
tmpX[0]

In [None]:
Z = Every_Nth_Value(tmpX)

In [None]:
len(Z), len(Z[0])

In [None]:
len(masterX)

In [None]:
len(masterX[0])

In [None]:
c=0
for i,n in enumerate(masterX):
    if len(n) == 20338:
        c=c+1
print(c)

In [None]:
plt.plot(masterX[0])

In [None]:
fixedX = FIXNAN(masterX)

In [None]:
for i in range(len(masterX)):
    #pc = (i+1) % len(masterX)//100
    if (i+1) % len(masterX)//100 == 0:
        print(i+1)

In [8]:
ars = algorithm_grid['Arsenal']
ars

{'num_kernels': [1000, 1500, 2000, 2500, 3000],
 'n_estimators': [15, 20, 25, 30, 35]}

In [67]:
new_algorithm_grid = {key.__name__: {None} for key in list_of_classifiers}

for c in new_algorithm_grid:
    print(c)
    pref = c.lower()+"__"
    newvals = {}
    #print(algorithm_grid[c])
    for subkey, subvals in algorithm_grid[c].items():
        
        pipe_name = pref+subkey
        
        print(pipe_name,"->",subvals)
        #print(new_algorithm_grid[c])
        newvals[pipe_name] = subvals
    
    print(newvals,'\n')
    new_algorithm_grid[c] = newvals

Arsenal
arsenal__num_kernels -> [1000, 1500, 2000, 2500, 3000]
arsenal__n_estimators -> [15, 20, 25, 30, 35]
{'arsenal__num_kernels': [1000, 1500, 2000, 2500, 3000], 'arsenal__n_estimators': [15, 20, 25, 30, 35]} 

CanonicalIntervalForest
canonicalintervalforest__n_estimators -> [50, 200, 250, 300, 800]
canonicalintervalforest__base_estimator -> ['CIT', 'DTC']
{'canonicalintervalforest__n_estimators': [50, 200, 250, 300, 800], 'canonicalintervalforest__base_estimator': ['CIT', 'DTC']} 

ContractableBOSS
contractableboss__n_parameter_samples -> [50, 200, 250, 300, 800]
contractableboss__max_ensemble_size -> [10, 45, 50, 55, 100]
{'contractableboss__n_parameter_samples': [50, 200, 250, 300, 800], 'contractableboss__max_ensemble_size': [10, 45, 50, 55, 100]} 

DrCIF
drcif__n_estimators -> [10, 150, 200, 250, 800]
drcif__base_estimator -> ['CIT', 'DTC']
{'drcif__n_estimators': [10, 150, 200, 250, 800], 'drcif__base_estimator': ['CIT', 'DTC']} 

IndividualBOSS
individualboss__window_size ->

In [68]:
new_algorithm_grid

{'Arsenal': {'arsenal__num_kernels': [1000, 1500, 2000, 2500, 3000],
  'arsenal__n_estimators': [15, 20, 25, 30, 35]},
 'CanonicalIntervalForest': {'canonicalintervalforest__n_estimators': [50,
   200,
   250,
   300,
   800],
  'canonicalintervalforest__base_estimator': ['CIT', 'DTC']},
 'ContractableBOSS': {'contractableboss__n_parameter_samples': [50,
   200,
   250,
   300,
   800],
  'contractableboss__max_ensemble_size': [10, 45, 50, 55, 100]},
 'DrCIF': {'drcif__n_estimators': [10, 150, 200, 250, 800],
  'drcif__base_estimator': ['CIT', 'DTC']},
 'IndividualBOSS': {'individualboss__window_size': [10, 25, 50, 75, 100],
  'individualboss__alphabet_size': [2, 3, 4, 5, 6]},
 'IndividualTDE': {'individualtde__window_size': [10, 25, 50, 75, 100],
  'individualtde__alphabet_size': [2, 3, 4, 5, 6]},
 'MUSE': {'muse__anova': [True, False], 'muse__window_inc': [2, 3, 4, 5, 6]},
 'RandomIntervalSpectralForest': {'randomintervalspectralforest__n_estimators': [10,
   150,
   200,
   250,
   

In [70]:
new_param_grid = {

    # Algorithm: {
    #   'algorithm__key1': [vals],
    #   'algorithm__key2': [vals],
    # },

'Arsenal':
        {'arsenal__num_kernels': [1000, 1500, 2000, 2500, 3000], 'arsenal__n_estimators': [15, 20, 25, 30, 35]},
 'CanonicalIntervalForest':
        {'canonicalintervalforest__n_estimators': [50, 200, 250, 300, 800], 'canonicalintervalforest__base_estimator': ['CIT', 'DTC']},
 'ContractableBOSS':
        {'contractableboss__n_parameter_samples': [50, 200, 250, 300, 800], 'contractableboss__max_ensemble_size': [10, 45, 50, 55, 100] },
 'DrCIF':
        {'drcif__n_estimators': [10, 150, 200, 250, 800], 'drcif__base_estimator': ['CIT', 'DTC']},
 'IndividualBOSS':
        {'individualboss__window_size': [10, 25, 50, 75, 100], 'individualboss__alphabet_size': [2, 3, 4, 5, 6]},
 'IndividualTDE':
        {'individualtde__window_size': [10, 25, 50, 75, 100], 'individualtde__alphabet_size': [2, 3, 4, 5, 6]},
 'MUSE':
        {'muse__anova': [True, False], 'muse__window_inc': [2, 3, 4, 5, 6]},
 'RandomIntervalSpectralForest':
        {'randomintervalspectralforest__n_estimators': [10, 150, 200, 250, 800],  'randomintervalspectralforest__min_interval': [4, 8, 16, 32, 64]},
 'SignatureClassifier':
        {'signatureclassifier__n_estimators': [50, 200, 250, 300, 800]},
 'WEASEL':
        {'weasel__anova': [True, False], 'weasel__window_inc': [2, 3, 4, 5, 6]}
}

In [73]:
for classifier in list_of_classifiers:
    cname = classifier.__name__
    print(new_param_grid[cname])

{'arsenal__num_kernels': [1000, 1500, 2000, 2500, 3000], 'arsenal__n_estimators': [15, 20, 25, 30, 35]}
{'canonicalintervalforest__n_estimators': [50, 200, 250, 300, 800], 'canonicalintervalforest__base_estimator': ['CIT', 'DTC']}
{'contractableboss__n_parameter_samples': [50, 200, 250, 300, 800], 'contractableboss__max_ensemble_size': [10, 45, 50, 55, 100]}
{'drcif__n_estimators': [10, 150, 200, 250, 800], 'drcif__base_estimator': ['CIT', 'DTC']}
{'individualboss__window_size': [10, 25, 50, 75, 100], 'individualboss__alphabet_size': [2, 3, 4, 5, 6]}
{'individualtde__window_size': [10, 25, 50, 75, 100], 'individualtde__alphabet_size': [2, 3, 4, 5, 6]}
{'muse__anova': [True, False], 'muse__window_inc': [2, 3, 4, 5, 6]}
{'randomintervalspectralforest__n_estimators': [10, 150, 200, 250, 800], 'randomintervalspectralforest__min_interval': [4, 8, 16, 32, 64]}
{'signatureclassifier__n_estimators': [50, 200, 250, 300, 800]}
{'weasel__anova': [True, False], 'weasel__window_inc': [2, 3, 4, 5, 6