In [56]:
#!/usr/bin/env python3

################################
# General Imports
################################
import csv, math, io, os, os.path, sys, random, time, json, gc, glob, statistics
from datetime import datetime
import joblib
from joblib import Parallel, delayed, dump, load

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sb

################################
# Scientific Imports
################################
import scipy
from scipy.signal import butter,filtfilt

################################
# SKLearn Imports
################################
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MaxAbsScaler

################################
# SKTime Imports
################################
from sktime.datatypes._panel._convert import from_2d_array_to_nested, from_nested_to_2d_array, is_nested_dataframe, from_nested_to_multi_index
from sktime.forecasting.compose import TransformedTargetForecaster
from sktime.forecasting.model_selection import ForecastingGridSearchCV

from sktime.classification.kernel_based import Arsenal
from sktime.classification.interval_based import CanonicalIntervalForest
from sktime.classification.dictionary_based import ContractableBOSS
from sktime.classification.interval_based import DrCIF
from sktime.classification.hybrid import HIVECOTEV1
from sktime.classification.dictionary_based import IndividualBOSS
from sktime.classification.dictionary_based import IndividualTDE
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
from sktime.classification.feature_based import MatrixProfileClassifier
from sktime.classification.dictionary_based import MUSE
from sktime.classification.interval_based import RandomIntervalSpectralForest
from sktime.classification.distance_based import ShapeDTW
from sktime.classification.feature_based import SignatureClassifier
from sktime.classification.interval_based import SupervisedTimeSeriesForest
from sktime.classification.feature_based import TSFreshClassifier
from sktime.classification.dictionary_based import WEASEL

################################
# Suppress Warnings
################################
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 

################################
# Initialisers
################################
default_rc_params = (16,9)
plt.rcParams["figure.figsize"] = default_rc_params
sb.set()

xNaNs = np.load("X_NAN_LIST.npy")
xTime = np.load("X_TIME_LIST.npy")

In [95]:
masterX = np.asarray([x[1:-1] for x in np.load("None_Or_One_Exoplanet.npy")])
masterY = np.load("None_Or_One_isplanetlist.npy")

In [152]:
################################
# Functions
################################

def Every_Nth_Value(y,nth=40):
    
    newY = np.zeros(len(y))
    
    print("Every Nth Value")
    
    for row in y:
        #row[:] = row[::nth]
        row = np.delete(row, np.arange(None, None, nth))
    
    return y

################################

def Every_Nth_ValueXY(x,y,n=40):
    return (Every_Nth_Value(x,nth=n), Every_Nth_Value(y,nth=n))

################################

def EveryNthNested(y,nth=40):
    print("Every Nth Value (NESTED)")
    
    if type(y) is not np.ndarray:
        print(f"Datatype {type(y)} found! Processing...")
        # De-nest
        y_ = from_nested_to_2d_array(y,True)

        # Subsample
        y_ = y[::nth]

        # Re-nest
        y_ = from_2d_array_to_nested(y_)
    else:
        print("np array detected; returning as-is")
        y_ = y
    # Return
    print("Returning")
    return y_

################################

def GetNumDays(time=xTime):
    
    #xTime = np.load("X_TIME_LIST.npy")
    nDays = time[-1]-time[0]
    
    return (nDays)

################################

def FilterMyData(x,cutoff=0.00005,order=2):
    
    """
    Function to apply a Butter Filter to timeseries.
    Vars:
    
    y:        The timeseries. Must be list or np array.
    cutoff:   The cutoff frequency. Used to determine where the filter cut off is.
    order:    Approximation via polynomial of the order'th degree (2=quadratic, 3=cubic, 4=quartic, etc)
    """
    
    # DATA VALIDATION
    # Flag
    isNested = False
    
    print(f"INITIAL shape of x is {x.shape}; INITIAL type of x is {type(x)}")
    
    # Check to see if x is a nested dataframe or not
    if type(x) == pd.core.frame.DataFrame:
        isNested = True
        print("NESTED DATAFRAME FOUND! UNPACKING...")
        x = from_nested_to_2d_array(x,True)
    
    # First, let's calculate the observational time period;
    # This is done separately so that I can change this in the future for any TESS fits file
    numdays       = GetNumDays()
    
    # Next, fix data                           
    xMedian       = np.median(x)                                                    # Get the median value of 'x' before changing it
    x             = [xMedian if n in xNaNs else item for n,item in enumerate(x)]    # Change all the missing values to the median value of the whole array
    
    # Frequency Data Stuff
    sec           = numdays*24*60*60   # Number of seconds in the overall observation period
    freq          = len(x)/sec         # Frequency, in Hz, ie number of observations per second
    # FREQ IS APPROX 1/120 OR ~0.008333333
    
    # Butter Lowpass Filter
    #polynomOrder  = order
    nyq           = 0.5 * freq
    normal_cutoff = cutoff / nyq
    #b, a          = butter(polynomOrder, normal_cutoff, btype='low', analog=False)
    b, a          = butter(order, normal_cutoff, btype='low', analog=False)
    
    newX          = np.array(filtfilt(b, a, x))
    
    print(f"newX generated; curr shape is {newX.shape} and current dtype is {newX.dtype}")
    
    if isNested == True:
        newX = np.vstack(newX[:,]).astype('float32')   # <-- This apparently fixes some of the issues I've been having?
        # see: https://stackoverflow.com/questions/19459017/how-to-convert-a-numpy-2d-array-with-object-dtype-to-a-regular-2d-array-of-float
        
        newX = from_2d_array_to_nested(newX)
    
    print(f"Returning Values; final shape is {newX.shape} and final type is {type(newX)}")
    
    # Finally, return the new X and Y values
    return (newX)

################################

def FilterMyUnNestedData(x,cutoff=0.00005,order=2):
    
    """
    Function to apply a Butter Filter to timeseries.
    Vars:
    
    y:        The timeseries. Must be list or np array.
    cutoff:   The cutoff frequency. Used to determine where the filter cut off is.
    order:    Approximation via polynomial of the order'th degree (2=quadratic, 3=cubic, 4=quartic, etc)

    """
    
    # First, let's calculate the observational time period;
    # This is done separately so that I can change this in the future for any TESS fits file
    numdays = GetNumDays()
    sec     = numdays*24*60*60   # Number of seconds in the overall observation period
    rowLen  = len(x[0])          # Technically bad practice, bu since we know every row has same length, it's okay to do outside func
    freq    = rowLen/sec         # Frequency, in Hz, ie number of observations per second
    
    # Butter Lowpass Filter
    #polynomOrder  = order
    nyq           = 0.5 * freq
    normal_cutoff = cutoff / nyq
    
    for row in x:

        # Next, fix data                           
        ####rowMedian = np.median(row)                                                    # Get the median value of 'x' before changing it
        ####print(f"Median is {rowMedian}")
        ####row[...]  = [rowMedian if n in xNaNs else item for n,item in enumerate(row)]    # Change all the missing vals to median of the whole row
        row       = FIXNAN(row)
        
        b, a      = butter(order, normal_cutoff, btype='low', analog=False)
        row       = np.array(filtfilt(b, a, x))
    
    # Finally, return the new X and Y values
    return (x)

################################

def Normalise(X,fixnan=True):
    # First of all, decide if wan to Fix all the 0s / NaNs
    if fixnan:
        X = FIXNANNEW(X)

    median = np.median(X)

    #print(f"OldNormal median = {median}")

    X[:] = [(number/median) for number in X]
    return X

################################

def FIXNANNEW(y, nanList=xNaNs):
    
    print(f"Array has [{len(y)}] elements, with each element having [{len(y[0])}] sub-elements")
    
    for row in y:
        m = np.median(row)
        #print(f"median = {m}")
        row[...] = [m if n in nanList else item for n,item in enumerate(row)]
    
    return y

################################

def FIXNAN(y, nanList=xNaNs):
    yMedian = np.median(y)
    y = [yMedian if n in nanList else item for n,item in enumerate(y)]
    return y

################################

def ConvertDataToNestedDF(X):
    print(f"Shape of X is {X.shape}")
    x = from_2d_array_to_nested(X)
    return x

################################

def OUTPUT(inp):
    print(f"Number of inputs: {len(inp)}")
    return(inp)

################################

OUT = FunctionTransformer(OUTPUT)
flt = FunctionTransformer(FilterMyUnNestedData)
nth = FunctionTransformer(Every_Nth_Value)
nthNest = FunctionTransformer(EveryNthNested)
nrm = FunctionTransformer(Normalise)

#cnv = FunctionTransformer(ConvertDataToNestedDF)

#pipe = Pipeline(steps=[['filter',flt],['everynth',nth],['convert_to_nested',cnv],['drcif',DrCIF()]])
pipe = Pipeline(steps=[('filter',flt),('everynth',nth),('normalise', nrm),('algorithm',DrCIF())])
#pipe = Pipeline(steps=[['output', OUT],['everynth',nthNest],['drcif',DrCIF()]])

In [157]:
pipe.set_params(filter__kw_args = {'cutoff': 0.00001})

Pipeline(steps=[('filter',
                 FunctionTransformer(func=<function FilterMyUnNestedData at 0x7ff1483cb940>,
                                     kw_args={'cutoff': 1e-05})),
                ('everynth',
                 FunctionTransformer(func=<function Every_Nth_Value at 0x7ff1483cb040>)),
                ('normalise',
                 FunctionTransformer(func=<function Normalise at 0x7ff1483cb9d0>)),
                ('algorithm', DrCIF())])

In [105]:
useNested = False

if useNested == True:
    print("Nesting...")
    X_nested = from_2d_array_to_nested(masterX)
    Xtrain, Xtest, ytrain, ytest  = train_test_split(X_nested, masterY, random_state=42)
    
else:
    print("Not Nesting...")
    Xtrain, Xtest, ytrain, ytest  = train_test_split(masterX, masterY, random_state=42)
    miniXtrain, miniXtest, miniytrain, miniytest  = train_test_split(masterX[::100], masterY[::100], random_state=42)

Not Nesting...


In [130]:
len(miniXtrain[::40])

2

In [148]:
Q = masterX[:4].copy()
len(Q)

4

In [149]:
QQ = Every_Nth_Value(Q)
len(QQ[0])

Every Nth Value


TypeError: unsupported operand type(s) for -: 'NoneType' and 'int'

In [158]:
pipe.fit(miniXtrain, miniytrain)

Every Nth Value


TypeError: unsupported operand type(s) for -: 'NoneType' and 'int'

## Using Every_Nth on a nested dataframe removes every nth COLUMN, not every nth DATAPOINT;
### This is my attemot to fix!

In [25]:
Xtrain

array([[2.6676536e+03, 2.6655479e+03, 2.6676201e+03, ..., 2.6671768e+03,
        2.6650269e+03, 2.6726589e+03],
       [6.4500504e+01, 6.0147106e+01, 6.4060493e+01, ..., 6.0992054e+01,
        6.1895298e+01, 6.5518791e+01],
       [1.3567809e+04, 1.3572647e+04, 1.3554360e+04, ..., 1.3562414e+04,
        1.3607798e+04, 1.3531218e+04],
       ...,
       [5.9423371e+04, 5.9394555e+04, 5.9418496e+04, ..., 5.9405496e+04,
        5.9440094e+04, 5.9456051e+04],
       [2.3478081e+05, 2.3455773e+05, 2.3558998e+05, ..., 2.3511486e+05,
        2.3510678e+05, 2.3480664e+05],
       [4.7179578e+04, 4.7236531e+04, 4.7258133e+04, ..., 4.7205141e+04,
        4.7223762e+04, 4.7193320e+04]], dtype=float32)

In [33]:
X_ = Every_Nth_Value(Xtrain)
X_

Every Nth Value


Unnamed: 0,0
8816,0 2667.653564 1 2665.547852 2 ...
26,0 10155.880859 1 10100.121094 2 ...
8810,0 275.919403 1 270.289001 2 ...
4302,0 15592.483398 1 15585.524414 2 ...
6144,0 67675.625000 1 67521.695312 2 ...
...,...
7777,0 23350.113281 1 23359.029297 2 ...
5575,0 64565.648438 1 64446.183594 2 ...
7385,0 55137.886719 1 55187.375000 2 ...
6439,0 28503.615234 1 28385.777344 2 ...


Note: 172 ROWS instead of 6865 ROWS; each row still has same number of datapoints

Current theory:
* de-nest
* remove every nth column
* re-nest

In [5]:
def EveryNthNested(y,nth=40):
    # De-nest
    y_ = from_nested_to_2d_array(y,True)
    
    # Subsample
    y_ = y_[:,::nth]
    
    # Re-nest
    y_ = from_2d_array_to_nested(y_)
    
    # Return
    return y_

In [7]:
X__ = EveryNthNested(Xtrain)
X__

Unnamed: 0,0
0,0 2667.653564 1 2666.842529 2 2...
1,0 64.500504 1 66.167274 2 65.67...
2,0 13567.808594 1 13527.958008 2 ...
3,0 16274.025391 1 16289.065430 2 ...
4,0 70679.109375 1 70648.226562 2 ...
...,...
6860,0 108310.476562 1 107993.578125 2 ...
6861,0 242.809174 1 246.112396 2 239...
6862,0 59423.371094 1 59436.429688 2 ...
6863,0 234780.812500 1 236407.375000 2 ...


In [117]:
test2 = masterX[:6]
test2 = test2[:,::40]
test2

array([[5934.8354, 6033.544 , 5957.0825, ..., 5938.8906, 6016.2983,
        5977.3477],
       [5934.8354, 6033.544 , 5957.0825, ..., 5938.8906, 6016.2983,
        5977.3477],
       [5934.8354, 6033.544 , 5957.0825, ..., 5938.8906, 6016.2983,
        5977.3477],
       [5934.8354, 6033.544 , 5957.0825, ..., 5938.8906, 6016.2983,
        5977.3477],
       [5934.8354, 6033.544 , 5957.0825, ..., 5938.8906, 6016.2983,
        5977.3477],
       [5934.8354, 6033.544 , 5957.0825, ..., 5938.8906, 6016.2983,
        5977.3477]], dtype=float32)

In [118]:
len(test2[0])

509

In [56]:
test = np.linspace(0,100,100,False)
print(len(test))
[k for i, k in enumerate(test) if i % 40 == 0]

100


[0.0, 40.0, 80.0]

In [142]:
xtr = FilterMyData(Xtrain)
#xtr = xtr.astype('float32')
xtr

INITIAL shape of x is (6865, 20338); INITIAL type of x is <class 'numpy.ndarray'>
newX generated; curr shape is (6865,) and current dtype is object


array([array([49781.81 , 49781.574, 49794.18 , ..., 49749.12 , 49668.13 ,
              49753.5  ], dtype=float32)                                 ,
       array([56270.883, 56270.332, 56284.406, ..., 56234.746, 56143.965,
              56239.133], dtype=float32)                                 ,
       array([63001.887, 63001.035, 63016.625, ..., 62962.152, 62861.027,
              62966.625], dtype=float32)                                 ,
       ...,
       array([36622.816, 36447.844, 36114.363, ..., 33483.43 , 33472.086,
              33623.75 ], dtype=float32)                                 ,
       array([35169.773, 34995.96 , 34661.74 , ..., 32110.855, 32098.975,
              32250.125], dtype=float32)                                 ,
       array([33710.098, 33538.055, 33204.152, ..., 30737.941, 30725.61 ,
              30875.78 ], dtype=float32)                                 ],
      dtype=object)

In [143]:
float_arr = np.vstack(xtr[:,]).astype('float32')
print(float_arr.dtype)
float_arr

float32


array([[49781.81 , 49781.574, 49794.18 , ..., 49749.12 , 49668.13 ,
        49753.5  ],
       [56270.883, 56270.332, 56284.406, ..., 56234.746, 56143.965,
        56239.133],
       [63001.887, 63001.035, 63016.625, ..., 62962.152, 62861.027,
        62966.625],
       ...,
       [36622.816, 36447.844, 36114.363, ..., 33483.43 , 33472.086,
        33623.75 ],
       [35169.773, 34995.96 , 34661.74 , ..., 32110.855, 32098.975,
        32250.125],
       [33710.098, 33538.055, 33204.152, ..., 30737.941, 30725.61 ,
        30875.78 ]], dtype=float32)

In [144]:
xf = from_2d_array_to_nested(float_arr)

In [145]:
xf

Unnamed: 0,0
0,0 49781.808594 1 49781.574219 2 ...
1,0 56270.882812 1 56270.332031 2 ...
2,0 63001.886719 1 63001.035156 2 ...
3,0 69913.664062 1 69912.539062 2 ...
4,0 76937.351562 1 76936.000000 2 ...
...,...
6860,0 39387.917969 1 39213.664062 2 ...
6861,0 38036.839844 1 37861.613281 2 ...
6862,0 36622.816406 1 36447.843750 2 ...
6863,0 35169.773438 1 34995.960938 2 ...
