In [1]:
################################
# General Imports
################################
import csv, math, io, os, os.path, sys, random, time, json, gc, glob
from datetime import datetime
import joblib
from joblib import Parallel, delayed, dump, load

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sb

################################
# Scientific Imports
################################
import scipy
from scipy.signal import butter,filtfilt

################################
# SKLearn Imports
################################
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MaxAbsScaler

################################
# SKTime Imports
################################
from sktime.datatypes._panel._convert import from_2d_array_to_nested, from_nested_to_2d_array, is_nested_dataframe
from sktime.forecasting.compose import TransformedTargetForecaster
from sktime.forecasting.model_selection import ForecastingGridSearchCV

from sktime.classification.kernel_based import Arsenal
from sktime.classification.interval_based import CanonicalIntervalForest
from sktime.classification.dictionary_based import ContractableBOSS
from sktime.classification.interval_based import DrCIF
from sktime.classification.hybrid import HIVECOTEV1
from sktime.classification.dictionary_based import IndividualBOSS
from sktime.classification.dictionary_based import IndividualTDE
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
from sktime.classification.feature_based import MatrixProfileClassifier
from sktime.classification.dictionary_based import MUSE
from sktime.classification.interval_based import RandomIntervalSpectralForest
from sktime.classification.distance_based import ShapeDTW
from sktime.classification.feature_based import SignatureClassifier
from sktime.classification.interval_based import SupervisedTimeSeriesForest
from sktime.classification.feature_based import TSFreshClassifier
from sktime.classification.dictionary_based import WEASEL

################################
# Suppress Warnings
################################
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 

################################
# Initialisers
################################
default_rc_params = (16,9)
plt.rcParams["figure.figsize"] = default_rc_params
sb.set()

In [2]:
################################
# Data Initialisers
################################
xNaNs = np.load("X_NAN_LIST.npy")
xTime = np.load("X_TIME_LIST.npy")

In [3]:
#masterX = [x[1:-1] for x in np.load("None_Or_One_Exoplanet_NORMALISED.npy")]
masterX = [x[1:-1] for x in np.load("None_Or_One_Exoplanet.npy")]
masterY = np.load("None_Or_One_isplanetlist.npy")

#X_nested = from_2d_array_to_nested(np.array(masterX))
#Xtrain, Xtest, ytrain, ytest  = train_test_split(X_nested, masterY, random_state=42)
Xtrain_,Xtest_,ytrain_,ytest_ = train_test_split (masterX, masterY, random_state=42)

In [10]:
################################
# Functions
################################

def GetLCData(rndFile=-1, outputFileName=False):
    
    # rndFile is random, unless specified
    rndFile = random.randint(0,len(fitsarr)) if rndFile==-1 else rndFile
    
    # Get LC data from the requisite fits file
    fitsFile = fitsarr[rndFile]

    # The following line of code gives us the header values
    fitsHeaders = fits.getheader(fitsFile)

    with fits.open(fitsFile, mode="readonly") as hdulist:

        tess_bjds     = hdulist[1].data['TIME']
        pdcsap_fluxes = hdulist[1].data['PDCSAP_FLUX']
    
    if outputFileName:
        return (tess_bjds[1:-1], pdcsap_fluxes[1:-1], rndFile)
    else:
        return (tess_bjds[1:-1], pdcsap_fluxes[1:-1])

################################

def Every_Nth_Value(y,n=40):
    return (y[::n])

################################

def Every_Nth_ValueXY(x,y,n=40):
    return (Every_Nth_Value(x,n), Every_Nth_Value(y,n))

################################

def GetNumDays(time=xTime):
    
    #xTime = np.load("X_TIME_LIST.npy")
    nDays = time[-1]-time[0]
    
    return (nDays)

################################

def FilterMyData(x,cutoff=0.00005,order=2):
    
    """
    Function to apply a Butter Filter to timeseries.
    Vars:
    
    y:        The timeseries. Must be list or np array.
    cutoff:   The cutoff frequency. Used to determine where the filter cut off is.
    order:    Approximation via polynomial of the order'th degree (2=quadratic, 3=cubic, 4=quartic, etc)
    """
    
    # DATA VALIDATION
    
    # Flag
    isNested = False
    
#####    # Check to see if x is a nested dataframe or not
#####    if type(x) == pd.core.frame.DataFrame:
#####        isNested = True
#####        print("NESTED DATAFRAME FOUND! UNPACKING FOR CALCULATIONS, THE REPACKING...")
#####        x = from_nested_to_2d_array(x)
    
    # First, let's calculate the observational time period;
    # This is done separately so that I can change this in the future for any TESS fits file
    numdays       = GetNumDays()
    
    # Next, fix data
    xMedian       = np.median(x)                                                    # Get the median value of 'x' before changing it
    x             = [xMedian if n in xNaNs else item for n,item in enumerate(x)]    # Change all the missing values to the median value of the whole array
    
    # Frequency Data Stuff
    sec           = numdays*24*60*60   # Number of seconds in the overall observation period
    freq          = len(x)/sec         # Frequency, in Hz, ie number of observations per second
    # FREQ IS APPROX 1/120 OR ~0.008333333
    
    # Butter Lowpass Filter
    polynomOrder  = order
    nyq           = 0.5 * freq
    normal_cutoff = cutoff / nyq
    b, a          = butter(polynomOrder, normal_cutoff, btype='low', analog=False)
    newX          = filtfilt(b, a, x)
    
    if isNested == True:
        nexX = from_2d_array_to_nested(newX)
    
    # Finally, return the new X and Y values
    return (newX)

################################

def GetFreqData(x,cutoff=0.00005):
    
    # First, let's calculate the observational time period;
    # This is done separately so that I can change this in the future for any TESS fits file
    numdays       = GetNumDays()
    
    # Next, fix data                           
    xMedian       = np.median(x)                                                    # Get the median value of 'y' before changing it
    x             = [xMedian if n in xNaNs else item for n,item in enumerate(x)]    # Change all the missing values to the median value of the whole array
    
    # Frequency Data Stuff
    sec           = numdays*24*60*60   # Number of seconds in the overall observation period
    freq          = len(x)/sec         # Frequency, in Hz, ie number of observations per second
    # FREQ IS APPROX 1/120 OR ~0.008333333
    
    # Butter Lowpass Filter
    nyq           = 0.5 * freq
    normal_cutoff = cutoff / nyq
    
    # Finally, return the new X and Y values
    return (freq,normal_cutoff)

################################

def FIXNAN(y, nanList=xNaNs):
    yMedian = np.median(y)
    y = [yMedian if n in nanList else item for n,item in enumerate(y)]
    return y

################################

def GetMetrics(classifierType, Xtrain, Xtest, ytrain, ytest, param_grid):
    
    # Make a PCA Pipeline
    print("> GM: START")
    
    model = classifierType()
    cname = classifierType.__name__
    print(f"\t> Model: {cname}")
    
    print("> GM: GENERATING TRANSFORMERS")
    flt = FunctionTransformer(FilterMyData)
    nth = FunctionTransformer(Every_Nth_Value)
    
    print("> GM: MAKE PIPELINE")
    #pipe = make_pipeline(flt,mdl)
    #pipe = make_pipeline(FunctionTransformer(FilterMyData), FunctionTransformer(Every_Nth_Value), model)
    pipe = Pipeline(steps=[['filter',flt],['nth',nth],['algo',model]])

    #print(pipe.get_params().keys())
    
    # Do gridsearch for svc params
    print("> GM: GRIDSEARCH")
    #grid = GridSearchCV(pipe, param_grid)
    grid = pipe
    
    return grid

def STUFFFROMABOVE(grid, Xtrain, Xtest, ytrain, ytest):
    
    # Fit model
    print("> GM: FIT")
    grid.fit(Xtrain, ytrain)
    
    # Use svc params and predict
    print("> GM: MAKESTATS")
    moreStats = grid.cv_results_
    #print("> > Best parameter (CV score=%0.3f):" % grid.best_score_)
    #print("> > {}".format(grid.best_params_))
    
    # Use svc params and predict
    print("> GM: PREDICT")
    model = grid.best_estimator_
    yfit = model.predict(Xtest)
    
    # Now that model has done, time for confusion matrix shenanigans
    print("> GM: CONFUSION")
    mat = confusion_matrix(ytest, yfit)
    
    return (mat, moreStats)

################################

def WriteJSON(targetname, acc, pre, rec, stats):
    # Preparing the stats text
    data = {}
    data[targetname] = []
    data[targetname].append({
        'Accuracy' : acc,
        'Precision' : pre,
        'Recall' : rec,
        'CV Stats': stats
    })

    # File saving stuff
    fname = targetname+".json"
    targetdest = "./sktime_results/"

    print("Saving {}".format(fname))

    # Write all the info to a file
    with open(targetdest+fname, "w") as f:
        #f.write(stats)
        json.dump(data, f, indent=4, default=str)

################################

def Merge(dict1, dict2):
    res = {**dict1, **dict2}
    return res

In [5]:
#list_of_classifiers = [x.split('./sktime_results/sktime_')[1].split('_fitted.json')[0] for x in glob.glob('./sktime_results/*.json')]
list_of_classifiers = [x.split('_')[2] for x in glob.glob('./sktime_results/*.json')]
list_of_classifiers.sort()
list_of_classifiers

['Arsenal',
 'CanonicalIntervalForest',
 'ContractableBOSS',
 'DrCIF',
 'IndividualBOSS',
 'IndividualTDE',
 'MUSE',
 'MatrixProfileClassifier',
 'RandomIntervalSpectralForest',
 'ShapeDTW',
 'SignatureClassifier',
 'SupervisedTimeSeriesForest',
 'TSFreshClassifier',
 'WEASEL']

In [6]:
def MakeClassifierKWArgs(classifier):
    c = {}
    c[classifier] = []
    c[classifier].append({
        "Name" : classifier
    })
    print(c)
    return c

D = {}
for C in list_of_classifiers:
    D = Merge(D, MakeClassifierKWArgs(C))

{'Arsenal': [{'Name': 'Arsenal'}]}
{'CanonicalIntervalForest': [{'Name': 'CanonicalIntervalForest'}]}
{'ContractableBOSS': [{'Name': 'ContractableBOSS'}]}
{'DrCIF': [{'Name': 'DrCIF'}]}
{'IndividualBOSS': [{'Name': 'IndividualBOSS'}]}
{'IndividualTDE': [{'Name': 'IndividualTDE'}]}
{'MUSE': [{'Name': 'MUSE'}]}
{'MatrixProfileClassifier': [{'Name': 'MatrixProfileClassifier'}]}
{'RandomIntervalSpectralForest': [{'Name': 'RandomIntervalSpectralForest'}]}
{'ShapeDTW': [{'Name': 'ShapeDTW'}]}
{'SignatureClassifier': [{'Name': 'SignatureClassifier'}]}
{'SupervisedTimeSeriesForest': [{'Name': 'SupervisedTimeSeriesForest'}]}
{'TSFreshClassifier': [{'Name': 'TSFreshClassifier'}]}
{'WEASEL': [{'Name': 'WEASEL'}]}


In [7]:
param_grid = dict(filter__kw_args = 
                  [
                      {'cutoff': np.linspace(0.00001,0.0018755128487341842)},
                      {'order': [1,2,3]},
                  ],
                  nth__kw_args = 
                  [
                      {'nth': [10, 20, 30, 40, 50]}
                  ],
                  #flt__cutoff: np.linspace(0.00001,0.0018755128487341842),
                  #flt__order: [1,2,3]   #,
                  #drcif__base_estimator = ['DTC', 'CIT'],
                  #drcif__n_estimators   =   np.linspace(100,1000,19)
             )

param_grid = {}

In [None]:
fnx = list(np.array(from_nested_to_2d_array(Xtrain)))
#fnx = fnx.tolist()
fnx[:3]

In [None]:
Xtrain_[:3]

In [None]:
#GetMetrics(Arsenal, from_nested_to_2d_array(Xtrain), Xtest, ytrain, ytest, param_grid)
#G = GetMetrics(Arsenal, Xtrain_, Xtest_, ytrain_, ytest_, param_grid)
G = GetMetrics(Arsenal, Xtrain_, Xtest_, ytrain_, ytest_, param_grid)

> GM: START
	> Model: Arsenal
> GM: GENERATING TRANSFORMERS
> GM: MAKE PIPELINE
> GM: GRIDSEARCH


In [None]:
Gg = STUFFFROMABOVE(G, Xtrain_, Xtest_, ytrain_, ytest_)

> GM: FIT


ValueError: Found input variables with inconsistent numbers of samples: [172, 6865]

In [None]:
for classifier in list_of_classifiers:
        print(f"Model: {classifier}")
        c = eval(classifier)()
        print(type(c).__name__)
        #Parallel(n_jobs=10)(delayed(MakeModels)(X_train,y_train,c) for c in classifier)

### Refreshing memory of SKLearn Pre-Processing

In [None]:
plt.rcParams["figure.figsize"] = (16,5)

Z = rawX[0]
plt.plot(Z)

#### Seeing if any functions from preprocessing do what I want to do

In [None]:
def Normalise(X,fixnan=True):
    
    # First of all, decide if wan to Fix all the 0s / NaNs
    if fixnan:
        X = FIXNAN(X)
    
    # Reshape because apparently it doesn't work if I don't
    n = np.reshape(X, (-1,1))
    
    # Make and fit scaler
    scaler = StandardScaler() #(0.75,1.25))
    scaler.fit(n)
    
    # Calculate values
    #scMean = scaler.mean_
    scTrans = scaler.transform(n)    # <--- +1 to center it on y=1 rather than y=0
    
    # Return transformed array
    return scTrans

def MaxABS(X,fixnan=True,center=1):
    # First of all, decide if wan to Fix all the 0s / NaNs
    if fixnan:
        X = FIXNAN(X)
    
    n = np.reshape(X, (-1,1))
    scaler = MaxAbsScaler() #(0.75,1.25))
    scaler.fit(n)
    scTrans = scaler.transform(n)
    #Median = np.median(scTrans)
    #scTrans = scTrans + (center-Median)     # <-- Center on one
    return scTrans

def Normal(X,fixnan=True):
    # First of all, decide if wan to Fix all the 0s / NaNs
    if fixnan:
        X = FIXNAN(X)
    
    median = np.median(X)
    
    #print(f"OldNormal median = {median}")
    
    X[:] = [(number/median) for number in X]
    return X

NormaliseData = FunctionTransformer(Normal)

A = rawX[0]
B = rawX[1]
C = rawX[2]
zA = MaxABS(A)
zB = MaxABS(B)
zC = MaxABS(C)
nA = NormaliseData.transform(A)
nB = NormaliseData.transform(B)
nC = NormaliseData.transform(C)

print(f"Medians:\tMedian(zA) =\t{np.median(zA)}\n\t\tMedian(nA) =\t{np.median(nA)}\n")
print(f"Medians:\tMedian(zB) =\t{np.median(zB)}\n\t\tMedian(nB) =\t{np.median(nB)}\n")
print(f"Medians:\tMedian(zC) =\t{np.median(zC)}\n\t\tMedian(nC) =\t{np.median(nC)}\n")

#plt.plot(zA)
plt.plot(zA, 'b-')
#plt.plot(zC)
plt.plot(nA, 'g-')

#### So it seems that nothing really does what I want it to do; MaxAbsScaler comes close but isn't quite on the ball. Seeing as it's close enough, this might work, but for now I'm sticking with making my own transformer out of the normalise function, so I can use it in Pipelines

### ON TO MAIN STUFF!

Order of steps:
* Load X and Y
* Train-test-split them (use nested for SKTime? Will work with Nested Array? Checked, both t-t-s sets contain same data)
* Start Loop:

- For each classifier:
  1. 
  2. B
  3. C

In [None]:
list(np.linspace(0.00001,0.0018755128487341842))

So, taking "raw" data (ie, not normalised) and applying a `scaler.mean__` function to it does the same thing as my normalising function.  
Therefore, I should use the raw data (ie, nothing done to it) and then apply the scaler as part of the pipeline

## GRIDSEARCHING SKTIME
Please refer to https://www.sktime.org/en/stable/api_reference/auto_generated/sktime.forecasting.model_selection.ForecastingGridSearchCV.html  
and  
https://www.sktime.org/en/stable/examples/01_forecasting.html#

In [None]:
na,nb,nc,nd = train_test_split(X_nested, masterY, random_state=42)
ma,mb,mc,md = train_test_split( masterX, masterY, random_state=42)

In [None]:
na

In [None]:
ma[:10]

### They are the same