In [15]:
#!/usr/bin/env python3

################################
# General Imports
################################
import csv, math, io, os, os.path, sys, random, time, json, gc, glob, statistics
from datetime import datetime
import joblib
from joblib import Parallel, delayed, dump, load

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sb

################################
# Scientific Imports
################################
import scipy
from scipy.signal import butter,filtfilt

################################
# SKLearn Imports
################################
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MaxAbsScaler

from sklearn.datasets import fetch_california_housing

################################
# SKTime Imports
################################
from sktime.datatypes._panel._convert import from_2d_array_to_nested, from_nested_to_2d_array, is_nested_dataframe, from_nested_to_multi_index
#from sktime.forecasting.compose import TransformedTargetForecaster
#from sktime.forecasting.model_selection import ForecastingGridSearchCV

from sktime.datasets import load_arrow_head

from sktime.registry import all_estimators

from sktime.classification.kernel_based import Arsenal
#from sktime.classification.interval_based import CanonicalIntervalForest
#from sktime.classification.dictionary_based import ContractableBOSS
#from sktime.classification.interval_based import DrCIF
#from sktime.classification.hybrid import HIVECOTEV1
#from sktime.classification.dictionary_based import IndividualBOSS
#from sktime.classification.dictionary_based import IndividualTDE
#from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
#from sktime.classification.feature_based import MatrixProfileClassifier
#from sktime.classification.dictionary_based import MUSE
#from sktime.classification.interval_based import RandomIntervalSpectralForest
#from sktime.classification.distance_based import ShapeDTW
#from sktime.classification.feature_based import SignatureClassifier
#from sktime.classification.interval_based import SupervisedTimeSeriesForest
#from sktime.classification.feature_based import TSFreshClassifier
#from sktime.classification.dictionary_based import WEASEL

################################
# Initialisers
################################
default_rc_params = (16,9)
plt.rcParams["figure.figsize"] = default_rc_params
sb.set()

xNaNs = np.load("X_NAN_LIST.npy")
xTime = np.load("X_TIME_LIST.npy")
ESTIM = all_estimators(estimator_types="classifier")

################################
# Suppress Warnings
################################
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 
warnings.simplefilter(action='ignore', category=FutureWarning)

## FUNCTIONS

In [117]:
################################
# Functions
################################

def Every_Nth_Value(y,nth=40):
    
    newY = np.zeros(len(y))
    
    print("Every Nth Value")
    
    for row in y:
        #row[:] = row[::nth]
        row = np.delete(row, np.arange(None, None, nth))
    
    return y

################################

def Every_Nth_ValueXY(x,y,n=40):
    return (Every_Nth_Value(x,nth=n), Every_Nth_Value(y,nth=n))

################################

def EveryNthNested(y,nth=40):
    print("Every Nth Value (NESTED)")
    
    if type(y) is not np.ndarray:
        print(f"Datatype {type(y)} found! Processing...")
        # De-nest
        y_ = from_nested_to_2d_array(y,True)

        # Subsample
        y_ = y[::nth]

        # Re-nest
        y_ = from_2d_array_to_nested(y_)
    else:
        print("np array detected; returning as-is")
        y_ = y
    # Return
    print("Returning")
    return y_

################################

def GetNumDays(time=xTime):
    
    #xTime = np.load("X_TIME_LIST.npy")
    nDays = time[-1]-time[0]
    
    return (nDays)

################################

def FilterMyData(x,cutoff=0.00005,order=2):
    
    """
    Function to apply a Butter Filter to timeseries.
    Vars:
    
    y:        The timeseries. Must be list or np array.
    cutoff:   The cutoff frequency. Used to determine where the filter cut off is.
    order:    Approximation via polynomial of the order'th degree (2=quadratic, 3=cubic, 4=quartic, etc)
    """
    
    # DATA VALIDATION
    # Flag
    isNested = False
    
    print(f"INITIAL shape of x is {x.shape}; INITIAL type of x is {type(x)}")
    
    # Check to see if x is a nested dataframe or not
    if type(x) == pd.core.frame.DataFrame:
        isNested = True
        print("NESTED DATAFRAME FOUND! UNPACKING...")
        x = from_nested_to_2d_array(x,True)
    
    # First, let's calculate the observational time period;
    # This is done separately so that I can change this in the future for any TESS fits file
    numdays       = GetNumDays()
    
    # Next, fix data                           
    xMedian       = np.median(x)                                                    # Get the median value of 'x' before changing it
    x             = [xMedian if n in xNaNs else item for n,item in enumerate(x)]    # Change all the missing values to the median value of the whole array
    
    # Frequency Data Stuff
    sec           = numdays*24*60*60   # Number of seconds in the overall observation period
    freq          = len(x)/sec         # Frequency, in Hz, ie number of observations per second
    # FREQ IS APPROX 1/120 OR ~0.008333333
    
    # Butter Lowpass Filter
    #polynomOrder  = order
    nyq           = 0.5 * freq
    normal_cutoff = cutoff / nyq
    #b, a          = butter(polynomOrder, normal_cutoff, btype='low', analog=False)
    b, a          = butter(order, normal_cutoff, btype='low', analog=False)
    
    newX          = np.array(filtfilt(b, a, x))
    
    print(f"newX generated; curr shape is {newX.shape} and current dtype is {newX.dtype}")
    
    if isNested == True:
        newX = np.vstack(newX[:,]).astype('float32')   # <-- This apparently fixes some of the issues I've been having?
        # see: https://stackoverflow.com/questions/19459017/how-to-convert-a-numpy-2d-array-with-object-dtype-to-a-regular-2d-array-of-float
        
        newX = from_2d_array_to_nested(newX)
    
    print(f"Returning Values; final shape is {newX.shape} and final type is {type(newX)}")
    
    # Finally, return the new X and Y values
    return (newX)

################################

def FilterMyUnNestedData(x,cutoff=0.00005,order=2):
    
    """
    Function to apply a Butter Filter to timeseries.
    Vars:
    
    y:        The timeseries. Must be list or np array.
    cutoff:   The cutoff frequency. Used to determine where the filter cut off is.
    order:    Approximation via polynomial of the order'th degree (2=quadratic, 3=cubic, 4=quartic, etc)

    """
    
    # First, let's calculate the observational time period;
    # This is done separately so that I can change this in the future for any TESS fits file
    numdays = GetNumDays()
    sec     = numdays*24*60*60   # Number of seconds in the overall observation period
    rowLen  = len(x[0])          # Technically bad practice, bu since we know every row has same length, it's okay to do outside func
    freq    = rowLen/sec         # Frequency, in Hz, ie number of observations per second
    
    # Butter Lowpass Filter
    #polynomOrder  = order
    nyq           = 0.5 * freq
    normal_cutoff = cutoff / nyq
    
    for row in x:

        # Next, fix data                           
        ####rowMedian = np.median(row)                                                    # Get the median value of 'x' before changing it
        ####print(f"Median is {rowMedian}")
        ####row[...]  = [rowMedian if n in xNaNs else item for n,item in enumerate(row)]    # Change all the missing vals to median of the whole row
        row       = FIXNAN(row)
        
        b, a      = butter(order, normal_cutoff, btype='low', analog=False)
        row       = np.array(filtfilt(b, a, x))
    
    # Finally, return the new X and Y values
    return (x)

################################

def Normalise(X,fixnan=True):
    # First of all, decide if wan to Fix all the 0s / NaNs
    if fixnan:
        X = FIXNANNEW(X)

    median = np.median(X)

    #print(f"OldNormal median = {median}")

    X[:] = [(number/median) for number in X]
    return X

################################

def FIXNANNEW(y, nanList=xNaNs):
    
    print(f"Array has [{len(y)}] elements, with each element having [{len(y[0])}] sub-elements")
    
    for row in y:
        m = np.median(row)
        #print(f"median = {m}")
        row[...] = [m if n in nanList else item for n,item in enumerate(row)]
    
    return y

################################

def FIXNAN(y, nanList=xNaNs):
    yMedian = np.median(y)
    y = [yMedian if n in nanList else item for n,item in enumerate(y)]
    return y

################################

def ConvertDataToNestedDF(X):
    print(f"Shape of X is {X.shape}")
    x = from_2d_array_to_nested(X)
    return x

################################

In [109]:
def StretchArray(oldArr):
    
    q = np.zeros(2*len(oldArr)-1)
    q[0]  = oldArr[0]
    q[-1] = oldArr[-1]

    for i in range(len(n)):
        #print(f"X[{i}] = {n[i]}")

        if i != len(oldArr)-1:

            q[2*i]   = oldArr[i]
            q[2*i+1] = np.average([oldArr[i],oldArr[i+1]])
    
    return q
    
    #q[i]   = X[0][i//2]
    #q[i+1] = 0.5*(X[0][i] + X[0][i+1])

In [16]:
X, y = fetch_california_housing(return_X_y=True)

In [116]:
len(X), len(y)

(20640, 20640)

In [119]:
flt = FunctionTransformer(FilterMyUnNestedData)
nth = FunctionTransformer(Every_Nth_Value)
nrm = FunctionTransformer(Normalise)
cnv = FunctionTransformer(ConvertDataToNestedDF)

pipe = Pipeline(steps=[('filter',flt),('everynth',nth),('normalise', nrm),('recombine', cnv), ('algorithm',Arsenal())])