In [56]:
################################
# General Imports
################################
import csv, math, io, os, os.path, sys, random, time, json, gc, glob
from datetime import datetime
import joblib
from joblib import Parallel, delayed, dump, load

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sb

################################
# Multiprocessing maybe?
################################
import multiprocessing
multiprocessing

################################
# Scientific Imports
################################
import scipy
from scipy.signal import butter,filtfilt

################################
# SKLearn Imports
################################
#from sklearn.naive_bayes import GaussianNB
#from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import ComplementNB

from sklearn.naive_bayes import BernoulliNB     # <------ USE THIS ONE!!! Bernoulli is good for yes/no classification
                                                # https://towardsdatascience.com/naive-bayes-classifier-81d512f50a7c
                                    # https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB

from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MaxAbsScaler

################################
# SKTime Imports
################################
#from sktime.datatypes._panel._convert import from_2d_array_to_nested, from_nested_to_2d_array, is_nested_dataframe
from sktime.forecasting.compose import TransformedTargetForecaster
from sktime.forecasting.model_selection import ForecastingGridSearchCV

################################
# Suppress Warnings
################################
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 

################################
# Initialisers
################################
default_rc_params = (16,4)
plt.rcParams["figure.figsize"] = default_rc_params
sb.set()

xNaNs = np.load("X_NAN_LIST.npy")
xTime = np.load("X_TIME_LIST.npy")

############################################################################################################################################################

masterX = np.load("True_NOO_fluxes.npy")
masterY = np.load("True_NOO_isplanetlist.npy")

In [2]:
################################
# Functions
################################

def GetNumDays(time=xTime):
    
    #xTime = np.load("X_TIME_LIST.npy")
    nDays = time[-1]-time[0]
    
    return (nDays)

################################

def Every_Nth_Value_EACH(y,nth=40):
    return (y[::nth])

################################

def Every_Nth_Value(masterX,nth=40):
    
    #print("Step 4: Subsample (every nth val)")
    
    print(f"nth = {nth}")
    
    biglen = len(masterX)
    oldlen = len(masterX[0])
    newlen = len(masterX[0][::nth])
    print(f"Old = {oldlen}; new = {newlen}")
    
    tmp = np.zeros((biglen,newlen))
    
    for n,X in enumerate(masterX):
        tmp[n] = Every_Nth_Value_EACH(X,nth)
        
    print(f"len(tmp) = {len(tmp)}; len(tmp[0]) = {len(tmp[0])}; shape = {np.shape(tmp)}")
    
    masterX = np.asarray(tmp)
    
    return masterX

################################

def FilterMyData(x,cutoff=0.00005,order=2,xNaNs=xNaNs):
    
    """
    Function to apply a Butter Filter to timeseries.
    Vars:
    
    y:        The timeseries. Must be list or np array.
    cutoff:   The cutoff frequency. Used to determine where the filter cut off is.
    order:    Approximation via polynomial of the order'th degree (2=quadratic, 3=cubic, 4=quartic, etc)
    """
    
    # First, let's calculate the observational time period;
    # This is done separately so that I can change this in the future for any TESS fits file
    numdays       = GetNumDays()
    
    # Next, fix data                           
    xMedian       = np.median(x)                                                    # Get the median value of 'x' before changing it
    x             = [xMedian if n in xNaNs else item for n,item in enumerate(x)]    # Change all the missing values to the median value of the whole array
    
    # Frequency Data Stuff
    sec           = numdays*24*60*60   # Number of seconds in the overall observation period
    freq          = len(x)/sec         # Frequency, in Hz, ie number of observations per second
    # FREQ IS APPROX 1/120 OR ~0.008333333
    
    # Butter Lowpass Filter
    #polynomOrder  = order
    nyq           = 0.5 * freq
    normal_cutoff = cutoff / nyq
    #b, a          = butter(polynomOrder, normal_cutoff, btype='low', analog=False)
    b, a          = butter(order, normal_cutoff, btype='low', analog=False)
    newX          = filtfilt(b, a, x)
    
    # Finally, return the new X and Y values
    return (newX)

################################

def FilterAllMyData(masterX,cutoff=0.00005,order=2,nanList=xNaNs):
    
    #print("Step 3: Filter")
    
    # Input:  masterX
    # Output: masterX with each LC filtered
    
    for X in masterX:
        X[:] = FilterMyData(X,cutoff,order,xNaNs)
    
    return masterX
    
################################

def Normal(masterX):
    
    #print("Step 2: Normalise")
    
    # Takes in 'masterX', my 9154 long array of LCs.
    # Need to return a 9154 array, where the daya has been normalised for EACH LC
    for X in masterX:
        median = np.median(X)
        X[:] = np.asarray([(number/median) for number in X])
    
    return masterX

################################

def FIXNAN(masterX, nanList=xNaNs):
    
    #print("Step 1: Fix NaN")
    
    # Takes in 'masterX', my 9154 long array of LCs.
    # Need to return a 9154 array, where the daya has been normalised for EACH LC
    for X in masterX:
        #print(f"\t> Length of X is {len(X)}")
        XMedian = np.median(X)
        X[:]= np.asarray([XMedian if n in nanList else item for n,item in enumerate(X)])
    
    return masterX

################################################################

def TEST2(masterX,nth=10):
    
    #print("Step 4: Subsample (every nth val)")
    
    biglen = len(masterX)
    oldlen = len(masterX[0])
    newlen = len(masterX[0][::nth])
    print(f"Old = {oldlen}; new = {newlen}")
    
    for i in range(len(masterX)):
        newX = Every_Nth_Value_EACH(masterX[i],nth)
        #print(f"Length of newX is {len(newX)}")
        masterX[i] = masterX[i][::nth]
        masterX[i] = np.resize(masterX[i],len(newX))
        #print(f"Length of NEW masterX[i] is {len(masterX[i])}")
        masterX[i] = newX
        
    print(f"len(tmp) = {len(tmp)}; len(tmp[0]) = {len(tmp[0])}; shape = {np.shape(tmp)}")
    
    return masterX

################################

def LCScaler(masterX):
    # Gen idlist
    idlist=[]
    for idx,LC in enumerate(masterX):
        if min(LC) < 0:
            idlist.append(idx)
    
    for lc in idlist:
        #lc is now an iterator
        #for lc in [5785, 5994, 6117, 7769]
        
        LC = masterX[lc]
        
        #Min = min()
        tmp = np.array([(x - min(LC)) / (max(LC) - min(LC)) for x in LC])
        masterX[lc] = tmp
    return masterX    

################################

## Make the Transformers and the pipeline

In [3]:
#algorithm = BernoulliNB()
algorithm = ComplementNB()

print("\t> Model: Complement Naive-Bayes")

# Make the transformers
print("> GENERATING TRANSFORMERS")
fnan = FunctionTransformer(FIXNAN)
norm = FunctionTransformer(Normal)
filt = FunctionTransformer(FilterAllMyData)
enth = FunctionTransformer(Every_Nth_Value)
lcsc = FunctionTransformer(LCScaler)

# Construct the Pipeline
print("> MAKE PIPELINE")
#pipe = Pipeline(steps=[['fixnan',fnan],['normalise',norm],['filter',filt]]) #,['everynth',enth]])
pipe = Pipeline(steps=[['fixnan',fnan],['normalise',norm], ['scale', lcsc],['filter',filt]]) #,['everynth',enth]])
#pipe2 = Pipeline(steps=[['filter',filt],['nb', algorithm]])

	> Model: Complement Naive-Bayes
> GENERATING TRANSFORMERS
> MAKE PIPELINE


## Use pipe to transform the data

In [4]:
print("> INITIAL TRANSFORMATION")
pipe.transform(masterX)

> INITIAL TRANSFORMATION


array([[0.99940985, 0.99944544, 0.99948084, ..., 1.0000775 , 1.0000783 ,
        1.0000789 ],
       [1.0212766 , 1.0212839 , 1.0212952 , ..., 0.9403424 , 0.9403301 ,
        0.94031984],
       [0.97599834, 0.9764575 , 0.97691923, ..., 0.9931079 , 0.9931133 ,
        0.99311805],
       ...,
       [0.9930401 , 0.99337405, 0.99370676, ..., 1.0034362 , 1.0034384 ,
        1.0034401 ],
       [1.2709858 , 1.2662818 , 1.2615675 , ..., 0.9601087 , 0.96007675,
        0.96004766],
       [0.9999215 , 0.9999676 , 1.0000149 , ..., 1.0014694 , 1.0014718 ,
        1.0014735 ]], dtype=float32)

## Subsample

In [5]:
print("> Subsample")
tmpX = enth.transform(masterX)
len(tmpX), len(masterX[0]), len(tmpX[0])

> Subsample
nth = 40
Old = 20338; new = 509
len(tmp) = 8063; len(tmp[0]) = 509; shape = (8063, 509)


(8063, 20338, 509)

In [10]:
# The above was to check if it worked; IT DOES. So on actual code, replace "tmpX = ..." with "masterX = ..."
masterX = tmpX

In [None]:
# Get x values of the sine wave

time        = np.arange(0, 10, 0.1);

# Amplitude of the sine wave is sine of a variable like time

amplitude   = (np.sin(time) + 0.5)

# Plot a sine wave using time and amplitude obtained for the sine wave

plt.plot(time, amplitude)
plt.axhline(y = 0, color = 'r', linestyle = '-')
plt.axhline(y = 0.5, color = 'g', linestyle = ':')

In [None]:
mean = np.mean(amplitude)
mx = max(amplitude)
mn = min(amplitude)

mean, mx, mn

In [None]:
ScaledCurve = np.array([(x - mn) / (mx - mn) for x in amplitude])

In [None]:
plt.plot(time, newAmp)
plt.axhline(y = 0, color = 'r', linestyle = '-')
plt.axhline(y = 0.5, color = 'g', linestyle = ':')

In [None]:
lst = []
for idx,LC in enumerate(masterX):
    if min(LC) < 0:
        lst.append((idx, min(LC)))
lst, [i[0] for i in lst]

In [None]:
def LCScaler(masterX):
    # Gen idlist
    idlist=[]
    for idx,LC in enumerate(masterX):
        if min(LC) < 0:
            idlst.append(idx)
    
    for lc in idlist:
        #lc is now an iterator
        #for lc in [5785, 5994, 6117, 7769]
        
        LC = masterX[lc]
        
        #Min = min()
        tmp = np.array([(x - min(LC)) / (max(LC) - min(LC)) for x in LC])
        masterX[lc] = tmp
    return masterX    

In [None]:
idlist = [int(i[0]) for i in lst]
print(idlist)

masterX = LCScaler(masterX)

In [None]:
plt.plot(masterX[idlist[0]])

In [None]:
plot(masterX[idlist[0]])

In [None]:
nrm2 = FunctionTransformer(LCScaler)
nrm2.transform()

## Make param grid
Note, do not need <code>acronym__params</code> because there will be only one estimator later on

In [32]:
#param_grid = {'alpha': [0.2, 0.4, 0.6, 0.8, 1.0]}
param_grid = {'alpha': np.linspace(0,1,101,True), #[0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1],
                  'fit_prior': [True, False]}

## Split the (now transformed) data, and make a <code>GridSearchCV</code> with the ALGORITHM ONLY

In [33]:
# Perform data manipulation
print("> TEST-TRAIN-SPLIT")
X_train, X_test, y_train, y_test = train_test_split(masterX, masterY, random_state=42)

# Do gridsearch for svc params
print("> GRIDSEARCH")
grid = GridSearchCV(algorithm, param_grid, return_train_score=True, n_jobs=3) # 4 programs running at once, 3 jobs = 12 CPUs, 3 for current, 1 spare - 16 total

# Fit model
print("> FIT")
grid.fit(X_train, y_train)

> TEST-TRAIN-SPLIT
> GRIDSEARCH
> FIT


GridSearchCV(estimator=ComplementNB(), n_jobs=3,
             param_grid={'alpha': array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
       0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
       0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
       0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
       0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
       0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
       0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
       0.99, 1.  ]),
                         'fit_prior': [True, False]},
             return_train_score=True)

In [34]:
# Use svc params and predict
print("> PREDICT")
model = grid.best_estimator_
y_pred = model.predict(X_test)

> PREDICT


In [35]:
mAcc = accuracy_score(y_test, y_pred)
mPre = precision_score(y_test, y_pred)
mRec = recall_score(y_test, y_pred)

In [36]:
mAcc, mPre, mRec

(0.5426587301587301, 0.09450830140485313, 0.2578397212543554)

In [37]:
model

ComplementNB(alpha=0.0)



In [40]:
np.linspace(0,1,11,True)[1:]

array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

In [42]:
masterX = np.load("True_NOO_fluxes.npy")[::4]
masterY = np.load("True_NOO_isplanetlist.npy")[::4]
X_train, X_test, y_train, y_test = train_test_split(masterX, masterY, random_state=42)

In [48]:
y_train, sum(y_train), len(y_train)

(array([0, 0, 0, ..., 0, 0, 0]), 210, 1512)

In [49]:
y_test, sum(y_test), len(y_test)

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
        1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 

In [52]:
y_test[17]

1

In [53]:
def ListCheck(arr):
    for n,item in enumerate(arr):
        if item == 0 or item == 1:
            continue
        else:
            print (f"Index {n} has odd value: {item}")

In [55]:
ListCheck(y_test), ListCheck(y_train)

(None, None)

In [76]:
np.linspace(1e-09,1e-08,91,True)

array([1.0e-09, 1.1e-09, 1.2e-09, 1.3e-09, 1.4e-09, 1.5e-09, 1.6e-09,
       1.7e-09, 1.8e-09, 1.9e-09, 2.0e-09, 2.1e-09, 2.2e-09, 2.3e-09,
       2.4e-09, 2.5e-09, 2.6e-09, 2.7e-09, 2.8e-09, 2.9e-09, 3.0e-09,
       3.1e-09, 3.2e-09, 3.3e-09, 3.4e-09, 3.5e-09, 3.6e-09, 3.7e-09,
       3.8e-09, 3.9e-09, 4.0e-09, 4.1e-09, 4.2e-09, 4.3e-09, 4.4e-09,
       4.5e-09, 4.6e-09, 4.7e-09, 4.8e-09, 4.9e-09, 5.0e-09, 5.1e-09,
       5.2e-09, 5.3e-09, 5.4e-09, 5.5e-09, 5.6e-09, 5.7e-09, 5.8e-09,
       5.9e-09, 6.0e-09, 6.1e-09, 6.2e-09, 6.3e-09, 6.4e-09, 6.5e-09,
       6.6e-09, 6.7e-09, 6.8e-09, 6.9e-09, 7.0e-09, 7.1e-09, 7.2e-09,
       7.3e-09, 7.4e-09, 7.5e-09, 7.6e-09, 7.7e-09, 7.8e-09, 7.9e-09,
       8.0e-09, 8.1e-09, 8.2e-09, 8.3e-09, 8.4e-09, 8.5e-09, 8.6e-09,
       8.7e-09, 8.8e-09, 8.9e-09, 9.0e-09, 9.1e-09, 9.2e-09, 9.3e-09,
       9.4e-09, 9.5e-09, 9.6e-09, 9.7e-09, 9.8e-09, 9.9e-09, 1.0e-08])

### Stuff below is testing various NB variations; Gauss/Bern/Categ all come out *exactly the same* so no need doing each

In [123]:
y_test2 = np.array(["True" if x==1 else "False" for x in y_test])

In [121]:
newY = masterY

newX = np.load("NOO_TRANSFORMED_DATA.npy")

In [105]:
from sklearn.naive_bayes import GaussianNB
algorithm = algorithm = CategoricalNB()

X_train, X_test, y_train, y_test = train_test_split(newX, newY, random_state=42)

params = {'alpha': np.linspace(0,1,101,True)}

grid = GridSearchCV(algorithm, params, return_train_score=True, n_jobs=3)

grid.fit(X_train, y_train)

model = grid.best_estimator_
y_pred = model.predict(X_test)

Traceback (most recent call last):
  File "/home/jdowzell/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/jdowzell/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 418, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "/home/jdowzell/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 646, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
  File "/home/jdowzell/anaconda3/lib/python3.8/site-packages/sklearn/naive_bayes.py", line 83, in predict
    jll = self._joint_log_likelihood(X)
  File "/home/jdowzell/anaconda3/lib/python3.8/site-packages/sklearn/naive_bayes.py", line 1461, in _joint_log_likelihood
    jll += self.feature_log_prob_[i][:, indices].T
IndexError: index 2 is out of bounds for axis 1 with size 2

Traceback (most recent call last):
  File "/home/jdowzell/anaconda3/lib/pyth

IndexError: index 4 is out of bounds for axis 1 with size 2

In [110]:
newY_test = np.array(['True' if x==1 else 'False' for x in y_test])

In [118]:
newY_pred = np.array([1 if x=="True" else 0 for x in y_pred])
newY_test = np.array([1 if x=="True" else 0 for x in newY_test])

In [122]:
mAcc = accuracy_score(newY_test, newY_pred)
mPre = precision_score(newY_test, newY_pred)
mRec = recall_score(newY_test, newY_pred)

mAcc,mPre,mRec

(0.21478174603174602, 0.14972972972972973, 0.9651567944250871)