In [1]:
#!/usr/bin/env python3

################################
# Scientific imports
###
%matplotlib inline
from astropy.io import fits
import matplotlib.pyplot as plt
import numpy as np
from astroquery.mast import Observations
from astroquery.mast import Catalogs

###
# General imports
###
import csv, math, io, os, os.path, sys, random, time
import pandas as pd
import seaborn as sb
import sklearn
from sklearn import metrics
from IPython.display import display

###
# Time imports
###
from tqdm.notebook import tqdm, trange

###
# MatPlotLib Settings
###

plt.rcParams["figure.figsize"] = (20,9)
sb.set()

###
# Global Variables
###
# Lists
fitsList=[]
starlist=[]
planetlist=[]
eblist=[]
beblist=[]
dataframe = pd.DataFrame(columns=['id', 'vals', 'isplanet'])
idlist=[]
fluxlist=[]
isplanetlist=[]

# List Holder
alllists = {}

# Keep track of current LC and it's TIC identifier
lastRandom={
    "number": 0,
    "id": 0
}

################################
# Functions
###
# Function for Reading which LC datafiles we have into a list
def MakingAList(prnt=False):
    fl = []
    fitsroot = "SIM_DATA/"
    fits_directories = [x[0] for x in os.walk('./SIM_DATA/.', topdown=True)]

    for fitsroot, fits_dirs, fits_files in os.walk(fitsroot):
        for fits_file in fits_files:
            fullpath = os.path.join(fitsroot, fits_file)
            if (os.path.splitext(fullpath.lower())[1]).endswith('.fits'):
                fl.append(fullpath)
    if prnt==True:
        print("Number of FITS files: {}".format(len(fl)))
    #print(len(fl))
    return fl

# Chooses a random number
def GetRandomLC(n = None):
    global lastRandom
    #print("1: {}".format(n))
    if isinstance(n, int):
        if 0 <= n < len(fitsList):
            n = n
        else:
            n = random.randint(0,len(fitsList))
    else:
        n = random.randint(0,len(fitsList))
    
    lastRandom["number"] = n
    lastRandom["id"] = str(fitsList[n].split("-")[2].lstrip("0"))
    return n

def DrawACurve(n=None):
    rndFile = GetRandomLC() if n == None else GetRandomLC(n)
    fitsFile = fitsList[rndFile]
    
    # The following line of code gives us the header values
    fitsHeaders = fits.getheader(fitsFile)

    with fits.open(fitsFile, mode="readonly") as hdulist:

        # Extract stellar parameters from the primary header.  We'll get the effective temperature, surface gravity,
        # and TESS magnitude.
        star_teff = hdulist[0].header['TEFF']
        star_logg = hdulist[0].header['LOGG']
        star_tmag = hdulist[0].header['TESSMAG']
        obj = hdulist[0].header['OBJECT']
        sector = hdulist[0].header['SECTOR']

        # Extract some of the fit parameters for the first TCE.  These are stored in the FITS header of the first
        # extension.
        duration = (hdulist[1].header['LIVETIME'])

        # Extract some of the columns of interest for the first TCE signal.  These are stored in the binary FITS table
        # in the first extension.  We'll extract the timestamps in TBJD, phase, initial fluxes, and corresponding
        # model fluxes.
        tess_bjds = hdulist[1].data['TIME']
        sap_fluxes = hdulist[1].data['SAP_FLUX']
        pdcsap_fluxes = hdulist[1].data['PDCSAP_FLUX']

    # Start figure and axis.
    fig, ax = plt.subplots()

    # Plot the timeseries in black circles.
    ## Using the [1:-1] identifier to cut off the leading and trailing zeroes
    ax.plot(tess_bjds[1:-1], pdcsap_fluxes[1:-1], 'k.', markersize=1)

    # Let's label the axes and define a title for the figure.
    fig.suptitle(CurrentLC())
    ax.set_ylabel("PDCSAP Flux (e-/s)")
    ax.set_xlabel("Time (TBJD)")

    # Adjust the left margin so the y-axis label shows up.
    plt.subplots_adjust(left=0.15)
    
    plt.show()
    
def LoadListGeneral(f):
    lst=[]
    try:
        # Assuming everything CAN go well, do this
        with open('./SIM_DATA/unpacked/{}'.format(f)) as df:
            csvdf = csv.reader(df)
            for lineholder in csvdf:
                line = lineholder[0]                # I don't know why but this makes it work better
                if line[0]!="#":                    # Ignore commented lines (lines w/ FIRST STRING ELEMENT is a # character)
                    lst.append(line.split()[0])     # Add line to list
                # endif
            # endfor
        # endwith
    except FileNotFoundError:
        print("FNF")
        return
    # end try
    return lst

def LoadList(itemtype="all"):
    
    pl="tsop301_planet_data.txt"
    sl="tsop301_star_data.txt"
    ebl="tsop301_eb_data.txt"
    bebl="tsop301_backeb_data.txt"
    
    foundflag=False
    
    # itemtype = (S)tar, (P)lanet, (E)clipsing (B)inary, or (B)ack (E)clipsing (B)inary
    if itemtype.lower() in ["s", "star", "all"]:
        foundflag = True
        global starlist
        starlist = LoadListGeneral(sl)
        print("Loading star list: {}".format(sl))
    if itemtype.lower() in ["p", "planet", "all"]:
        foundflag = True
        global planetlist
        planetlist = LoadListGeneral(pl)
        print ("loading planet list: {}".format(pl))
    if itemtype.lower() in ["eb", "eclipsing binary", "eclipsingbinary", "all"]:
        foundflag = True
        global eblist
        eblist = LoadListGeneral(ebl)
        print ("loading eb list: {}".format(ebl))
    if itemtype.lower() in ["beb", "back eclipsing binary", "backeclipsingbinary", "all"]:
        foundflag = True
        global beblist
        beblist = LoadListGeneral(bebl)
        print ("loading beb list: {}".format(bebl))
        
    if foundflag:
        global alllists
        alllists = {"s": starlist, "p": planetlist, "eb": eblist, "beb": beblist}
        return
    else:
        # If an invalid selection has been entered
        print("You must enter either:\n"
               "* \"S\" (or \"Star\")\n"
               "* \"P\" (or \"Planet\")\n"
               "* \"EB\" (or \"Eclipsing Binary\")\n"
               "* \"BEB\" (or \"Back Eclipsing Binary\")")
        
def IsThisAStar(n):
    return n in alllists["s"]
    
def IsThisAPlanet(n):
    return n in alllists["p"]

def IsThisAEB(n):
    return n in alllists["eb"]

def IsThisABEB(n):
    return n in alllists["beb"]

# Function to tell you what an item is
def WhatIsMyLC(n):
    lbl = []
    lbl.append("Star") if IsThisAStar(n) else lbl
    lbl.append("Planet") if IsThisAPlanet(n) else lbl
    lbl.append("EB") if IsThisAEB(n) else lbl
    lbl.append("BRB") if IsThisABEB(n) else lbl
    
    return "UNKNOWN" if lbl==[] else lbl

# Purely for convenience
def CurrentLC():
    return ("File № {} - {}".format(lastRandom["number"], lastRandom["id"]))

# For writing lists to files
def WriteToFile(file,l):
    """
    Writes a list (param=l) to a txt file (param=file)
    """
    fn = file.split('.')[0]
    fn=fn+'.txt'
    with open(fn, 'w') as f:
        for item in l:
            f.write("%s\n" % item)

def MakeDataFrame(fitsList,ilist,flist,plist):
    """
    Reads a list of FITS files to examine
    
    Firstly, it reads in a list of FITS files to open and examine (param=fitsList)
    Next, it generates three lists (id-,dat-,p-) and makes them all equal in length to the length of the fitsList.
    It then reads the object ID (stored in the filename) and the flux timeseries, and assigns it to the two lists (params=idlist,datlist)
    Finally, it runs the "IsThisAPlanet" function to determind if the objID is a planet, and then outputs that into the last list (param=plist)
    
    RETURNS:
    A thruple of all three lists
    """
    # Make blank plain list
    rng=int(len(fitsList)/100)
    ilist=[None]*rng
    dlist=[None]*rng
    plist=[None]*rng
    
    print("Lists have the following lengths: ID={}; DAT={}; P={}".format(len(ilist),len(dlist), len(plist)))

    # Loop thru every FITS file
    for n, file in enumerate(tqdm(fitsList[0:rng])):
        # Print the file number (NOT ID, but the number of the file opened)
        if(n%1000==0):
            print("Opening file {}".format(n))
        # Open the file
        with fits.open(file) as hdu:
            # Get the ID of
            objid = np.uint32(str(fitsList[n].split("-")[2].lstrip("0")))
            ilist[n] = objid

            # Get the PDSCAP flux data
            flux = hdu[1].data['PDCSAP_FLUX'][1:-1]
            dlist[n] = flux.tolist()

            #Work out if is planet or not
            plist[n] = IsThisAPlanet(objid)
    print("--------")
    return(ilist,dlist,plist)

################################
# RUN ALL INITIALISERS
###
def Initialise():
    global fitsList
    global dataset
    global idlist
    global fluxlist
    global isplanetlist
    
    # Set up the list of FITS files
    print("Populating fitsList...")
    fitsList = MakingAList()
    WriteToFile("FITSLIST",fitsList)
    
    # Make the list of star/planet/eclipsingbinary/backeclipsingbinary IDs
    print("Loading the s/p/eb/beb Lists")
    LoadList()
    
    # Make the lists of ID, Flux, IsPlanet
    print("Populating the DataFrame")
    idlist, fluxlist, isplanetlist = MakeDataFrame(fitsList, idlist,fluxlist,isplanetlist)
    
    # Turn 3 lists into 1 DataFrame
    dataset = pd.DataFrame(list(zip(idlist,fluxlist,isplanetlist)), columns=['id', 'vals', 'isplanet'])
    WriteToFile("IDLIST",idlist)
    WriteToFile("FLUXLIST",fluxlist)
    WriteToFile("ISPLANETLIST",isplanetlist)
    
    # Clear the original lists to save mem 
    #del idlist, fluxlist, isplanetlist
    
    # Finally
    print("Done!")

In [2]:
Initialise()

Populating fitsList...
Loading the s/p/eb/beb Lists
Loading star list: tsop301_star_data.txt
loading planet list: tsop301_planet_data.txt
loading eb list: tsop301_eb_data.txt
loading beb list: tsop301_backeb_data.txt
Populating the DataFrame
Lists have the following lengths: ID=160; DAT=160; P=160


  0%|          | 0/160 [00:00<?, ?it/s]

Opening file 0
--------
Done!


## Well... that was far simpler than I was making it out to be?  
Now I can just convert these three lists to a PD DataFrame and wahey!

https://www.geeksforgeeks.org/create-a-pandas-dataframe-from-lists/  
https://dask.org/  
https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65  

## Next to do:
- ~~Combine the three lists into a PD DataFrame~~
- do the literal first NL algorithm with the data
- do some tests
- TAKE A BREAK YOU DESERVE IT JAKE

In [3]:
dataset

Unnamed: 0,id,vals,isplanet
0,277688819,"[179952.0, 179979.34375, 179668.578125, 179833...",False
1,272466550,"[21610.82421875, 21618.3203125, 21631.96875, 2...",False
2,101544343,"[108729.734375, 108670.5390625, 108613.875, 10...",False
3,384387355,"[56039.61328125, 56027.9765625, 55994.33984375...",False
4,31500881,"[18019.5078125, 17992.53125, 17918.802734375, ...",False
...,...,...,...
155,406169392,"[13633.4765625, 13635.998046875, 13603.9121093...",False
156,280096428,"[794186.375, 794335.375, 794127.875, 794119.93...",False
157,292409661,"[27368.95703125, 27352.611328125, 27355.195312...",False
158,360740653,"[36381.796875, 36631.8046875, 36256.30859375, ...",False


## Maybe write these lists to physical files for future use?