# Database creation
This file is used to play with the data and once the objective is reached, functions will be written in .py files in order to be used by the model part of the project.

In [1]:
from astropy.io import fits
from tabulate import tabulate
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from dataset_creation_utils import *
# Retrieve the contrast curves from the database
path_db = 'C:/Users/ludin/Documents/Master Thesis/Dataset_creation/SPHERE_DC_DATA/'
foldername = '2MASS J01543773+0043005_DB_H23_2014-10-07_ird_specal_dc_cADI_softsorting_200967/'
filename = 'ird_specal_dc-IRD_SPECAL_CONTRAST_CURVE_TABLE-contrast_curve_tab.fits'

if not os.path.exists(path_db):
    print('ERROR! Folder {} does not exist.'.format(path_db)) # exit doesn't print anything in jupyter notebook

if not os.path.exists(os.path.join(path_db, foldername)):
    print('ERROR! Folder {} does not exist.'.format(foldername))

if not os.path.exists(os.path.join(path_db, foldername, filename)):
    print('ERROR! File {} does not exist.'.format(filename))

with fits.open(os.path.join(path_db, foldername, filename)) as hdul:
    data = hdul[1].data #numpy record
    print(data.dtype)
    print(hdul[1].header['DATE-OBS'])
    print(hdul[1].data['TARGET_NAME'])
    print(data['SEPARATION'][0])


(numpy.record, [('TARGET_NAME', 'S3'), ('LAM', 'S10'), ('PIXSCALE', 'S10'), ('NSIGMA', 'S3'), ('REPERTORY', 'S55'), ('SEPARATION', '>f4', (1148,)), ('NSIGMA_CONTRAST', '>f4', (1148,))])
2014-10-08T05:51:18.0343
['FS4' 'FS4' 'FS4']
[6.12500e-03 2.79787e-02 4.98323e-02 ... 2.50286e+01 2.50504e+01
 2.50723e+01]


In [2]:
df = get_df_with_headers(path_db, ['ESO OBS ID', 'DATE-OBS', 'OBJECT'])
df

Unnamed: 0,ESO OBS ID,DATE-OBS,OBJECT,SEPARATION,NSIGMA_CONTRAST
0,2177825,2018-09-15T07:33:22.60,CD-52 381,"[0.0061185, 0.029195273, 0.052272048, 0.075348...","[0.07546934, 0.081046954, 0.0072249672, 0.0053..."
1,1182440,2016-04-04T07:59:57.8491,ScoPMS_048,"[0.01225, 0.037514854, 0.06277971, 0.08804456,...","[0.024168868, 0.00965169, 0.002824313, 0.00203..."
2,1182371,2015-04-12T07:15:49.8435,HIP_76629,"[0.01225, 0.038996268, 0.06574254, 0.0924888, ...","[0.018838352, 0.003571751, 0.0013111836, 0.000..."
3,1962017,2018-07-05T01:24:56.95,RXJ1846,"[0.024482, 0.080440864, 0.13639972, 0.19235858...","[0.13829023, 0.015755469, 0.005646824, 0.00268..."
4,1962017,2018-07-05T01:24:56.95,RXJ1846,"[0.024482, 0.080440864, 0.13639972, 0.19235858...","[0.13829023, 0.015755469, 0.005646824, 0.00268..."
...,...,...,...,...,...
509,2296758,2019-03-20T02:23:21.23,TYC 7692-2943-2,"[0.012247, 0.04235727, 0.07246754, 0.1025778, ...","[0.02178879, 0.003522512, 0.00074191904, 0.000..."
510,2296758,2019-03-25T02:18:36.93,TYC 7692-2943-2,"[0.012247, 0.041378073, 0.07050915, 0.09964023...","[0.011126063, 0.0045869034, 0.00084505096, 0.0..."
511,1424674,2016-10-02T08:50:23.1564,HIP 28153,"[0.006125, 0.030178692, 0.054232385, 0.0782860...","[0.030811656, 0.0077921706, 0.00096568564, 0.0..."
512,2028801,2018-11-01T07:33:29.10,TYC 8097-337-1,"[0.01223, 0.03781431, 0.06339863, 0.08898293, ...","[0.011241959, 0.005084048, 0.0022800567, 0.001..."


In [3]:
# Print the types of the columns
print(df.dtypes)

ESO OBS ID          int64
DATE-OBS           object
OBJECT             object
SEPARATION         object
NSIGMA_CONTRAST    object
dtype: object


In [4]:
write_stats_in_file(df, path_db)

In [54]:
def plot_contrast_curves(df, path):
    """
    Plot the contrast curves.
    """

    if not os.path.exists(path):
        exit('ERROR! Folder {} does not exist.'.format(path))

    if not path.endswith('/'):
        path += '/'

    if not os.path.exists(path + 'plots/'):
        os.makedirs(path + 'plots/')
        
    path += 'plots/'

    if not ('SEPARATION' in df.keys() and 'NSIGMA_CONTRAST' in df.keys() and 'OBJECT' in df.keys() and 'DATE-OBS' in df.keys(), 'ESO OBS ID' in df.keys()):
        exit('ERROR! The dataframe must contain the columns SEPARATION, NSIGMA_CONTRAST, OBJECT, DATE-OBS and ESO OBS ID.')
        
    # Dataframes are not meant to access elements by index.
    df_dict = df.to_dict()

    # 2D numpy array (n_objects, len(separation)) in order to plot the summary of the contrast curves
    contrast_curves = np.zeros((len(df_dict['OBJECT']), len(df_dict['SEPARATION'][0])))

    for i in range(len(df_dict['OBJECT'])):
        break
        # Log transform of the contrast and warning suppression
        with np.errstate(divide='ignore' , invalid='ignore'):
            contrast = np.log10(df_dict['NSIGMA_CONTRAST'][i]) # There are some negative and zero values in the contrast curves, maybe process them before plotting
            print(contrast)

        # Plot the contrast curve for each object and save it in separate files
        plt.figure()
        plt.plot(df_dict['SEPARATION'][i], contrast, label=df_dict['OBJECT'][i])
        plt.xlabel('Separation (arcsec)')
        plt.ylabel('Contrast (5-sigma)')
        plt.title('Contrast curve for {} on {}'.format(df_dict['OBJECT'][i], df_dict['DATE-OBS'][i]))
        filename = os.path.join(path, 'contrast_curve_{}.png'.format(df_dict['ESO OBS ID'][i]))
        # print('Saving figure {}...'.format(filename))
        plt.plot()
        plt.savefig(filename)
        plt.close()

        # Filling the numpy array
        for j in range(len(df_dict['NSIGMA_CONTRAST'][0])):
            contrast_curves[i][j] = contrast[j]

    
    # print(contrast_curves)
    mean_contrast_curve = np.mean(contrast_curves[:5], axis=0)
    std_contrast_curve = np.std(contrast_curves[:5], axis=0)
    first_quartile_contrast_curve = np.percentile(contrast_curves, 25, axis=0)
    third_quartile_contrast_curve = np.percentile(contrast_curves, 75, axis=0)

    # Plot the mean contrast curve and fill the area between the first and third quartiles
    plt.figure()
    plt.plot(df_dict['SEPARATION'][0], mean_contrast_curve, label='Mean contrast curve')
    plt.fill_between(df_dict['SEPARATION'][0], first_quartile_contrast_curve, third_quartile_contrast_curve, alpha=0.5, label='First and third quartiles')
    plt.xlabel('Separation (arcsec)')
    plt.ylabel('Contrast (5-sigma)')
    plt.title('Summary of the contrast curves')
    plt.legend()
    filename = os.path.join(path, 'contrast_curves_summary.png')
    # print('Saving figure {}...'.format(filename))
    plt.plot()
    plt.savefig(filename)
    plt.close()

In [53]:
plot_contrast_curves(df, 'C:/Users/ludin/Documents/Master Thesis/Dataset_creation')

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [35]:
# Transform df['SIGMA_CONTRAST'] into a 2D (514,1086) shaped numpy array whose lines are the contrast curves

def get_contrast_curves(df):
    """
    Transform df['NSIGMA_CONTRAST'] into a 2D (514,1086) shaped numpy array whose lines are the contrast curves.
    """

    if not ('NSIGMA_CONTRAST' in df.keys()):
        exit('ERROR! The dataframe must contain the column NSIGMA_CONTRAST.')

    # Dataframes are not meant to access elements by index.
    df_dict = df.to_dict()

    contrast_curves = np.zeros((len(df_dict['NSIGMA_CONTRAST']), len(df_dict['NSIGMA_CONTRAST'][0])))

    for i in range(len(df_dict['NSIGMA_CONTRAST'])):
        contrast_curves[i] = df_dict['NSIGMA_CONTRAST'][i]

    return contrast_curves

get_contrast_curves(df)

ValueError: could not broadcast input array from shape (993,) into shape (1086,)