In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle

from matplotlib.pyplot import figure
from astropy.io import fits
from astropy.table import Table

from astropy.coordinates import SkyCoord
from astropy import units as u
from astropy.table import QTable

filename_path = '../data/astraAllStarASPCAP-0.7.fits'

# this function is Lucy's way to get rid of columns that are more than 1D (pandas doesn't support that)
def Table_to_pandas(fn):
    data = fits.open(fn)
    df = QTable(data[2].data)
    cols = []
    cols_drop = []
    for i in df.columns:
        if np.size(df[i][0])==1:
            cols.append(i)
        else:
            cols_drop.append(i)
    print(cols_drop)
    return df[cols].to_pandas()

df_sdss5 = Table_to_pandas(filename_path)

df_sdss5 = df_sdss5[(df_sdss5['teff']>4500) & (df_sdss5['teff']<6500)]
df_sdss5 = df_sdss5[(df_sdss5['snr']>=200)]

['sdss5_target_flags']


In [2]:
# -------------------------------
# 1. SDSS cut on Fe/H error
# -------------------------------
df_sdss5 = df_sdss5[df_sdss5['gaia_dr3_source_id'] > 0]
df_sdss_metcut = df_sdss5[(df_sdss5['e_fe_h'] < 0.1) & (df_sdss5['e_mg_h'] < 0.1)]

In [3]:
# Ensure unique SDSS rows per Gaia source
df_sdss_metcut = df_sdss_metcut.sort_values(['snr'], ascending=False)
df_sdss_metcut = df_sdss_metcut.drop_duplicates(subset='gaia_dr3_source_id', keep='first')

In [4]:
# -------------------------------
# 2. Save Gaia IDs from SDSS cut
# -------------------------------
df_sdss_metcut['gaia_dr3_source_id'].to_csv('../data/gaia_ids_from_sdss.txt', index=False, header=False)

In [5]:
# -------------------------------
# 3. Gaia RVS flags / high SNR selection
# -------------------------------
df_gaia_rvs_flags = pd.read_csv("../data/gaia_rvs_flags.csv")

# Keep only sources with RV spectra
df_gaia_has_rvs = df_gaia_rvs_flags[df_gaia_rvs_flags['has_rvs'] == True]

# High SNR spectra
df_gaia_high_snr = df_gaia_has_rvs[df_gaia_has_rvs['rvs_spec_sig_to_noise'] > 100]

# Optional: save high SNR source_ids to VOTable, to upload to gaia archive and get their spectra
t = Table()
t['source_id'] = df_gaia_high_snr['source_id'].values
t.write('../data/gaia_high_snr_source_ids_revised.vot', format='votable', overwrite=True)

In [6]:
# lambda values are based on GAIA documentation
# Load the downloaded spectra into a dataframe called rvs_spectra_df
# Plot an example 

lambdas = np.linspace(846, 870, 2401)

t = Table.read("../data/rvs_1.xml", format="votable")

rvs_spectra_df = t.to_pandas()

rvs_spectra_df['source_id'] = rvs_spectra_df['datalinkID']
rvs_spectra_df = rvs_spectra_df[['source_id', 'flux', 'flux_error']]

In [7]:
# Keep spectra who are in sdss_metcut

stars_spectra = rvs_spectra_df[rvs_spectra_df['source_id'].isin(df_sdss_metcut['gaia_dr3_source_id'])]
print(len(stars_spectra))

2894


In [8]:
# Drop duplicates. By looking at a few examples, seemed like duplicates have the exact same spectra. so I just keep first.
# But maybe there is a better way?

stars_spectra = stars_spectra.drop_duplicates(subset='source_id', keep='first')
len(stars_spectra)

2842

In [9]:
# Keep stars from sdss_metcut who are in the spectra

stars_labels = df_sdss_metcut[df_sdss_metcut['gaia_dr3_source_id'].isin(stars_spectra['source_id'])]
print(len(stars_labels))

2842


In [10]:
stars_labels['source_id'] = stars_labels['gaia_dr3_source_id']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stars_labels['source_id'] = stars_labels['gaia_dr3_source_id']


In [12]:
# For giants and dwarfs
most_reliable_giants = ['c_h', 'n_h', 'o_h', 'mg_h', 'al_h', 'si_h', 'mn_h', 'fe_h', 'ni_h']
reliable_giants = ['na_h', 'k_h', 'ca_h', 'co_h', 'ce_h']
most_reliable_dwarfs = ['c_h', 'mg_h', 'si_h', 'fe_h', 'ni_h']
reliable_dwarfs = ['o_h', 'al_h', 'k_h', 'ca_h', 'mn_h']

gaia_elements = ['n_h', 'mg_h', 'si_h', 's_h', 'ca_h', 'ti_h', 'cr_h', 'fe_h', 'ni_h', 'ce_h', 'nd_h']

giant_elements = [] #most_reliable + reliable
dwarf_elements = [] #most_reliable + reliable

print("*** Gaia elements in most reliable + reliable giants: ***")
for i in range(len(most_reliable_giants)):
    if most_reliable_giants[i] in gaia_elements:
        giant_elements.append(most_reliable_giants[i])

for i in range(len(reliable_giants)):
    if reliable_giants[i] in gaia_elements:
        giant_elements.append(reliable_giants[i])

print(giant_elements)

print("*** Gaia elements in most reliable + reliable dwarfs: ***")
for i in range(len(most_reliable_dwarfs)):
    if most_reliable_dwarfs[i] in gaia_elements:
        dwarf_elements.append(most_reliable_dwarfs[i])

for i in range(len(reliable_dwarfs)):
    if reliable_dwarfs[i] in gaia_elements:
        dwarf_elements.append(reliable_dwarfs[i])
    
print(dwarf_elements)

*** Gaia elements in most reliable + reliable giants: ***
['n_h', 'mg_h', 'si_h', 'fe_h', 'ni_h', 'ca_h', 'ce_h']
*** Gaia elements in most reliable + reliable dwarfs: ***
['mg_h', 'si_h', 'fe_h', 'ni_h', 'ca_h']


In [17]:
giants_label_names = ['source_id', 'teff', 'e_teff', 'logg', 'e_logg']
giants_label_names += [
    col
    for el in giant_elements
    for col in (el, f"e_{el}")
]
giant_labels = stars_labels[giants_label_names]
giants_df = stars_spectra.merge(giant_labels, on='source_id', how='inner')
giants_df = giants_df[(giants_df['logg'] > 1.5) & (giants_df['logg'] < 3.5)]
giants_df = giants_df[giants_df['e_fe_h'] < 0.1]
giants_df = giants_df[giants_df['e_mg_h'] < 0.1]
giants_df = giants_df[giants_df['e_si_h'] < 0.1]
giants_df = giants_df[giants_df['e_ni_h'] < 0.1]
giants_df = giants_df[giants_df['e_ca_h'] < 0.1]
giants_df = giants_df[giants_df['e_ce_h'] < 0.1]
print("number of giants = ", len(giants_df))
print(giants_label_names)

for p in ['teff', 'logg', 'n_h', 'mg_h', 'si_h', 'fe_h', 'ni_h', 'ca_h', 'ce_h']:
    err_col = f"e_{p}"
    # Boolean mask of missing values (NaN)
    mask = giants_df[p].isna() | giants_df[p].astype(str).isin(['--', 'masked', '<Masked>']) | giants_df[err_col].isna() | giants_df[err_col].astype(str).isin(['--', 'masked', '<Masked>'])
    stars_masked = giants_df[~mask]
    med = np.median(stars_masked[p])
    giants_df.loc[mask, p] = med
    giants_df.loc[mask, err_col] = 9999

giants_df.to_pickle('../data/giants_rel_most_rel.pkl')

number of giants =  1667
['source_id', 'teff', 'e_teff', 'logg', 'e_logg', 'n_h', 'e_n_h', 'mg_h', 'e_mg_h', 'si_h', 'e_si_h', 'fe_h', 'e_fe_h', 'ni_h', 'e_ni_h', 'ca_h', 'e_ca_h', 'ce_h', 'e_ce_h']


In [18]:
dwarfs_label_names = ['source_id', 'teff', 'e_teff', 'logg', 'e_logg']
dwarfs_label_names += [
    col
    for el in dwarf_elements
    for col in (el, f"e_{el}")
]
dwarf_labels = stars_labels[dwarfs_label_names]
dwarfs_df = stars_spectra.merge(dwarf_labels, on='source_id', how='inner')
dwarfs_df = dwarfs_df[(dwarfs_df['logg'] > 3.5)]
dwarfs_df = dwarfs_df[dwarfs_df['e_fe_h'] < 0.1]
dwarfs_df = dwarfs_df[dwarfs_df['e_mg_h'] < 0.1]
dwarfs_df = dwarfs_df[dwarfs_df['e_si_h'] < 0.1]
dwarfs_df = dwarfs_df[dwarfs_df['e_ni_h'] < 0.1]
dwarfs_df = dwarfs_df[dwarfs_df['e_ca_h'] < 0.1]

print("number of dwarfs = ", len(dwarfs_df))
print(dwarfs_label_names)

for p in ['teff', 'logg', 'mg_h', 'si_h', 'fe_h', 'ni_h', 'ca_h']:
    err_col = f"e_{p}"
    # Boolean mask of missing values (NaN)
    mask = dwarfs_df[p].isna() | dwarfs_df[p].astype(str).isin(['--', 'masked', '<Masked>']) | dwarfs_df[err_col].isna() | dwarfs_df[err_col].astype(str).isin(['--', 'masked', '<Masked>'])
    stars_masked = dwarfs_df[~mask]
    med = np.median(stars_masked[p])
    dwarfs_df.loc[mask, p] = med
    dwarfs_df.loc[mask, err_col] = 9999

dwarfs_df.to_pickle('../data/dwarfs_rel_most_rel.pkl')

number of dwarfs =  1149
['source_id', 'teff', 'e_teff', 'logg', 'e_logg', 'mg_h', 'e_mg_h', 'si_h', 'e_si_h', 'fe_h', 'e_fe_h', 'ni_h', 'e_ni_h', 'ca_h', 'e_ca_h']


In [None]:
# For a general example
tutorial_labels = ['source_id', 'teff', 'e_teff', 'logg', 'e_logg', 'fe_h', 'e_fe_h','o_h', 'e_o_h', 'mg_h', 'e_mg_h', 'si_h', 'e_si_h']
stars_labels = stars_labels[tutorial_labels]

stars_df = stars_spectra.merge(stars_labels, on='source_id', how='inner')
print(len(stars_df))

params = ['teff', 'logg', 'fe_h', 'o_h', 'mg_h', 'si_h']

for p in params:
    err_col = f"e_{p}"
    # Boolean mask of missing values (NaN)
    mask = stars_df[p].isna() | stars_df[p].astype(str).isin(['--', 'masked', '<Masked>']) | stars_df[err_col].isna() | stars_df[err_col].astype(str).isin(['--', 'masked', '<Masked>'])
    stars_masked = stars_df[~mask]
    med = np.median(stars_masked[p])
    stars_df.loc[mask, p] = med
    stars_df.loc[mask, err_col] = 9999

stars_df.to_pickle('../data/stars_feb4.pkl')