In [1]:
import pandas as pd
import numpy as np

In [2]:
# Watermark is not required for this code, but is included for information. 
import watermark
%load_ext watermark
%watermark -a "ELEANOR LUTZ" -d -v -iv -m

ModuleNotFoundError: No module named 'watermark'

## Data Source
The data used in this Jupyter Notebook is from the [HYG Database version 3](http://www.astronexus.com/hyg) by David Nash. 

In [None]:
df = pd.read_csv('./data/hygdata_v3/hygdata_v3.csv', low_memory=False)
display(df.head())

# Remove the sun because it doesn't make sense in a star chart
df = df[df['proper'] != 'Sol']

# Translate plaintext Bayer designations into non-ASCII greek letters
greek_dict = {'Alp': u"α",'Bet': u"β",'Chi': u"χ",'Del': u"δ",'Eps': u"ε",'Eta': u"η",
              'Gam': u"γ",'Iot': u"ι",'Kap': u"κ",'Lam': u"λ",'Mu': u"μ",'Nu': u"ν",
              'Ome': u"ω",'Omi': u"ο",'Phi': u"φ",'Pi': u"π",'Psi': u"ψ",'Rho': u"ρ",
              'Sig': u"σ",'Tau': u"τ",'The': u"θ",'Ups': u"υ",'Xi': u"ξ",'Zet': u"ζ"}

print(df[pd.notnull(df['bayer'])]['bayer'].unique())
def get_greek_letter(n):
    if str(n) == 'nan':
        return(np.nan)
    split = n.split("-")
    greek = greek_dict.get(split[0])
    if len(split) > 1:
        r = greek + split[1]
    else: 
        r = greek
    return(r)

df['greek_letters'] = df['bayer'].apply(get_greek_letter)
display(df.head())

print(len(df[pd.notnull(df['spect'])]['spect'].unique()), 'unique spectral designations')
def get_first_letter(name):
    '''Preprocess spectral designations to remove numbers'''
    if str(name) != 'nan':
        if len(name) > 1:
            if name[0:2] == 'sd':
                # remove MK system luminosity class to look just at
                # Morgan-Keenan designations
                name = name[2::]
            alphas = ''.join(c for c in name if c not in '?:!/;.,[]{}()')
            return(alphas[0].upper())
        else:
            return(name.upper())
    return(name)
    
df['dist'].replace(to_replace=100000, value=np.nan, inplace=True)
    
df['spect_desig'] = df['spect'].apply(get_first_letter)    
print(len(df[pd.notnull(df['spect_desig'])]['spect_desig'].unique()), 'unique spectral designations')
print(df[pd.notnull(df['spect_desig'])]['spect_desig'].unique())

color_dict = { 
    'O':'#5A90C3', 'B':'#93C2F1', 'A':'#f3e8d3', 'F':'#d4bf94',
    'G':'#FFD423', 'K':'#F99220', 'M':'#FF2620',  'L':'#FF2620',
    'T':'#FF6199', 'Y':'#6B22FF', 
    'C':'#979330', 'R':'#979330', 'W':'#979330', 'N':'#979330',
    'S':'#979330', 'D':'#979330', 'P':'#979330',
    'nan': '#000000' # unknown
}
df['color'] = df['spect_desig'].replace(to_replace=color_dict)
df['color'] = df['color'].replace(to_replace=np.nan, value='#000000')
df['linecolor'] = df['color'].replace(['#000000'], ['#f3e8d3']) # beige outline for black NANs

display(df.head())
df.to_csv('./data/processed/hygdata_processed.csv', index=False)

print(len(df), 'total stars available in database')
df = df[df['mag'] <= 6.5]
print(len(df), 'stars visible to the human eye')
df.to_csv('./data/processed/hygdata_processed_mag65.csv', index=False)