In [2]:
import numpy as np
import pandas as pd
from collections import Counter
from astropy.io.fits import getdata
from astropy import units as u
from astropy.coordinates import SkyCoord, Angle
from astroquery.vizier import Vizier
from astropy.table import Table
from astroquery.xmatch import XMatch
from astroquery.simbad import Simbad
import time



import sys  
sys.path.insert(0, '../')

from prepare_library import atnf_pos, create_perobs_data, cal_ave, add_MW, confusion_clean, TD_clean
from muwclass_library import prepare_cols

from TD_dictionary import Simbad_dict,crowd_fields_dict,rare_sources_removed_dict,rare_sources_saving_dict 

import warnings
warnings.filterwarnings('ignore')

Vizier.ROW_LIMIT = -1
exnum = -999999.

## Comments
### 1. The updated TD can consist of two parts, one for the confidently classified (well-known) sources, the other for the candidates

### 2. X-ray sources match to SIMBAD, use SIMBAD classification (main_type as well as other_types), to add more rare-type sources and verifiy the existing classifications

SRG_ART_XC = Vizier(catalog="J/A+A/661/A38",row_limit=-1, #2022A&A...661A..38P
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints()[0]
print(SRG_ART_XC.to_pandas()['Type'].value_counts())
#print(SRG_ART_XC.to_pandas())
# ∼4×10−12 erg s−1 cm−2, 15", 4-12 keV


## Low-Mass X-ray Binaries from https://arxiv.org/abs/2206.10053

* Galactic UCXBs (~20 + 20 cands)
* Galactic LMXBs exhibiting clear total eclipses (~15)
* Galactic SyXBs (~20)
* LMXBs that are quasi-persistent (~20), i.e., transient systems but exhibiting prolonged outbursts of >1 yr
* VFXBs in our Galaxy (~40)
* Galactic LMXBs accreting around the Eddington limit.  (~20)
* Galactic globular cluster LMXBs (~20)


## HMXB checking https://arxiv.org/abs/2207.02114

## do not use NS https://github.com/NickSwainston/pulsar_spectra

## add a binary star type? 

* binaries candidates from APOGEE survey which we excluded [(APOGEE.s_HRV <= 1) & (APOGEE.s_HRV <= 5*APOGEE.errHRV)]
* Gaia binaries (Gaia Spectroscopic Orbits Validated with LAMOST and GALAH Radial Velocities; )


## More 

* AGNs: 6dF Optical AGN Catalog; BAT AGN Spectroscopic Survey; 2MASS Redshift Survey; https://astrocloud.nrao.edu/s/L2R3fYHGt2jCbTz; FIRST-NVSS-SDSS AGN catalog; WISE and Sloan Digital Sky Survey (SDSS) spectroscopic data catalog (Toba et al. 2014)
* YSOs: HST ULLYSES Program
* 4XMM-DR10 (Tranin et al. 2022).

## Make use of SIMBAD
### do not use their coordinates! 



In [3]:
# define some directories and output name

data_dir = './data'
old_TD_dir = '../buildTD/data'
field_name = 'CSCv2_TD_11162022'
verb = 0

query_dir = '../demo/data/query'

In [4]:
open_CV = pd.read_csv('./data/updates/Open_CV_catalog.csv')
# no Nova

print(len(open_CV))
#print(open_CV['Type'].value_counts())
print(Counter(open_CV['Type']))
open_CV = open_CV[open_CV['Type'].isin(['Candidate', 'CataclyV', 'known CV', 'CataclyV_Candidate', 'Cataclysmic_Variable', 'Known CV'])]
print(len(open_CV))
print(Counter(open_CV['Type']))
open_CV['Class'] = np.nan
open_CV.loc[open_CV['Type'].isin([ 'CataclyV', 'known CV', 'Cataclysmic_Variable', 'Known CV']), 'Class'] = 'CV'
open_CV.loc[open_CV['Type'].isin([ 'Candidate', 'CataclyV_Candidate']), 'Class'] = 'CV-candidate'

print(len(open_CV))
#print(open_CV['Type'].value_counts())
#print(open_CV.columns)

open_CV['e_Pos'], open_CV['ref'] = np.nan, 'The Open Cataclysmic Variable Catalog'
open_CV = open_CV.rename(columns={'Name':'name_cat','Type':'SubClass'})

open_CV['R.A.'] = open_CV.apply(lambda r: str(r['R.A.']).split(',', 1)[0],axis=1)
open_CV['Dec.'] = open_CV.apply(lambda r: str(r['Dec.']).split(',', 1)[0],axis=1)
open_CV.to_csv('./data/updates/Open_CV_catalog_test.csv',index=False)
open_CV = pd.read_csv('./data/updates/Open_CV_catalog_test.csv')
open_CV['_RAJ2000'] = Angle(open_CV['R.A.'], 'hourangle').degree
open_CV['_DEJ2000'] = Angle(open_CV['Dec.'], 'deg').degree

open_CV = open_CV.drop(columns=['R.A.','Dec.','Disc. Date', 'Mag.'])



14413
Counter({'Candidate': 3819, 'CataclyV': 3669, nan: 2331, 'N': 1627, 'known CV': 902, 'CataclyV_Candidate': 851, 'DN': 439, 'Cataclysmic_Variable': 323, 'Known CV': 240, 'Nova_Candidate': 194, 'Candidate Nova': 17, 'AntiNova': 1})
9804
Counter({'Candidate': 3819, 'CataclyV': 3669, 'known CV': 902, 'CataclyV_Candidate': 851, 'Cataclysmic_Variable': 323, 'Known CV': 240})
9804


In [5]:
# some CVs catalog that may not included in Open CV catalog
CV_ZTF = Vizier(catalog="J/AJ/162/94/table1",row_limit=-1, #https://ui.adsabs.harvard.edu/abs/2021AJ....162...94S/abstract
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints()[0]
#CV_ZTF.to_pandas().to_csv('./data/updates/CV_ZTF.csv',index=False)
CV_ZTF = CV_ZTF['_RAJ2000','_DEJ2000','ZTF','Spec'].to_pandas().rename(columns={'Spec':'SubClass','ZTF':'name_cat'})
CV_ZTF['e_Pos'], CV_ZTF['ref'] = np.nan, '2021AJ....162...94S'
CV_ZTF['SubClass'] = CV_ZTF.apply(lambda r: 'Spec-'+r['SubClass'] if r['SubClass']!='' else '', axis=1)
CV_ZTF['Class'] = 'CV'
 
#print(CV_ZTF)

CV_can_ZTF = Vizier(catalog="J/AJ/162/94/table2",row_limit=-1, #https://ui.adsabs.harvard.edu/abs/2021AJ....162...94S/abstract
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints()[0]
#CV_can_ZTF.to_pandas().to_csv('./data/updates/CV_can_ZTF.csv',index=False)
CV_can_ZTF = CV_can_ZTF['_RAJ2000','_DEJ2000','ZTF','Spec'].to_pandas().rename(columns={'Spec':'SubClass','ZTF':'name_cat'})
CV_can_ZTF['e_Pos'], CV_can_ZTF['ref'] = np.nan, '2021AJ....162...94S'
CV_can_ZTF['SubClass'] = CV_can_ZTF.apply(lambda r: 'Spec-'+r['SubClass'] if r['SubClass']!='' else '', axis=1)
CV_can_ZTF['Class'] = 'CV-candidate'

#print(CV_can_ZTF)

#'''
CV_LAMOST_dr5 = Vizier(catalog="J/AJ/159/43",row_limit=-1, #2020AJ....159...43H 
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints()[0]
#CV_LAMOST.to_pandas().to_csv('./data/updates/CV_LAMOST.csv',index=False)
CV_LAMOST_dr5 = CV_LAMOST_dr5['_RAJ2000','_DEJ2000','LAMOST','Type','Type-lit'].to_pandas().rename(columns={'Type-lit':'SubClass','Type':'Class','LAMOST':'name_cat'})
CV_LAMOST_dr5['e_Pos'], CV_LAMOST_dr5['ref'] = np.nan, '2020AJ....159...43H'
CV_LAMOST_dr5 = CV_LAMOST_dr5[CV_LAMOST_dr5['Class']=='CV'].reset_index(drop=True)
CV_LAMOST_dr5['Class'] = 'CV-candidate'


#print(CV_LAMOST.to_pandas()
#print(CV_LAMOST_dr5)


CV_LAMOST_dr6 = Vizier(catalog="J/ApJS/257/65",row_limit=-1, #https://ui.adsabs.harvard.edu/abs/2021ApJS..257...65S/abstract
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints()[0]
#CV_LAMOST.to_pandas().to_csv('./data/updates/CV_LAMOST.csv',index=False)
CV_LAMOST_dr6 = CV_LAMOST_dr6['_RAJ2000','_DEJ2000','LAMOST','Set','sType1'].to_pandas().rename(columns={'sType1':'SubClass','LAMOST':'name_cat'})
CV_LAMOST_dr6['e_Pos'], CV_LAMOST_dr6['ref'] = np.nan, '2021ApJS..257...65S'
CV_LAMOST_dr6['Class'] = 'CV-candidate'
#CV_LAMOST_dr6.loc[CV_LAMOST_dr6['Set']=='new', 'Class'] = 'CV'
CV_LAMOST_dr6 = CV_LAMOST_dr6.drop(columns=['Set'])

#print(CV_LAMOST_dr6)
#'''


In [6]:
# Swift BAT 105-Month Hard X-ray Survey
# https://swift.gsfc.nasa.gov/results/bs105mon/
# add them to candidates
# https://ui.adsabs.harvard.edu/abs/2015AJ....150..170H/citations
# spectroscopically verified CVs can be used, but need accurate coordinates 

BAT105 = pd.read_csv(f'./data/updates/BAT105.csv')

BAT105.columns = BAT105.columns.str.strip() 
BAT105 = BAT105.applymap(lambda x: x.strip() if isinstance(x, str) else x)
BAT105.columns

print(len(BAT105))
#BAT105['RA']
#BAT105['BAT_NAME']
#print(Counter(BAT105['TYPE']))

#print(BAT105.columns)
#print(BAT105.groupby(['CL2','TYPE']).size())

BAT105 = BAT105[BAT105['TYPE'].isin(['Sy1.9','Sy1.5','Sy2','Sy1.2','Sy1.8','Beamed AGN','Sy1','CV','HMXB','LMXB','Pulsar'])]
BAT105['Class'] = np.nan
BAT105.loc[BAT105['TYPE'].isin(['Sy1.9','Sy1.5','Sy2','Sy1.2','Sy1.8','Beamed AGN','Sy1']), 'Class'] = 'AGN'
BAT105.loc[BAT105['TYPE'].isin([ 'CV']), 'Class'] = 'CV'
BAT105.loc[BAT105['TYPE'].isin([ 'HMXB']), 'Class'] = 'HMXB'
BAT105.loc[BAT105['TYPE'].isin([ 'LMXB']), 'Class'] = 'LMXB'
BAT105.loc[BAT105['TYPE'].isin([ 'Pulsar']), 'Class'] = 'NS'
print(Counter(BAT105['Class']))
#print(open_CV['Type'].value_counts())
#print(open_CV.columns)
print(BAT105.groupby(['CL2','TYPE']).size())

BAT105['e_Pos'], BAT105['ref'] = np.nan, '2018ApJS..235....4O'
BAT105 = BAT105.rename(columns={'COUNTERPART_NAME':'name_cat','TYPE':'SubClass','CTPT_RA':'_RAJ2000','CTPT_DEC':'_DEJ2000'})
BAT105 = BAT105[['name_cat','_RAJ2000','_DEJ2000','e_Pos','Class','SubClass','ref']]

#print(BAT105)


# 8.40×10−12 erg s−1 cm−2, 14-195 keV, ~10 arcmin


1632
Counter({'AGN': 973, 'LMXB': 109, 'HMXB': 108, 'CV': 75, 'NS': 25})
CL2  TYPE      
40   Sy1           161
     Sy1.2          96
     Sy1.5         111
     Sy1.8           9
50   Sy1.9         128
     Sy2           310
80   Beamed AGN    158
90   CV             75
150  Pulsar         25
180  HMXB          108
190  LMXB          109
dtype: int64


In [7]:
LAMOST_OB = Vizier(catalog="J/ApJS/241/32",row_limit=-1, #https://ui.adsabs.harvard.edu/abs/2021ApJS..257...65S/abstract
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints()[0]
#CV_LAMOST.to_pandas().to_csv('./data/updates/CV_LAMOST.csv',index=False)
print(LAMOST_OB.to_pandas())
LAMOST_OB = LAMOST_OB['_RAJ2000','_DEJ2000','ObsID','SpT'].to_pandas().rename(columns={'SpT':'SubClass','ObsID':'name_cat'})
LAMOST_OB['e_Pos'], LAMOST_OB['ref'] = np.nan, '2019ApJS..241...32L'
LAMOST_OB['Class'] = 'HM-STAR'
#CV_LAMOST_dr6 = CV_LAMOST_dr6.drop(columns=['Set'])
#LAMOST_OB


         _RAJ2000   _DEJ2000      ObsID     RAJ2000    DEJ2000         S_N  \
0       68.068632  53.143738   29814218   68.068632  53.143738  296.359985   
1       82.819712  28.924288     513098   82.819712  28.924288   28.740000   
2       95.782966  26.778336    1902033   95.782966  26.778336   69.180000   
3       32.732299  58.305086    2404184   32.732299  58.305086   84.250000   
4       35.327500  57.147800    2407116   35.327500  57.147800   47.400002   
...           ...        ...        ...         ...        ...         ...   
22896  288.815187  43.674379  581808073  288.815187  43.674379  179.630005   
22897  200.848058  26.275522  582314065  200.848058  26.275522   39.900002   
22898  233.317108  44.587383  582404042  233.317108  44.587383   99.750000   
22899  233.013558  42.962747  582405046  233.013558  42.962747   75.930000   
22900  172.122059  29.251185  584705034  172.122059  29.251185  122.010002   

       m_ObsID  Nobs   SSpT MKSpT    SpT       Comm  
0        

In [8]:
LAMOST_AFGK = pd.read_csv('./data/updates/dr8_v2.0_LRS_stellar.csv')
#CV_LAMOST.to_pandas().to_csv('./data/updates/CV_LAMOST.csv',index=False)
#print(LAMOST_OB.to_pandas())

LAMOST_AFGK = LAMOST_AFGK[['ra','dec','uid','subclass']].rename(columns={'ra':'_RAJ2000','dec':'_DEJ2000','subclass':'SubClass','uid':'name_cat'})
LAMOST_AFGK['e_Pos'], LAMOST_AFGK['ref'] = np.nan, 'LAMOST-DR8-AFGK'
LAMOST_AFGK['Class'] = 'LM-STAR'
#CV_LAMOST_dr6 = CV_LAMOST_dr6.drop(columns=['Set'])
#LAMOST_AFGK 

LAMOST_M = pd.read_csv('./data/updates/dr8_v2.0_LRS_mstellar.csv')
#CV_LAMOST.to_pandas().to_csv('./data/updates/CV_LAMOST.csv',index=False)
#print(LAMOST_OB.to_pandas())

LAMOST_M = LAMOST_M[['ra','dec','uid','subclass']].rename(columns={'ra':'_RAJ2000','dec':'_DEJ2000','subclass':'SubClass','uid':'name_cat'})
LAMOST_M['e_Pos'], LAMOST_M['ref'] = np.nan, 'LAMOST-DR8-M'
LAMOST_M['Class'] = 'LM-STAR'
#CV_LAMOST_dr6 = CV_LAMOST_dr6.drop(columns=['Set'])
#LAMOST_M

#print(LAMOST_AFGK['SubClass'].value_counts())

#print(LAMOST_M['SubClass'].value_counts())

# YSOs 
### from multiple molecular clouds and open clusters (Megeath et al. 2012; Povich et al. 2011; Ozawa et al. 2005; Giardino et al. 2007; Rebull et al. 2011; Delgado et al. 2011);

In [9]:
YSO1 = Vizier(catalog="J/AJ/144/192",row_limit=-1, #https://ui.adsabs.harvard.edu/?#abs/2012AJ....144..192M
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints(Cl='=|P|D')[0]
YSO1 = YSO1['_RAJ2000','_DEJ2000','Cl'].to_pandas().rename(columns={'Cl':'SubClass'})
YSO1['e_Pos'], YSO1['ref'] = np.nan, '2012AJ....144..192M'
print(len(YSO1),Counter(YSO1['SubClass']))

YSO2 = Vizier(catalog="J/ApJS/194/14/catalog",row_limit=-1,#https://ui.adsabs.harvard.edu/?#abs/2011ApJS..194...14P
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints(Stage='!=A')[0]
YSO2 = YSO2['_RAJ2000','_DEJ2000','Stage'].to_pandas().rename(columns={'Stage':'SubClass'})
YSO2['e_Pos'], YSO2['ref'] = np.nan, '2011ApJS..194...14P'
print(len(YSO2),Counter(YSO2['SubClass']))


YSO3 = Vizier(catalog="J/A+A/429/963",row_limit=-1,#https://ui.adsabs.harvard.edu/?#abs/2005A%26A...429..963O
    columns=['*', '_RAJ2000', '_DEJ2000','e_Pos']).query_constraints(Class='!=nIII')[0]
YSO3 = YSO3['_RAJ2000','_DEJ2000','e_Pos','Class'].to_pandas().rename(columns={'Class':'SubClass'})
YSO3['ref'] = '2005A&A...429..963O'
print(len(YSO3),Counter(YSO3['SubClass']))


YSO4 = Vizier(catalog="J/A%2bA/463/275/table5",row_limit=-1, #https://ui.adsabs.harvard.edu/?#abs/2007A%26A...463..275G
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints(clYSO='=|I|I/II|II|II/III|III')[0]
YSO4 = YSO4['_RAJ2000','_DEJ2000','clYSO'].to_pandas().rename(columns={'clYSO':'SubClass'})
YSO4['e_Pos'], YSO4['ref'] = np.nan, '2007A&A...463..275G'
print(len(YSO4),Counter(YSO4['SubClass']))

YSO5 = Vizier(catalog="J/ApJS/196/4",row_limit=-1, #https://ui.adsabs.harvard.edu/?#abs/2011ApJS..196....4R
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints(St='=|k|n')[0]
YSO5 = YSO5['_RAJ2000','_DEJ2000','St'].to_pandas().rename(columns={'St':'SubClass'})
YSO5['e_Pos'], YSO5['ref'] = np.nan,  '2011ApJS..196....4R'
print(len(YSO5),Counter(YSO5['SubClass']))

YSO6 = Vizier(catalog="J/A%2bA/531/A141/catalog",row_limit=-1,#https://ui.adsabs.harvard.edu/?#abs/2011A%26A...531A.141D
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints(MmD='=2')[0]
YSO6 = YSO6['_RAJ2000','_DEJ2000','MmD'].to_pandas().rename(columns={'MmD':'SubClass'})
YSO6['e_Pos'], YSO6['ref'] = np.nan, '2011A&A...531A.141D'
print(len(YSO6),Counter(YSO6['SubClass']))



3419 Counter({'D': 2991, 'P': 428})
808 Counter({'II': 478, '0/I': 247, 'III': 83})
72 Counter({'II': 26, '': 22, 'III': 17, 'I': 7})
56 Counter({'II': 20, 'III': 16, 'I/II': 9, 'I': 9, 'II/III': 2})
272 Counter({'k': 178, 'n': 94})
308 Counter({2: 308})


In [10]:
df_YSOs = pd.concat([YSO1, YSO2, YSO3, YSO4, YSO5, YSO6])
df_YSOs = df_YSOs.reset_index(drop=True)
df_YSOs['Class']='YSO'
print(len(df_YSOs),'YSOs')

4935 YSOs


# STARs
### from the Catalog of Stellar Spectral Classifications (Skiff 2014) with O, B or W (e.g., WN, WR stars) types are labeled as HM-STARs and A, F, G, K, or M types are labeled as LM-STARs;

In [11]:
stars = Vizier(catalog="B/mk/mktypes",row_limit=-1, #https://ui.adsabs.harvard.edu/?#abs/2014yCat....1.2023S
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints(Mag='<=23')[0]# Fainter sources with Mag > 23 were removed
stars = stars['_RAJ2000','_DEJ2000','Name','SpType','Bibcode','Remarks','Mag'].to_pandas()
stars = stars.replace(r'^\s*$', np.nan, regex=True)
print(len(stars)) 

937000


In [12]:
# Sources with their SpType column including strings of “e”, “s”, “n”, “p”, “f”, “cv”, “i”, “r”, “a”, “D”, “C”, “cont”, “l”,“H”, “h”, “abs”, “+”, “:”, “*”, “?” were removed
# since their spectral type may not be reliable 
stars_d1 = stars[stars['SpType'].str.contains('e|s|n|p|f|cv|i|r|a|D|C|cont|l|H|h|abs|\+|\*|\:|\?', na=False)]
stars_f1 = stars[~stars.set_index(['_RAJ2000','_DEJ2000']).index.isin(stars_d1.set_index(['_RAJ2000','_DEJ2000']).index)]
stars_f1 = stars_f1.reset_index(drop=True)

#print(len(stars))
#print(len(stars_d1))
#print(len(stars_f1))

# any sources with non-empty Remarks column were removed;
stars_d2 = stars_f1[stars_f1['Remarks'].isnull() == False]
stars_f2 = stars_f1[~stars_f1.set_index(['_RAJ2000','_DEJ2000']).index.isin(stars_d2.set_index(['_RAJ2000','_DEJ2000']).index)]
stars_f2 = stars_f2.reset_index(drop=True)

#print(len(stars_d2))
#print(len(stars_f2))

# Sources with “H97b” in their Name column were removed. They are Orion stars which are likely a mix of faint low-mass stars and YSOs and better to be dropped
stars_d3 = stars_f2[stars_f2['Name'].str.contains('H97b')]
stars_f3 = stars_f2[~stars_f2.set_index(['_RAJ2000','_DEJ2000']).index.isin(stars_d3.set_index(['_RAJ2000','_DEJ2000']).index)]
stars_f3 = stars_f3.reset_index(drop=True)

#print(len(stars_d3))
#print(len(stars_f3))

In [13]:
# Seperate high and low mass stars into their respective classes
star_hm = stars_f3[stars_f3['SpType'].str.startswith(tuple(['O','B','W']), na=False)]
star_lm = stars_f3[stars_f3['SpType'].str.startswith(tuple(['A','F','G','K','M']), na=False)]
star_hm = star_hm.reset_index(drop=True)
star_lm = star_lm.reset_index(drop=True)

star_hm['e_Pos'], star_hm['Class'] = np.nan, 'HM-STAR'
star_lm['e_Pos'], star_lm['Class'] = np.nan, 'LM-STAR'
df_HMSTARs = star_hm.rename(columns={'Name':'name_cat','SpType':'SubClass','Bibcode':'ref'}).drop(columns=['Remarks','Mag'])
df_LMSTARs = star_lm.rename(columns={'Name':'name_cat','SpType':'SubClass','Bibcode':'ref'}).drop(columns=['Remarks','Mag'])
print(len(df_HMSTARs))
print(len(df_LMSTARs))



62124
450224


#### Spectroscopically classified low-mass stars from the APOGEE data in SDSS Data Release 16 were obtained. We filtered out those unreliable sources if they don’t have effective temperature measurements or surface gravity measurements. We also removed those likely binary systems by filtering on the VSCATTER if VSCATTER > 1 km/s and/or VSCATTER > 5*VERR_MED. We also removed sources that are not flagged as a star based on Washington/DDO 51 photometry

In [14]:
APOGEE_all = Vizier(catalog="III/284/allstars",row_limit=-1, #https://ui.adsabs.harvard.edu/?#abs/2020AJ....160..120J
    columns=['*','_RAJ2000', '_DEJ2000','AName','Giant','Star']).query_constraints(Teff='>3000 & <10000',logg='>-1 & <7')[0]

APOGEE = APOGEE_all['_RAJ2000','_DEJ2000','AName','Giant','Star','TClass','Teff','logg','s_HRV','errHRV'].to_pandas()

APOGEE_STAR = APOGEE[APOGEE.Star == 1].reset_index(drop=True)
#APOGEE_STAR['Binary'] = ''
APOGEE_STAR.loc[(APOGEE_STAR.s_HRV <= 1) & (APOGEE_STAR.s_HRV <= 5*APOGEE_STAR.errHRV), 'TClass'] = APOGEE_STAR.loc[(APOGEE_STAR.s_HRV <= 1) & (APOGEE_STAR.s_HRV <= 5*APOGEE_STAR.errHRV), 'TClass'] + '|Binary'#.reset_index(drop=True)# & APOGEE.Teff.isnull() & (APOGEE.logg <= 7) & (APOGEE.logg >= -1) ]
print(Counter(APOGEE_STAR['TClass']))

APOGEE_STAR['e_Pos'], APOGEE_STAR['Class'], APOGEE_STAR['ref'] = np.nan, 'LM-STAR', '2020AJ....160..120J'
APOGEE_STAR = APOGEE_STAR.rename(columns={'AName':'name_cat','TClass':'SubClass'})
APOGEE_STAR = APOGEE_STAR.replace('none', np.nan, regex=True)
APOGEE_STAR = APOGEE_STAR.drop(columns=['Giant','Star','Teff','logg','s_HRV','errHRV'])
print(len(APOGEE_STAR))

Counter({'GKg_c|Binary': 54641, 'GKg_b|Binary': 52765, 'GKg_a|Binary': 43870, 'GKd_c|Binary': 23912, 'GKd_b|Binary': 23526, 'GKg_d|Binary': 22472, 'GKg_c': 19166, 'GKg_b': 18996, 'GKd_a|Binary': 16669, 'GKg_a': 12251, 'GKd_d|Binary': 10204, 'Mg_a|Binary': 7846, 'GKg_d': 7420, 'Mg_b': 7112, 'Mg_c': 7099, 'Fd_b|Binary': 6535, 'Fd_c|Binary': 6517, 'Mg_b|Binary': 6385, 'Mg_a': 6128, 'Mg_c|Binary': 5938, 'GKd_c': 5132, 'GKd_b': 5103, 'Md_b|Binary': 4996, 'Md_c|Binary': 4983, 'Fd_a|Binary': 4958, 'BA': 4786, 'Md_a|Binary': 3488, 'Fd_d|Binary': 3068, 'Mg_d|Binary': 2955, 'GKd_a': 2930, 'Mg_d': 2904, 'BA|Binary': 2561, 'Fd_c': 2381, 'Fd_b': 2251, 'GKd_d': 2083, 'Md_d|Binary': 1882, 'Md_c': 1592, 'Md_b': 1589, 'Fd_a': 1514, 'Fd_d': 924, 'Md_a': 791, 'Md_d': 635})
422958


## WRs
### HM-STARs from the VIIth Catalog of Galactic Wolf-Rayet Stars (van der Hucht 2001) and its annex catalog (van der Hucht 2006)

In [15]:
WRs1 = Vizier(catalog="III/215",row_limit=-1, #https://ui.adsabs.harvard.edu/?#abs/2001NewAR..45..135V
    columns=['*', '_RAJ2000', '_DEJ2000','OName']).query_constraints()[0]
WRs1 = WRs1['_RAJ2000','_DEJ2000','Name','OName','Aname'].to_pandas()
WRs1['Class'], WRs1['e_Pos'], WRs1['ref'], WRs1['SubClass'] = 'HM-STAR', np.nan, '2001NewAR..45..135V', np.nan#III/215
WRs1 = WRs1.replace(r'^\s*$', np.nan, regex=True)
WRs1['name_cat'] = WRs1['Name'].combine_first(WRs1['OName'].combine_first(WRs1['Aname']))
df_WRs1 = WRs1.drop(columns=['Name','OName','Aname'])
print(len(df_WRs1))


226


In [16]:
WRs2 = Vizier(catalog="J/A+A/458/453/table1",row_limit=-1, #https://ui.adsabs.harvard.edu/?#abs/2006A%26A...458..453V
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints()[0]
WRs2 = WRs2['_RAJ2000','_DEJ2000','SpType','SpType0','SimbadName','WRori'].to_pandas()
WRs2['Class'], WRs2['e_Pos'], WRs2['ref']= 'HM-STAR', np.nan, '2006A&A...458..453V'#J/A+A/458/453/table1
WRs2 = WRs2.replace(r'^\s*$', np.nan, regex=True)
WRs2['name_cat'] = WRs2['SimbadName'].combine_first(WRs2['WRori'])
WRs2['SubClass'] = WRs2['SpType'].combine_first(WRs2['SpType0'])
df_WRs2 = WRs2.drop(columns=['SpType','SpType0','SimbadName','WRori'])
print(len(df_WRs2))


118


# Quasars & AGNs 
### from Veron Catalog of Quasars & AGN 13th Edition (Veron-Cetty & Veron 2010)

In [17]:
AGNs = Vizier(catalog="VII/258/vv10",row_limit=-1,#https://ui.adsabs.harvard.edu/?#abs/2010A%26A...518A..10V
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints(Cl='|Q|A|B')[0]
AGNs = AGNs['_RAJ2000','_DEJ2000','Name','Cl'].to_pandas()
AGNs['Class'], AGNs['e_Pos'], AGNs['ref']= 'AGN', np.nan, '2010A&A...518A..10V'#VII/258/vv10
df_AGNs = AGNs.rename(columns={'Name':'name_cat','Cl':'SubClass'})

print(len(df_AGNs), Counter(df_AGNs['SubClass']))

168940 Counter({'Q': 133335, 'A': 34231, 'B': 1374})


# HMXBs
### from the Catalog of HMXBs in the Galaxy 4th Edition (Liu et al. 2006)

In [18]:
HMXBs = Vizier(catalog="J/A+A/455/1165/table1",row_limit=-1, #https://ui.adsabs.harvard.edu/?#abs/2006A%26A...455.1165L
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints()[0]
HMXBs = HMXBs['_RAJ2000','_DEJ2000','Name','Type'].to_pandas()
HMXBs['Class'], HMXBs['e_Pos'], HMXBs['ref'] = 'HMXB', np.nan, '2006A&A...455.1165L'#J/A+A/455/1165/table1
df_HMXBs = HMXBs.rename(columns={'Name':'name_cat','Type':'SubClass'})
print(len(df_HMXBs))


114


# LMXBs
### from the Low Mass X-ray Binary Catalog (Liu et al. 2007) and from the Catalog of CVs, LMXBs and related objects (Seventh edition) (Ritter & Kolb 2003)

In [19]:
LMXBs1 = Vizier(catalog="J/A+A/469/807",row_limit=-1, #https://ui.adsabs.harvard.edu/?#abs/2007A%26A...469..807L
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints()[0]
LMXBs1 = LMXBs1['_RAJ2000','_DEJ2000','Name','Type'].to_pandas()
LMXBs1['Class'], LMXBs1['e_Pos'], LMXBs1['ref'] = 'LMXB', np.nan, '2007A&A...469..807L'#J/A+A/469/807
df_LMXBs1 = LMXBs1.rename(columns={'Name':'name_cat','Type':'SubClass'})
print(len(df_LMXBs1))


187


In [20]:
LMXBs2 = Vizier(catalog="B/cb/lmxbdata",row_limit=-1, #https://ui.adsabs.harvard.edu/?#abs/2003A%26A...404..301R
    columns=['*', '_RAJ2000', '_DEJ2000','epos']).query_constraints()[0]

LMXBs2 = LMXBs2['_RAJ2000','_DEJ2000','epos','Name','Type1'].to_pandas()
LMXBs2['Class'], LMXBs2['ref'] = 'LMXB', '2003A&A...404..301R' #B/cb/lmxbdata
df_LMXBs2 = LMXBs2.rename(columns={'Name':'name_cat','Type1':'SubClass','epos':'e_Pos'})
print(len(df_LMXBs2))


108


# CVs
### from the Cataclysmic Variables Catalog 2006 Edition (Downes et al. 2001) and the Catalog of CVs, LMXBs and related objects (Seventh edition) (Ritter & Kolb 2003)

In [21]:
CVs1 = Vizier(catalog="V/123A",row_limit=-1, #https://ui.adsabs.harvard.edu/?#abs/2001PASP..113..764D
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints()[0]
CVs1 = CVs1['_RAJ2000','_DEJ2000','Names','VarType'].to_pandas()
CVs1['Class'], CVs1['e_Pos'], CVs1['ref'] = 'CV', np.nan, '2005JAD....11....2D'#'2001PASP..113..764D'#V/123A
CVs1 = CVs1[CVs1['VarType']!='non-CV'].reset_index(drop=True)
df_CVs1 = CVs1.rename(columns={'Names':'name_cat','VarType':'SubClass'})
print(len(df_CVs1))


1618


In [22]:
CVs2 = Vizier(catalog="B/cb/cbdata",row_limit=-1, #https://ui.adsabs.harvard.edu/?#abs/2003A%26A...404..301R
    columns=['*', '_RAJ2000', '_DEJ2000','epos']).query_constraints()[0]
CVs2 = CVs2['_RAJ2000','_DEJ2000','epos','Name','Type1'].to_pandas()
CVs2['Class'], CVs2['ref'] = 'CV', '2003A&A...404..301R'#B/cb/cbdata
df_CVs2 = CVs2.rename(columns={'Name':'name_cat','Type1':'SubClass','epos':'e_Pos'})
print(len(df_CVs2))


1429


# NS & NS_BIN
### from the ATNF Pulsar Catalog (Manchester et al. 2005)

In [23]:
import urllib3
#https://ui.adsabs.harvard.edu/abs/2005AJ....129.1993M/abstract
http = urllib3.PoolManager()
r = http.request('GET', 'https://www.atnf.csiro.au/research/pulsar/psrcat/proc_form.php?version=1.65&Name=Name&RaJ=RaJ&DecJ=DecJ&Binary=Binary&Type=Type&startUserDefined=true&c1_val=&c2_val=&c3_val=&c4_val=&sort_attr=jname&sort_order=asc&condition=&pulsar_names=&ephemeris=short&coords_unit=raj%2Fdecj&radius=&coords_1=&coords_2=&style=Long+csv+with+errors&no_value=*&fsize=3&x_axis=&x_scale=linear&y_axis=&y_scale=linear&state=query&table_bottom.x=35&table_bottom.y=15') # it's a file like object and works just like a file
r.status

ATNF = r.data.decode('utf-8').partition('\n<pre>\n')[2].partition('\n</pre>\n')[0].replace('*',' ').split('\n')
NSs = pd.DataFrame(columns=['src', 'NAME','Name_ref','RAJ','e_RA','RAJ_ref','DECJ','e_DEC','DECJ_ref','Binary','Binary_ref','PSR_type','Type_ref'], 
                data=[row.split(';') for row in ATNF[2:]])

NSs['_RAJ2000'] = NSs.apply(lambda row: atnf_pos(row.RAJ, row.e_RA, 'hms', 'pos'), axis=1)
NSs['_e_RAJ2000'] = NSs.apply(lambda row: atnf_pos(row.RAJ, row.e_RA, 'hms', 'err'), axis=1)
NSs['_DEJ2000'] = NSs.apply(lambda row: atnf_pos(row.DECJ, row.e_DEC, 'dms', 'pos'), axis=1)
NSs['_e_DEJ2000'] = NSs.apply(lambda row: atnf_pos(row.DECJ, row.e_DEC, 'dms', 'err'), axis=1)
NSs['e_Pos'] = NSs.apply(lambda row: max(row._e_RAJ2000 , row._e_DEJ2000), axis=1)


# correcting the inaccurate coordinates of two NSs
NSs.loc[NSs.NAME=='J1819-1458', '_RAJ2000'] = 274.8924
NSs.loc[NSs.NAME=='J1819-1458', '_DEJ2000'] = -14.9676579999999
NSs.loc[NSs.NAME=='J1741-2054', '_RAJ2000'] = 265.48868
NSs.loc[NSs.NAME=='J1741-2054', '_DEJ2000'] = -20.903278
NSs.loc[NSs.NAME=='J1718-3718', '_RAJ2000'] = 259.5409420
NSs.loc[NSs.NAME=='J1718-3718', '_DEJ2000'] = -37.3143054

# non-empty Binary column are binary non-accreting NSs (NS_BIN class) and share similar properties with LMXBs.
NSs['ref']= '2005AJ....129.1993M'#B/psr/psr
NSs['Class'] = 'NS_BIN'
print(len(NSs),Counter(NSs['Binary']))
NSs.loc[NSs['Binary']==' ', 'Class'] = 'NS'

# adding a few new NSs and NS_BINs
new_NS_BINs = pd.read_csv(f'{old_TD_dir}/new_NS_BIN.csv')
print(NSs.loc[NSs.NAME.isin(new_NS_BINs.name_cat.values), ['NAME','Binary','Class']])

df_NSs = NSs[['NAME','_RAJ2000','_DEJ2000','e_Pos','Class','PSR_type','ref']].rename(columns={'NAME':'name_cat','PSR_type':'SubClass'})
print(len(df_NSs))

print(len(df_NSs),Counter(df_NSs['Class']))


3177 Counter({' ': 2844, 'ELL1': 143, 'BT': 107, 'DD': 39, 'DDH': 14, 'ELL1H': 10, 'BTX': 7, 'DDGR': 4, 'T2': 3, 'MSS': 3, 'DDS': 1, 'BT2P': 1, 'DDK': 1})
             NAME Binary   Class
6      J0023+0923   ELL1  NS_BIN
60     J0101-6422     BT  NS_BIN
313   J0737-3039B     DD  NS_BIN
560    J1124-3653             NS
644    J1231-1411     BT  NS_BIN
714    J1311-3430   ELL1  NS_BIN
921    J1514-4946   ELL1  NS_BIN
984      B1534+12     DD  NS_BIN
1107   J1614-2230  ELL1H  NS_BIN
1172   J1628-3205     BT  NS_BIN
1294   J1653-0158   ELL1  NS_BIN
1494   J1731-1847    BTX  NS_BIN
1836   J1810+1744     BT  NS_BIN
1881   J1816+4510   ELL1  NS_BIN
2545   J1909-3744   ELL1  NS_BIN
2909     B1957+20     BT  NS_BIN
2953   J2017+0603  ELL1H  NS_BIN
3002   J2043+1711   ELL1  NS_BIN
3013   J2047+1053     BT  NS_BIN
3018   J2051-0827   ELL1  NS_BIN
3100   J2214+3000   ELL1  NS_BIN
3102   J2215+5135   ELL1  NS_BIN
3106   J2222-0137     DD  NS_BIN
3125   J2241-5236     BT  NS_BIN
3133   J2256-1024   

# HMXBs, LMXBs, and CVs from INTEGRAL General Reference Catalog (IGRS) and HMXBs from Be Star catalog

In [24]:
df_HMXB_Be = pd.read_csv(f'{old_TD_dir}/raretype_BeStar_IGRS.csv')
print(Counter(df_HMXB_Be['Class']))

Counter({'HMXB': 55, 'LMXB': 8, 'CV': 5})


# Combining sources together

In [25]:
df_TD = pd.concat([df_AGNs, df_YSOs, df_LMSTARs, APOGEE_STAR, df_HMSTARs, df_WRs1, df_WRs2, df_NSs, df_HMXBs, df_LMXBs1, df_LMXBs2, df_CVs1, df_CVs2, df_HMXB_Be,open_CV,CV_ZTF,CV_can_ZTF,CV_LAMOST_dr5,CV_LAMOST_dr6,BAT105,LAMOST_OB,LAMOST_AFGK,LAMOST_M], ignore_index=True, sort=False)



In [26]:
df_TD.head(5)

Unnamed: 0,_RAJ2000,_DEJ2000,name_cat,SubClass,Class,e_Pos,ref
0,0.005417,-2.033333,FIRST J00000-0202,Q,AGN,,2010A&A...518A..10V
1,0.005833,-30.6075,2QZ J000001-3036,Q,AGN,,2010A&A...518A..10V
2,0.007083,-31.373889,2QZ J000001-3122,Q,AGN,,2010A&A...518A..10V
3,0.01125,-25.193611,XMM J00000-2511,Q,AGN,,2010A&A...518A..10V
4,0.011667,-35.059167,MS 23574-3520,Q,AGN,,2010A&A...518A..10V


In [27]:
print(len(df_TD), sorted(Counter(df_TD['Class']).items()))

8591100 [('AGN', 169913), ('CV', 8354), ('CV-candidate', 5641), ('HM-STAR', 85369), ('HMXB', 277), ('LM-STAR', 8312997), ('LMXB', 412), ('NS', 2869), ('NS_BIN', 333), ('YSO', 4935)]


In [28]:
df_TD.to_csv(f'./data/TD_{field_name}_all.csv',index=False)

In [29]:
df_TD = pd.read_csv(f'./data/TD_{field_name}_all.csv')

In [30]:
# matching with CSCv2 

for i in range(9):
    print(i)
    if i !=8:
        TD_CSC = XMatch.query(cat1= Table.from_pandas(df_TD[i*1000000:(i+1)*1000000]), #open('/Users/yanghui/Desktop/Research/2019/MUWCLASS_Project/ML/DATA/TD/versions/CSC_TD_v5_09062021.csv'),
                              cat2='vizier:IX/57/csc2master',
                              max_distance=3*u.arcsec, colRA1='_RAJ2000',colDec1='_DEJ2000')

        TD_CSC = TD_CSC.to_pandas()
        print(len(TD_CSC), sorted(Counter(TD_CSC['Class']).items()))
        TD_CSC.to_csv(f'./data/TD_CSC_{i}.csv',index=False)
    else:
        TD_CSC = XMatch.query(cat1= Table.from_pandas(df_TD[i*1000000:]), #open('/Users/yanghui/Desktop/Research/2019/MUWCLASS_Project/ML/DATA/TD/versions/CSC_TD_v5_09062021.csv'),
                              cat2='vizier:IX/57/csc2master',
                              max_distance=3*u.arcsec, colRA1='_RAJ2000',colDec1='_DEJ2000')

        TD_CSC = TD_CSC.to_pandas()
        print(len(TD_CSC), sorted(Counter(TD_CSC['Class']).items()))
        TD_CSC.to_csv(f'./data/TD_CSC_{i}.csv',index=False)

0
9821 [('AGN', 6174), ('LM-STAR', 2202), ('YSO', 1445)]
1
3004 [('AGN', 254), ('CV', 607), ('CV-candidate', 68), ('HM-STAR', 977), ('HMXB', 123), ('LM-STAR', 352), ('LMXB', 232), ('NS', 243), ('NS_BIN', 148)]
2
344 [('LM-STAR', 344)]
3
301 [('LM-STAR', 301)]
4
350 [('LM-STAR', 350)]
5
356 [('LM-STAR', 356)]
6
437 [('LM-STAR', 437)]
7
642 [('LM-STAR', 642)]
8
636 [('LM-STAR', 636)]


In [31]:
TD_CSC = pd.DataFrame()
for i in range(9):
    #print(i)
    
    TD_CSC = pd.concat([TD_CSC, pd.read_csv(f'./data/TD_CSC_{i}.csv')], ignore_index=True, sort=False)

print(len(TD_CSC), sorted(Counter(TD_CSC['Class']).items()))

15891 [('AGN', 6428), ('CV', 607), ('CV-candidate', 68), ('HM-STAR', 977), ('HMXB', 123), ('LM-STAR', 5620), ('LMXB', 232), ('NS', 243), ('NS_BIN', 148), ('YSO', 1445)]


In [32]:
TD_CSC.head(5)

TD_CSC.to_csv(f'{data_dir}/{field_name}_Xmatch_all.csv',index=False)



In [33]:
TD_CSC = pd.read_csv(f'{data_dir}/{field_name}_Xmatch_all.csv')

# drop duplicated sources
TD_CSC = TD_CSC.sort_values(by=['angDist']) 
TD_CSC = TD_CSC.drop_duplicates(subset=['_RAJ2000_1', '_DEJ2000_1', 'name_cat', 'SubClass', 'Class','e_Pos', 'ref']).reset_index(drop=True)

print(len(TD_CSC), sorted(Counter(TD_CSC['Class']).items()))


14500 [('AGN', 6170), ('CV', 490), ('CV-candidate', 57), ('HM-STAR', 847), ('HMXB', 110), ('LM-STAR', 4954), ('LMXB', 192), ('NS', 183), ('NS_BIN', 90), ('YSO', 1407)]


In [34]:
# calculate the combined positional uncertainties (PUs) from X-ray positions and class-specific catalog coordinates 
TD_CSC['PU'] = np.sqrt(TD_CSC.e_Pos.fillna(0)*2**2+TD_CSC.r0.fillna(0)**2)

TD_CSC['name'] = TD_CSC.apply(lambda row: '2CXO '+str(row['2CXO']),axis=1)

# Sources from populous classes (AGNs, HM-STARs, LM-STARs and YSOs) are omitted if their class-specific catalog 
# and X-ray combined 2-σ PUs are > 1" or 
# if the separations of the class-specific catalog and the CSCv2 coordinates exceed the 2-σ PUs.

idx = np.where( ((TD_CSC['angDist']>TD_CSC['PU']) | (TD_CSC['PU'] >1.) )& ((TD_CSC['Class']=='AGN') | (TD_CSC['Class']=='YSO') | (TD_CSC['Class']=='HM-STAR') | (TD_CSC['Class']=='LM-STAR') ))[0]
print('Remove', len(idx), sorted(Counter(TD_CSC.loc[idx, 'Class']).items()))
TD_CSC = TD_CSC.drop(TD_CSC.index[idx])
TD_CSC = TD_CSC.reset_index(drop=True)
print(len(TD_CSC), sorted(Counter(TD_CSC['Class']).items()))

TD = TD_CSC.rename(columns={'_RAJ2000_1':'ra_cat','_DEJ2000_1':'dec_cat','angDist':'sep','RAICRS':'ra','DEICRS':'dec'})[['name_cat','ra_cat','dec_cat','e_Pos','Class','SubClass','ref','sep','name','ra','dec','r0','r1','PA','PU']].sort_values(by=['Class','ra']).reset_index(drop=True)


#TD['remove_code'] = 0



Remove 8608 [('AGN', 4534), ('HM-STAR', 405), ('LM-STAR', 3186), ('YSO', 483)]
5892 [('AGN', 1636), ('CV', 490), ('CV-candidate', 57), ('HM-STAR', 442), ('HMXB', 110), ('LM-STAR', 1768), ('LMXB', 192), ('NS', 183), ('NS_BIN', 90), ('YSO', 924)]


In [35]:
# cross-matching to SIMBAD 

CSCv2 = Table.read('./data/CSCv2.vot').to_pandas()

#CSCv2['_RAJ2000'] = Angle(CSCv2['_RAJ2000'], 'deg').degree*u.degree 
#CSCv2['_DEJ2000'] = Angle(CSCv2['_DEJ2000'], 'deg').degree*u.degree 

CSCv2_simbad = XMatch.query(cat1=Table.from_pandas(CSCv2),
                         cat2='vizier:SIMBAD',max_distance=3 * u.arcsec, colRA1='_RAJ2000',colDec1='_DEJ2000')

CSCv2_simbad = CSCv2_simbad.to_pandas().rename(columns={'ra':'ra_simbad', 'dec':'dec_simbad', 'angDist':'_r_simbad'}).sort_values(by=['_r_simbad']) 
CSCv2_simbad = CSCv2_simbad.drop_duplicates(subset=['_2CXO'], keep='first').reset_index(drop=True)
print(len(CSCv2_simbad))
#print(CSCv2_simbad.columns)
#TD= pd.merge(TD, TD_simbad, how='outer', on = ['name_cat','ra_cat','dec_cat','e_Pos','Class','SubClass','ref', 'sep', 'name', 'ra', 'dec', 'r0','PU','remove_code'])

#TD.loc[TD.name_cat.isnull(), 'name_cat'] = TD.loc[TD.name_cat.isnull(), 'main_id']
#TD.loc[TD.name_cat.isnull(), 'name_cat'] = TD.loc[TD.name_cat.isnull(), 'name']
#print(TD.columns[:50])


87838


In [36]:
CSCv2_simbad['name'] = CSCv2_simbad.apply(lambda r: '2CXO '+ r['_2CXO'],axis=1)
CSCv2_simbad['ref'] = 'SIMBAD'
CSCv2_simbad = CSCv2_simbad.rename(columns={'_r_simbad':'sep','RAICRS':'ra', 'DEICRS':'dec','main_id':'name_cat',
                    'ra_simbad':'ra_cat','dec_simbad':'dec_cat', 'coo_err_maj':'e_Pos','main_type':'Class', 'other_types':'SubClass'})
CSCv2_simbad['PU'] = np.sqrt((CSCv2_simbad.e_Pos.fillna(0)*2)**2+CSCv2_simbad.r0.fillna(0)**2)


In [37]:
CSCv2_simbad[['name_cat', 'nbref', 'ra_sexa', 'dec_sexa', 'coo_qual','coo_bibcode']]

Unnamed: 0,name_cat,nbref,ra_sexa,dec_sexa,coo_qual,coo_bibcode
0,2CXO J113510.4-605605,1,11 35 10.4690,-60 56 05.335,D,2010ApJS..189...37E
1,2CXO J113503.0-605521,1,11 35 03.0329,-60 55 21.149,D,2010ApJS..189...37E
2,2CXO J113509.9-605500,1,11 35 09.9680,-60 55 00.531,D,2010ApJS..189...37E
3,2CXO J113512.2-605559,1,11 35 12.2187,-60 55 59.691,D,2010ApJS..189...37E
4,2CXO J113514.1-605605,1,11 35 14.1441,-60 56 05.662,D,2010ApJS..189...37E
...,...,...,...,...,...,...
87833,2XMM J122224.7+043324,1,12 22 24.710,+04 33 24.04,C,2009ApJS..182..543A
87834,[KCE2014] 183.3660+02.9600,1,12 13 27.8,+02 57 36,D,2014MNRAS.445.1430K
87835,XMMU J005010.7-731931,1,00 50 10.707,-73 19 31.23,D,2013A&A...558A...3S
87836,GPM 210.409761+54.557029,1,14 01 38.35648,+54 33 24.0570,A,2020yCat.1350....0G


In [38]:
print(CSCv2_simbad.columns)
print(TD.columns)
print(TD['Class'].value_counts())

Index(['sep', '_RAJ2000', '_DEJ2000', '_2CXO', 'ra', 'dec', 'r0', 'r1', 'PA',
       'name_cat', 'ra_cat', 'dec_cat', 'e_Pos', 'coo_err_min',
       'coo_err_angle', 'nbref', 'ra_sexa', 'dec_sexa', 'coo_qual',
       'coo_bibcode', 'Class', 'SubClass', 'radvel', 'radvel_err', 'redshift',
       'redshift_err', 'sp_type', 'morph_type', 'plx', 'plx_err', 'pmra',
       'pmdec', 'pm_err_maj', 'pm_err_min', 'pm_err_pa', 'size_maj',
       'size_min', 'size_angle', 'B', 'V', 'R', 'J', 'H', 'K', 'u', 'g', 'r',
       'i', 'z', 'name', 'ref', 'PU'],
      dtype='object')
Index(['name_cat', 'ra_cat', 'dec_cat', 'e_Pos', 'Class', 'SubClass', 'ref',
       'sep', 'name', 'ra', 'dec', 'r0', 'r1', 'PA', 'PU'],
      dtype='object')
LM-STAR         1768
AGN             1636
YSO              924
CV               490
HM-STAR          442
LMXB             192
NS               183
HMXB             110
NS_BIN            90
CV-candidate      57
Name: Class, dtype: int64


In [39]:
#print(len(CSCv2_simbad), len(TD))
#TD_simbad = pd.merge(CSCv2_simbad[['name','_r_simbad','RAICRS', 'DEICRS', 'r0',
       #'r1', 'PA', 'main_id','ra_simbad', 'dec_simbad','main_type', 'other_types']],
       #         TD[['name_cat', 'ra_cat', 'dec_cat', 'e_Pos', 'Class', 'SubClass', 'ref',
       #'sep', 'name', 'ra', 'dec', 'r0', 'r1', 'PA', 'PU']], on='name', how='outer' )
#print(len(TD_simbad))

print(len(TD), len(CSCv2_simbad))
TD_simbad  = pd.concat([TD, CSCv2_simbad[['name_cat', 'ra_cat', 'dec_cat', 'e_Pos', 'Class', 'SubClass', 'ref',
       'sep', 'name', 'ra', 'dec', 'r0', 'r1', 'PA', 'PU','nbref','coo_bibcode']]], ignore_index=True, sort=False)
print(len(TD_simbad))
TD_simbad = TD_simbad[TD_simbad['Class']!='CV-candidate'].reset_index(drop=True)
print(len(TD_simbad))

5892 87838
93730
93673


In [40]:
print(TD_simbad.columns)

Index(['name_cat', 'ra_cat', 'dec_cat', 'e_Pos', 'Class', 'SubClass', 'ref',
       'sep', 'name', 'ra', 'dec', 'r0', 'r1', 'PA', 'PU', 'nbref',
       'coo_bibcode'],
      dtype='object')


In [41]:

TD_simbad[TD_simbad.duplicated(subset=['name'])]['Class'].unique()

array(['AGN', 'CV', 'HM-STAR', 'HMXB', 'LM-STAR', 'LMXB', 'NS', 'NS_BIN',
       'YSO', 'X', 'Orion_V*', 'RSCVn', 'QSO', 'TTau*', 'CataclyV*',
       'NIR', 'V*', 'Star', 'WR*', 'PulsV*delSct', 'Seyfert_1', 'Radio',
       'Seyfert_2', 'Pulsar', 'Em*', 'Planet', 'Seyfert', 'EB*', 'Cl*',
       'BClG', 'low-mass*', 'SB*', 'LINER', 'GlCl', 'PM*', '**', 'IR',
       'BLLac', 'LensSystem_Candidate', 'Unknown_Candidate',
       'YSO_Candidate', 'EmObj', 'RotV*', 'Radio(cm)', 'WD*_Candidate',
       'BYDra', 'GinPair', 'QSO_Candidate', 'Erupt*RCrB', 'EB*_Candidate',
       'Ae*_Candidate', 'Blazar', 'Nova', 'HH', 'ULX?_Candidate',
       'Galaxy', 'CV*_Candidate', 'PN', 'PartofG', 'LPV*', 'BH_Candidate',
       'PulsV*bCep', 'denseCore', 'Be*', 'GravLensSystem',
       'RGB*_Candidate', 'PulsV*', 'LensedImage', 'XB', 'Eruptive*',
       'RedSG*', 'Irregular_V*', 'MolCld', 'Cloud', 'Symbiotic*', 'ClG',
       'Ae*', 'MIR', 'BlueSG*', 'HII', 'GinCl', 'EmG', 'brownD*',
       'Neutron*', 'AGN_C

In [42]:
TD_simbad['rare-type'] = 4
#TD_simbad['rare-type'] = TD_simbad.apply(lambda r: True if r['Class'] is in Simbad_dict['rare-type'] else True if set(r['SubClass'].str.split("|")).isdisjoint(set(Simbad_dict['rare-type'])), axis=1)
#TD_simbad['rare-type'] = TD_simbad.apply(lambda r: True if r['Class'] in Simbad_dict['rare-type'] else False, axis=1)
#TD_simbad['rare-type'] = TD_simbad.apply(lambda r: False if type(r['SubClass']) == float else True if set(r['SubClass'].split("|")).isdisjoint(set(Simbad_dict['rare-type'])) else False, axis=1)
TD_simbad['rare-type'] = TD_simbad.apply(lambda r: 1 if r['Class'] in Simbad_dict['rare-type'] else 2 if type(r['SubClass']) == float else 3 if not set(r['SubClass'].split("|")).isdisjoint(set(Simbad_dict['rare-type'])) else 4, axis=1)
#

In [43]:
print(TD_simbad['rare-type'].value_counts())
print(TD_simbad.loc[TD_simbad['rare-type']==3, ['Class','SubClass']])
print(TD_simbad.loc[TD_simbad['rare-type']==1, 'Class'].value_counts())
print(TD_simbad[TD_simbad['rare-type']==1].groupby(['Class','ref']).size())

4    89431
1     3835
2      342
3       65
Name: rare-type, dtype: int64
                Class            SubClass
5977   ULX?_Candidate       HXB|ULX|UX?|X
6885              Be*  *|BS*|Be*|HXB|IR|X
6931              AGN           AGN|CV*|X
7010   ULX?_Candidate           HXB|UX?|X
9479         SFregion       HXB|SFR|ULX|X
...               ...                 ...
79563               X               CV*|X
79797  ULX?_Candidate       HXB|ULX|UX?|X
81324            Nova          CV*|No*|V*
87554           gamma           Psr|X|gam
88340            LPV*      *|**|CV*|LP*|X

[65 rows x 2 columns]
HMXB         1211
XB            929
LMXB          596
CV            490
CataclyV*     188
NS            183
Pulsar        146
NS_BIN         90
Neutron*        2
Name: Class, dtype: int64
Class      ref                                  
CV         2003A&A...404..301R                        94
           2005JAD....11....2D                       177
           2018ApJS..235....4O                  

In [44]:


df_crowd_fields = pd.DataFrame.from_dict(crowd_fields_dict, orient='index')

df_crowd_fields.reset_index(inplace=True)

df_crowd_fields = df_crowd_fields.rename(columns = {'index':'Name','ra':'RAJ2000', 'dec':'DEJ2000','r':'r2'})

#df_crowd_fields


In [45]:
# Globular Clusters 2013A&A...558A..53K

GCs = Vizier(columns=["**"], catalog="J/A+A/558/A53/catalog", row_limit=-1,column_filters={"Type":"g"}).query_constraints()
df_GCs = GCs[0].to_pandas()
#df_GCs = df_GCs[df_GCs['Type']=='g'].reset_index(drop=True)

#print(df_GCs.columns)
#print(df_GCs[['Name','RAJ2000','DEJ2000','r2']])


# nearby star-forming galaxies (2012MNRAS.419.2095M)

SFgal = Vizier(columns=["**"], catalog="J/MNRAS/419/2095/sfgal", row_limit=-1).query_constraints()
df_SFgal = SFgal[0].to_pandas()
#print(df_SFgal.columns)
#print(df_SFgal[['Galaxy','Rx','SimbadName']])

df_SFgal['RAJ2000'] = np.nan
df_SFgal['DEJ2000'] = np.nan
df_SFgal['r2'] = df_SFgal['Rx']*2/60 # a factor of 2 to be conservative 
df_SFgal['Name'] = df_SFgal.apply(lambda r: 'NGC_' + r['Galaxy'] if r['Galaxy'][:1].isdigit() else r['Galaxy'],axis=1)

for simbad_name in df_SFgal['SimbadName']:
    df_r = Simbad.query_object(simbad_name).to_pandas()
    df_SFgal.loc[df_SFgal['SimbadName']==simbad_name, 'RAJ2000'] = Angle(df_r['RA'],'hourangle').degree
    df_SFgal.loc[df_SFgal['SimbadName']==simbad_name, 'DEJ2000'] = Angle(df_r['DEC'],'deg').degree
    #print(simbad_name)
    #print(df_r.to_pandas())
    
#print(df_SFgal[['Name','RAJ2000','DEJ2000','r2']])

print(len(df_crowd_fields),len(df_GCs),len(df_SFgal))
df_crowd_fields_all = pd.concat([df_crowd_fields, df_GCs[['Name','RAJ2000','DEJ2000','r2']], df_SFgal[['Name','RAJ2000','DEJ2000','r2']]], ignore_index=True, sort=False)
   

print(df_crowd_fields_all[['Name','RAJ2000','DEJ2000','r2']])


34 147 29
         Name     RAJ2000    DEJ2000        r2
0      NGC_55    3.723333 -39.196667  0.164384
1       IC_10    5.072083  59.303889  0.066667
2     Haro_11    9.219583 -33.554778  0.001667
3        M_31   10.684580  41.269160  1.000000
4         SMC   13.186700 -72.828600  5.330000
..        ...         ...        ...       ...
205  NGC_7090  324.120271 -54.557319  0.103333
206  NGC_7541  348.682737   4.533900  0.046667
207  NGC_7793  359.457308 -32.591028  0.066667
208   UGC5720  158.133150  54.400981  0.020000
209   CARTWHE    9.421305 -33.716254  0.023333

[210 rows x 4 columns]


In [46]:
df_crowd_fields_all[df_crowd_fields_all.duplicated(subset=['Name'],keep=False)].sort_values(by='Name')


Unnamed: 0,Name,RAJ2000,DEJ2000,r2
14,NGC_2403,114.214167,65.6025,0.168675
188,NGC_2403,114.213909,65.602681,0.15
18,NGC_4214,183.913333,36.326944,0.069644
196,NGC_4214,183.913225,36.326889,0.083333
22,NGC_5457,210.802429,54.34875,0.166667
202,NGC_5457,210.802429,54.34875,0.133333
27,NGC_6388,264.072754,-44.73565,0.051667
111,NGC_6388,264.071991,-44.736,0.22
28,NGC_6397,265.175375,-53.674333,0.266667
115,NGC_6397,265.175995,-53.674,0.405


In [47]:
print(len(df_crowd_fields_all))
df_crowd_fields_all = df_crowd_fields_all.sort_values(by='r2',ascending=False).reset_index(drop=True)
df_crowd_fields_all = df_crowd_fields_all[~df_crowd_fields_all.duplicated(subset=['Name'])].reset_index(drop=True)
print(len(df_crowd_fields_all))


210
203


In [48]:

for i in range(10):
    c1 = SkyCoord(ra=df_crowd_fields_all.loc[i, 'RAJ2000']*u.degree, dec=df_crowd_fields_all.loc[i, 'DEJ2000']*u.degree, frame='icrs')
    #print(df_crowd_fields_all.loc[i,:])
    for j in range(len(df_crowd_fields_all)):
        c2 = SkyCoord(ra=df_crowd_fields_all.loc[j, 'RAJ2000']*u.degree, dec=df_crowd_fields_all.loc[j, 'DEJ2000']*u.degree, frame='icrs')
    
        if (c1.separation(c2).degree < df_crowd_fields_all.loc[i, 'r2']) & (j!=i):
            print(c1.separation(c2).degree, '\n', df_crowd_fields_all.loc[j, :])


print(len(df_crowd_fields_all))
df_crowd_fields_all = df_crowd_fields_all[~(df_crowd_fields_all['Name'].isin(['NGC_104','NGC_362','M_13']))].reset_index(drop=True)
print(len(df_crowd_fields_all))    


2.289041868442859 
 Name         NGC_104
RAJ2000        6.004
DEJ2000   -72.081001
r2              0.95
Name: 3, dtype: object
2.14499221821194 
 Name       NGC_362
RAJ2000     15.825
DEJ2000    -70.847
r2           0.235
Name: 33, dtype: object
0.00019073197755802303 
 Name             M_13
RAJ2000    250.421833
DEJ2000     36.459861
r2           0.166667
Name: 72, dtype: object
203
200


In [49]:
TD_simbad_name = TD_simbad[~TD_simbad.duplicated(subset='name')].reset_index(drop=True)
print(len(TD_simbad_name))

ras = np.array(TD_simbad_name['ra'].values)
decs = np.array(TD_simbad_name['dec'].values)
ras_cat =  np.array(df_crowd_fields_all['RAJ2000'].values)
decs_cat =  np.array(df_crowd_fields_all['DEJ2000'].values)

87967


In [50]:

c = SkyCoord(ra=ras*u.degree, dec=decs*u.degree)
catalog = SkyCoord(ra=ras_cat*u.degree, dec=decs_cat*u.degree)
idxs, d2d, d3d = c.match_to_catalog_sky(catalog)
#print(idx, d2d, d3d)
#print(len(idx))

TD_simbad['remove_regions'] = ''

start = time.time()
for i in range(len(TD_simbad_name)):
    #if TD_simbad_name.loc[i, 'name']=='2CXO J010206.6-714116':
        #print(df_crowd_fields_all.loc[idxs[i], :])
    #'''
    if d2d[i].deg < df_crowd_fields_all.loc[idxs[i], 'r2']:
        #print(i, d2d[i].deg, df_crowd_fields_all.loc[idxs[i], 'r2'])
        TD_simbad.loc[TD_simbad['name']==TD_simbad_name.loc[i, 'name'], 'remove_regions'] += '|'+df_crowd_fields_all.loc[idxs[i], 'Name']
    #'''
end = time.time() 
print(end - start)      
        

51.93120265007019


In [51]:
TD_simbad['remove_regions'].value_counts()

             84318
|SMC          1737
|NGC_2264      912
|M_31          711
|M_33          516
             ...  
|NGC_6638        1
|NGC_5946        1
|NGC_6256        1
|NGC_6717        1
|Terzan_3        1
Name: remove_regions, Length: 127, dtype: int64

In [52]:
TD_simbad['GLAT'] = SkyCoord(ra=TD_simbad['ra']*u.degree, dec=TD_simbad['dec']*u.degree, frame='icrs').galactic.l.degree
TD_simbad['GLON'] = SkyCoord(ra=TD_simbad['ra']*u.degree, dec=TD_simbad['dec']*u.degree, frame='icrs').galactic.b.degree

TD_simbad[['ra','dec','GLAT','GLON']]

Unnamed: 0,ra,dec,GLAT,GLON
0,0.039115,13.938493,104.492215,-47.088916
1,0.627974,0.833072,98.134883,-59.662290
2,0.912396,16.039005,106.501181,-45.302672
3,1.594324,-0.073573,99.280958,-60.859243
4,1.747051,-0.294661,99.412899,-61.120224
...,...,...,...,...
93668,185.602940,4.555845,284.608751,66.389505
93669,183.366218,2.960804,280.709825,64.181837
93670,12.541707,-73.325356,303.058260,-43.802585
93671,210.408386,54.556618,102.642273,59.719589


In [53]:
idx = np.where(((TD_simbad.GLAT > 350.) | (TD_simbad.GLAT < 10.)) & (TD_simbad.GLON > -5.) & (TD_simbad.GLON < 5.))[0]
TD_simbad.loc[idx, 'remove_regions'] = TD_simbad.loc[idx, 'remove_regions'] + '|GalacticCenter'

idx = np.where((TD_simbad['name'].str.strip().str[-1].str.isalpha()) & (~TD_simbad['name'].isin(['2CXO J043715.9-471509X'])))[0]
TD_simbad.loc[idx, 'remove_regions'] = TD_simbad.loc[idx, 'remove_regions'] + '|CSCconfused'

In [54]:
TD_simbad['remove_regions'].value_counts()


                         75052
|GalacticCenter           8770
|SMC                      1733
|NGC_2264                  912
|M_31                      708
                         ...  
|NGC_6355                    1
|NGC_5253|CSCconfused        1
|NGC_4697|CSCconfused        1
|NGC_6388|CSCconfused        1
|NGC_5194|CSCconfused        1
Name: remove_regions, Length: 156, dtype: int64

In [55]:
print(TD_simbad.loc[TD_simbad['ref']=='2004ApJ...610.1045S','Class'].value_counts())
#print(TD_simbad.loc[TD_simbad['ref']=='2004ApJ...610.1045S',''].value_counts())
print(TD_simbad.loc[TD_simbad['name'].isin(TD_simbad.loc[TD_simbad['ref']=='2004ApJ...610.1045S','name']), 'Class'].value_counts())

print(TD_simbad.loc[TD_simbad['name'].isin(TD_simbad.loc[TD_simbad['ref']=='2004ApJ...610.1045S','name']), ['name','Class','ref']])

LM-STAR    10
Name: Class, dtype: int64
LM-STAR     11
YSO          5
Star         3
X            2
Orion_V*     1
Name: Class, dtype: int64
                        name     Class                  ref
3220   2CXO J053507.0-052500   LM-STAR  2004ApJ...610.1045S
3225   2CXO J053508.4-052230   LM-STAR  2004ApJ...610.1045S
3230   2CXO J053510.3-052451   LM-STAR  2004ApJ...610.1045S
3234   2CXO J053511.3-052426   LM-STAR  2004ApJ...610.1045S
3236   2CXO J053511.7-052155   LM-STAR  2004ApJ...610.1045S
3260   2CXO J053515.9-052152   LM-STAR  2004ApJ...610.1045S
3275   2CXO J053518.0-052140   LM-STAR  2004ApJ...610.1045S
3305   2CXO J053523.5-052350   LM-STAR  2004ApJ...610.1045S
3310   2CXO J053525.0-052438   LM-STAR  2004ApJ...610.1045S
3314   2CXO J053525.3-052529   LM-STAR     arXiv:2011.14483
3315   2CXO J053525.3-052529   LM-STAR  2004ApJ...610.1045S
5414   2CXO J053525.3-052529       YSO  2012AJ....144..192M
8577   2CXO J053507.0-052500      Star               SIMBAD
15146  2CXO J053511

In [56]:
#TD_simbad.loc[TD_simbad['ref']=='2004ApJ...610.1045S','Class'] = 'YSO' 
TD_simbad.columns

Index(['name_cat', 'ra_cat', 'dec_cat', 'e_Pos', 'Class', 'SubClass', 'ref',
       'sep', 'name', 'ra', 'dec', 'r0', 'r1', 'PA', 'PU', 'nbref',
       'coo_bibcode', 'rare-type', 'remove_regions', 'GLAT', 'GLON'],
      dtype='object')

In [57]:
print(TD_simbad.loc[TD_simbad['name']=='2CXO J171809.8-371851', ['name_cat','name','ra_cat','dec_cat','Class','ref','SubClass','remove_regions']])
print(TD_simbad.loc[TD_simbad['name']=='2CXO J222552.6+653535', ['name_cat','name','Class','ref','SubClass','remove_regions']])
#print(TD_simbad.loc[TD_simbad['name']=='2CXO J063354.2+174616', ['name_cat','name','Class','ref','SubClass','remove_regions']])






        name_cat                   name      ra_cat    dec_cat Class  \
4729  J1718-3718  2CXO J171809.8-371851  259.540942 -37.314305    NS   

                      ref SubClass remove_regions  
4729  2005AJ....129.1993M      NaN                 
           name_cat                   name   Class     ref   SubClass  \
89348  PSR B2223+65  2CXO J222552.6+653535  Pulsar  SIMBAD  Psr|Rad|X   

      remove_regions  
89348                 


In [58]:
print(TD_simbad.loc[(TD_simbad['ref']=='2005JAD....11....2D') & (TD_simbad['remove_regions']==''), ['name','Class','ref','SubClass','remove_regions']])
#print(TD_simbad.loc[(TD_simbad['ref']=='2005JAD....11....2D') & (TD_simbad['SubClass']=='CV') & (TD_simbad['remove_regions']==''), ['name','Class','ref','SubClass','remove_regions']])
print(TD_simbad.loc[(TD_simbad['ref']=='2005JAD....11....2D') & (TD_simbad['remove_regions']==''), ['SubClass']].value_counts())

                       name Class                  ref SubClass remove_regions
1637  2CXO J002257.6+614107    CV  2005JAD....11....2D       CV               
1736  2CXO J012940.0+384210    CV  2005JAD....11....2D     IBWD               
1739  2CXO J020052.2-092431    CV  2005JAD....11....2D     IBWD               
1744  2CXO J025608.1+192634    CV  2005JAD....11....2D       DQ               
1745  2CXO J030346.9+645435    CV  2005JAD....11....2D       CV               
1748  2CXO J033108.1+435750    CV  2005JAD....11....2D      CV:               
1750  2CXO J033111.9+435415    CV  2005JAD....11....2D    NA/DQ               
1754  2CXO J033131.4+435648    CV  2005JAD....11....2D      CV:               
1760  2CXO J052728.2-124150    CV  2005JAD....11....2D      NL:               
1762  2CXO J054320.3-410154    CV  2005JAD....11....2D       DQ               
1765  2CXO J054748.3+283511    CV  2005JAD....11....2D       UG               
1772  2CXO J080622.9+152731    CV  2005JAD....11....

In [59]:




df_rare_sources_saving = pd.DataFrame(rare_sources_saving_dict).T



#df_rare_sources_saving.reset_index(inplace=True)
#df_rare_sources_saving = df_rare_sources_saving.rename(columns = {'index':'name'})

df_rare_sources_saving

Unnamed: 0,Class,Comment
2CXO J073751.2-303940,NS,change from NS_BIN to NS since it is a double ...
2CXO J153709.9+115555,NS,change from NS_BIN to NS since it is a double ...
2CXO J112401.1-365319,NS_BIN,change from NS to NS_BIN as a black widow pulsar
2CXO J185843.6+032606,LMXB,2021ApJ...909..154T
2CXO J130247.6-635008,HMXB,recorded in the ATNF catalog
...,...,...
2CXO J191404.2+095258,HMXB,confused with 2CXO J191404.2+095258X.
2CXO J193030.1+185214,NS,confused with 2CXO J193029.9+185213X.
2CXO J202105.4+365104,NS,confused with 2CXO J202105.4+365104X.
2CXO J222552.8+653536,NS,same NS as 2CXO J222552.6+653535 in different ...


In [60]:
TD_simbad[(TD_simbad.name.isin(TD_simbad.loc[TD_simbad['ref']=='2020RNAAS...4..219J', 'name'])) & (TD_simbad.name.isin(TD_simbad.loc[TD_simbad['Class']=='CV*_Candidate', 'name'])) ]

Unnamed: 0,name_cat,ra_cat,dec_cat,e_Pos,Class,SubClass,ref,sep,name,ra,...,r0,r1,PA,PU,nbref,coo_bibcode,rare-type,remove_regions,GLAT,GLON


In [61]:
TD_simbad.columns

Index(['name_cat', 'ra_cat', 'dec_cat', 'e_Pos', 'Class', 'SubClass', 'ref',
       'sep', 'name', 'ra', 'dec', 'r0', 'r1', 'PA', 'PU', 'nbref',
       'coo_bibcode', 'rare-type', 'remove_regions', 'GLAT', 'GLON'],
      dtype='object')

In [62]:
ref_changes = {'Simbad':'old-SIMBAD', 
               'The Open Cataclysmic Variable Catalog': '2020RNAAS...4..219J',
               'Open CV catalog': '2020RNAAS...4..219J',
               'arXiv:2008.09917': '2020A&A...642A.168B',
               'arXiv:2011.14483': '2021ApJ...908...49F',
               'arXiv:2103.00196': '2021A&A...648A..34P',
               'arXiv:2110.01464': '2022A&A...657A.131M'}

for ref in ref_changes:
    TD_simbad.loc[TD_simbad['ref']==ref, 'ref'] = ref_changes[ref]
    
TD_simbad = TD_simbad.sort_values(by=['name','ref','e_Pos']).reset_index(drop=True)




#print(TD_simbad[['name','ref','e_Pos']][100:150])




In [63]:
TD_simbad[(TD_simbad['remove_regions']=='|GalacticCenter') & (TD_simbad['rare-type']==1)]

Unnamed: 0,name_cat,ra_cat,dec_cat,e_Pos,Class,SubClass,ref,sep,name,ra,...,r0,r1,PA,PU,nbref,coo_bibcode,rare-type,remove_regions,GLAT,GLON
70227,4U 1705-32,257.226667,-32.315972,,LMXB,LXB|N*?|X|gam,SIMBAD,1.763204,2CXO J170854.2-321856,257.226113,...,1.227,1.227,0.000000,1.227000,56.0,2005A&A...432L..49S,1,|GalacticCenter,352.793524,4.681810
70422,4U 1711-34,258.582417,-34.046472,,LMXB,LXB|X|gam,SIMBAD,0.699489,2CXO J171419.7-340246,258.582251,...,0.714,0.713,44.020000,0.714000,43.0,2007A&A...469..807L,1,|GalacticCenter,352.060411,2.745959
70802,XTE J1719-291,259.820708,-29.069542,,XB,HX?|X|XB*,SIMBAD,2.733208,2CXO J171917.1-290409,259.821564,...,0.711,0.711,65.419998,0.711000,18.0,2008ATel.1451....1D,1,|GalacticCenter,356.745498,4.753352
71080,2MASS J17200591-3116596,260.024625,-31.283222,0.06,HMXB,HXB|IR|X|gam,SIMBAD,0.179051,2CXO J172005.9-311659,260.024608,...,0.713,0.713,0.000000,0.723028,34.0,2003yCat.2246....0C,1,|GalacticCenter,355.022141,3.347450
71648,J1723-2837,260.846607,-28.632547,0.11,NS_BIN,,2005AJ....129.1993M,0.348727,2CXO J172323.1-283757,260.846644,...,0.710,0.710,0.000000,0.971648,,,1,|GalacticCenter,357.616243,4.260026
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80517,SAX J1810.8-2609,272.685292,-26.150333,,LMXB,"T, B",2007A&A...469..807L,0.147194,2CXO J181044.4-260901,272.685276,...,0.719,0.713,93.280000,0.719000,,,1,|GalacticCenter,5.197213,-3.431088
80518,SAX J1810.8-2609,272.685000,-26.150000,,LMXB,LMXB,2018ApJS..235....4O,1.609226,2CXO J181044.4-260901,272.685276,...,0.719,0.713,93.280000,0.719000,,,1,|GalacticCenter,5.197213,-3.431088
80519,V* V4722 Sgr,272.685292,-26.150333,,LMXB,LXB|V*|X|gam,SIMBAD,0.147180,2CXO J181044.4-260901,272.685276,...,0.719,0.713,93.279999,0.719000,88.0,2007A&A...469..807L,1,|GalacticCenter,5.197213,-3.431088
81794,V4641 Sgr,274.840000,-25.406944,1.00,LMXB,XT,2003A&A...404..301R,0.894903,2CXO J181921.6-252425,274.840069,...,0.711,0.711,104.100000,2.122621,,,1,|GalacticCenter,6.773924,-4.789018


In [64]:
df_TD_single =  TD_simbad[~(TD_simbad.duplicated(subset=['name'],keep=False)) ].reset_index(drop=True)

df_TD_single_keep = df_TD_single[(df_TD_single['name'].isin(rare_sources_saving_dict)) | (df_TD_single['ref']!='SIMBAD')].reset_index(drop=True)

df_TD_single_keep.set_index('name', inplace=True)
df_TD_single_keep.update(df_rare_sources_saving)
df_TD_single_keep.reset_index(inplace=True)  # to recover t
#df_TD_single_keep = df_TD_single_keep.rename(columns = {'index':'name'})

#print(df_TD_single_keep.columns)
print(df_TD_single_keep['Class'].value_counts())

LM-STAR    78
NS         18
HMXB        8
LMXB        4
CV          4
NS_BIN      3
AGN         3
HM-STAR     2
Name: Class, dtype: int64


In [65]:

df_TD_multi = TD_simbad[ (TD_simbad.duplicated(subset=['name'],keep=False)) ].reset_index(drop=True)
#print(len(TD_simbad), len(df_TD_single), len(df_TD_multi))

df_TD_multi_save = df_TD_multi[df_TD_multi['name'].isin(rare_sources_saving_dict)].reset_index(drop=True)
#print(len(df_TD_multi_save))

df_TD_multi_save.set_index('name', inplace=True)
df_TD_multi_save.update(df_rare_sources_saving)
df_TD_multi_save.reset_index(inplace=True)  # to recover t



df_TD_multi_save_final = df_TD_multi_save[ ~(df_TD_multi_save.duplicated(subset=['name'])) ].reset_index(drop=True)
df_TD_multi_save['SubClass'] = df_TD_multi_save['SubClass'].fillna('')
for i, row in df_TD_multi_save_final.iterrows():
    
    #print(row['name'])
    #refs = []
    #print(df_TD_multi_save.loc[df_TD_multi_save['name']==row['name'], 'ref'].values)
    refs = ";".join(df_TD_multi_save.loc[df_TD_multi_save['name']==row['name'], 'ref'].values)
    SubClasses = ";".join(df_TD_multi_save.loc[df_TD_multi_save['name']==row['name'], 'SubClass'].values)
    #print(df_TD_multi_save.loc[df_TD_multi_save['name']==row['name'], 'SubClass'].values)
    #print(refs)
    #print(SubClasses)
    df_TD_multi_save_final.loc[i, 'refs'] = refs
    df_TD_multi_save_final.loc[i, 'SubClasses'] = SubClasses


#print(len(df_TD_multi_save['name'].unique()))

#print(df_TD_multi_save_final['Class'].value_counts())
#for name in df_TD_multi_save['name'].unique()



#

In [66]:
df_TD_multi_save_final[['ref','refs','SubClass','SubClasses']]

df_TD_multi_save_final = df_TD_multi_save_final.drop(columns=['ref','SubClass']).rename(columns={'refs':'ref', 'SubClasses':'SubClass'})

df_TD_multi_save_final[['ref','SubClass']]

Unnamed: 0,ref,SubClass
0,2005AJ....129.1993M;SIMBAD,HE;
1,2005JAD....11....2D;SIMBAD,CV;*|CV*|IR|PN|UV|V*
2,2005JAD....11....2D;2020RNAAS...4..219J;SIMBAD,CV:;CataclyV;*|CV*|IR|X
3,2003A&A...404..301R;2007A&A...469..807L;SIMBAD,"XT;T, R;*|BH?|HXB|LXB|No*|V*|X|gam"
4,2006A&A...455.1165L;2018ApJS..235....4O;SIMBAD...,"T, P, C;HMXB;*|Be*|Em*|Er*|HXB|IR|N*|Psr|Rad|S..."
...,...,...
77,2005AJ....129.1993M;SIMBAD,HE;*|Be*|IR|Psr|Rad|X|gam
78,2020RNAAS...4..219J;SIMBAD,CataclyV;CV*|X
79,2005JAD....11....2D;SIMBAD,CV:;*|IR|WD*
80,2020RNAAS...4..219J;SIMBAD,CataclyV;*|CV*|EB*|EB?|IR|PM*|V*|WD*


In [67]:
df_TD_multi_check = df_TD_multi[~df_TD_multi['name'].isin(rare_sources_saving_dict)].reset_index(drop=True)
df_TD_multi_check['comment'] = ''

for name_uniq in df_TD_multi_check['name'].unique():
    
    dup = df_TD_multi_check[df_TD_multi_check['name']==name_uniq]
    if 'SIMBAD' in dup['ref'].unique():
        if len(dup)==2:
            other_class = dup.loc[dup['ref']!='SIMBAD', 'Class'].values
            #print(other_class)
            if other_class[0] == 'HM-STAR' or other_class[0] == 'LM-STAR':
                if dup.loc[dup['ref']=='SIMBAD', 'Class'].values not in Simbad_dict[other_class[0]] +  Simbad_dict['Common']+Simbad_dict['STAR']:
                    #print(dup[['Class','SubClass','ref','name_cat','ra_cat','dec_cat','sep']])
                    df_TD_multi_check.loc[df_TD_multi_check['name']==name_uniq, 'comment'] = df_TD_multi_check.loc[df_TD_multi_check['name']==name_uniq, 'comment'] + '|1'
                            
            elif other_class[0] == 'HMXB' or other_class[0] == 'LMXB':
                if dup.loc[dup['ref']=='SIMBAD', 'Class'].values not in Simbad_dict[other_class[0]] +  Simbad_dict['Common']+ Simbad_dict['XRB']:
                    #print(dup[['Class','SubClass','name_cat','ra_cat','dec_cat','sep']])
                    df_TD_multi_check.loc[df_TD_multi_check['name']==name_uniq, 'comment'] = df_TD_multi_check.loc[df_TD_multi_check['name']==name_uniq, 'comment'] + '|2'
      
            else:       
                if dup.loc[dup['ref']=='SIMBAD', 'Class'].values not in Simbad_dict[other_class[0]] +  Simbad_dict['Common']:
                    #print(dup[['Class','SubClass','name_cat','ra_cat','dec_cat','sep']])
                    df_TD_multi_check.loc[df_TD_multi_check['name']==name_uniq, 'comment'] = df_TD_multi_check.loc[df_TD_multi_check['name']==name_uniq, 'comment'] + '|3'
        else:
            other_class = dup.loc[dup['ref']!='SIMBAD', 'Class'].unique()
            
            if len(other_class)>1:
                #print('>3!!!', dup[['Class','SubClass','ref','name_cat','ra_cat','dec_cat']])
                df_TD_multi_check.loc[df_TD_multi_check['name']==name_uniq, 'comment'] = df_TD_multi_check.loc[df_TD_multi_check['name']==name_uniq, 'comment'] + '|4'

            else:
                if other_class[0] == 'HM-STAR' or other_class[0] == 'LM-STAR':
                    if dup.loc[dup['ref']=='SIMBAD', 'Class'].values not in Simbad_dict[other_class[0]] +  Simbad_dict['Common']+Simbad_dict['STAR']:
                        #print(dup[['Class','SubClass','name_cat','ra_cat','dec_cat','sep']])
                        df_TD_multi_check.loc[df_TD_multi_check['name']==name_uniq, 'comment'] = df_TD_multi_check.loc[df_TD_multi_check['name']==name_uniq, 'comment'] + '|5'

                elif other_class[0] == 'HMXB' or other_class[0] == 'LMXB':
                    if dup.loc[dup['ref']=='SIMBAD', 'Class'].values not in Simbad_dict[other_class[0]] +  Simbad_dict['Common']+ Simbad_dict['XRB']:
                        #print(dup[['Class','SubClass','name_cat','ra_cat','dec_cat','sep']])
                        df_TD_multi_check.loc[df_TD_multi_check['name']==name_uniq, 'comment'] = df_TD_multi_check.loc[df_TD_multi_check['name']==name_uniq, 'comment'] + '|6'

                else:       
                    if dup.loc[dup['ref']=='SIMBAD', 'Class'].values not in Simbad_dict[other_class[0]] +  Simbad_dict['Common']:
                        #print(dup[['Class','SubClass','name_cat','ra_cat','dec_cat','sep']])
                        df_TD_multi_check.loc[df_TD_multi_check['name']==name_uniq, 'comment'] = df_TD_multi_check.loc[df_TD_multi_check['name']==name_uniq, 'comment'] + '|7'

        

    else:
        other_class = dup['Class'].unique()
        if len(other_class)>1:
            
            #print('ha',dup[['Class','SubClass','ref','name_cat','ra_cat','dec_cat']])
            df_TD_multi_check.loc[df_TD_multi_check['name']==name_uniq, 'comment'] = df_TD_multi_check.loc[df_TD_multi_check['name']==name_uniq, 'comment'] + '|8'

        
#'''    


In [68]:
print(len(df_TD_multi_check[df_TD_multi_check['comment']=='']))
print(len(df_TD_multi_check[df_TD_multi_check['remove_regions']=='']))
print(len(df_TD_multi_check[(df_TD_multi_check['remove_regions']=='') & (df_TD_multi_check['comment']=='')]))



df_TD_multi_keep = df_TD_multi_check[(df_TD_multi_check['comment']=='')]
df_TD_multi_keep_final = df_TD_multi_keep[~df_TD_multi_keep.duplicated(subset=['name'])].reset_index(drop=True)



df_TD_multi_keep['SubClass'] = df_TD_multi_keep['SubClass'].fillna('')
for i, row in df_TD_multi_keep_final.iterrows():
    
    #print(row['name'])
    #refs = []
    #print(df_TD_multi_save.loc[df_TD_multi_save['name']==row['name'], 'ref'].values)
    refs = ";".join(df_TD_multi_keep.loc[df_TD_multi_keep['name']==row['name'], 'ref'].values)
    SubClasses = ";".join(df_TD_multi_keep.loc[df_TD_multi_keep['name']==row['name'], 'SubClass'].values)
    #print(df_TD_multi_save.loc[df_TD_multi_save['name']==row['name'], 'SubClass'].values)
    #print(refs)
    #print(SubClasses)
    df_TD_multi_keep_final.loc[i, 'refs'] = refs
    df_TD_multi_keep_final.loc[i, 'SubClasses'] = SubClasses


print(len(df_TD_multi_keep))

7936
8318
7006
7936


In [69]:
print(df_TD_multi_keep_final[['ref','refs','SubClass','SubClasses']])

df_TD_multi_keep_final = df_TD_multi_keep_final.drop(columns=['ref','SubClass']).rename(columns={'refs':'ref', 'SubClasses':'SubClass'})

print(df_TD_multi_keep_final[['ref','SubClass']])


                      ref                                            refs  \
0     2010A&A...518A..10V                      2010A&A...518A..10V;SIMBAD   
1     2010A&A...518A..10V                      2010A&A...518A..10V;SIMBAD   
2     2010A&A...518A..10V                      2010A&A...518A..10V;SIMBAD   
3     2010A&A...518A..10V                      2010A&A...518A..10V;SIMBAD   
4     2010A&A...518A..10V                      2010A&A...518A..10V;SIMBAD   
...                   ...                                             ...   
3614  2010A&A...518A..10V                      2010A&A...518A..10V;SIMBAD   
3615  2010A&A...518A..10V                      2010A&A...518A..10V;SIMBAD   
3616  2010A&A...518A..10V  2010A&A...518A..10V;2018ApJS..235....4O;SIMBAD   
3617  2010A&A...518A..10V                      2010A&A...518A..10V;SIMBAD   
3618  1982MSS...C03....0H                      1982MSS...C03....0H;SIMBAD   

     SubClass                         SubClasses  
0           Q           

In [102]:
#print(df_TD_single_keep['Class'].value_counts())
#print(df_TD_multi_save['Class'].value_counts())
#print(df_TD_multi_keep['Class'].value_counts())


df_TD_final = pd.concat([df_TD_single_keep, df_TD_multi_save_final, df_TD_multi_keep_final], ignore_index=True, sort=False)



In [103]:
print(df_TD_final.loc[df_TD_final['name'].isin(rare_sources_saving_dict), ['remove_regions']].value_counts())
#df_TD_multi_keep_final#[df_TD_multi_keep_final['name']=='2CXO J153709.9+115555']

remove_regions 
                   88
|GalacticCenter    11
dtype: int64


In [104]:
#print(df_TD_final.loc[df_TD_final['remove_regions']!='', 'name'].values)
print(df_TD_final[(df_TD_final['remove_regions']=='|GalacticCenter') & (df_TD_final['rare-type']==1) & (~df_TD_final['name'].isin(rare_sources_saving_dict))])
print(df_TD_final.loc[(df_TD_final['remove_regions']=='|GalacticCenter') & (df_TD_final['rare-type']==1) & (~df_TD_final['name'].isin(rare_sources_saving_dict)), 'Class'].value_counts())
df_TD_final[(df_TD_final['remove_regions']=='|GalacticCenter') & (df_TD_final['rare-type']==1) & (~df_TD_final['name'].isin(rare_sources_saving_dict))].to_csv('./data/TD_raretype_inGalacticPlane.csv',index=False)
#print(df_TD_final['remove_regions'].value_counts())

                       name         name_cat      ra_cat    dec_cat  e_Pos  \
86    2CXO J173233.5-313123       J1732-3131  263.139750 -31.523056  2.000   
87    2CXO J174157.2-205412       J1741-2054  265.488680 -20.903278  0.045   
89    2CXO J174618.4-220946        AT2021kwb  266.577108 -22.163050    NaN   
90    2CXO J174722.7-280914       J1747-2809  266.845000 -28.154167  0.500   
91    2CXO J174726.1-295957     SLX 1744-299  266.857917 -29.999389    NaN   
...                     ...              ...         ...        ...    ...   
3370  2CXO J180451.3-274512  OGLE-BLG-DN-645  271.213958 -27.753333    NaN   
3371  2CXO J180457.9-283353  OGLE-BLG-DN-651  271.241500 -28.564694    NaN   
3372  2CXO J180507.0-274309       J1805-2743  271.279583 -27.719167  1.000   
3373  2CXO J180540.4-273425  OGLE-BLG-DN-673  271.418580 -27.574288    NaN   
3382  2CXO J180950.2-233222       J1809-2332  272.459375 -23.539633  0.450   

     Class                    SubClass  \
86      NS           

In [105]:
#df_TD_final = df_TD_final[df_TD_final['remove_regions']==''].reset_index(drop=True)
print(len(df_TD_single_keep),len(df_TD_multi_save),len(df_TD_multi_keep),len(df_TD_final))

print(df_TD_final['Class'].value_counts())

120 220 7936 3821
AGN        1571
YSO         754
LM-STAR     743
CV          236
HM-STAR     167
NS          134
LMXB         98
HMXB         74
NS_BIN       44
Name: Class, dtype: int64


In [106]:
df_TD_final.columns

Index(['name', 'name_cat', 'ra_cat', 'dec_cat', 'e_Pos', 'Class', 'SubClass',
       'ref', 'sep', 'ra', 'dec', 'r0', 'r1', 'PA', 'PU', 'nbref',
       'coo_bibcode', 'rare-type', 'remove_regions', 'GLAT', 'GLON',
       'comment'],
      dtype='object')

In [107]:
df_rare_save = pd.DataFrame(rare_sources_saving_dict).T
print(df_rare_save)
df_rare_save.reset_index(inplace=True)
df_rare_save =  df_rare_save.rename(columns={'index':'name'}) 
print(len(df_rare_save['name'].unique()))

df_rare_remove = pd.DataFrame(rare_sources_removed_dict.items(), columns=['name', 'comment'])

#df_rare_remove = pd.DataFrame.from_dict(rare_sources_removed_dict) #pd.DataFrame(rare_sources_removed_dict).T
print(len(df_rare_remove))
#df_rare_remove.reset_index(inplace=True)
#df_rare_remove =  df_rare_remove.rename(columns={'index':'name'}) 
print(df_rare_remove)
print(len(df_rare_remove['name'].unique()))
print(df_rare_remove[df_rare_remove['name'].isin(df_rare_save['name'])])
print(df_rare_save[df_rare_save['name'].isin(df_rare_remove['name'])])

                        Class  \
2CXO J073751.2-303940      NS   
2CXO J153709.9+115555      NS   
2CXO J112401.1-365319  NS_BIN   
2CXO J185843.6+032606    LMXB   
2CXO J130247.6-635008    HMXB   
...                       ...   
2CXO J191404.2+095258    HMXB   
2CXO J193030.1+185214      NS   
2CXO J202105.4+365104      NS   
2CXO J222552.8+653536      NS   
2CXO J222552.6+653535      NS   

                                                                 Comment  
2CXO J073751.2-303940  change from NS_BIN to NS since it is a double ...  
2CXO J153709.9+115555  change from NS_BIN to NS since it is a double ...  
2CXO J112401.1-365319   change from NS to NS_BIN as a black widow pulsar  
2CXO J185843.6+032606                                2021ApJ...909..154T  
2CXO J130247.6-635008                       recorded in the ATNF catalog  
...                                                                  ...  
2CXO J191404.2+095258              confused with 2CXO J191404.2+095258X.  
2CX

In [108]:
TD_rare_GB = df_TD_final[(~df_TD_final['name'].isin(rare_sources_saving_dict)) & (df_TD_final['rare-type']==1) & (df_TD_final['remove_regions']=='|GalacticCenter')].reset_index(drop=True)

print(TD_rare_GB)

TD_rare_GB.to_csv(f'{data_dir}/{field_name}_raretype_GalacticBulge.csv',index=False)


                     name         name_cat      ra_cat    dec_cat  e_Pos  \
0   2CXO J173233.5-313123       J1732-3131  263.139750 -31.523056  2.000   
1   2CXO J174157.2-205412       J1741-2054  265.488680 -20.903278  0.045   
2   2CXO J174618.4-220946        AT2021kwb  266.577108 -22.163050    NaN   
3   2CXO J174722.7-280914       J1747-2809  266.845000 -28.154167  0.500   
4   2CXO J174726.1-295957     SLX 1744-299  266.857917 -29.999389    NaN   
..                    ...              ...         ...        ...    ...   
88  2CXO J180451.3-274512  OGLE-BLG-DN-645  271.213958 -27.753333    NaN   
89  2CXO J180457.9-283353  OGLE-BLG-DN-651  271.241500 -28.564694    NaN   
90  2CXO J180507.0-274309       J1805-2743  271.279583 -27.719167  1.000   
91  2CXO J180540.4-273425  OGLE-BLG-DN-673  271.418580 -27.574288    NaN   
92  2CXO J180950.2-233222       J1809-2332  272.459375 -23.539633  0.450   

   Class                    SubClass  \
0     NS                          HE   
1     N

In [109]:
#df_TD_final = df_TD_final[(df_TD_final['name'].isin(rare_sources_saving_dict)) | (df_TD_final['remove_regions']=='')].reset_index(drop=True)



In [110]:
df_TD_final.to_csv(f'{data_dir}/{field_name}_final.csv',index=False)

In [111]:
print(len(df_TD_final))

print(df_TD_final['remove_regions'].value_counts())
print(df_TD_final[df_TD_final['remove_regions']!=''])

3821
                               3416
|GalacticCenter                 146
|SMC                             75
|LMC                             19
|Westerlund_1                    19
|NGC_6397                        11
|NGC_6440|GalacticCenter         10
|M_31                            10
|NGC_6752                         9
|Terzan_5|GalacticCenter          8
|NGC_6715                         7
|NGC_6093                         6
|NGC_6121                         6
|NGC_6791                         5
|NGC_6388                         4
|GalacticCenter|CSCconfused       4
|NGC_3079                         3
|NGC_3201                         3
|NGC_2264                         3
|NGC_5139                         3
|NGC_6341                         3
|NGC_6266                         3
|Circinus                         3
|M_33                             3
|NGC_6522|GalacticCenter          2
|NGC_6656                         2
|NGC_6205                         2
|IC_348                

In [153]:
df_TD_final = pd.read_csv(f'{data_dir}/{field_name}_final.csv')
CSCv2 = Table.read('./data/CSCv2.vot').to_pandas()
print(len(CSCv2))
print(CSCv2.columns)

CSCv2 = CSCv2.sort_values(by=['_2CXO']).reset_index(drop=True)

#CSCv2[['ra','dec']]


#c = SkyCoord(ra=ras*u.degree, dec=decs*u.degree)
catalog = SkyCoord(ra=CSCv2['RAICRS']*u.degree, dec=CSCv2['DEICRS']*u.degree)
idxs, d2d, d3d = catalog.match_to_catalog_sky(catalog, nthneighbor=2)

#print(idx, d2d, d3d)
CSCv2['neb_index'] = idxs 
CSCv2['sep_2'] = d2d.arcsec

print(len(CSCv2.loc[CSCv2['sep_2']<3, ['_2CXO', 'RAICRS', 'DEICRS', 'r0','neb_index','sep_2']]))

CSCv2['name'] = '2CXO '+ CSCv2['_2CXO']



317167
Index(['_RAJ2000', '_DEJ2000', '_2CXO', 'RAICRS', 'DEICRS', 'r0', 'r1', 'PA'], dtype='object')
5637


In [154]:
CSCv2[CSCv2.name=='2CXO J155058.6-562835'].index[0]
CSCv2.loc[CSCv2.name=='2CXO J073751.2-303940', 'neb_index'].values[0]

82219

In [155]:
df_Xconfused = df_TD_final.loc[df_TD_final.name.isin(CSCv2.loc[CSCv2['sep_2']<3, 'name']) & (df_TD_final['rare-type']==1), :]#.to_csv(f'{data_dir}/TD_X-confused.csv',index=False)
df_Xconfused = df_Xconfused.sort_values(by=['name']).reset_index(drop=True)
#print(df_Xconfused)
df_Xconfused['sep'] = 0
df_Xconfused['srcs'] = ''
for i in range(len(df_Xconfused)):
    src_query = df_Xconfused.iloc[i]['name']#.values[0]
    #print(src_query)
    df_Xconfused.loc[i, 'sep'] = CSCv2.loc[CSCv2.name==src_query, 'sep_2'].values[0]
    #print(CSCv2.loc[CSCv2.name==src_query, 'sep_2'].values[0])
    df_Xconfused.loc[i, 'srcs'] = str(CSCv2.loc[CSCv2[CSCv2.name==src_query].index[0], 'name'])+','+str(CSCv2.loc[CSCv2.loc[CSCv2.name==src_query, 'neb_index'], 'name'].values[0])
    #print(str(CSCv2.loc[CSCv2[CSCv2.name==src_query].index[0], 'name'])+','+str(CSCv2.loc[CSCv2.name==src_query, 'name'].values[0]))
    #CSCv2[CSCv2.name==df_Xconfused.loc[i, 'name'].values[0], 'sep_2'].values[0]
src_query = '2CXO J073751.2-303940'
print(CSCv2[CSCv2.name==src_query])
print(CSCv2.iloc[[CSCv2[CSCv2.name==src_query].index[0], CSCv2.loc[CSCv2.name==src_query, 'neb_index'].values[0]], 10].values)  
#print(CSCv2.iloc[[208257, 208255], :])                                                                                               

         _RAJ2000   _DEJ2000             _2CXO      RAICRS     DEICRS     r0  \
82213  114.463516 -30.661306  J073751.2-303940  114.463516 -30.661306  0.712   

          r1          PA  neb_index      sep_2                   name  
82213  0.712  167.899994      82219  34.118986  2CXO J073751.2-303940  
['2CXO J073751.2-303940' '2CXO J073753.0-303915']


In [156]:
df_Xconfused[['name_cat','Class','sep','srcs']]

print(df_Xconfused[['name_cat','Class','sep','name','srcs']].to_latex(index=False))  

\begin{tabular}{llrll}
\toprule
       name\_cat &  Class &      sep &                   name &                                         srcs \\
\midrule
   PSR J0437-47 &     NS & 1.317310 &  2CXO J043715.8-471508 & 2CXO J043715.8-471508,2CXO J043715.9-471509X \\
     J0437-4715 & NS\_BIN & 1.317310 & 2CXO J043715.9-471509X & 2CXO J043715.9-471509X,2CXO J043715.8-471508 \\
    1A 0535+262 &   HMXB & 0.640470 &  2CXO J053854.5+261856 & 2CXO J053854.5+261856,2CXO J053854.5+261855X \\
     J0633+1746 &     NS & 1.844926 &  2CXO J063354.2+174613 &  2CXO J063354.2+174613,2CXO J063354.3+174614 \\
     J1050-5953 &     NS & 2.841059 &  2CXO J105007.1-595321 &  2CXO J105007.1-595321,2CXO J105007.5-595321 \\
    4U 1119-603 &   HMXB & 0.634059 &  2CXO J112115.1-603725 & 2CXO J112115.1-603725,2CXO J112115.1-603725X \\
      HD 100199 &   HMXB & 0.720804 &  2CXO J113106.9-625648 & 2CXO J113106.9-625648,2CXO J113106.9-625648X \\
     J1308+2127 &     NS & 1.066825 &  2CXO J130848.2+212706 &  2CXO 

In [62]:
df_TD_final

Unnamed: 0,name,name_cat,ra_cat,dec_cat,e_Pos,Class,SubClass,ref,sep,ra,...,r1,PA,PU,nbref,coo_bibcode,rare-type,remove_regions,GLAT,GLON,comment
0,2CXO J000150.5+233015,G17197224783680,0.460598,23.504296,,LM-STAR,K3,LAMOST-DR8-AFGK,0.278577,0.460528,...,0.753,141.30,0.807000,,,4,,108.467901,-37.970470,
1,2CXO J000701.5+730308,J0007+7303,1.757083,73.052056,3.0,NS,NRAD,2005AJ....129.1993M,0.918716,1.756555,...,0.713,0.00,3.536717,,,1,,119.659483,10.463306,
2,2CXO J001826.4+300400,GSC 2261-1462,4.610125,30.066639,,LM-STAR,G5,1971PASP...83..810T,0.350398,4.610055,...,0.816,79.38,0.912000,,,4,,114.487557,-32.263400,
3,2CXO J005240.0+562837,G16901528933076,13.166510,56.477011,,LM-STAR,G8,LAMOST-DR8-AFGK,0.596529,13.166810,...,0.774,40.07,0.858000,,,4,,123.102718,-6.394337,
4,2CXO J010705.6+321056,G17222139826623,16.773656,32.182487,,LM-STAR,K4,LAMOST-DR8-AFGK,0.186472,16.773717,...,0.721,46.15,0.747000,,,4,,126.779542,-30.572263,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3423,2CXO J235718.3+004350,SDSS J23573+0043,359.326667,0.730556,,AGN,Q,2010A&A...518A..10V,0.760310,359.326474,...,0.789,11.34,0.844000,,,4,,95.698536,-59.248481,
3424,2CXO J235833.2+003453,SDSS J23585+0034,359.638333,0.581389,,AGN,A,2010A&A...518A..10V,0.383086,359.638394,...,0.716,53.94,0.717000,,,4,,96.140239,-59.511001,
3425,2CXO J235904.2-605459,PKS 2356-61,359.767917,-60.916389,,AGN,A,2010A&A...518A..10V,0.143655,359.767910,...,0.711,162.20,0.711000,,,4,,314.018635,-55.070324,
3426,2CXO J235939.7-250056,XMM J23596-2500,359.915833,-25.015833,,AGN,Q,2010A&A...518A..10V,0.206691,359.915826,...,0.746,113.10,0.756000,,,4,,40.331786,-78.188942,


In [157]:
TD_old = pd.read_csv(f'/Users/huiyang/Research/GitHub/MUWCLASS_CSCv2/codes/buildTD/data/CSCv2_TD.csv')
TD_old = TD_old[TD_old['remove_code']==0].reset_index(drop=True)
print(TD_old['Class'].value_counts())

#TD_simbad

TD_old_rare = TD_old[TD_old['Class'].isin(['NS','CV','LMXB','HMXB','NS_BIN'])]
print(len(TD_old_rare))

TD_notsimbad = TD_simbad[TD_simbad['ref']!='SIMBAD']
TD_check = TD_notsimbad[(TD_notsimbad['name'].isin(TD_old_rare['name'])) & ~(TD_notsimbad.duplicated(subset=['name'])) ]


AGN        1484
YSO        1057
LM-STAR     263
HM-STAR     120
NS          101
CV           57
LMXB         56
HMXB         44
NS_BIN       25
Name: Class, dtype: int64
283


In [158]:
TD_old_missing = TD_old_rare[~TD_old_rare.name.isin(df_TD_final.name)].reset_index(drop=True)

where_missing = TD_simbad[TD_simbad['name'].isin(TD_old_missing['name'])]

where_missing.to_csv(f'{data_dir}/where_missingcsv',index=False)

In [None]:
['2CXO J143308.3-611540','2CXO J173527.5-325554','2CXO J174445.7-271344','2CXO J175834.5-212321','2CXO J124215.9+323249',\
'2CXO J173413.4-260518','2CXO J173953.9-282946','2CXO J174433.0-284426',]

In [159]:
TD_old_missing

Unnamed: 0,name_cat,ra_cat,dec_cat,e_Pos,Class,SubClass,ref,sep,name,ra,...,H,K,u,g,r,i,z,remove_4,Gal_Long,Gal_Lat
0,CXOAYSB J033108+4357,52.784125,43.964028,,CV,CV:,2005JAD....11....2D,0.21293,2CXO J033108.1+435750,52.784148,...,14.14,13.93,,,,,,0,150.911336,-10.062167
1,V1129 Cen,189.782917,-45.562222,1.0,CV,DN,2003A&A...404..301R,0.483387,2CXO J123907.9-453344,189.783019,...,8.649,8.584,,,,,,0,300.676982,17.255526
2,IGR J14331-6112,218.284708,-61.261028,,HMXB,HMXB,INTEGRAL General Reference Catalog,0.330785,2CXO J143308.3-611540,218.284719,...,,,,,,,,0,314.846301,-0.764284
3,IGR J17354-3255,263.865,-32.931778,,HMXB,HMXB (SFXT),INTEGRAL General Reference Catalog,0.148531,2CXO J173527.5-325554,263.864952,...,10.993,10.272,,,,,,1,355.457633,-0.272983
4,RX J1744.7-2713,266.190417,-27.228889,,HMXB,,2006A&A...455.1165L,1.098317,2CXO J174445.7-271344,266.190723,...,6.843,6.507,,9.58,8.95,7.93,,1,1.357847,1.051931
5,IGR J17586-2129,269.644,-21.389314,,HMXB,Unclassified (HMXB?),INTEGRAL General Reference Catalog,0.23036,2CXO J175834.5-212321,269.644027,...,9.53,8.437,,,,,,1,7.986212,1.326527
6,J1242+3232,190.567083,32.547222,1.0,LMXB,UL,2003A&A...404..301R,1.593596,2CXO J124215.9+323249,190.566575,...,,,,,,,,0,142.525496,84.228177
7,KS 1731-260,263.556125,-26.088556,,LMXB,"T, B",2007A&A...469..807L,0.310493,2CXO J173413.4-260518,263.556034,...,17.7,16.7,,,23.6,22.3,21.0,1,1.072952,3.652581
8,XTE J1739-285,264.974792,-28.496333,,LMXB,"T, B",2007A&A...469..807L,0.39782,2CXO J173953.9-282946,264.974746,...,,,,,,,,1,359.714107,1.298137
9,1E 1740.7-2942,265.978458,-29.745167,,LMXB,"M, R",2007A&A...469..807L,0.464477,2CXO J174354.8-294443,265.978511,...,,18.2,,,,,,1,359.115883,-0.105854


In [55]:
TD_compare = pd.merge(df_TD_final[['name','Class']], TD_old[['name','Class']], on='name', how='inner')

In [57]:
TD_compare[TD_compare['Class_x']!=TD_compare['Class_y']]

Unnamed: 0,name,Class_x,Class_y
657,2CXO J053515.7-052424,LM-STAR,YSO


In [69]:
TD_new = df_TD_final[~df_TD_final.name.isin(TD_old.name)].reset_index(drop=True)

TD_new

where_new = df_TD_final[df_TD_final['name'].isin(TD_new['name'])]

print(where_new['Class'].value_counts())
print(where_new.groupby(['Class','ref']).size())#.value_counts()#.to_csv(f'{data_dir}/where_missingcsv',index=False)

LM-STAR    413
AGN         93
CV          14
HM-STAR     14
NS          10
HMXB         9
LMXB         6
NS_BIN       2
YSO          2
Name: Class, dtype: int64
Class    ref                
AGN      2018ApJS..235....4O     93
CV       2003A&A...404..301R      1
         2005JAD....11....2D      1
         2018ApJS..235....4O      1
         2020RNAAS...4..219J     11
HM-STAR  2019ApJS..241...32L     14
HMXB     2006A&A...455.1165L      2
         2018ApJS..235....4O      4
         SIMBAD                   3
LM-STAR  2020AJ....160..120J     29
         LAMOST-DR8-AFGK        288
         LAMOST-DR8-M            96
LMXB     2007A&A...469..807L      2
         2018ApJS..235....4O      4
NS       2005AJ....129.1993M      5
         2018ApJS..235....4O      1
         SIMBAD                   4
NS_BIN   2003A&A...404..301R      2
YSO      2004ApJ...610.1045S      1
         2012AJ....144..192M      1
dtype: int64


In [87]:
where_new.loc[~where_new.name.isin(rare_sources_removed_dict), 'Class'].value_counts()

#where_new[~where_new.name.isin(rare_sources_removed_dict)].groupby(['Class','ref']).size()

LM-STAR    413
AGN         93
CV          14
HM-STAR     14
HMXB         8
NS           6
LMXB         6
NS_BIN       2
YSO          2
Name: Class, dtype: int64

In [74]:
where_new_rare = df_TD_final[(df_TD_final['name'].isin(TD_new['name'])) & (df_TD_final['rare-type']==1)]

print(where_new_rare['Class'].value_counts())
print(where_new_rare.groupby(['Class','ref']).size())#.value_counts()#.to_csv(f'{data_dir}/where_missingcsv',index=False)

TD_simbad[(TD_simbad['name'].isin(where_new_rare['name'])) ].to_csv(f'{data_dir}/where_new.csv',index=False)

CV        14
NS        10
HMXB       9
LMXB       6
NS_BIN     2
Name: Class, dtype: int64
Class   ref                
CV      2003A&A...404..301R     1
        2005JAD....11....2D     1
        2018ApJS..235....4O     1
        2020RNAAS...4..219J    11
HMXB    2006A&A...455.1165L     2
        2018ApJS..235....4O     4
        SIMBAD                  3
LMXB    2007A&A...469..807L     2
        2018ApJS..235....4O     4
NS      2005AJ....129.1993M     5
        2018ApJS..235....4O     1
        SIMBAD                  4
NS_BIN  2003A&A...404..301R     2
dtype: int64


In [70]:
TD_simbad[TD_simbad.name.isin(where_new.loc[(where_new['Class']=='LM-STAR') & (where_new['ref']=='2020AJ....160..120J'), 'name'])]

Unnamed: 0,name_cat,ra_cat,dec_cat,e_Pos,Class,SubClass,ref,sep,name,ra,...,r0,r1,PA,PU,nbref,coo_bibcode,rare-type,remove_regions,GLAT,GLON
8332,,36.647319,62.326859,,LM-STAR,GKg_a,2020AJ....160..120J,0.493922,2CXO J022635.3+621937,36.647098,...,0.974,0.953,90.630000,0.974,,,4,,133.732220,1.467253
8912,,37.885535,-7.456915,,LM-STAR,Fd_a,2020AJ....160..120J,0.212633,2CXO J023132.5-072725,37.885501,...,0.982,0.872,9.922000,0.982,,,4,,177.888661,-59.181010
8913,TYC 4704-81-1,37.885538,-7.456935,0.0,Star,*|IR|X,SIMBAD,0.167670,2CXO J023132.5-072725,37.885501,...,0.982,0.872,9.922000,0.982,13.0,2020yCat.1350....0G,4,,177.888661,-59.181010
9328,,40.010370,61.128960,,LM-STAR,GKd_c,2020AJ....160..120J,0.398288,2CXO J024002.4+610744,40.010347,...,0.715,0.713,48.430000,0.715,,,4,,135.662551,0.970692
9329,BD+60 536,40.010361,61.128995,0.0,EB*,*|EB*|IR|PM*|V*|X,SIMBAD,0.271995,2CXO J024002.4+610744,40.010347,...,0.715,0.713,48.430000,0.715,9.0,2020yCat.1350....0G,4,,135.662551,0.970692
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84711,HD 180299,289.139351,-16.130987,0.0,Star,*|IR,SIMBAD,0.075581,2CXO J191633.4-160751,289.139332,...,0.788,0.744,121.300003,0.788,6.0,2020yCat.1350....0G,4,,21.139187,-12.721153
84881,,291.238795,50.180389,,LM-STAR,GKd_c,2020AJ....160..120J,0.144638,2CXO J192457.3+501049,291.238834,...,0.719,0.714,66.370000,0.719,,,4,,81.827346,15.496032
84882,2MASS J19245731+5010494,291.238866,50.180385,0.0,Eruptive*,*|El*|Er*|IR|Ro*,SIMBAD,0.126084,2CXO J192457.3+501049,291.238834,...,0.719,0.714,66.370003,0.719,7.0,2020yCat.1350....0G,4,,81.827346,15.496032
85205,,297.018068,27.702742,,LM-STAR,BA,2020AJ....160..120J,0.210165,2CXO J194804.3+274209,297.018011,...,0.903,0.799,160.100000,0.903,,,4,,63.768282,1.102329


In [58]:
TD_simbad[TD_simbad.name=='2CXO J053515.7-052424']

Unnamed: 0,name_cat,ra_cat,dec_cat,e_Pos,Class,SubClass,ref,sep,name,ra,...,r0,r1,PA,PU,nbref,coo_bibcode,rare-type,remove_regions,GLAT,GLON
17917,[FKP2021] 134,83.81575,-5.406861,,LM-STAR,M4,2021ApJ...908...49F,0.170849,2CXO J053515.7-052424,83.815762,...,0.723,0.719,63.35,0.723,,,4,,209.025576,-19.394317
17918,COUP 736,83.815745,-5.406862,0.0,Star,*|IR|Rad|X,SIMBAD,0.179718,2CXO J053515.7-052424,83.815762,...,0.723,0.719,63.349998,0.723,13.0,2020yCat.1350....0G,4,,209.025576,-19.394317


In [None]:
# still want to save ??
# 2CXO J102347.6+003840
# 2CXO J133001.0+471343



# accept TD_rare_multi_good.csv, but still remove remove_reasons with 18 

# maybe 2CXO J130848.1+212707 a new NS PSR J1308+2127 (2012MNRAS.419.1525H)
# maybe 2CXO J184625.8+091949 is a new NS PSR J1846+0919 (Fermi source)


# many are from LMXBs in early-type galaxies. I. Chandra (Humphrey+, 2008)
# remove from 2012MNRAS.419.2095M

# keeping 

# 2012ApJ...759..123S: DEM L241, a supernova remnant containing a high-mass X-ray binary


# 47_Tuc
# 15.*(24+05.67/60)/60
# -(72+(4+52.6/60)/60)
# 43.8/60

# accept TD_rare_multi_good.csv, but still remove remove_reasons with 18 

# maybe 2CXO J130848.1+212707 a new NS PSR J1308+2127 (2012MNRAS.419.1525H)
# maybe 2CXO J184625.8+091949 is a new NS PSR J1846+0919 (Fermi source)


# many are from LMXBs in early-type galaxies. I. Chandra (Humphrey+, 2008)
# remove from 2012MNRAS.419.2095M

## do not use 2020AJ....159...43H CVs from LAMOST with Machine learning

### Cross-matching to multi-wavelength catalogs

In [4]:
#THIS PART TAKES several hours to run for the first time as it is performing all cross-matching. After that it saves to cache and only takes ~200 seconds to run.

df_ave = pd.read_csv(f'{data_dir}/{field_name}_ave.csv')

# cross-match with MW catalogs
start = time.time()
add_MW(df_ave, data_dir, field_name, Chandratype='CSC')
end = time.time() 
print(end - start)



I/350/gaiaedr3
cross-matching to gaia
cross-matching to gaiadist
II/246/out
cross-matching to 2mass
II/365/catwise
cross-matching to catwise
II/363/unwise
cross-matching to unwise
II/328/allwise
cross-matching to allwise
244.75267958641052


In [None]:
### cleaning the counterpart based on the combined PUs from X-ray sources and multiwavelength counterparts 

In [5]:
df_MW = pd.read_csv(f'{data_dir}/{field_name}_MW.csv')

# correcting a source's PU so that the correct counterpart can be matched
df_MW.loc[df_MW.name == '2CXO J182615.0-145055', 'err_ellipse_r0'] = 1.2 

df_MW_cf = confusion_clean(df_MW,X_PU='err_ellipse_r0',Chandratype='CSC')
df_MW_cf.to_csv(f'{data_dir}/{field_name}_MW_clean.csv',index=False)

df_MW_cf = pd.read_csv(f'{data_dir}/{field_name}_MW_clean.csv')
df_ave = TD_clean(df_MW_cf, remove_codes = [1, 32]) 

# removing a few unreliable sources
s_remove = np.where(df_ave.name_cat.isin(['Mon R2- 5b','Cl* NGC 2264    SBL    1025B','** RAT   17A','NGC 6611 374','NGC 6611 245','NGC 6611 213','2MASS J13121845-6237309','V* V700 Per','ESO-HA 1171', 'VSS II- 7']))[0]

df_ave.loc[s_remove, 'remove_code'] = df_ave.loc[s_remove, 'remove_code']+128

df_ave.to_csv(f'{data_dir}/{field_name}_MW_before_remove.csv', index=False)

df_remove = df_ave[df_ave['remove_code']==0].reset_index(drop=True)
print('Final breakdown', len(df_remove), Counter(df_remove['Class']))
df_remove.to_csv(f'{data_dir}/{field_name}_MW_remove.csv', index=False)




2275 counterparts matched for gaia
1854 counterparts matched for 2mass
1909 counterparts matched for catwise
2117 counterparts matched for unwise
1810 counterparts matched for allwise
[('LMXB', 1), ('NS_BIN', 1)]
[('AGN', 1390), ('CV', 44), ('HM-STAR', 117), ('HMXB', 26), ('LM-STAR', 243), ('LMXB', 41), ('NS', 87), ('NS_BIN', 24), ('YSO', 1038)]
Final breakdown 3002 Counter({'AGN': 1390, 'YSO': 1038, 'LM-STAR': 236, 'HM-STAR': 116, 'NS': 87, 'CV': 44, 'LMXB': 41, 'HMXB': 26, 'NS_BIN': 24})


In [6]:
print(len(df_MW_cf),Counter(df_MW_cf['Class']))

print(Counter(df_ave['remove_code']))

3012 Counter({'AGN': 1390, 'YSO': 1038, 'LM-STAR': 243, 'HM-STAR': 117, 'NS': 87, 'CV': 44, 'LMXB': 42, 'HMXB': 26, 'NS_BIN': 25})
Counter({0: 3002, 128: 8, 32: 1, 33: 1})


In [7]:
df = pd.read_csv(f'{data_dir}/{field_name}_MW_remove.csv')
df_final = prepare_cols(df, cp_thres=0, vphas=False,gaiadata=False,cp_conf_flag=False, TD=True, NS_MWdrop=False, STAR_classremove=['HM-STAR','LM-STAR','YSO'])


Remove 25 [('LM-STAR', 4), ('YSO', 21)]
Final breakdown 2977 [('AGN', 1390), ('CV', 44), ('HM-STAR', 116), ('HMXB', 26), ('LM-STAR', 232), ('LMXB', 41), ('NS', 87), ('NS_BIN', 24), ('YSO', 1017)]


In [9]:
df_final.head()

Unnamed: 0,name,ra,dec,PU,significance,Fcsc_s,e_Fcsc_s,Fcsc_m,e_Fcsc_m,Fcsc_h,...,W4mag,e_W4mag,cp_flag_allwise,Class,cp_flag_wise12,which_wise12,W1mag,e_W1mag,W2mag,e_W2mag
0,2CXO J000009.3+135618,0.039125,13.938494,0.79,1.95,1.671074e-15,1.098049e-15,4.844e-16,4.844e-16,1.989924e-15,...,8.827,0.462,0,AGN,0.0,allwise,15.489,0.045,14.605,0.059
1,2CXO J000230.7+004959,0.627958,0.833072,0.72,11.07,5.675297e-14,6.911736e-15,2.874798e-14,5.285001e-15,5.735159e-14,...,7.915,0.235,0,AGN,0.0,allwise,14.497,0.03,13.178,0.031
2,2CXO J000622.6-000424,1.594333,-0.073572,0.78,25.42,1.544e-13,1.22e-14,1.165798e-13,7.650074e-15,4.038e-13,...,8.398,,0,AGN,0.0,allwise,15.299,0.041,14.245,0.049
3,2CXO J000659.2-001740,1.747042,-0.294661,0.84,4.11,1.069845e-14,3.658216e-15,6.211934e-15,2.77351e-15,1.372718e-14,...,,,-8,AGN,0.0,catwise,16.549,0.037,15.65,0.057
4,2CXO J000703.6+155423,1.765,15.906575,0.72,8.63,4.66e-15,2.824e-15,1.593202e-14,5.625001e-15,5.35e-13,...,5.208,0.039,0,AGN,0.0,allwise,11.61,0.023,10.588,0.021


In [17]:
df_current = pd.read_csv('/home/orion51/Desktop/Research/MUWCLASS/MUWCLASS_CSCv2/files/CSC_TD_MW_remove.csv')
df_f = prepare_cols(df_current, cp_thres=0, vphas=False,gaiadata=False,cp_conf_flag=False, TD=True, NS_MWdrop=False, STAR_classremove=['HM-STAR','LM-STAR','YSO'])
df_final = df_current[df_current['name'].isin(df_f['name'])]

Remove 21 [('LM-STAR', 1), ('YSO', 20)]
Final breakdown 2941 [('AGN', 1390), ('CV', 44), ('HM-STAR', 118), ('HMXB', 26), ('LM-STAR', 207), ('LMXB', 41), ('NS', 87), ('NS_BIN', 24), ('YSO', 1004)]


In [18]:
df_final['SubClass']

0          Q
1          Q
2          Q
3          Q
4          A
        ... 
2957     III
2958    I/II
2959      II
2960      II
2961      II
Name: SubClass, Length: 2941, dtype: object

In [19]:
df_star = df_final[df_final['Class']=='LM-STAR'].reset_index(drop=True)
df_star['Spectral_type'] = df_star.apply(lambda r: r.SubClass[0:2],axis=1)
print(len(df_star))
print(df_star['Spectral_type'].value_counts())

df_GK = df_star[df_star['Spectral_type']=='GK'].reset_index(drop=True)

GK = df_f[df_f.name.isin(df_GK.name)]

GK = GK.rename(columns={'PU':'PU_X','Fcsc_s':'F_s','Fcsc_m':'F_m','Fcsc_h':'F_h','Fcsc_b':'F_b',\
                            'Gmag':'G','BPmag':'BP','RPmag':'RP','Jmag':'J','Hmag':'H','Kmag':'K','W1mag':'W1','W2mag':'W2','W3mag':'W3',\
                            'var_intra_prob':'P_intra', 'var_inter_prob':'P_inter'})

GK.reset_index(drop=True).to_csv('./data/TD_GK.csv',index=False)

df_star['Spectral_type'] = df_star.apply(lambda r: r.SubClass[0],axis=1)
print(len(df_star))
print(df_star['Spectral_type'].value_counts())

for tp in ['G','K','M']:

    df_tp = df_star[df_star['Spectral_type']==tp].reset_index(drop=True)

    tpe = df_f[(df_f.name.isin(df_tp.name)) & ~(df_f.name.isin(GK.name))].reset_index(drop=True)

    tpe = tpe.rename(columns={'PU':'PU_X','Fcsc_s':'F_s','Fcsc_m':'F_m','Fcsc_h':'F_h','Fcsc_b':'F_b',\
                            'Gmag':'G','BPmag':'BP','RPmag':'RP','Jmag':'J','Hmag':'H','Kmag':'K','W1mag':'W1','W2mag':'W2','W3mag':'W3',\
                            'var_intra_prob':'P_intra', 'var_inter_prob':'P_inter'})
    print(len(tpe))
    
    tpe.to_csv(f'./data/TD_{tp}.csv',index=False)



207
GK    21
F8    17
F5    17
Md    13
G0    13
Fd    12
A2    11
A0     9
G5     9
K0     9
F2     9
F0     5
K1     4
G8     4
F6     4
F4     3
M      3
M5     3
F      3
A1     2
G6     2
A3     2
K2     2
A6     2
G9     2
F7     2
F3     2
G3     2
M3     2
F9     2
M4     1
A8     1
K      1
G1     1
K3     1
A7     1
M1     1
A4     1
A      1
G7     1
K5     1
F1     1
M7     1
A5     1
M0     1
G2     1
Name: Spectral_type, dtype: int64
207
F    77
G    56
A    31
M    25
K    18
Name: Spectral_type, dtype: int64
35
18
25


In [None]:
df_M = df_star[df_star['Spectral_type']=='M'].reset_index(drop=True)

df_Md = df_f[df_f.name.isin(df_M.name)]

In [5]:
df_star = df_final[df_final['Class']=='LM-STAR'].reset_index(drop=True)
df_star['Spectral_type'] = df_star.apply(lambda r: r.SubClass[0:2],axis=1)
print(len(df_star))
print(df_star['Spectral_type'].value_counts())
df_M = df_star[df_star['Spectral_type']=='M'].reset_index(drop=True)
df_apoge = df_star[df_star['ref']=='2020AJ....160..120J'].reset_index(drop=True)
df_spec = df_star[df_star['ref']!='2020AJ....160..120J'].reset_index(drop=True)
print(len(df_apoge),len(df_spec))
print(df_apoge['SubClass'].value_counts())
print(df_spec['SubClass'].value_counts())
#df_spec['Spectral_type'] = df_spec.apply(lambda r: r.SubClass[0],axis=1)
print(df_spec['Spectral_type'].value_counts())
#'2020AJ....160..120J'

207
GK    21
F8    17
F5    17
Md    13
G0    13
Fd    12
A2    11
A0     9
G5     9
K0     9
F2     9
F0     5
K1     4
G8     4
F6     4
F4     3
M      3
M5     3
F      3
A1     2
G6     2
A3     2
K2     2
A6     2
G9     2
F7     2
F3     2
G3     2
M3     2
F9     2
M4     1
A8     1
K      1
G1     1
K3     1
A7     1
M1     1
A4     1
A      1
G7     1
K5     1
F1     1
M7     1
A5     1
M0     1
G2     1
Name: Spectral_type, dtype: int64
46 161
GKd_b    8
GKd_c    6
Md_a     5
Fd_b     5
Fd_a     4
GKd_a    4
Md_c     3
Md_d     3
Fd_c     2
Md_b     2
GKd_d    1
GKg_a    1
Fd_d     1
GKg_c    1
Name: SubClass, dtype: int64
F8       14
F5       12
K0        8
F2        8
A2        8
         ..
F3        1
F5/7      1
K5        1
F6/7V     1
K1        1
Name: SubClass, Length: 75, dtype: int64
F5    17
F8    17
G0    13
A2    11
A0     9
G5     9
K0     9
F2     9
F0     5
K1     4
G8     4
F6     4
M      3
F4     3
F      3
M5     3
K2     2
A3     2
F7     2
A6     2
G9   

In [23]:
df_Md = df_f[df_f.name.isin(df_M.name)]
df_Md.columns[:40]

Index(['name', 'ra', 'dec', 'PU', 'significance', 'Fcsc_s', 'e_Fcsc_s',
       'Fcsc_m', 'e_Fcsc_m', 'Fcsc_h', 'e_Fcsc_h', 'flux_aper90_ave_b',
       'e_flux_aper90_ave_b', 'var_intra_prob', 'var_inter_prob', 'CSC_flags',
       'EDR3Name_gaia', 'RA_pmcor_gaia', 'DEC_pmcor_gaia', 'Gmag', 'e_Gmag',
       'BPmag', 'e_BPmag', 'RPmag', 'e_RPmag', 'Plx_gaia', 'e_Plx_gaia',
       'PM_gaia', 'rgeo', 'b_rgeo', 'B_rgeo', 'rpgeo', 'b_rpgeo', 'B_rpgeo',
       'cp_flag_gaia', '_2MASS_2mass', 'Jmag', 'e_Jmag', 'Hmag', 'e_Hmag'],
      dtype='object')

In [24]:
df_Md = df_Md.rename(columns={'PU':'PU_X','Fcsc_s':'F_s','Fcsc_m':'F_m','Fcsc_h':'F_h','Fcsc_b':'F_b',\
                            'Gmag':'G','BPmag':'BP','RPmag':'RP','Jmag':'J','Hmag':'H','Kmag':'K','W1mag':'W1','W2mag':'W2','W3mag':'W3',\
                            'var_intra_prob':'P_intra', 'var_inter_prob':'P_inter'})
# add broad band flux
#TD[]
df_Md.reset_index(drop=True).to_csv('./data/TD_Mdwarf.csv',index=False)