In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from astropy.io.fits import getdata
from astropy import units as u
from astropy.coordinates import SkyCoord, Angle
from astroquery.vizier import Vizier
from astropy.table import Table
from astroquery.xmatch import XMatch
import time

import sys  
sys.path.insert(0, '../')
#sys.path.insert(0, '/Users/yanghui/Desktop/Research/2019/MUWCLASS_Project/ML/ML_pipelines_merge/MUWCLASS_pipeline_github/MUWCLASS/')
#from muwclass import datasets, prep, red,  classify, distribution, plots, prepare_data, prepare_cxo

from prepare_library import atnf_pos, create_perobs_data, cal_ave, add_MW, confusion_clean, TD_clean
from muwclass_library import prepare_cols

Vizier.ROW_LIMIT = -1
exnum = -999999.

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
data_dir = './data'
field_name = 'CSC_TD'
# 11122021: not removing stars without MW counterparts, not using PU = max(sep, PU)
verb = 0

#query_dir = '/Users/yanghui/Desktop/Research/2019/MUWCLASS_Project/ML/DATA/TD/query'
query_dir = '../demo/data/query'

# YSOs 

In [43]:
YSO1 = Vizier(catalog="J/AJ/144/192",row_limit=-1,
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints(Cl='=|P|D')[0]
YSO1 = YSO1['_RAJ2000','_DEJ2000','Cl'].to_pandas().rename(columns={'Cl':'SubClass'})
YSO1['e_Pos'], YSO1['ref'] = np.nan, '2012AJ....144..192M'
print(len(YSO1),Counter(YSO1['SubClass']))

YSO2 = Vizier(catalog="J/ApJS/194/14/catalog",row_limit=-1,
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints(Stage='!=A')[0]
YSO2 = YSO2['_RAJ2000','_DEJ2000','Stage'].to_pandas().rename(columns={'Stage':'SubClass'})
YSO2['e_Pos'], YSO2['ref'] = np.nan, '2011ApJS..194...14P'
print(len(YSO2),Counter(YSO2['SubClass']))


YSO3 = Vizier(catalog="J/A+A/429/963",row_limit=-1,
    columns=['*', '_RAJ2000', '_DEJ2000','e_Pos']).query_constraints(Class='!=nIII')[0]
YSO3 = YSO3['_RAJ2000','_DEJ2000','e_Pos','Class'].to_pandas().rename(columns={'Class':'SubClass'})
YSO3['ref'] = '2005A&A...429..963O'
print(len(YSO3),Counter(YSO3['SubClass']))


YSO4 = Vizier(catalog="J/A%2bA/463/275/table5",row_limit=-1,
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints(clYSO='=|I|I/II|II|II/III|III')[0]
YSO4 = YSO4['_RAJ2000','_DEJ2000','clYSO'].to_pandas().rename(columns={'clYSO':'SubClass'})
YSO4['e_Pos'], YSO4['ref'] = np.nan, '2007A&A...463..275G'
print(len(YSO4),Counter(YSO4['SubClass']))

YSO5 = Vizier(catalog="J/ApJS/196/4",row_limit=-1,
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints(St='=|k|n')[0]
YSO5 = YSO5['_RAJ2000','_DEJ2000','St'].to_pandas().rename(columns={'St':'SubClass'})
YSO5['e_Pos'], YSO5['ref'] = np.nan,  '2011ApJS..196....4R'
print(len(YSO5),Counter(YSO5['SubClass']))

YSO6 = Vizier(catalog="J/A%2bA/531/A141/catalog",row_limit=-1,
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints(MmD='=2')[0]
YSO6 = YSO6['_RAJ2000','_DEJ2000','MmD'].to_pandas().rename(columns={'MmD':'SubClass'})
YSO6['e_Pos'], YSO6['ref'] = np.nan, '2011A&A...531A.141D'
print(len(YSO6),Counter(YSO6['SubClass']))



3419 Counter({'D': 2991, 'P': 428})
808 Counter({'II': 478, '0/I': 247, 'III': 83})
72 Counter({'II': 26, '': 22, 'III': 17, 'I': 7})
56 Counter({'II': 20, 'III': 16, 'I/II': 9, 'I': 9, 'II/III': 2})
272 Counter({'k': 178, 'n': 94})
308 Counter({2: 308})




In [44]:
df_YSOs = pd.concat([YSO1, YSO2, YSO3, YSO4, YSO5, YSO6])
df_YSOs = df_YSOs.reset_index(drop=True)
df_YSOs['Class']='YSO'
print(len(df_YSOs),'YSOs')

4935 YSOs


# STARs

In [9]:
stars = Vizier(catalog="B/mk/mktypes",row_limit=-1,
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints(Mag='<=23')[0]
stars = stars['_RAJ2000','_DEJ2000','Name','SpType','Bibcode','Remarks','Mag'].to_pandas()
stars = stars.replace(r'^\s*$', np.nan, regex=True)
print(len(stars)) 

937000


In [46]:
stars_d1 = stars[stars['SpType'].str.contains('e|s|n|p|f|cv|i|r|a|D|C|cont|l|H|h|abs|\+|\*|\:|\?', na=False)]
stars_f1 = stars[~stars.set_index(['_RAJ2000','_DEJ2000']).index.isin(stars_d1.set_index(['_RAJ2000','_DEJ2000']).index)]
stars_f1 = stars_f1.reset_index(drop=True)

print(len(stars))
print(len(stars_d1))
print(len(stars_f1))


stars_d2 = stars_f1[stars_f1['Remarks'].isnull() == False]
stars_f2 = stars_f1[~stars_f1.set_index(['_RAJ2000','_DEJ2000']).index.isin(stars_d2.set_index(['_RAJ2000','_DEJ2000']).index)]
stars_f2 = stars_f2.reset_index(drop=True)

print(len(stars_d2))
print(len(stars_f2))


stars_d3 = stars_f2[stars_f2['Name'].str.contains('H97b')]
stars_f3 = stars_f2[~stars_f2.set_index(['_RAJ2000','_DEJ2000']).index.isin(stars_d3.set_index(['_RAJ2000','_DEJ2000']).index)]
stars_f3 = stars_f3.reset_index(drop=True)

print(len(stars_d3))
print(len(stars_f3))

937000
238222
622683
69952
516991
692
516182


In [47]:

star_hm = stars_f3[stars_f3['SpType'].str.startswith(tuple(['O','B','W']), na=False)]
star_lm = stars_f3[stars_f3['SpType'].str.startswith(tuple(['A','F','G','K','M']), na=False)]
star_hm = star_hm.reset_index(drop=True)
star_lm = star_lm.reset_index(drop=True)

star_hm['e_Pos'], star_hm['Class'] = np.nan, 'HM-STAR'
star_lm['e_Pos'], star_lm['Class'] = np.nan, 'LM-STAR'
df_HMSTARs = star_hm.rename(columns={'Name':'name_cat','SpType':'SubClass','Bibcode':'ref'}).drop(columns=['Remarks','Mag'])
df_LMSTARs = star_lm.rename(columns={'Name':'name_cat','SpType':'SubClass','Bibcode':'ref'}).drop(columns=['Remarks','Mag'])
print(len(df_HMSTARs))
print(len(df_LMSTARs))



62124
450224


In [48]:
APOGEE_all = Vizier(catalog="III/284/allstars",row_limit=-1,
    columns=['*','_RAJ2000', '_DEJ2000','AName','Giant','Star']).query_constraints(Teff='>3000 & <10000',logg='>-1 & <7')[0]

APOGEE = APOGEE_all['_RAJ2000','_DEJ2000','AName','Giant','Star','TClass','Teff','logg','s_HRV','errHRV'].to_pandas()

#APOGEE_STAR = APOGEE[(APOGEE.s_HRV <= 1) & (APOGEE.s_HRV <= 5*APOGEE.errHRV)].reset_index(drop=True)# & APOGEE.Teff.isnull() & (APOGEE.logg <= 7) & (APOGEE.logg >= -1) ]

APOGEE_STAR = APOGEE[(APOGEE.s_HRV <= 1) & (APOGEE.s_HRV <= 5*APOGEE.errHRV) & (APOGEE.Star == 1)].reset_index(drop=True)# & APOGEE.Teff.isnull() & (APOGEE.logg <= 7) & (APOGEE.logg >= -1) ]

APOGEE_STAR['e_Pos'], APOGEE_STAR['Class'], APOGEE_STAR['ref'] = np.nan, 'LM-STAR', '2020AJ....160..120J'
APOGEE_STAR = APOGEE_STAR.rename(columns={'AName':'name_cat','TClass':'SubClass'})
APOGEE_STAR = APOGEE_STAR.replace('none', np.nan, regex=True)
APOGEE_STAR = APOGEE_STAR.drop(columns=['Giant','Star','Teff','logg','s_HRV','errHRV'])
print(len(APOGEE_STAR))

310171


## WRs

In [49]:
WRs1 = Vizier(catalog="III/215",row_limit=-1,
    columns=['*', '_RAJ2000', '_DEJ2000','OName']).query_constraints()[0]
WRs1 = WRs1['_RAJ2000','_DEJ2000','Name','OName','Aname'].to_pandas()
WRs1['Class'], WRs1['e_Pos'], WRs1['ref'], WRs1['SubClass'] = 'HM-STAR', np.nan, '2001NewAR..45..135V', np.nan#III/215
WRs1 = WRs1.replace(r'^\s*$', np.nan, regex=True)
WRs1['name_cat'] = WRs1['Name'].combine_first(WRs1['OName'].combine_first(WRs1['Aname']))
df_WRs1 = WRs1.drop(columns=['Name','OName','Aname'])
print(len(df_WRs1))


226


In [50]:
WRs2 = Vizier(catalog="J/A+A/458/453/table1",row_limit=-1,
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints()[0]
WRs2 = WRs2['_RAJ2000','_DEJ2000','SpType','SpType0','SimbadName','WRori'].to_pandas()
WRs2['Class'], WRs2['e_Pos'], WRs2['ref']= 'HM-STAR', np.nan, '2006A&A...458..453V'#J/A+A/458/453/table1
WRs2 = WRs2.replace(r'^\s*$', np.nan, regex=True)
WRs2['name_cat'] = WRs2['SimbadName'].combine_first(WRs2['WRori'])
WRs2['SubClass'] = WRs2['SpType'].combine_first(WRs2['SpType0'])
df_WRs2 = WRs2.drop(columns=['SpType','SpType0','SimbadName','WRori'])
print(len(df_WRs2))


118


# Quasars & AGNs 

In [51]:
AGNs = Vizier(catalog="VII/258/vv10",row_limit=-1,
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints(Cl='|Q|A|B')[0]
AGNs = AGNs['_RAJ2000','_DEJ2000','Name','Cl'].to_pandas()
AGNs['Class'], AGNs['e_Pos'], AGNs['ref']= 'AGN', np.nan, '2010A&A...518A..10V'#VII/258/vv10
df_AGNs = AGNs.rename(columns={'Name':'name_cat','Cl':'SubClass'})

print(len(df_AGNs), Counter(df_AGNs['SubClass']))

168940 Counter({'Q': 133335, 'A': 34231, 'B': 1374})


# HMXBs

In [52]:
HMXBs = Vizier(catalog="J/A+A/455/1165/table1",row_limit=-1,
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints()[0]
HMXBs = HMXBs['_RAJ2000','_DEJ2000','Name','Type'].to_pandas()
HMXBs['Class'], HMXBs['e_Pos'], HMXBs['ref'] = 'HMXB', np.nan, '2006A&A...455.1165L'#J/A+A/455/1165/table1
df_HMXBs = HMXBs.rename(columns={'Name':'name_cat','Type':'SubClass'})
print(len(df_HMXBs))


114


# LMXBs

In [53]:
LMXBs1 = Vizier(catalog="J/A+A/469/807",row_limit=-1,
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints()[0]
LMXBs1 = LMXBs1['_RAJ2000','_DEJ2000','Name','Type'].to_pandas()
LMXBs1['Class'], LMXBs1['e_Pos'], LMXBs1['ref'] = 'LMXB', np.nan, '2007A&A...469..807L'#J/A+A/469/807
df_LMXBs1 = LMXBs1.rename(columns={'Name':'name_cat','Type':'SubClass'})
print(len(df_LMXBs1))


187


In [54]:
LMXBs2 = Vizier(catalog="B/cb/lmxbdata",row_limit=-1,
    columns=['*', '_RAJ2000', '_DEJ2000','epos']).query_constraints()[0]

#'''
LMXBs2 = LMXBs2['_RAJ2000','_DEJ2000','epos','Name','Type1'].to_pandas()
LMXBs2['Class'], LMXBs2['ref'] = 'LMXB', '2003A&A...404..301R' #B/cb/lmxbdata
df_LMXBs2 = LMXBs2.rename(columns={'Name':'name_cat','Type1':'SubClass','epos':'e_Pos'})
print(len(df_LMXBs2))
#'''

108


# CVs

In [55]:
CVs1 = Vizier(catalog="V/123A",row_limit=-1,
    columns=['*', '_RAJ2000', '_DEJ2000']).query_constraints()[0]
CVs1 = CVs1['_RAJ2000','_DEJ2000','Names','VarType'].to_pandas()
CVs1['Class'], CVs1['e_Pos'], CVs1['ref'] = 'CV', np.nan, '2005JAD....11....2D'#'2001PASP..113..764D'#V/123A
CVs1 = CVs1[CVs1['VarType']!='non-CV'].reset_index(drop=True)
df_CVs1 = CVs1.rename(columns={'Names':'name_cat','VarType':'SubClass'})
print(len(df_CVs1))


1618


In [56]:
CVs2 = Vizier(catalog="B/cb/cbdata",row_limit=-1,
    columns=['*', '_RAJ2000', '_DEJ2000','epos']).query_constraints()[0]
CVs2 = CVs2['_RAJ2000','_DEJ2000','epos','Name','Type1'].to_pandas()
CVs2['Class'], CVs2['ref'] = 'CV', '2003A&A...404..301R'#B/cb/cbdata
df_CVs2 = CVs2.rename(columns={'Name':'name_cat','Type1':'SubClass','epos':'e_Pos'})
print(len(df_CVs2))


1429


# NS & NS_BIN

In [57]:
import urllib3

http = urllib3.PoolManager()
r = http.request('GET', 'https://www.atnf.csiro.au/research/pulsar/psrcat/proc_form.php?version=1.65&Name=Name&RaJ=RaJ&DecJ=DecJ&Binary=Binary&Type=Type&startUserDefined=true&c1_val=&c2_val=&c3_val=&c4_val=&sort_attr=jname&sort_order=asc&condition=&pulsar_names=&ephemeris=short&coords_unit=raj%2Fdecj&radius=&coords_1=&coords_2=&style=Long+csv+with+errors&no_value=*&fsize=3&x_axis=&x_scale=linear&y_axis=&y_scale=linear&state=query&table_bottom.x=35&table_bottom.y=15') # it's a file like object and works just like a file
r.status

ATNF = r.data.decode('utf-8').partition('\n<pre>\n')[2].partition('\n</pre>\n')[0].replace('*',' ').split('\n')
#print(ATNF)
#ATNF = str(r.data).partition('\\n<pre>\\n')[2].partition('\\n</pre>\\n')[0]

NSs = pd.DataFrame(columns=['src', 'NAME','Name_ref','RAJ','e_RA','RAJ_ref','DECJ','e_DEC','DECJ_ref','Binary','Binary_ref','PSR_type','Type_ref'], 
                data=[row.split(';') for row in ATNF[2:]])
#NSs = pd.DataFrame(ATNF)
#NSs = pd.read_csv(ATNF, sep=';',lineterminator='\n' )
#print(NSs)
NSs['_RAJ2000'] = NSs.apply(lambda row: atnf_pos(row.RAJ, row.e_RA, 'hms', 'pos'), axis=1)
NSs['_e_RAJ2000'] = NSs.apply(lambda row: atnf_pos(row.RAJ, row.e_RA, 'hms', 'err'), axis=1)
NSs['_DEJ2000'] = NSs.apply(lambda row: atnf_pos(row.DECJ, row.e_DEC, 'dms', 'pos'), axis=1)
NSs['_e_DEJ2000'] = NSs.apply(lambda row: atnf_pos(row.DECJ, row.e_DEC, 'dms', 'err'), axis=1)
#print(NSs[['_e_RAJ2000','_e_DEJ2000']])
NSs['e_Pos'] = NSs.apply(lambda row: max(row._e_RAJ2000 , row._e_DEJ2000), axis=1)

#'''
NSs.loc[NSs.NAME=='J1819-1458', '_RAJ2000'] = 274.8924
NSs.loc[NSs.NAME=='J1819-1458', '_DEJ2000'] = -14.9676579999999
NSs.loc[NSs.NAME=='J1741-2054', '_RAJ2000'] = 265.48868
NSs.loc[NSs.NAME=='J1741-2054', '_DEJ2000'] = -20.903278
#'''

NSs['ref']= '2005AJ....129.1993M'#B/psr/psr
NSs['Class'] = 'NS_BIN'
print(len(NSs),Counter(NSs['Binary']))
NSs.loc[NSs['Binary']==' ', 'Class'] = 'NS'


new_NS_BINs = pd.read_csv(f'{data_dir}/new_NS_BIN.csv')
print(NSs.loc[NSs.NAME.isin(new_NS_BINs.name_cat.values), ['NAME','Binary','Class']])
#print(NSs.loc[NSs.NAME.isin(['J0437-4715']), ['NAME','Binary','Class']])

df_NSs = NSs[['NAME','_RAJ2000','_DEJ2000','e_Pos','Class','PSR_type','ref']].rename(columns={'NAME':'name_cat','PSR_type':'SubClass'})
print(len(df_NSs))

print(len(df_NSs),Counter(df_NSs['Class']))


3177 Counter({' ': 2844, 'ELL1': 143, 'BT': 107, 'DD': 39, 'DDH': 14, 'ELL1H': 10, 'BTX': 7, 'DDGR': 4, 'T2': 3, 'MSS': 3, 'DDS': 1, 'BT2P': 1, 'DDK': 1})
             NAME Binary   Class
6      J0023+0923   ELL1  NS_BIN
60     J0101-6422     BT  NS_BIN
313   J0737-3039B     DD  NS_BIN
560    J1124-3653             NS
644    J1231-1411     BT  NS_BIN
714    J1311-3430   ELL1  NS_BIN
921    J1514-4946   ELL1  NS_BIN
984      B1534+12     DD  NS_BIN
1107   J1614-2230  ELL1H  NS_BIN
1172   J1628-3205     BT  NS_BIN
1294   J1653-0158   ELL1  NS_BIN
1494   J1731-1847    BTX  NS_BIN
1836   J1810+1744     BT  NS_BIN
1881   J1816+4510   ELL1  NS_BIN
2545   J1909-3744   ELL1  NS_BIN
2909     B1957+20     BT  NS_BIN
2953   J2017+0603  ELL1H  NS_BIN
3002   J2043+1711   ELL1  NS_BIN
3013   J2047+1053     BT  NS_BIN
3018   J2051-0827   ELL1  NS_BIN
3100   J2214+3000   ELL1  NS_BIN
3102   J2215+5135   ELL1  NS_BIN
3106   J2222-0137     DD  NS_BIN
3125   J2241-5236     BT  NS_BIN
3133   J2256-1024   

# HMXBs, LMXBs, and CVs from INTEGRAL General Reference Catalog (IGRS) and HMXBs from Be Star catalog

Bestar = pd.read_csv('/Users/yanghui/Desktop/Research/2020/Proposal/XMM/BeSS_Epsilon/master_Simbad.csv')
HMXB_Be = Bestar[Bestar['main_type'].str.contains('HMXB', na=False)].drop_duplicates(subset=['main_id'])[['main_id','ra_Simbad','dec_Simbad','type']]
df_HMXB_Be = HMXB_Be.rename(columns={'main_id':'name_cat','ra_Simbad':'_RAJ2000','dec_Simbad':'_DEJ2000','type':'SubClass'})
df_HMXB_Be['Class'], df_HMXB_Be['e_Pos'], df_HMXB_Be['ref'] = 'HMXB', np.nan, 'Simbad'
df_HMXB_Be = df_HMXB_Be.reset_index(drop=True)
print(len(df_HMXB_Be))
df_HMXB_Be.to_csv('/Users/yanghui/Desktop/Research/2019/MUWCLASS_Project/ML/DATA/TD/MoreSources/HMXBBeStar/HMXBBeStar.csv',index=False)

IGRS = pd.read_csv('/Users/yanghui/Desktop/Research/2019/MUWCLASS_Project/ML/DATA/TD/versions/CSC_TD_v4.csv')[3466:].reset_index(drop=True)[['name_cat','ra_cat','dec_cat','error','Class','SubClass']]
IGRS['ref'] = 'INTEGRAL General Reference Catalog'
df_IGRS = IGRS.rename(columns={'ra_cat':'_RAJ2000','dec_cat':'_DEJ2000','error':'e_Pos'})
print(len(df_IGRS))

In [58]:
df_HMXB_Be = pd.read_csv(f'{data_dir}/raretype_BeStar_IGRS.csv')
print(Counter(df_HMXB_Be['Class']))

Counter({'HMXB': 55, 'LMXB': 8, 'CV': 5})


# Combining sources together

In [59]:
df_TD = pd.concat([df_AGNs, df_YSOs, df_LMSTARs, APOGEE_STAR, df_HMSTARs, df_WRs1, df_WRs2, df_NSs, df_HMXBs, df_LMXBs1, df_LMXBs2, df_CVs1, df_CVs2, df_HMXB_Be], ignore_index=True, sort=False)

In [60]:
df_TD.head(5)

Unnamed: 0,_RAJ2000,_DEJ2000,name_cat,SubClass,Class,e_Pos,ref
0,0.005417,-2.033333,FIRST J00000-0202,Q,AGN,,2010A&A...518A..10V
1,0.005833,-30.6075,2QZ J000001-3036,Q,AGN,,2010A&A...518A..10V
2,0.007083,-31.373889,2QZ J000001-3122,Q,AGN,,2010A&A...518A..10V
3,0.01125,-25.193611,XMM J00000-2511,Q,AGN,,2010A&A...518A..10V
4,0.011667,-35.059167,MS 23574-3520,Q,AGN,,2010A&A...518A..10V


In [61]:
print(len(df_TD), sorted(Counter(df_TD['Class']).items()))


1003439 [('AGN', 168940), ('CV', 3052), ('HM-STAR', 62468), ('HMXB', 169), ('LM-STAR', 760395), ('LMXB', 303), ('NS', 2844), ('NS_BIN', 333), ('YSO', 4935)]


In [62]:
# matching with CSCv2

TD_CSC = XMatch.query(cat1= Table.from_pandas(df_TD), #open('/Users/yanghui/Desktop/Research/2019/MUWCLASS_Project/ML/DATA/TD/versions/CSC_TD_v5_09062021.csv'),
                      cat2='vizier:IX/57/csc2master',
                      max_distance=3*u.arcsec, colRA1='_RAJ2000',colDec1='_DEJ2000')

TD_CSC = TD_CSC.to_pandas()
print(len(TD_CSC), sorted(Counter(TD_CSC['Class']).items()))


11331 [('AGN', 6174), ('CV', 342), ('HM-STAR', 895), ('HMXB', 77), ('LM-STAR', 1862), ('LMXB', 166), ('NS', 222), ('NS_BIN', 148), ('YSO', 1445)]


In [63]:
TD_CSC.head(5)

TD_CSC.to_csv(f'{data_dir}/{field_name}_Xmatch_all.csv',index=False)



In [64]:
TD_CSC = pd.read_csv(f'{data_dir}/{field_name}_Xmatch_all.csv')

TD_CSC = TD_CSC.sort_values(by=['angDist']) 
#print(TD_CSC[TD_CSC.duplicated(subset=['_RAJ2000_1', '_DEJ2000_1', 'name_cat', 'SubClass', 'Class','e_Pos', 'ref'])])
TD_CSC = TD_CSC.drop_duplicates(subset=['_RAJ2000_1', '_DEJ2000_1', 'name_cat', 'SubClass', 'Class','e_Pos', 'ref']).reset_index(drop=True)

print(len(TD_CSC), sorted(Counter(TD_CSC['Class']).items()))


10723 [('AGN', 5978), ('CV', 275), ('HM-STAR', 767), ('HMXB', 68), ('LM-STAR', 1834), ('LMXB', 139), ('NS', 165), ('NS_BIN', 90), ('YSO', 1407)]




#Table.from_pandas(df)
TD_CSC = XMatch.query(cat1= Table.from_pandas(df_TD), #open('/Users/yanghui/Desktop/Research/2019/MUWCLASS_Project/ML/DATA/TD/versions/CSC_TD_v5_09062021.csv'),
                      cat2='vizier:IX/57/csc2master',
                      max_distance=2*u.arcsec, colRA1='_RAJ2000',colDec1='_DEJ2000')

TD_CSC = TD_CSC.to_pandas()
print(len(TD_CSC), sorted(Counter(TD_CSC['Class']).items()))

8815 [('AGN', 5549), ('CV', 183), ('HM-STAR', 343), ('HMXB', 75), ('LM-STAR', 864), ('LMXB', 77), ('NS', 162), ('NS_BIN', 89), ('WR', 92), ('YSO', 1381)]

In [65]:
TD_CSC['PU'] = np.sqrt(TD_CSC.e_Pos.fillna(0)*2**2+TD_CSC.r0.fillna(0)**2)

TD_CSC['name'] = TD_CSC.apply(lambda row: '2CXO '+str(row['2CXO']),axis=1)

#TD_CSC.to_csv(f'{data_dir}/TD_check_all.csv',index=False)

idx = np.where( ((TD_CSC['angDist']>TD_CSC['PU']) | (TD_CSC['PU'] >1.) )& ((TD_CSC['Class']=='AGN') | (TD_CSC['Class']=='YSO') | (TD_CSC['Class']=='HM-STAR') | (TD_CSC['Class']=='LM-STAR') ))[0]
print('Remove', len(idx), sorted(Counter(TD_CSC.loc[idx, 'Class']).items()))
TD_CSC = TD_CSC.drop(TD_CSC.index[idx])
TD_CSC = TD_CSC.reset_index(drop=True)
print(len(TD_CSC), sorted(Counter(TD_CSC['Class']).items()))

TD = TD_CSC.rename(columns={'_RAJ2000_1':'ra_cat','_DEJ2000_1':'dec_cat','angDist':'sep','RAICRS':'ra','DEICRS':'dec'})[['name_cat','ra_cat','dec_cat','e_Pos','Class','SubClass','ref','sep','name','ra','dec','r0','PU']].sort_values(by=['Class','ra']).reset_index(drop=True)


TD['remove_code'] = 0



Remove 6440 [('AGN', 4485), ('HM-STAR', 364), ('LM-STAR', 1108), ('YSO', 483)]
4283 [('AGN', 1493), ('CV', 275), ('HM-STAR', 403), ('HMXB', 68), ('LM-STAR', 726), ('LMXB', 139), ('NS', 165), ('NS_BIN', 90), ('YSO', 924)]


In [66]:
TD_simbad = XMatch.query(cat1=Table.from_pandas(TD),
                         cat2='vizier:SIMBAD',max_distance=3 * u.arcsec, colRA1='ra',colDec1='dec')
TD_simbad = TD_simbad.to_pandas().rename(columns={'ra_1':'ra', 'dec_1':'dec','ra_2':'ra_simbad', 'dec_2':'dec_simbad', 'angDist':'_r_simbad'}).sort_values(by=['_r_simbad']) 
TD_simbad = TD_simbad.drop_duplicates(subset=['name_cat','ra_cat','dec_cat','Class','name'], keep='first').reset_index(drop=True)
print(len(TD_simbad))
TD= pd.merge(TD, TD_simbad, how='outer', on = ['name_cat','ra_cat','dec_cat','e_Pos','Class','SubClass','ref', 'sep', 'name', 'ra', 'dec', 'r0','PU','remove_code'])

TD.loc[TD.name_cat.isnull(), 'name_cat'] = TD.loc[TD.name_cat.isnull(), 'main_id']
TD.loc[TD.name_cat.isnull(), 'name_cat'] = TD.loc[TD.name_cat.isnull(), 'name']
print(TD.columns[:50])


4105
Index(['name_cat', 'ra_cat', 'dec_cat', 'e_Pos', 'Class', 'SubClass', 'ref',
       'sep', 'name', 'ra', 'dec', 'r0', 'PU', 'remove_code', '_r_simbad',
       'main_id', 'ra_simbad', 'dec_simbad', 'coo_err_maj', 'coo_err_min',
       'coo_err_angle', 'nbref', 'ra_sexa', 'dec_sexa', 'coo_qual',
       'coo_bibcode', 'main_type', 'other_types', 'radvel', 'radvel_err',
       'redshift', 'redshift_err', 'sp_type', 'morph_type', 'plx', 'plx_err',
       'pmra', 'pmdec', 'pm_err_maj', 'pm_err_min', 'pm_err_pa', 'size_maj',
       'size_min', 'size_angle', 'B', 'V', 'R', 'J', 'H', 'K'],
      dtype='object')


move2YSOs = [ '[DWS84] 20','HJM C 1-25','HJM C 5-3','[HAD2004] NGC 1579 210','[HAD2004] NGC 1579 260',
    'MBO 66','MBO 62','MB045','Cl* NGC 2024 LSE 12','Cl* NGC 2024 LSE 3','Cl* NGC 2024 LSE 6',
    'Cl* NGC 2024 LSE 16','Cl* NGC 2024 LSE 2','[CCE98] 42','[CFB2003] Par-Lup3-2','ASR 107','ASR 43',
    'ASR 36','[PNJ2003] 7']
print(len(move2YSOs))
s = np.where(TD.name_cat.isin(move2YSOs))[0]

TD.loc[s, :].to_csv(f'{data_dir}/{field_name}_YSO_clean.csv',index=False)

src2YSO = ['MBO 62']
#s = np.where((TD['name_cat'].isin(src2YSO)))[0]# | (TD['ref']=='2004ApJ...610.1045S') )[0] #& (TD['name_cat'].str.contains('PNJ')))[0]
#s = np.where( (TD['ref']=='2004ApJ...610.1045S') )[0] #& (TD['name_cat'].str.contains('PNJ')))[0]
s = np.where(((TD['main_type'].str.contains('Orion_V*|TTau*|YSO')) & (TD['Class']!='YSO')))[0]

'''
ra_Orion = (5+(35+17.3/60)/60)*15
dec_Orion = (-1)*(5+(23+28/60)/60)
TD['sep_Orion'] = TD.apply(lambda row: SkyCoord(row.ra*u.deg, row.dec*u.deg, frame='icrs').separation(SkyCoord(ra_Orion*u.deg, dec_Orion*u.deg, frame='icrs')).arcsec, axis=1)
s = np.where((TD['sep_Orion']<180) & (TD['Class']!='YSO'))[0]
'''
#print(Counter(TD.loc[s, 'main_type']))
print(TD.loc[s, :])

#TD.loc[s, :].to_csv(f'{data_dir}/{field_name}_YSO_clean.csv',index=False)

In [67]:
## Re-assign those STARs which are actually YSOs 
src2YSO = ['MBO 62']
s = np.where(((TD['main_type'].str.contains('Orion_V*|TTau*|YSO')) & (TD['Class']!='YSO')) | (TD['name_cat'].isin(src2YSO)) | (TD['ref']=='2004ApJ...610.1045S') )[0] #& (TD['name_cat'].str.contains('PNJ')))[0]
print("Converting", sorted(Counter(TD['Class'][s]).items()), "sources to YSOs.")
TD.loc[s, 'Class'] = 'YSO'
print(len(TD), sorted(Counter(TD['Class']).items()))

#  check the remaining stars if they can still be YSOs


Converting [('HM-STAR', 91), ('LM-STAR', 290)] sources to YSOs.
4283 [('AGN', 1493), ('CV', 275), ('HM-STAR', 312), ('HMXB', 68), ('LM-STAR', 436), ('LMXB', 139), ('NS', 165), ('NS_BIN', 90), ('YSO', 1305)]


In [68]:
print(TD.loc[TD.name_cat.isin(['B1259-63','J1124-3653','J2032+4127','XTE J1858+034','J0737-3039B','B1534+12']), ['name_cat','Class']])

TD.loc[TD.name_cat=='B1259-63', 'Class'] = 'HMXB'
TD.loc[TD.name_cat=='J1124-3653', 'Class'] = 'NS_BIN'
TD.loc[TD.name_cat=='J2032+4127', 'Class'] = 'HMXB'
TD.loc[TD.name_cat=='XTE J1858+034', 'Class'] = 'LMXB'

double_NSs = ['J0737-3039B', 'B1534+12']
TD.loc[TD.name_cat.isin(double_NSs), 'Class'] = 'NS'

print(TD.loc[TD.name_cat.isin(['B1259-63','J1124-3653','J2032+4127','XTE J1858+034','J0737-3039B','B1534+12']), ['name_cat','Class']])

           name_cat   Class
2225  XTE J1858+034    HMXB
3152     J1124-3653      NS
3287    J0737-3039B  NS_BIN
3292       B1259-63  NS_BIN
3297       B1534+12  NS_BIN
3349     J2032+4127  NS_BIN
           name_cat   Class
2225  XTE J1858+034    LMXB
3152     J1124-3653  NS_BIN
3287    J0737-3039B      NS
3292       B1259-63    HMXB
3297       B1534+12      NS
3349     J2032+4127    HMXB



# There are sources matched to the same CSC sources due to
# 1. those sources are the same sources with the same classes from different literatures/sources (updated tables)
# 2. those sources are the same sources with different classifications from different sources
# 3. the same CSC sources are matched to different sources due to confusion
# we will keep the first case with the nearest counterpart while throwing the 2nd & 3rd cases
print("remove code = 2 duplicate CSC sources")

TD_dup = TD[TD.duplicated(subset=['name'], keep=False)].sort_values(by=['name'])


s_all = []
for name in TD_dup.name.unique():
    df_s = TD[TD['name']==name]
    df_s = df_s.reset_index(drop=True)
    if len(df_s)!=2:
        s = np.where(TD['name']==name)[0]
        s_all = np.append(s_all, s)
    else:
        df_s1 = df_s.iloc[0]
        df_s2 = df_s.iloc[1]
        if df_s1['Class'] != df_s2['Class']:
            s = np.where(TD['name']==name)[0]
            s_all = np.append(s_all, s)
        else:
            sep_max = max(df_s['sep'])
            s = np.where( (TD['name']==name) & (TD['sep']==sep_max))[0]
            if len(s)==2:
                s_all = np.append(s_all, s[0])
                #print(s)
            else:
                s_all = np.append(s_all, s)

TD.loc[s_all, 'remove_code'] = TD.loc[s_all, 'remove_code']+2
print('Remove', len(s_all), sorted(Counter(TD.loc[s_all, 'Class']).items()))
TD.loc[s_all, :].to_csv(f'{data_dir}/{field_name}_remove_code2.csv',index=False)
print('Left', len(TD[TD['remove_code']==0]), sorted(Counter(TD[TD['remove_code']==0]['Class']).items()))



In [69]:

# Different LVSs matched to the same CSC sources:
# 1. if different LVSs with the same classification are matched the same CSCv2 source, the nearest LVS is remained. 
# 2. if CSCv2 sources are matched to LVSs with different classifications, the X-ray sources will be removed. 
print("remove code = 2 duplicate CSC sources")

TD_dup = TD[TD.duplicated(subset=['name'], keep=False)].sort_values(by=['name'])


s_all = []
for name in TD_dup.name.unique():
    df_s = TD[TD['name']==name]
    df_s = df_s.reset_index(drop=True)
    if len(df_s['Class'].unique()) >1:
        s = np.where(TD['name']==name)[0]
        s_all = np.append(s_all, s)
    else:
        
        sep_min = min(df_s['sep'])
        s_src = np.where(TD['name']==name)[0]
        s_min = np.where( (TD['name']==name) & (TD['sep']==sep_min))[0]
        
        if len(s_min)>1:
            s_all = np.append(s_all, list(set(s_src) - set([s_min[0]])))
        else:
            s_all = np.append(s_all, list(set(s_src) - set(s_min)))
        

TD.loc[s_all, 'remove_code'] = TD.loc[s_all, 'remove_code']+2
print('Remove', len(s_all), sorted(Counter(TD.loc[s_all, 'Class']).items()))
TD.loc[s_all, :].to_csv(f'{data_dir}/{field_name}_remove_code2_2.csv',index=False)
print('Left', len(TD[TD['remove_code']==0]), sorted(Counter(TD[TD['remove_code']==0]['Class']).items()))



remove code = 2 duplicate CSC sources
Remove 609 [('CV', 78), ('HM-STAR', 156), ('HMXB', 9), ('LM-STAR', 113), ('LMXB', 44), ('NS', 32), ('NS_BIN', 35), ('YSO', 142)]
Left 3674 [('AGN', 1493), ('CV', 197), ('HM-STAR', 156), ('HMXB', 60), ('LM-STAR', 323), ('LMXB', 96), ('NS', 134), ('NS_BIN', 52), ('YSO', 1163)]


In [70]:
print("remove_code = 4: sources in crowded/complex environments")
viz = Vizier(row_limit=-1,  timeout=5000, columns=["**", "+_r"], catalog="J/A+A/558/A53/catalog",column_filters={"Type":"g"},)

radec = [[TD.loc[i, 'ra'], TD.loc[i, 'dec']] for i in range(len(TD))]
rd = Table(Angle(radec, 'deg'), names=('_RAJ2000', '_DEJ2000'))

start = time.time()
query_res = viz.query_region(rd, radius=1.*u.deg)[0]
df_gc = query_res.to_pandas()
df_gc = df_gc[df_gc._r < df_gc.r2].reset_index(drop=True)
df_gc['_q_index'] = df_gc['_q']-1
end = time.time() 
print(end - start)

TD['remove_4'] = 0
TD.loc[TD.index.isin(df_gc._q_index), 'remove_4'] = TD.loc[TD.index.isin(df_gc._q_index), 'remove_4'] +1


remove_code = 4: sources in crowded/complex environments
0.41849780082702637


In [71]:
TD['Gal_Long'] = TD.apply(lambda row: SkyCoord(ra=row.ra*u.degree, dec=row.dec*u.degree, frame='icrs').galactic.l.degree, axis=1)

TD['Gal_Lat'] =  TD.apply(lambda row: SkyCoord(ra=row.ra*u.degree, dec=row.dec*u.degree, frame='icrs').galactic.b.degree, axis=1)


In [72]:

#          LMC,      SMC,  Westerlund 1, M31, NGC 300, NGC 3379, Liller 1,NGC 6791, M 27/NGC 6853, Circinus Galaxy, NGC 2264, IC 348, NGC 1333      
re_ras =[80.89375, 13.1867, 251.76667,10.68458,13.7229,161.95666,263.3520,290.22083,299.901417, 213.29125,        100.25,  56.13833, 52.2971]
re_decs=[-69.75611,-72.8286,-45.85136,41.26916,-37.6844,12.58163,-33.3889,37.771667,22.721136, -65.339167,        9.8833,  32.16333, 31.31]
re_rs  =[10.75,    5.33,    3./60,    1.,      0.17,     0.04,   0.003,   16./60,  8./60,     6.9/60,             45./60,  42./60,   6./60]
### 2002AJ....123.1528R 2003ApJ...593.1093L 1994A&AS..106..165A for NGC 2264, IC 348, NGC 1333    

for ra, dec, r in zip(re_ras, re_decs, re_rs):
    #print(ra, dec, r)
    TD['remove_4'] = TD.apply(lambda row: row.remove_4 +1 if SkyCoord(row.ra*u.deg, row.dec*u.deg, frame='icrs').separation(SkyCoord(ra*u.deg, dec*u.deg, frame='icrs')).deg < r else row.remove_4,axis=1)

# crowded Galactic Center
#idx = np.where((TD.ra>266.38) & (TD.ra<266.59) & (TD.dec>-29.1) & (TD.dec<-28.5) & (TD.Class!='LM-STAR'))[0]
idx = np.where(((TD.Gal_Long > 350.) | (TD.Gal_Long < 10.)) & (TD.Gal_Lat>-5.) & (TD.Gal_Lat<5.))[0]# & (TD.Class!='LM-STAR'))[0]
TD.loc[idx, 'remove_4'] = TD.loc[idx, 'remove_4']+1

idx = np.where((TD['name'].str.strip().str[-1].str.isalpha()) & (~TD['name'].isin(['2CXO J043715.9-471509X'])))[0]
TD.loc[idx, 'remove_4'] = TD.loc[idx, 'remove_4'] + 1

# drop sources (mostly NSs) in complex exvironment, in bright PWNe 
SN_delete = ['2CXO J151355.6-590809','2CXO J174715.8-295801','2CXO J183333.5-103407','2CXO J053747.4-691019','2CXO J083520.6-451034','2CXO J195258.2+325240','2CXO J054010.8-691954','2CXO J090835.4-491305','2CXO J180150.6-085733','2CXO J184343.3-040805','2CXO J102347.6+003840','2CXO J174615.5-321400','2CXO J163905.4-464212']
#Oleg Check ['PSR J0540-6919  ','PSR J0908-4913  ','PSR J1748-2446C ','PSR J1748-2021A ','PSR J1801-0857C ','PSR J1843-0408  ','PSR J2129+1210A '] and has been removed
# 2CXO J163905.4-464212 (HMXB) drop due to its confused counterpart

# LMXBs and HMXBs that appeared in the previous TD and most of them are in the crowded environment but since they appeared in the 
# previous TD so they have gone through manual investigation so should be fine to add
LMXBs_HMXBs_save = ['2CXO J174819.2-360716', '2CXO J180632.1-221417', '2CXO J181044.4-260901', '2CXO J174502.3-285449', 
              '2CXO J174702.5-285259', '2CXO J173413.4-260518', '2CXO J174931.7-280805', '2CXO J173953.9-282946',
              '2CXO J174433.0-284426', '2CXO J174354.8-294443',  '2CXO J171419.7-340246','2CXO J174451.1-292116', 
              '2CXO J181921.6-252425', '2CXO J174621.1-284343', # LMXB so far
              '2CXO J173527.5-325554', '2CXO J175834.5-212321', '2CXO J174445.7-271344'] # HMXBs


s = np.where(((TD['remove_4']>0) | (TD.name.isin(SN_delete))) & (~TD.name.isin(LMXBs_HMXBs_save)) )[0]
TD.loc[s, 'remove_code'] = TD.loc[s, 'remove_code'] + 4

print('Remove', len(s), sorted(Counter(TD.loc[s, 'Class']).items()))
print('Left', len(TD[TD['remove_code']==0]), sorted(Counter(TD[TD['remove_code']==0]['Class']).items()))



Remove 659 [('AGN', 9), ('CV', 181), ('HM-STAR', 67), ('HMXB', 15), ('LM-STAR', 79), ('LMXB', 55), ('NS', 64), ('NS_BIN', 59), ('YSO', 130)]
Left 3216 [('AGN', 1484), ('CV', 58), ('HM-STAR', 121), ('HMXB', 45), ('LM-STAR', 266), ('LMXB', 58), ('NS', 101), ('NS_BIN', 25), ('YSO', 1058)]


In [73]:
#TD[(TD['remove_code']==4) & ((TD.Class =='HMXB') | (TD.Class =='LMXB'))].to_csv('TD_crowd.csv',index=False)

#TD.loc[(TD['remove_code']==4) & ((TD.Class =='HMXB') | (TD.Class =='LMXB')), 'remove_code'] = 0




s_all = []

for name in TD_dup.name.unique():
    df_s = TD[TD['name']==name]
    df_s = df_s.reset_index(drop=True)
    if len(df_s)!=2:
        s = np.where(TD['name']==name)[0]
        s_all = np.append(s_all, s)
    else:
        df_s1 = df_s.iloc[0]
        df_s2 = df_s.iloc[1]
        if df_s1['Class'] != df_s2['Class']:
            s = np.where(TD['name']==name)[0]
            s_all = np.append(s_all, s)

#TD[(TD.index.isin(s_all))].to_csv(f'{data_dir}/TD_check_dup.csv',index=False)
            
#TD[(TD.index.isin(s_all)) & (TD['remove_code']==2)].to_csv('TD_check_dup.csv',index=False)


src_delete = ['[DWS84] 20', 'HD 38563 C', '[RHI84] 10- 632', '[HC2000] 114','[CHG2008] C2-18','Cl* NGC 2244 John 14','V* V2361 Ori','V* V1129 Cen','HD 35914','M 51 X-7','1E 161348-5055.1','XTE J1829-098']
#print(len(src_delete))
#s = np.where(TD.name_cat.isin(src_delete))[0]
#s = np.where((TD['ref'] =='2003ApJ...593.1093L'))[0]

s = np.where(TD['name_cat'].str.contains('ASR', na=False))[0]
print(TD.loc[s, :])
'''
ra_ngc2264 = (6+41/60)*15
dec_ngc2264 = (9+53/60)
TD['sep_ngc2264'] = TD.apply(lambda row: SkyCoord(row.ra*u.deg, row.dec*u.deg, frame='icrs').separation(SkyCoord(ra_ngc2264*u.deg, dec_ngc2264*u.deg, frame='icrs')).arcsec, axis=1)
s = np.where((TD['sep_ngc2264']<45*60))[0]


ra_ic348 = (3+(44+33.2/60)/60)*15
dec_ic348 = (32+9.8/60)
print(ra_ic348,dec_ic348 )
TD['sep_ic348'] = TD.apply(lambda row: SkyCoord(row.ra*u.deg, row.dec*u.deg, frame='icrs').separation(SkyCoord(ra_ic348*u.deg, dec_ic348*u.deg, frame='icrs')).arcsec, axis=1)
s = np.where((TD['sep_ic348']<42*60))[0]
'''
ra_ngc1333 = (3+(29+11.3/60)/60)*15
dec_ngc1333 = (31+(18+36/60)/60)
print(ra_ngc1333,dec_ngc1333)
TD['sep_ngc1333'] = TD.apply(lambda row: SkyCoord(row.ra*u.deg, row.dec*u.deg, frame='icrs').separation(SkyCoord(ra_ngc1333*u.deg, dec_ngc1333*u.deg, frame='icrs')).arcsec, axis=1)
s = np.where((TD['sep_ngc1333']<6*60))[0]


TD.loc[s, :].to_csv(f'{data_dir}/{field_name}_src_ambiguous.csv',index=False)




In [74]:
print("remove_code = 8: delete sources with ambiguous classifications")

src_delete = ['[DWS84] 20', 'HD 38563 C', '[RHI84] 10- 632', '[HC2000] 114','[CHG2008] C2-18','Cl* NGC 2244 John 14','V* V2361 Ori','V* V1129 Cen','HD 35914','M 51 X-7','1E 161348-5055.1','XTE J1829-098']
#ASR_notdelete = ['ASR 107', 'ASR 43', 'ASR 36']

s = np.where( (TD['name_cat'].isin(src_delete)) | (TD['main_type'] == 'Candidate_YSO'))[0]# | (TD['name_cat'].str.contains('ASR', na=False) & (~TD['name_cat'].isin(ASR_notdelete)) ) )[0]
TD.loc[s, 'remove_code'] = TD.loc[s, 'remove_code'] + 8

print('Remove', len(s), sorted(Counter(TD.loc[s, 'Class']).items()))
print('Left', len(TD[TD['remove_code']==0]), sorted(Counter(TD[TD['remove_code']==0]['Class']).items()))


remove_code = 8: delete sources with ambiguous classifications
Remove 13 [('CV', 1), ('HM-STAR', 3), ('HMXB', 1), ('LM-STAR', 3), ('LMXB', 2), ('YSO', 3)]
Left 3207 [('AGN', 1484), ('CV', 57), ('HM-STAR', 120), ('HMXB', 44), ('LM-STAR', 264), ('LMXB', 56), ('NS', 101), ('NS_BIN', 25), ('YSO', 1056)]


In [75]:
TD.to_csv(f'{data_dir}/{field_name}.csv',index=False)

In [3]:
TD = pd.read_csv(f'{data_dir}/{field_name}.csv')
TD = TD[TD['remove_code']==0].reset_index(drop=True)

In [4]:

print(len(TD),Counter(TD.Class))

'''
Counter({'AGN': 1484, 'YSO': 1042, 'LM-STAR': 238, 'HM-STAR': 122, 'NS': 101, 'CV': 57, 'LMXB': 56, 'HMXB': 44, 'NS_BIN': 25})
'''

3207 Counter({'AGN': 1484, 'YSO': 1056, 'LM-STAR': 264, 'HM-STAR': 120, 'NS': 101, 'CV': 57, 'LMXB': 56, 'HMXB': 44, 'NS_BIN': 25})


"\nCounter({'AGN': 1484, 'YSO': 1042, 'LM-STAR': 238, 'HM-STAR': 122, 'NS': 101, 'CV': 57, 'LMXB': 56, 'HMXB': 44, 'NS_BIN': 25})\n"

In [5]:
df_pers = create_perobs_data(TD, query_dir, data_dir, name_type='CSCview', name_col='name', ra_col='ra',dec_col='dec',coord_format='deg')

df_pers.to_csv(f'{data_dir}/{field_name}_per.csv', index=False)

In [7]:
df_pers = pd.read_csv(f'{data_dir}/{field_name}_per.csv', low_memory=False)

df_pers['name'] = df_pers['name'].str.lstrip()
df_pers['per_remove_code'] = 0

df_ave, df_obs = cal_ave(df_pers, data_dir, dtype='TD',Chandratype='CSC',verb=verb)

#df_ave.update(TD)
#df_ave = pd.concat([df_ave, TD.iloc[:, np.r_[:8, 14:16, 24]]], axis=1)
#print(df_ave.columns)
df_ave = pd.merge(df_ave, TD.iloc[:, np.r_[:9, 14:16, 24]], how='inner', on='name')
#df_ave = df_ave.rename(columns={'main_type':'main_type_simbad'})
print(Counter(df_ave.Class))
df_ave.to_csv(f'{data_dir}/{field_name}_ave.csv', index=False)



There are 16046 per-obs data.
Run add_newdata......
Before adding new data:
Run stats......
 H   M   S    #    %  
--- --- --- ----- ----
  Y   Y   Y 13128 81.8
  Y   Y   N   867  5.4
  N   Y   Y   304  1.8
  Y   N   Y   248  1.5
  N   Y   N    68  0.4
  Y   N   N   377  2.3
  N   N   Y   111  0.6
  N   N   N   943  5.8
 ~Y   Y   Y  2918 18.1
-----------------
total:      16046
Only  13128  detections have valid fluxes at all bands.
After adding new  2289 s band data:
Run stats......
 H   M   S    #    %  
--- --- --- ----- ----
  Y   Y   Y 13635 84.9
  Y   Y   N   360  2.2
  N   Y   Y   336  2.0
  Y   N   Y   468  2.9
  N   Y   N    36  0.2
  Y   N   N   157  0.9
  N   N   Y   229  1.4
  N   N   N   825  5.1
 ~Y   Y   Y  2411 15.0
-----------------
total:      16046
Only  13635  detections have valid fluxes at all bands.
After adding new  1420 m band data:
Run stats......
 H   M   S    #    %  
--- --- --- ----- ----
  Y   Y   Y 13938 86.8
  Y   Y   N   361  2.2
  N   Y   Y   475  2.9

  sig_max =  np.nanmax(df.loc[idx,sig])
  sig_max =  np.nanmax(df.loc[idx,sig])
  df_ave.loc[idx2, 'kp_prob_b_max'] = np.nanmax(df.loc[idx,'kp_prob_b'].values)
  df_ave.loc[idx2, 'kp_prob_b_max'] = np.nanmax(df.loc[idx,'kp_prob_b'].values)
  df_ave.loc[idx2, 'kp_prob_b_max'] = np.nanmax(df.loc[idx,'kp_prob_b'].values)
  df_ave.loc[idx2, 'kp_prob_b_max'] = np.nanmax(df.loc[idx,'kp_prob_b'].values)
  df_ave.loc[idx2, 'kp_prob_b_max'] = np.nanmax(df.loc[idx,'kp_prob_b'].values)
  df_ave.loc[idx2, 'significance_max'] = np.nanmax(df.loc[idx,'flux_significance_b'].values)
  df_ave.loc[idx2, 'kp_prob_b_max'] = np.nanmax(df.loc[idx,'kp_prob_b'].values)
  df_ave.loc[idx2, 'kp_prob_b_max'] = np.nanmax(df.loc[idx,'kp_prob_b'].values)
  df_ave.loc[idx2, 'kp_prob_b_max'] = np.nanmax(df.loc[idx,'kp_prob_b'].values)
  df_ave.loc[idx2, 'kp_prob_b_max'] = np.nanmax(df.loc[idx,'kp_prob_b'].values)
  df_ave.loc[idx2, 'kp_prob_b_max'] = np.nanmax(df.loc[idx,'kp_prob_b'].values)
  df_ave.loc[idx2, 'signifi

  df_ave.loc[idx2, 'kp_prob_b_max'] = np.nanmax(df.loc[idx,'kp_prob_b'].values)
  df_ave.loc[idx2, 'kp_prob_b_max'] = np.nanmax(df.loc[idx,'kp_prob_b'].values)
  df_ave.loc[idx2, 'kp_prob_b_max'] = np.nanmax(df.loc[idx,'kp_prob_b'].values)
  df_ave.loc[idx2, 'kp_prob_b_max'] = np.nanmax(df.loc[idx,'kp_prob_b'].values)
  df_ave.loc[idx2, 'kp_prob_b_max'] = np.nanmax(df.loc[idx,'kp_prob_b'].values)
  df_ave.loc[idx2, 'kp_prob_b_max'] = np.nanmax(df.loc[idx,'kp_prob_b'].values)
  df_ave.loc[idx2, 'kp_prob_b_max'] = np.nanmax(df.loc[idx,'kp_prob_b'].values)
  df_ave.loc[idx2, 'kp_prob_b_max'] = np.nanmax(df.loc[idx,'kp_prob_b'].values)


Counter({'AGN': 1390, 'YSO': 1038, 'LM-STAR': 243, 'HM-STAR': 117, 'NS': 87, 'CV': 44, 'LMXB': 42, 'HMXB': 26, 'NS_BIN': 25})


In [10]:
print(len(df_ave),Counter(df_ave['Class']))

3012 Counter({'AGN': 1390, 'YSO': 1038, 'LM-STAR': 243, 'HM-STAR': 117, 'NS': 87, 'CV': 44, 'LMXB': 42, 'HMXB': 26, 'NS_BIN': 25})


In [3]:
#'''
df_ave = pd.read_csv(f'{data_dir}/{field_name}_ave.csv')

# cross-match with MW catalogs
start = time.time()
add_MW(df_ave, data_dir, field_name, Chandratype='CSC')
end = time.time() 
print(end - start)



0.15547657012939453


  data_MW_old = pd.read_csv(f'{file_dir}/{field_name}_MW.csv')


In [4]:
df_MW = pd.read_csv(f'{data_dir}/{field_name}_MW.csv')

#df_MW['PU_max'] = df_MW[['sep', 'err_ellipse_r0']].max(axis=1)
df_MW.loc[df_MW.name == '2CXO J182615.0-145055', 'err_ellipse_r0'] = 1.2 

df_MW_cf = confusion_clean(df_MW,X_PU='err_ellipse_r0',Chandratype='CSC')
df_MW_cf.to_csv(f'{data_dir}/{field_name}_MW_clean.csv',index=False)
#'''

#df_ave = pd.concat([df_MW, TD.iloc[:, np.r_[:8]]], axis=1)

df_MW_cf = pd.read_csv(f'{data_dir}/{field_name}_MW_clean.csv')
df_ave = TD_clean(df_MW_cf, remove_codes = [1, 32]) # previousl no remove_codes =2?!

s_remove = np.where(df_ave.name_cat.isin(['Mon R2- 5b','Cl* NGC 2264    SBL    1025B','** RAT   17A','NGC 6611 374','NGC 6611 245','NGC 6611 213','2MASS J13121845-6237309','V* V700 Per','ESO-HA 1171', 'VSS II- 7']))[0]

df_ave.loc[s_remove, 'remove_code'] = df_ave.loc[s_remove, 'remove_code']+128

df_ave.to_csv(f'{data_dir}/{field_name}_MW_before_remove.csv', index=False)

df_remove = df_ave[df_ave['remove_code']==0].reset_index(drop=True)
print('Final breakdown', len(df_remove), Counter(df_remove['Class']))
#df_remove_cols = ['name_cat','ra_cat','dec_cat','error','Class','SubClass','ref','remove_code','sep','name','ra','dec','err_ellipse_r0','err_ellipse_r1','err_ellipse_ang','significance	flux_aper_b	flux_aper_h	flux_aper_m	flux_aper_s	ks_intra_prob_b	kp_intra_prob_b	var_inter_prob_b	Gmag	BPmag	RPmag	pm_gaia	pmRA_gaia	pmDE_gaia	Jmag	Hmag	Kmag	W1mag_catwise	W2mag_catwise	pmRA_catwise	pmDE_catwise	W1mag_allwise	W2mag_allwise	W3mag_allwise	W4mag_allwise	rgeo	rpgeo	main_id	main_type	W1mag_unwise	W2mag_unwise	W1mag_comb	W2mag_comb	CSC_flags]
df_remove.to_csv(f'{data_dir}/{field_name}_MW_remove.csv', index=False)




  df_MW = pd.read_csv(f'{data_dir}/{field_name}_MW.csv')


2275 counterparts matched for gaia
1854 counterparts matched for 2mass
1909 counterparts matched for catwise
2117 counterparts matched for unwise
1810 counterparts matched for allwise


  df_MW_cf = pd.read_csv(f'{data_dir}/{field_name}_MW_clean.csv')


[('LMXB', 1), ('NS_BIN', 1)]
[('AGN', 1390), ('CV', 44), ('HM-STAR', 117), ('HMXB', 26), ('LM-STAR', 243), ('LMXB', 41), ('NS', 87), ('NS_BIN', 24), ('YSO', 1038)]
Final breakdown 3002 Counter({'AGN': 1390, 'YSO': 1038, 'LM-STAR': 236, 'HM-STAR': 116, 'NS': 87, 'CV': 44, 'LMXB': 41, 'HMXB': 26, 'NS_BIN': 24})


In [5]:
print(len(df_MW_cf),Counter(df_MW_cf['Class']))

print(Counter(df_ave['remove_code']))

3012 Counter({'AGN': 1390, 'YSO': 1038, 'LM-STAR': 243, 'HM-STAR': 117, 'NS': 87, 'CV': 44, 'LMXB': 42, 'HMXB': 26, 'NS_BIN': 25})
Counter({0: 3002, 128: 8, 32: 1, 33: 1})


In [10]:
df = pd.read_csv(f'{data_dir}/{field_name}_MW_remove.csv')
df_final = prepare_cols(df, cp_thres=0, vphas=False,gaiadata=False,cp_conf_flag=False, TD=True, NS_MWdrop=False, STAR_classremove=['HM-STAR','LM-STAR','YSO'])


Remove 25 [('LM-STAR', 4), ('YSO', 21)]
Final breakdown 2977 [('AGN', 1390), ('CV', 44), ('HM-STAR', 116), ('HMXB', 26), ('LM-STAR', 232), ('LMXB', 41), ('NS', 87), ('NS_BIN', 24), ('YSO', 1017)]


  df = pd.read_csv(f'{data_dir}/{field_name}_MW_remove.csv')


In [7]:
df_final.head()

Unnamed: 0,name,ra,dec,PU,significance,Fcsc_s,e_Fcsc_s,Fcsc_m,e_Fcsc_m,Fcsc_h,...,W4mag,e_W4mag,cp_flag_allwise,Class,cp_flag_wise12,which_wise12,W1mag,e_W1mag,W2mag,e_W2mag
0,2CXO J000009.3+135618,0.039125,13.938494,0.79,1.95,1.671074e-15,1.098049e-15,4.844e-16,4.844e-16,1.989924e-15,...,8.827,0.462,0,AGN,0.0,allwise,15.489,0.045,14.605,0.059
1,2CXO J000230.7+004959,0.627958,0.833072,0.72,11.07,5.675297e-14,6.911736e-15,2.874798e-14,5.285001e-15,5.735159e-14,...,7.915,0.235,0,AGN,0.0,allwise,14.497,0.03,13.178,0.031
2,2CXO J000622.6-000424,1.594333,-0.073572,0.78,25.42,1.544e-13,1.22e-14,1.165798e-13,7.650074e-15,4.038e-13,...,8.398,,0,AGN,0.0,allwise,15.299,0.041,14.245,0.049
3,2CXO J000659.2-001740,1.747042,-0.294661,0.84,4.11,1.069845e-14,3.658216e-15,6.211934e-15,2.77351e-15,1.372718e-14,...,,,-8,AGN,0.0,catwise,16.549,0.037,15.65,0.057
4,2CXO J000703.6+155423,1.765,15.906575,0.72,8.63,4.66e-15,2.824e-15,1.593202e-14,5.625001e-15,5.35e-13,...,5.208,0.039,0,AGN,0.0,allwise,11.61,0.023,10.588,0.021


In [14]:
TD_0224 = pd.read_csv('../../files/CSC_TD_v5_02242022_MW_remove.csv')
TD_0224 = prepare_cols(TD_0224, cp_thres=0, vphas=False,gaiadata=False,cp_conf_flag=False, TD=True, NS_MWdrop=False, STAR_classremove=['HM-STAR','LM-STAR','YSO'])

print(len(TD_0224))

Remove 21 [('LM-STAR', 1), ('YSO', 20)]
Final breakdown 2941 [('AGN', 1390), ('CV', 44), ('HM-STAR', 118), ('HMXB', 26), ('LM-STAR', 207), ('LMXB', 41), ('NS', 87), ('NS_BIN', 24), ('YSO', 1004)]
2941


  TD_0224 = pd.read_csv('../../files/CSC_TD_v5_02242022_MW_remove.csv')


In [30]:
print(len(df_final[df_final.name.isin(TD_0224.name)]))
#print(len(TD_0224[TD_0224.name.isin(df_final.name)]))
print(df_final.loc[~df_final.name.isin(TD_0224.name), ['name','Class']])
print(TD_0224.loc[~TD_0224.name.isin(df_final.name), ['name','Class']])

TD_comb = pd.merge(df_final, TD_0224, on='name', how='inner')
print(len(TD_comb))
print(TD_comb.loc[TD_comb.Class_x != TD_comb.Class_y, ['name','Class_x','Class_y']])
#print(TD_0224.loc[(TD_0224.name.isin(df_final.name) & (TD_0224.Class == df_final.Class), ['name','Class']])


2929
                       name    Class
1443  2CXO J022608.5+620647      YSO
1446  2CXO J022715.6+613730  HM-STAR
1484  2CXO J104517.2-594701  HM-STAR
1485  2CXO J104530.2-594820  HM-STAR
1486  2CXO J104536.7-594702  HM-STAR
1505  2CXO J111901.6-613106  HM-STAR
1509  2CXO J165410.7-414747      YSO
1510  2CXO J165414.7-415111      YSO
1511  2CXO J165418.1-415016      YSO
1521  2CXO J182022.6-160833      YSO
1673  2CXO J053509.8-052338      YSO
1675  2CXO J053510.5-052216      YSO
1676  2CXO J053510.8-052240  LM-STAR
1677  2CXO J053510.9-052224  LM-STAR
1679  2CXO J053512.9-052457  LM-STAR
1680  2CXO J053513.1-052452  LM-STAR
1682  2CXO J053513.9-052319      YSO
1683  2CXO J053514.8-052315  LM-STAR
1685  2CXO J053515.0-052354  LM-STAR
1686  2CXO J053515.2-052318      YSO
1687  2CXO J053515.5-052337      YSO
1688  2CXO J053515.7-052411  LM-STAR
1689  2CXO J053515.7-052424  LM-STAR
1692  2CXO J053516.7-052316      YSO
1693  2CXO J053517.5-052324      YSO
1694  2CXO J053517.8-052440      