## TOI data from vizier

In [40]:
#so just import from another directory
import sys
sys.path.insert(0, '../code')

In [42]:
from kenmon import get_tois_data

df = get_tois_data()
df.shape

(6262, 64)

In [43]:
d = df.query("TOI==200.01").squeeze()
d

TIC ID                              410214986
TOI                                    200.01
Previous CTOI                             NaN
Master                                      3
SG1A                                        5
                                 ...         
Date TOI Updated (UTC)             2024-09-05
Date Modified             2024-10-01 12:15:24
Comments                           DS Tuc A b
ra_deg                             354.915458
dec_deg                            -69.196042
Name: 100, Length: 64, dtype: object

In [44]:
d.ra_deg, d.dec_deg

(354.9154583333333, -69.19604166666667)

In [47]:
from kenmon import Target

param = "log_R_HK_"
t = Target(ra_deg=d.ra_deg, dec_deg=d.dec_deg)
t.query_vizier_param(param, use_regex=True)



Found 2 references in Vizier using `log_R_HK_`.


{'J/A+A/537/A147/table4:log_R_HK_': -4.09,
 'J/A+A/537/A147/table4:e_log_R_HK_': 0.05}

In [49]:
t.query_vizier_param(param, use_regex=False)

Found 1 references in Vizier using `log_R_HK_`.


{'J/A+A/537/A147/table4:log_R_HK_': -4.09}

Cannot just use mean of all measurements!

## search vizier param on all TOI

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
from kenmon import Target
import time
import random

regex_query = "log_R_HK_"

def process_row(row, max_retries=3, timeout=30):
    """Process each row to query Vizier with retries, rate limiting, and timeouts."""
    t = Target(ra_deg=row.ra_deg, dec_deg=row.dec_deg, verbose=False)
    
    for attempt in range(max_retries):
        try:
            # Apply rate limiting with random sleep to prevent server overload
            time.sleep(random.uniform(0.5, 2.0))  
            
            p = t.query_vizier_param(param=regex_query, use_regex=True)
            return row.TOI, p
        except Exception as e:
            print(f"Attempt {attempt+1} failed for TOI {row.TOI}: {e}")
            time.sleep(2 ** attempt)  # Exponential backoff
    return row.TOI, f"Failed after {max_retries} retries"

# Parallel execution with limited workers
data = {}
max_workers = min(4, len(df))  # Adjust based on server limits

with ProcessPoolExecutor(max_workers=max_workers) as executor:
    futures = {executor.submit(process_row, row): row for _, row in df.iterrows()}
    
    for future in tqdm(as_completed(futures), total=len(futures)):
        toi, result = future.result()
        data[toi] = result

In [51]:
import pandas as pd

data_df = pd.DataFrame(data)
data_df.index.unique()

Index(['J/A+A/531/A8/table4:log_R_HK_', 'J/A+A/537/A147/table4:log_R_HK_',
       'J/A+A/537/A147/table4:e_log_R_HK_', 'J/A+A/682/A136/table2:log_R_HK_N',
       'J/A+A/682/A136/table2:e_log_R_HK_N',
       'J/A+A/682/A136/table2:log_R_HK_M',
       'J/A+A/682/A136/table2:e_log_R_HK_M',
       'J/ApJS/152/261/table2:log_R_HK_'],
      dtype='object')

In [52]:
param_name = 'rhk'
data_df.to_csv(f'../data/TOI_{param_name}_vizier.csv')

In [53]:
import pandas as pd

data_df = pd.read_csv(f'../data/TOI_{param_name}_vizier.csv', index_col=0)
data_df

Unnamed: 0,101.01,103.01,104.01,105.01,102.01,106.01,107.01,109.01,108.01,110.01,...,7175.01,7177.01,7180.01,7178.01,7179.01,7182.01,7181.01,7183.01,7185.01,7184.01
J/A+A/531/A8/table4:log_R_HK_,,,,,,,,,,,...,,,,,,,,,,
J/A+A/537/A147/table4:log_R_HK_,,,,,,,,,,,...,,,,,,,,,,
J/A+A/537/A147/table4:e_log_R_HK_,,,,,,,,,,,...,,,,,,,,,,
J/A+A/682/A136/table2:log_R_HK_N,,,,,,,,,,,...,,,,,,,,,,
J/A+A/682/A136/table2:e_log_R_HK_N,,,,,,,,,,,...,,,,,,,,,,
J/A+A/682/A136/table2:log_R_HK_M,,,,,,,,,,,...,,,,,,,,,,
J/A+A/682/A136/table2:e_log_R_HK_M,,,,,,,,,,,...,,,,,,,,,,
J/ApJS/152/261/table2:log_R_HK_,,,,,,,,,,,...,,,,,,,,,,


In [15]:
refs = {}
for i in data_df.index:
    count=data_df.loc[i].dropna().shape
    refs[i] = count

In [16]:
refs = pd.Series(refs).sort_values(ascending=False)
refs.head(20)

J/A+A/460/695/stars:EWLi           (7,)
J/A+A/664/A163/tableb1:e_EWLi      (4,)
J/A+A/664/A163/tableb1:EWLi        (4,)
J/A+A/676/A129/catalog:EWLi        (3,)
J/AJ/163/156/table5:EWLi           (2,)
J/AJ/163/156/table5:r_EWLi         (2,)
J/AJ/133/2524/table1:u_EWLi        (2,)
J/AJ/133/2524/table1:EWLi          (2,)
J/A+A/612/A99/ngc2451:f_EWLi       (2,)
J/A+A/612/A99/ngc2451:EWLi         (2,)
J/A+A/612/A99/ngc2451:l_EWLi       (2,)
J/A+A/480/735/stars:EWLi           (1,)
J/AJ/164/115/table7:e_EWLi         (1,)
J/AJ/165/205/stars:EWLi            (1,)
J/A+A/685/A83/tablec1:f_EWLiunv    (1,)
J/A+A/685/A83/tablec1:EWLiunv      (1,)
J/A+A/573/A126/sample:EWLi         (1,)
J/AJ/164/115/table7:EWLi           (1,)
J/MNRAS/494/2429/table1:EWLi       (1,)
J/AJ/161/171/table7:r_EWLi         (1,)
dtype: object

https://cdsarc.cds.unistra.fr/viz-bin/cat/J/A%2BA/460/695

In [23]:
# in mA; Search for Associations Containing Young stars (2006)
data_df.T['J/A+A/460/695/stars:EWLi'].dropna()

200.01     216.0
831.01     160.0
833.01       0.0
865.01       0.0
2383.01      0.0
4399.01    165.0
6551.01    220.0
Name: J/A+A/460/695/stars:EWLi, dtype: object

https://cdsarc.cds.unistra.fr/viz-bin/cat/J/A+A/676/A129

In [20]:
# in mA
data_df.T['J/A+A/676/A129/catalog:EWLi'].dropna()

1807.01        104.0
2076.01    89.400002
2076.02    89.400002
2076.03    89.400002
Name: J/A+A/664/A163/tableb1:EWLi, dtype: object

In [39]:
for ref in refs.index:
    x = data_df.T[ref].dropna()
    try:
        if (x.astype(float)>50).any():
            print(data_df.T[ref].dropna())
    except Exception as e:
        print(e)

200.01     216.0
831.01     160.0
833.01       0.0
865.01       0.0
2383.01      0.0
4399.01    165.0
6551.01    220.0
Name: J/A+A/460/695/stars:EWLi, dtype: object
1807.01        104.0
2076.01    89.400002
2076.02    89.400002
2076.03    89.400002
Name: J/A+A/664/A163/tableb1:EWLi, dtype: object
1227.01    513
1880.01    515
Name: J/AJ/163/156/table5:EWLi, dtype: object
could not convert string to float: 'Goodman'
could not convert string to float: '<'
4399.01    165.0
Name: J/A+A/573/A126/sample:EWLi, dtype: object
2048.01    53.099998
Name: J/AJ/164/115/table7:EWLi, dtype: object
200.01    216.0
Name: J/MNRAS/494/2429/table1:EWLi, dtype: object


## load data

In [35]:
import pandas as pd

data = pd.read_csv('../data/youth_indicators_tois.csv')
data

Unnamed: 0,TOI,bv,bv_err,li,prot,rhk,teff,ruwe
0,101.01,0.656000,0.1,,1.430321,,5648.642857,
1,102.01,0.507667,0.1,,4.410011,,6257.270417,
2,103.01,0.291000,0.1,,3.554638,,6086.159167,
3,104.01,0.545667,0.1,,4.090360,,6046.516250,
4,105.01,0.687667,0.1,,2.187803,,5741.285333,
...,...,...,...,...,...,...,...,...
4646,5378.01,1.174000,,,,,4676.843750,0.955114
4647,5379.01,0.458667,,,,,6316.347805,1.046806
4648,5380.01,0.461000,,,,,6420.325168,1.042541
4649,5382.01,0.560000,,,,,6263.408975,1.233940


In [36]:
data.li.dropna()

100     108.107498
654     160.000000
655       0.000000
682       0.000000
1217      0.015000
1394      0.166000
1857      0.000000
2291     35.200000
3706    165.000000
4115      7.361100
Name: li, dtype: float64

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from kenmon import Target

params = ["B-V", "log_R_HK_", "EWLi", "Teff", "Prot", "ruwe"]

for i,row in tqdm(df.iterrows()):
    if data.TOI.isin([row.TOI]).sum()==0:
        d = {}
        d['TOI'] = row.TOI
        t = Target(ra_deg=row.ra_deg, dec_deg=row.dec_deg, verbose=False)

        for param in params:
            p = t.query_vizier_param(param, use_regex=False)
            v = np.nanmean(list(map(float, p.values())))
            d[param] = v
        s = pd.Series(d).rename({"B-V": "bv", 
                                 "log_R_HK_": "rhk", 
                                 "EWLi": "li", 
                                 "Teff": "teff", 
                                 "Prot": "prot",
                                 #ruwe
                                }).to_frame().T
        data = pd.concat([data,s], ignore_index=True) 
        # break

In [21]:
data = data.sort_values(by='TOI')
data.tail()

Unnamed: 0,TOI,bv,bv_err,li,prot,rhk,teff,ruwe
4646,5378.01,1.174,,,,,4676.84375,0.955114
4647,5379.01,0.458667,,,,,6316.347805,1.046806
4648,5380.01,0.461,,,,,6420.325168,1.042541
4649,5382.01,0.56,,,,,6263.408975,1.23394
4650,5383.01,0.441625,,,0.85,,6700.580612,1.735051


In [22]:
data.dropna(how='any')

Unnamed: 0,TOI,bv,bv_err,li,prot,rhk,teff,ruwe


In [23]:
for c in data.columns:
    print(c,len(data[c].dropna()))

TOI 4651
bv 4290
bv_err 4000
li 10
prot 613
rhk 17
teff 4643
ruwe 332


In [25]:
data.to_csv('../data/youth_indicators_tois.csv', index=False)