<a href="https://colab.research.google.com/github/juanserrano90/codelatam/blob/main/Data/Redshift/GetSNRedshiftCatalog.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook obtains the redshift value per supernova from the
https://github.com/astrocatalogs/supernovae/tree/master/output repository and
stores the values only from the supernovae in our dataset.

Author: Paula GM
Date: Oct 6th, 2024
Last modified: Oct 20th, 2024

In [2]:
import json
import pandas as pd
import numpy as np

In [3]:
# --Read and display content of json file
url = 'https://raw.githubusercontent.com/astrocatalogs/supernovae/master/output/catalog.json'
df = pd.read_json(url)
df.head()

Unnamed: 0,name,alias,discoverer,discoverdate,maxdate,maxappmag,maxabsmag,host,ra,dec,...,ebv,hostra,hostdec,hostoffsetang,hostoffsetdist,spectralink,instruments,download,radiolink,xraylink
0,AT1991bm,"[{'value': 'AT1991bm'}, {'value': 'GSC2.3 N0ZY...",[{'value': 'Filipp Romanov'}],[{'value': '1991/08/06'}],[{'value': '1991/08/05'}],[{'value': '14'}],[{'value': '-22.2727'}],[{'value': 'UGC 11180'}],[{'value': '18:15:14.404'}],[{'value': '+47:31:51.55'}],...,,,,,,,,,,
1,AT1992bv,[{'value': 'AT1992bv'}],"[{'value': 'William Keel, Judy Schmidt, Julian...",[{'value': '1992/11/19'}],[{'value': '1992/11/18'}],[{'value': '16.8'}],[{'value': '-18.64'}],[{'value': 'IC 1908'}],[{'value': '03:15:05.472'}],[{'value': '-54:49:16.40'}],...,,,,,,,,,,
2,AT1992bw,"[{'value': 'AT1992bw'}, {'value': 'GSC2.3 NB6I...",[{'value': 'Filipp Romanov'}],[{'value': '1992/08/08'}],[{'value': '1992/08/07'}],[{'value': '17.6'}],[{'value': '-16.8888'}],[{'value': 'UGC 43'}],[{'value': '00:06:02.553'}],[{'value': '+14:24:50.72'}],...,,,,,,,,,,
3,AT1999gy,[{'value': 'AT1999gy'}],[{'value': 'Claude Cornen'}],[{'value': '1999/03/21'}],[{'value': '1999/03/20'}],[{'value': '17.9'}],,,[{'value': '16:19:58.050'}],[{'value': '-01:10:29.09'}],...,[{'value': '0.092'}],,,,,,,,,
4,AT1999gz,[{'value': 'AT1999gz'}],[{'value': 'Galaxy Zoo'}],[{'value': '1999/03/20'}],[{'value': '1999/03/19'}],[{'value': '20.4'}],[{'value': '-16.835'}],,[{'value': '11:57:31.944'}],[{'value': '+00:48:59.85'}],...,,,,,,,,,,


In [4]:
cols = df.columns
cols

Index(['name', 'alias', 'discoverer', 'discoverdate', 'maxdate', 'maxappmag',
       'maxabsmag', 'host', 'ra', 'dec', 'redshift', 'velocity', 'lumdist',
       'claimedtype', 'photolink', 'references', 'ebv', 'hostra', 'hostdec',
       'hostoffsetang', 'hostoffsetdist', 'spectralink', 'instruments',
       'download', 'radiolink', 'xraylink'],
      dtype='object')

In [5]:
# --Create database with only name (and aliases) and redshift
df_redshift = df[['name', 'redshift']]
print(df_redshift.shape)
df_redshift.head()

(72145, 2)


Unnamed: 0,name,redshift
0,AT1991bm,[{'value': '0.040188'}]
1,AT1992bv,[{'value': '0.0275'}]
2,AT1992bw,[{'value': '0.017773'}]
3,AT1999gy,
4,AT1999gz,[{'value': '0.06227'}]


In [6]:
# --Count number of NaN in redshift column
print('Num. of NaNs in Redshift column:',df_redshift['redshift'].isna().sum())
# --Remove NaN from database
# df_redshift_clean = df_redshift.dropna()
df_redshift_clean = df_redshift
# print(df_redshift_clean.shape)
# df_redshift_clean.head()

Num. of NaNs in Redshift column: 43077


In [7]:
# --Extract only the redshift number from the 'redshift' column
df_redshift_clean.loc[:, 'redshift'] = df_redshift['redshift'].apply(
    lambda x: x[0]['value'] if isinstance(x, list) and x else np.nan  # Use np.nan for missing values
)

# --Convert the redshift column to float
df_redshift_clean.loc[:, 'redshift'] = pd.to_numeric(df_redshift_clean['redshift'], errors='coerce')

# --Convert all letters in name to lowercase
df_redshift_clean.loc[:, 'name'] = df_redshift_clean['name'].str.lower()

# --Display the cleaned DataFrame
print(df_redshift_clean)

           name  redshift
0      at1991bm  0.040188
1      at1992bv    0.0275
2      at1992bw  0.017773
3      at1999gy       NaN
4      at1999gz   0.06227
...         ...       ...
72140       w44       NaN
72141      w49b       NaN
72142       w50       NaN
72143       w51       NaN
72144       w63       NaN

[72145 rows x 2 columns]


In [8]:
print('Num. of NaNs in Redshift column:',df_redshift_clean['redshift'].isna().sum())

Num. of NaNs in Redshift column: 43077


In [9]:
# --Load our SN database
url2 = "https://github.com/juanserrano90/codelatam/raw/main/Data/data/sn_data.parquet"
df_raw = pd.read_parquet(url2)
df_raw = df_raw.reset_index()

# --Make a list of unique names and make all letters lowercase
names = df_raw['SN Name'].unique()
names = [name.lower() for name in names]
print('Num. of unique SN: ', len(names))

Num. of unique SN:  518


In [10]:
# check if iPTF15dtg is in SN Name of df_redshift_clean
print('iPTF15dtg' in df_redshift_clean['name'].values)

False


**Federica's Instructions for finding the missing values:**
Build a function to standarized names following the below criteria:
  - If the name start with SN but there are fewer than 4 digits in the name (exactly only 2) and the first digit is either 0,1,2 then ad 20 before the number, otherwise add 19 (this corresponds to the year)
  - If the first character is a number (digit) and there is only two digits then add 20 or 19 as above and also add SN

Find the conventional name or redshift for snls03d3bb and ptf10bzf.

In [11]:
def test(name):
  """Standardizes supernova names according to discussion with Federica.

  Args:
    name: The supernova name as a string.

  Returns:
    The standardized supernova name in lowercase.
  """

  if name.startswith("sn"):
    if name[4:6].isalpha():
      if int(name[2]) in [0, 1, 2]:
        name = "sn20" + name[2:]
      else:
        name = "sn19" + name[2:]
  elif name[0].isdigit():
    if int(name[0]) in [0, 1, 2]:
      name = "sn20" + name
    else:
      name = "sn19" + name

  return name

In [12]:
#-- Standarize names using above function and instructions

#-- Manual change of names of ptf10bzf to sn2010ah as per https://ui.adsabs.harvard.edu/abs/2013MNRAS.432.2463M/abstract
names[names.index('ptf10bzf')] = 'sn2010ah'
names[names.index('15dtg')] = 'iPTF15dtg'

# mylist = names.copy()
for i in range(len(names)):
  name = names[i]
  names[i] = test(name)
# print(mylist)

In [13]:
# --Broadcast df_redshift_clean with names to keep only the redshift from those
# --SN and add NaN if there is no record of the SN

df_names = pd.DataFrame(names, columns=['SN Name'])  # Convert to DataFrame

# Merge df_redshift_clean with df_names on 'name' (or 'SN Name' if you reset the index)
df_merged = df_names.merge(df_redshift_clean, how='left', left_on='SN Name', right_on='name').drop(columns=['name'])

# Now df_merged contains 'SN Name' and corresponding 'redshift', with NaN where there's no match
print(df_merged)

      SN Name  redshift
0    sn2008ar  0.026147
1    sn1999ac   0.00944
2    sn1998dk   0.01322
3    sn2005gj    0.0616
4    sn2004eo  0.014734
..        ...       ...
514  sn2000er     0.031
515  sn2005ek  0.016551
516  sn2007ru    0.0155
517  sn2003jd     0.019
518  sn2007uy    0.0065

[519 rows x 2 columns]


In [14]:
#-- Manually add redshift values for snls03d3bb and iPTF15dtg
#-- z = 0.244 for snls03d3bb as per https://ui.adsabs.harvard.edu/abs/2006AAS...208.0203H/abstract
df_merged.loc[df_merged['SN Name'] == 'snls03d3bb', 'redshift'] = 0.244
#-- z = 0.0524 for iPTF15dtg as per https://arxiv.org/pdf/1605.02491
df_merged.loc[df_merged['SN Name'] == 'iPTF15dtg', 'redshift'] = 0.0524

In [15]:
#--Save database as csv file
df_merged.to_csv('sn_redshift.csv', index=False)

## **Below is the data exploration with Federica that led to the specified standarization above.**

In [16]:
df_merged.isnull().sum()

Unnamed: 0,0
SN Name,0
redshift,0


In [17]:
df_merged[df_merged['redshift'].isnull()]

Unnamed: 0,SN Name,redshift


In [18]:
[df_merged['SN Name'].values[i] for i in range(len(df_merged))
if (df_merged['redshift'][i] is np.nan and not df_merged['SN Name'].values[i].startswith('sn'))]

[]

In [19]:
[df_merged['SN Name'].values[i] for i in range(len(df_merged))
if (len(df_merged['SN Name'][i]) > 9 and df_merged['redshift'][i] is np.nan)]
# Find the conventional name or redshift for this one and the ptf10bzf manually

[]