In [1]:
import pandas as pd
import numpy as np
from dataloader import import_kaepora

In [2]:
# Kaepora subsample for paper
kaepora = import_kaepora()
kaepora.info()

<class 'pandas.core.frame.DataFrame'>
Index: 311 entries, 2006td to 1992bo
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   v_siII      311 non-null    float64
 1   v_siII_err  302 non-null    float64
 2   z           291 non-null    float64
dtypes: float64(3)
memory usage: 9.7+ KB


In [3]:
# TNS
tns = pd.read_csv("data/tns_sn_before2016aqv.csv", usecols=range(8))

# Clean SN Name
tns["Name"] = (tns["Name"]
 .str.lower() # Use lowercase SN names
 .str.replace("sn", "") # Remove "sn" prefix
 .str.replace("\s", "") # Replace spaces with whitespace
 .str.strip() # Remove leading and trailing characters.
)

# Clean host name
tns["Host Name"] = (tns["Host Name"]
                    .str.upper() # Use uppercase galaxy names
                    .str.replace("\s",  "")
                    .str.strip()
                   )
tns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5256 entries, 0 to 5255
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             5256 non-null   int64  
 1   Name           5256 non-null   object 
 2   RA             5256 non-null   object 
 3   DEC            5256 non-null   object 
 4   Obj. Type      5256 non-null   object 
 5   Redshift       177 non-null    float64
 6   Host Name      5194 non-null   object 
 7   Host Redshift  54 non-null     float64
dtypes: float64(2), int64(1), object(5)
memory usage: 328.6+ KB


In [4]:
# GLADE
# File is very large, keep only what's necessary
glade_cols = ["PGC", "GWGC name", "HyperLEDA name", "2MASS name", "SDSS-DR12 name", "flag1", "RA", "dec", "dist", "dist_err", "z", "B", "B_err", "B_Abs", "J", "J_err", "H", "H_err", "K", "K_err", "flag2", "flag3"]
glade = pd.read_csv("data/GLADE_2.4.txt", sep="\s+", 
                    names=glade_cols,
                    usecols=["GWGC name", "HyperLEDA name", "2MASS name", "SDSS-DR12 name", "z", "B", "B_err", "B_Abs", "dist", "dist_err"]
                   )

# Removing rows where galaxy name aren't recorded
# Cast galaxy name columns as string because some of them are read as numerics
glade = (glade
         .dropna(subset=["GWGC name", "HyperLEDA name", "2MASS name"], how="all")
         .astype({"GWGC name": str, "HyperLEDA name": str, "2MASS name": str, "SDSS-DR12 name": str})
        )

glade[["GWGC name", "HyperLEDA name", "2MASS name"]] = glade[["GWGC name", "HyperLEDA name", "2MASS name"]].apply(
    lambda s: ((s
                .str.upper()  # Use uppercase galaxy names
                .str.replace("\s", "")
                .str.strip())
               ))

# glade = glade[
# #     (glade["GWGC name"].isin(tns["Host Name"].unique())) & 
#     (~glade["GWGC name"].isnull())
# ]

glade.info()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2966597 entries, 0 to 3263610
Data columns (total 10 columns):
 #   Column          Dtype  
---  ------          -----  
 0   GWGC name       object 
 1   HyperLEDA name  object 
 2   2MASS name      object 
 3   SDSS-DR12 name  object 
 4   dist            float64
 5   dist_err        float64
 6   z               float64
 7   B               float64
 8   B_err           float64
 9   B_Abs           float64
dtypes: float64(6), object(4)
memory usage: 249.0+ MB


In [5]:
# Merge kaepora and TNS on SN name
kaepora_tns = kaepora.merge(tns, how="inner", left_index=True, right_index=False, right_on="Name", suffixes=("_kaepora", "_tns"))
kaepora_tns.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 304 entries, 1544 to 4493
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   v_siII         304 non-null    float64
 1   v_siII_err     296 non-null    float64
 2   z              284 non-null    float64
 3   ID             304 non-null    int64  
 4   Name           304 non-null    object 
 5   RA             304 non-null    object 
 6   DEC            304 non-null    object 
 7   Obj. Type      304 non-null    object 
 8   Redshift       1 non-null      float64
 9   Host Name      304 non-null    object 
 10  Host Redshift  0 non-null      float64
dtypes: float64(5), int64(1), object(5)
memory usage: 28.5+ KB


In [6]:
# Merge kaepora+TNS with GLADE on galaxy name
kaepora_tns_glade = kaepora_tns.merge(glade, how="left", left_on="Host Name", right_on="HyperLEDA name", suffixes=("_kaepora", "_glade"))
kaepora_tns_glade.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 304 entries, 0 to 303
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   v_siII          304 non-null    float64
 1   v_siII_err      296 non-null    float64
 2   z_kaepora       284 non-null    float64
 3   ID              304 non-null    int64  
 4   Name            304 non-null    object 
 5   RA              304 non-null    object 
 6   DEC             304 non-null    object 
 7   Obj. Type       304 non-null    object 
 8   Redshift        1 non-null      float64
 9   Host Name       304 non-null    object 
 10  Host Redshift   0 non-null      float64
 11  GWGC name       128 non-null    object 
 12  HyperLEDA name  128 non-null    object 
 13  2MASS name      128 non-null    object 
 14  SDSS-DR12 name  128 non-null    object 
 15  dist            128 non-null    float64
 16  dist_err        0 non-null      float64
 17  z_glade         128 non-null    flo

In [7]:
# Save to file
(kaepora_tns_glade
    .rename(columns={"Host Redshift": "z_tns", "Name": "sn"})
    .sort_values(["HyperLEDA name", "sn"])
    [["sn", "Host Name", "HyperLEDA name", "v_siII", "v_siII_err",
        "z_kaepora", "z_glade", "z_tns", "B", "B_err", "dist", "dist_err", "RA", "DEC"]]
 ).to_csv("kaepora_tns_glade.csv", index=False)

In [12]:
# Finding NGC<1-3 numbers>
matches = tns["Host Name"].str.match("NGC\d{1,3}$", na=False)
tns["Host Name"][matches]

32      NGC175
48      NGC134
63      NGC846
91      NGC157
248     NGC910
         ...  
4991    NGC337
5000    NGC418
5020    NGC694
5187    NGC628
5229    NGC428
Name: Host Name, Length: 116, dtype: object

In [32]:
# Padding zeroes to NGC<1-3numbers> to NGC<4 numbers>

(tns["Host Name"][matches]
 .str.extract("(\d+)", expand=False) # Extract the digits
 .apply(lambda ngc_number: f"NGC{int(ngc_number):04d}") # Pad until 4 digits
)

32      NGC0175
48      NGC0134
63      NGC0846
91      NGC0157
248     NGC0910
         ...   
4991    NGC0337
5000    NGC0418
5020    NGC0694
5187    NGC0628
5229    NGC0428
Name: Host Name, Length: 116, dtype: object