# Importing Packages

Notebook used to generate all the data you find in the Catalogues in the Catalogues folder of this repo

In [1]:
#Loading needed modules and classes/functions 
import numpy as np
import pandas as pd
import math
import os, time, shutil, hickle
from tqdm import tqdm
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=1.8)
sns.set_style('white')
from PIL import Image

#------------------------------------------
import torch
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision.datasets import ImageFolder 
from torchvision.io import read_image, decode_image
from torchvision.models import vgg19
from torchvision.utils import save_image
import torchvision.transforms as transforms
import torchvision
from torchvision import models

#-------------------------------------------
import marvin
from marvin.tools.maps import Maps
# from marvin.tools.image import Image
from marvin.utils.general.images import get_images_by_list
from marvin import config
from marvin.tools.cube import Cube

#-------------------------------------------
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

#-------------------------------------------
from PIL import Image as image_PIL
import PIL 
from PIL import ImageShow

#-------------------------------------------
from pytorch_grad_cam import GradCAM, ScoreCAM, GradCAMPlusPlus, AblationCAM, XGradCAM, EigenCAM, EigenGradCAM
from pytorch_grad_cam.utils.image import show_cam_on_image

from astropy.io import fits

#set config attributes and turn on global downloads of Marvin data
config.setRelease('DR17')
config.mode = 'local'
config.download = True

[0;34m[INFO]: [0mNo release version set. Setting default to DR17


In [3]:
work_dir = #########Put the directory to the folder where you gitclone the directory to here########## 
data = fits.open(work_dir +'VACs/SDSS17Pipe3D_v3_1_1.fits')
data_gz = fits.open(work_dir +'VACs/MaNGA_gz-v2_0_1.fits')
data_gema = fits.open(work_dir+'VACs/GEMA_2.0.2.fits')

# Morphology Data Approach

In [4]:
table = data[1]
table_gz = data_gz[1].data
table_gema1 = data_gema[1].data
table_gema12 = data_gema[12].data
table_gema14 = data_gema[14].data


manga_id = []
log_mstar = []
log_sfr = []
redshift = []
sersic_n =[]
t50 = []
d4000 = []
Av=[]
Z=[]
RA_pipe3d=[]
DEC_pipe3d=[]

for i in range(len(table.data)):
    manga_id.append(table.data[i][4])
    log_mstar.append(table.data[i][12])
    log_sfr.append(table.data[i][7])
    redshift.append(table.data[i][177])
    sersic_n.append(table.data[i][512])
    t50.append(table.data[i][115])
    d4000.append(table.data[i][456])
    Av.append(table.data[i][173])
    Z.append(table.data[i][28])
    RA_pipe3d.append(table.data[i][5])
    DEC_pipe3d.append(table.data[i][6])

#Cleaning out repeat IDs 
manga_id_unique, unique_index = np.unique(manga_id, return_index=True)

manga_id = np.array(manga_id,dtype=str)[unique_index]
log_mstar = np.array(log_mstar)[unique_index]
log_sfr = np.array(log_sfr)[unique_index]
redshift = np.array(redshift)[unique_index]
sersic_n =np.array(sersic_n)[unique_index]
t50 = np.array(t50)[unique_index]
d4000 = np.array(d4000)[unique_index]
Av= np.array(Av)[unique_index]
Z= np.array(Z)[unique_index]
RA_pipe3d= np.array(RA_pipe3d)[unique_index]
DEC_pipe3d= np.array(DEC_pipe3d)[unique_index]



galaxy_zoo_mangaid=[]
for i in range(len(table_gz)):
  galaxy_zoo_mangaid.append(table_gz[i][1])

gema_mangaid_table1=[]
for i in range(len(table_gema1)):
  gema_mangaid_table1.append(table_gema1[i][0])

gema_mangaid_table12=[]
for i in range(len(table_gema12)):
  gema_mangaid_table12.append(table_gema12[i][0])

gema_mangaid_table14=[]
for i in range(len(table_gema14)):
  gema_mangaid_table14.append(table_gema14[i][0])



matching_mangaids_gz=[]
matching_mangaids_gema_table1=[]
matching_mangaids_gema_table12=[]
matching_mangaids_gema_table14=[]


matching_index0=[]
matching_index1=[]
matching_index2=[]
matching_index3=[]

matching_index_gz=[]
matching_index_gema_table1=[]
matching_index_gema_table12=[]
matching_index_gema_table14=[]

for i in range (len(manga_id)):
  for j in range(len(galaxy_zoo_mangaid)): 
    if manga_id[i] == galaxy_zoo_mangaid[j]: 
      matching_mangaids_gz.append(manga_id[i])
      matching_index_gz.append(j)
      matching_index0.append(i)

for i in range (len(manga_id)):
  for r in range(len(gema_mangaid_table1)):
    if manga_id[i]==gema_mangaid_table1[r]:
      matching_mangaids_gema_table1.append(manga_id[i])
      matching_index_gema_table1.append(r)
      matching_index1.append(i)

for i in range (len(manga_id)):
  for p in range(len(gema_mangaid_table12)):
    if manga_id[i]==gema_mangaid_table12[p]:
      matching_mangaids_gema_table12.append(manga_id[i])
      matching_index_gema_table12.append(p)
      matching_index2.append(i)

for i in range (len(manga_id)):
  for q in range(len(gema_mangaid_table14)):
    if manga_id[i]==gema_mangaid_table14[q]:
      matching_mangaids_gema_table14.append(manga_id[i])
      matching_index_gema_table14.append(q)
      matching_index3.append(i)


#Galaxy Zoo using weight fraction and on the positive statement (i.e yes bar and not no bar)
spirals=[]
bars=[]
irregular_features=[] #called odd_feature_irregular in data model 
edge_on=[]  
bulge=[] #Dominant bulge 
RA_gz=[]
DEC_gz=[]
redshift_gz=[]
smooth=[]
merger_gz=[]
gz_id_test=[]


i=0
for k in range (len(manga_id)):
  if manga_id[k] in galaxy_zoo_mangaid: 
    j=matching_index_gz[i]
    gz_id_test.append(table_gz[j][1])
    bars.append(table_gz[j][56])
    spirals.append(table_gz[j][70])
    bulge.append(table_gz[j][102])
    irregular_features.append(table_gz[j][166])
    edge_on.append(table_gz[j][42])
    RA_gz.append(table_gz[j][4])
    DEC_gz.append(table_gz[j][5])
    redshift_gz.append(table_gz[j][9])
    smooth.append(table_gz[j][22])
    merger_gz.append(table_gz[j][174])
    i=i+1
  else:
    gz_id_test.append('nan')
    bars.append(np.nan)
    spirals.append(np.nan)
    bulge.append(np.nan)
    irregular_features.append(np.nan)
    edge_on.append(np.nan)
    RA_gz.append(np.nan)
    DEC_gz.append(np.nan)
    redshift_gz.append(np.nan)
    smooth.append(np.nan)
    merger_gz.append(np.nan)

#GEMA VAC



i=0 
p_merger = []
over_density = []
local_density = []
z_completeness = []


for k in range(len(manga_id)):
  if manga_id[k] in gema_mangaid_table1:
    j= matching_index_gema_table1[i]
    z_completeness.append(table_gema1[j][2])
    i=i+1 
  else:
    z_completeness.append(np.nan)

i=0
for k in range(len(manga_id)):
  if manga_id[k] in gema_mangaid_table12:
    j=matching_index_gema_table12[i]
    p_merger.append(table_gema12[j][1])
    i=i+1
  else:
    p_merger.append(np.nan)
    
i=0
for k in range(len(manga_id)):
  if manga_id[k] in gema_mangaid_table14:
    j = matching_index_gema_table14[i]
    over_density.append(table_gema14[j][1])
    local_density.append(table_gema14[j][2])
    i=i+1
  else:
    over_density.append(np.nan)
    local_density.append(np.nan)




Getting t50 values from model

In [5]:
# Define the one-layer linear neural network class
class LinearNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LinearNet, self).__init__()
        self.linear_in = nn.Linear(input_size, hidden_size)
        self.linear_out = nn.Linear(hidden_size, output_size)
        self.linear = nn.Linear(hidden_size,hidden_size)
        self.activation = nn.ReLU()
    
    def forward(self, x):
        # print(x.size())
        out = self.linear_in(x)
        # print(out.size())
        out = self.activation(out)
        out = self.linear(out)
        out = self.activation(out)
        out = self.linear(out)
        out = self.activation(out)
        out = self.linear(out)
        out = self.activation(out)
        # out = self.linear(out)
        # out = self.activation(out)
        out = self.linear_out(out)
        return out

# Example usage
input_size = 2
hidden_size = 100
output_size = 1


# Create an instance of the LinearNet
d4000_to_t50 = LinearNet(input_size, hidden_size, output_size).to('cuda')


d4000_to_t50.load_state_dict(torch.load(work_dir+'models/d400_to_t50.pytorch'))


input_data_plot = torch.stack((torch.tensor(d4000,dtype=torch.float32),torch.tensor(log_sfr,dtype=torch.float32)-torch.tensor(log_mstar,dtype=torch.float32)),dim=1)
input_data_plot = input_data_plot.reshape(input_data_plot.shape[0],input_data_plot.shape[1])

t50_output = d4000_to_t50(input_data_plot.to('cuda'))
t50_output = t50_output.detach().cpu().numpy().flatten()
t50_output = (t50_output*13.6) #to make it in units of Gyr 



In [7]:
def resize_image(src_image, size=(256,256), bg_color="black"): 
    
    # resize the image so the longest dimension matches our target size
    src_image.thumbnail(size, Image.ANTIALIAS)
    
    # Create a new square background image
    new_image = Image.new("RGB", size, bg_color)
    
    # Paste the resized image into the center of the square background
    new_image.paste(src_image, (int((size[0] - src_image.size[0]) / 2), int((size[1] - src_image.size[1]) / 2)))
  
    # return the resized image
    return new_image

def load_image_data(idlist, size=(256,256), bg_color="black"):
    img = Image.open(work_dir+'images (DR17)/'+idlist+'.png')
    img_sized = resize_image(img, size=size, bg_color=bg_color)
    return img_sized

def find_file(filename, directory1, directory2):
    # Check if the file exists in the first directory
    file_path = os.path.join(directory1, filename)
    if os.path.exists(file_path):
        return np.load(file_path)

    # If the file is not found in the first directory, check the second directory
    file_path = os.path.join(directory2, filename)
    if os.path.exists(file_path):
        return np.load(file_path)

    # If the file is not found in either directory, return None
    return None

def find_file_txt(filename, directory1, directory2):
    # Check if the file exists in the first directory
    file_path = os.path.join(directory1, filename)
    if os.path.exists(file_path):
        category.append('Train')
        return np.loadtxt(file_path)

    # If the file is not found in the first directory, check the second directory
    file_path = os.path.join(directory2, filename)
    if os.path.exists(file_path):
        category.append('Test')
        return np.loadtxt(file_path)

    # If the file is not found in either directory, return None
    category.append('None')
    return None

In [None]:
# let's get images for each galaxy
gal_ids = manga_id
for i in tqdm(range(len(gal_ids))):
    try:
        im = Image(mangaid = gal_ids[i])

        # generate a new image
        # inputs are height and width in arcsec, and a arcsec/pixel scale
        im.get_new_cutout(50, 50, scale=0.089)

        # plot the new image cutout
        im.plot()
        plt.savefig('SPECIFY DIR FOR IMAGES HERE'+gal_ids[i]+'.png')
        plt.close()
        plt.close()
    except:
        print('could not get image for galaxy: ',gal_ids[i])

In [8]:
images= []
shap_map_mass =[]
shap_map_sfr = []
shap_map_d4000 = []
sfh =[]
category =[]

for i in tqdm(manga_id):
    images.append(load_image_data(i))
    sfh.append(find_file_txt(i,'/home/juanpabloalfonzo/Documents/Manga CNNs/SFH (DR17)/True/Data Arrays/Train','/home/juanpabloalfonzo/Documents/Manga CNNs/SFH (DR17)/True/Data Arrays/Test'))
    shap_map_mass.append(find_file(i+'.npy','/home/juanpabloalfonzo/Documents/Manga CNNs/SHAP-Maps/Mass/Train','/home/juanpabloalfonzo/Documents/Manga CNNs/SHAP-Maps/Mass/Test'))
    shap_map_sfr.append(find_file(i+'.npy','/home/juanpabloalfonzo/Documents/Manga CNNs/SHAP-Maps/SFR/Train','/home/juanpabloalfonzo/Documents/Manga CNNs/SHAP-Maps/SFR/Test'))
    shap_map_d4000.append(find_file(i+'.npy','/home/juanpabloalfonzo/Documents/Manga CNNs/SHAP-Maps/D4000/Train','/home/juanpabloalfonzo/Documents/Manga CNNs/SHAP-Maps/D4000/Test'))

    

100%|██████████| 10081/10081 [08:43<00:00, 19.26it/s]


Getting Predictions from Chain CNNs

In [9]:
device = 'cuda'

model_used = 'ResNet50_log_t50_chain_90_10'
model_mass = models.resnet50(weights=None)
model_mass.fc = nn.Linear(2048, 1)
model_mass = nn.DataParallel(model_mass,device_ids=(0,1))
model_mass.load_state_dict(torch.load(work_dir+'models/Mass_'+model_used+'.pytorch'),strict=True)#strict is set to false since it was trained on multiple GPUs it causes an error when loaded on the model that is not on multiple GPUs yet
                                                                                                # DO NOT DO THIS!! MESSES WITH THE MODEL PREDECTIONS HEAVILY
model_mass.eval()

model_sfr = models.resnet50(weights='ResNet50_Weights.IMAGENET1K_V1')
model_sfr.conv1 = nn.Conv2d(4, 64, kernel_size=7, stride=2, padding=3,bias=False)
model_sfr.fc = nn.Linear(2048, 1)
model_sfr = nn.DataParallel(model_sfr,device_ids=(0,1))
model_sfr.load_state_dict(torch.load(work_dir+'models/SFR_'+model_used+'.pytorch'),strict=True)
model_sfr.eval()

model_used = 'd4000_chain'
model_d4000 = models.resnet50(weights='ResNet50_Weights.IMAGENET1K_V1')
model_d4000.conv1 = nn.Conv2d(5, 64, kernel_size=7, stride=2, padding=3,bias=False)
model_d4000.fc = nn.Linear(2048, 1)
model_d4000 = nn.DataParallel(model_d4000,device_ids=(0,1))
model_d4000.load_state_dict(torch.load(work_dir+'models/d4000_'+model_used+'.pytorch'),strict=True)
model_d4000.eval()


model_mass, model_sfr, model_d4000 = model_mass.to(device), model_sfr.to(device), model_d4000.to(device)


pred_mass = []


transform = transforms.Compose([transforms.ToTensor()])

for i in tqdm(range(len(images))):
    with torch.no_grad():
        image_tensor = torch.ones((1,3,256,256))
        image_tensor[0,:,:,:] = transform(images[i])
        image_tensor = image_tensor.float().to('cuda')
        pred = model_mass(image_tensor)
        pred = pred.detach().cpu().numpy()
        pred_mass.append(pred[0][0])

pred_sfr = []
for i in tqdm(range(len(images))):
    with torch.no_grad():
        image_tensor = torch.ones((1,3,256,256))
        image_tensor[0,:,:,:] = transform(images[i])
        pred_mass_tensor = torch.ones((1,1,256,256))*pred_mass[i]
        image_mass_tensor = torch.concat((image_tensor,pred_mass_tensor),1)
        image_mass_tensor = image_mass_tensor.float().to('cuda')
        pred = model_sfr(image_mass_tensor)
        pred = pred.detach().cpu().numpy()
        pred_sfr.append(pred[0][0])

pred_d4000 =[]
for i in tqdm(range(len(images))):
    with torch.no_grad():
        image_tensor = torch.ones((1,3,256,256))
        image_tensor[0,:,:,:] = transform(images[i])
        pred_mass_tensor = torch.ones((1,1,256,256))*pred_mass[i]
        image_mass_tensor = torch.concat((image_tensor,pred_mass_tensor),1)
        pred_sfr_tensor = torch.ones((1,1,256,256))*pred_sfr[i]
        image_mass_sfr_tensor = torch.concat((image_mass_tensor,pred_sfr_tensor),1)
        image_mass_sfr_tensor = image_mass_sfr_tensor.float().to('cuda')
        pred = model_d4000(image_mass_sfr_tensor)
        pred = pred.detach().cpu().numpy()
        pred_d4000.append(pred[0][0])

   



100%|██████████| 10081/10081 [02:24<00:00, 69.67it/s]
100%|██████████| 10081/10081 [02:26<00:00, 68.69it/s]
100%|██████████| 10081/10081 [02:26<00:00, 68.86it/s]


# Generating the Final Catalogues

In [10]:
d = {'mangaid':manga_id, 'log_mstar':log_mstar, 'log_sfr': log_sfr, 'redshift':redshift, 'redshift_gz':redshift_gz, 'sersic_n':sersic_n, 't50_Pipe3D':t50, 't50_model':t50_output,
      'd4000':d4000, 'Av':Av, 'Z':Z, 'spirals':spirals, 'bars': bars, 'irregular_features':irregular_features, 'edge_on':edge_on,'bulge':bulge,
       'RA_Pipe3D':RA_pipe3d, 'DEC_Pipe3D': DEC_pipe3d, 'RA_gz':RA_gz, 'DEC_gz':DEC_gz, 'smooth':smooth, 'merger_gz':merger_gz, 'p_merger':p_merger,'over_density':over_density,
         'z_completeness':z_completeness, 'images':images, 'shap_map_mass':shap_map_mass, 'shap_map_sfr':shap_map_sfr,'shap_map_d4000':shap_map_d4000, 
         'split':category , 'pred_mstar':pred_mass,'pred_sfr':pred_sfr, 'pred_d4000':pred_d4000, 'sfh':sfh}

manga_cat= pd.DataFrame(d)

In [11]:
hickle.dump(manga_cat,'SHAP_Map_Labels_MaNGA_extra.cat')



pandas 1.4.4


In [12]:
scalars = {'mangaid':manga_id, 
'log_mstar':log_mstar, 
'log_sfr': log_sfr, 
'redshift':redshift, 
'redshift_gz':redshift_gz, 
'sersic_n':sersic_n, 
't50_Pipe3D':t50, 
't50_model':t50_output,
'd4000':d4000, 
'Av':Av, 
'Z':Z, 
'spirals':spirals, 
'bars': bars, 
'irregular_features':irregular_features, 
'edge_on':edge_on,
'bulge':bulge,
'RA_Pipe3D':RA_pipe3d, 
'DEC_Pipe3D': DEC_pipe3d, 
'RA_gz':RA_gz, 
'DEC_gz':DEC_gz, 
'smooth':smooth, 
'merger_gz':merger_gz, 
'p_merger':p_merger,
'over_density':over_density,
'z_completeness':z_completeness,
'split':category,
'pred_mstar':pred_mass,
'pred_sfr':pred_sfr,
'pred_d4000':pred_d4000,
'sfh':sfh}

scalars = pd.DataFrame(scalars)

hickle.dump(scalars,'/home/juanpabloalfonzo/Documents/Manga CNNs/Catalogues/scalars_extra.cat')

images_cat = {'mangaid':manga_id, 'images':images}
images_cat = pd.DataFrame(images_cat)
hickle.dump(images_cat,'/home/juanpabloalfonzo/Documents/Manga CNNs/Catalogues/images_extra.cat')

shap_mass_cat = {'mangaid':manga_id, 'shap_map_mass':shap_map_mass}
shap_mass_cat = pd.DataFrame(shap_mass_cat)
hickle.dump(shap_mass_cat,'/home/juanpabloalfonzo/Documents/Manga CNNs/Catalogues/shap_mass_extra.cat')

shap_sfr_cat = {'mangaid':manga_id, 'shap_map_mass':shap_map_sfr}
shap_sfr_cat = pd.DataFrame(shap_sfr_cat)
hickle.dump(shap_sfr_cat,'/home/juanpabloalfonzo/Documents/Manga CNNs/Catalogues/shap_sfr_extra.cat')

shap_d4000_cat = {'mangaid':manga_id, 'shap_map_mass':shap_map_d4000}
shap_d4000_cat = pd.DataFrame(shap_d4000_cat)
hickle.dump(shap_d4000_cat,'/home/juanpabloalfonzo/Documents/Manga CNNs/Catalogues/shap_d4000_extra.cat')
