# Predict Single Morphology Variable (Sersic n) Based on Multiple Star-formation Variables (M*, SFR and more)


In [1]:
#Loading needed modules and classes/functions 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix,r2_score
import seaborn as sns
sns.set(font_scale=1.4)
sns.set_style('ticks')
%matplotlib qt

#3 Linear layers NN, 1 hidden 
class linearRegression(torch.nn.Module):
    def __init__(self, inputSize, outputSize,hiddenSize):
        super(linearRegression, self).__init__()
        self.linear = torch.nn.Linear(inputSize, hiddenSize)
        self.linear1 = torch.nn.Linear(hiddenSize, hiddenSize)
        self.linear2 = torch.nn.Linear(hiddenSize,hiddenSize)
        self.linear3 = torch.nn.Linear(hiddenSize,hiddenSize)
        self.linear4= torch.nn.Linear(hiddenSize, outputSize)
        self.ReLU= torch.nn.ReLU()
        self.ReLU6= torch.nn.ReLU6()
        self.Sigmoid= torch.nn.Sigmoid()
        self.ELU=torch.nn.ELU()
        self.LeakyReLU=torch.nn.LeakyReLU()
        self.PReLU=torch.nn.PReLU()
        self.RReLU= torch.nn.RReLU()
        self.CELU=torch.nn.CELU()
        self.SELU=torch.nn.SELU()
        self.Softsign=torch.nn.Softsign()

    def forward(self, x):
        x = self.linear(x)
        x = self.Softsign(x)
        # x= self.linear1(x)
        # x= self.Sigmoid(x)
        # x= self.linear2(x)
        # x= self.Sigmoid(x)
        # x = self.linear3(x)
        # x= self.Sigmoid(x)
        x=self.linear4(x)
        return x




# Importing Data from Schema Table

In [2]:


data=pd.read_csv('CompleteTable.csv')  #Importing All MaNGA Data from DPRall Schema

galaxy_list=np.loadtxt('Query Results',dtype=str) #Pulling Manga ID's of galaxies which satisfy log(M) > 9 and 0 < z < 0.1

galaxy_index=np.zeros(len(galaxy_list)) 
for i in range (len(galaxy_list)): #Getting the index of these galaxies in the schema table
    galaxy_index[i]=np.where(data.loc[:,'mangaid']==galaxy_list[i])[0][0]

galaxy_index=np.array(galaxy_index,dtype=int) #Ensuring we have array that can be used to index, force int 

galaxies=data.iloc[galaxy_index] #DF of galaxies which satisfies the condition, contains all relevant schema data 

#Creating the arrays of the independent variables were are interested in, and dependent variable n 

mass=galaxies.loc[:,'nsa_sersic_mass']
log_mass=np.log10(mass)

SFR=galaxies.loc[:,'sfr_tot']
log_SFR=np.log10(SFR)

ha_flux=galaxies.loc[:,'emline_gflux_tot_ha_6564']

n=galaxies.loc[:,'nsa_sersic_n']
n=np.array(n,dtype=np.float32)
n=StandardScaler().fit_transform(n.reshape(-1,1))
n=torch.from_numpy(n).to('cuda:0')






# Prep the input data to go into a DataLoader 

In [3]:

inputs=[log_SFR]
inputs_transformed=[]

def data_preparer(inputs):  
    """
    Takes in a list in which each element is an input variable and then preps
    it accordingly to return it as one combined GPU pytorch tensor. 
    """
    

    #makes all inputs np arrays of np.float32
    #and makes scailing to mean 0 and std of 1
    for i in range (len(inputs)):
        inputs_transformed.append(StandardScaler().fit_transform(np.array(inputs[i],dtype=np.float32).reshape(-1,1))) 
                                                                                                    
    reshape=np.column_stack(inputs_transformed)
    out=torch.from_numpy(reshape).to('cuda:0')
    return(out)

inputs_tensor=data_preparer(inputs)

print(np.shape(inputs_tensor))
print(np.shape(n))

#Create Tensor Datasets 
train_ds, test_ds, validate_ds=torch.utils.data.random_split(TensorDataset(inputs_tensor,n),[2183,727,728]) #Better way to automate these splits? 

#Create Data Loaders
train_dl=DataLoader(train_ds,batch_size=64,shuffle=True)
test_dl=DataLoader(test_ds,batch_size=64,shuffle=True)
validate_dl=DataLoader(validate_ds,batch_size=64,shuffle=True)

print(train_ds[0])



torch.Size([3638, 1])
torch.Size([3638, 1])
(tensor([0.3884], device='cuda:0'), tensor([-0.9633], device='cuda:0'))


# The Model (N inputs, 1 output, CUDA Enabled)

In [4]:

inputDim=len(inputs)
outputDim=1 
hiddenSize=50
learningRate=0.1

model = linearRegression(inputDim, outputDim,hiddenSize)
##### For GPU #######
if torch.cuda.is_available():
    model.cuda()

criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learningRate)

#Training Loop
epochs=1800

epoch_array=np.zeros(epochs)
loss_array=np.zeros(epochs)
for epoch in range(epochs): #Forward Pass and loss
    for xb,yb in train_dl:
        # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
        optimizer.zero_grad()

        # get output from the model, given the inputs
        outputs = model(xb)

        # get loss for the predicted output
        loss = criterion(outputs, yb)
        # print(loss)
        # get gradients w.r.t to parameters, (backward pass)
        loss.backward()

        # update parameters
        optimizer.step()

        epoch_array[epoch]=epoch 
        loss_array[epoch]=loss.item()

        if epoch %200==0:
            print('epoch {}, loss {}'.format(epoch, loss.item()))

with torch.no_grad(): # we don't need gradients in the testing phase
    predicted = model(inputs_tensor)
    print(predicted)




epoch 0, loss 1.1728949546813965
epoch 0, loss 1.764080286026001
epoch 0, loss 1.2699695825576782
epoch 0, loss 1.185093879699707
epoch 0, loss 0.8662426471710205
epoch 0, loss 1.2137163877487183
epoch 0, loss 1.5499790906906128
epoch 0, loss 0.95569908618927
epoch 0, loss 0.9519414901733398
epoch 0, loss 1.2367439270019531
epoch 0, loss 1.0804895162582397
epoch 0, loss 0.9680027961730957
epoch 0, loss 1.1641689538955688
epoch 0, loss 0.6662874221801758
epoch 0, loss 0.857558012008667
epoch 0, loss 1.0487252473831177
epoch 0, loss 1.096029281616211
epoch 0, loss 0.8141884207725525
epoch 0, loss 0.9368876218795776
epoch 0, loss 0.8376739025115967
epoch 0, loss 0.7166354656219482
epoch 0, loss 0.8915270566940308
epoch 0, loss 0.9804446697235107
epoch 0, loss 0.9058224558830261
epoch 0, loss 0.7509802579879761
epoch 0, loss 0.6916099190711975
epoch 0, loss 0.9202231764793396
epoch 0, loss 0.9589412808418274
epoch 0, loss 0.8109996914863586
epoch 0, loss 1.0062906742095947
epoch 0, loss 0.

In [39]:
from mpl_toolkits.mplot3d import Axes3D

%matplotlib qt
fig=plt.figure()
ax= plt.axes(projection='3d')
ax.scatter3D(inputs_tensor.cpu().detach().numpy()[:,0].flatten(),inputs_tensor.cpu().detach().numpy()[:,1].flatten(),predicted.cpu().detach().numpy().flatten(),label='Model Predictions')

ax.scatter3D(inputs_tensor.cpu().detach().numpy()[:,0],inputs_tensor.cpu().detach().numpy()[:,1],n.cpu().detach().numpy(),alpha=0.5,label='Actual Data')

ax.set_xlabel('log SFR')
ax.set_ylabel('log Mass')
ax.set_zlabel('Sersic n')
ax.set_title('Mass and SFR as Indicators of Galaxy Morphology')
ax.legend()


# plt.scatter(inputs_tensor.cpu().detach().numpy(),n.cpu().detach().numpy(),alpha=0.2)
# plt.scatter(inputs_tensor.cpu().detach().numpy(),predicted.cpu().detach().numpy())


<matplotlib.legend.Legend at 0x7fc11f1019d0>

In [5]:
batch_size=64 
device='cuda'

all_truths_train = [] 
all_preds_train = [] 
for (data,target) in train_dl:
    data, target = data.to(device), target.to(device)
    output=model(data)
    all_truths_train.append(target.cpu().detach().numpy())
    all_preds_train.append(output.cpu().detach().numpy()) 


incomplete_batch_id_train=len(all_truths_train)-1

remainder_train=len(all_truths_train[incomplete_batch_id_train])

total_values_train=(len(all_truths_train)*batch_size)-(batch_size-remainder_train)



all_truths_train_array=np.zeros(total_values_train)
all_preds_train_array=np.zeros(total_values_train)
k=0
while k < total_values_train:
    for i in range(len(all_truths_train)):
        if i<incomplete_batch_id_train:
            for j in range(batch_size):
                all_truths_train_array[k]=all_truths_train[i][j]
                all_preds_train_array[k]=all_preds_train[i][j]
                k=k+1
                


        else:
            i=incomplete_batch_id_train
            for j in range(remainder_train):
                all_truths_train_array[k]=all_truths_train[i][j]
                all_preds_train_array[k]=all_preds_train[i][j]
                k=k+1
                


all_truths_test = [] 
all_preds_test = [] 
for (data,target) in test_dl:
    data, target = data.to(device), target.to(device)
    output=model(data)
    all_truths_test.append(target.cpu().detach().numpy())
    all_preds_test.append(output.cpu().detach().numpy()) 


incomplete_batch_id_test=len(all_truths_test)-1

remainder_test=len(all_truths_test[incomplete_batch_id_test])


total_values_test=(len(all_truths_test)*batch_size)-(batch_size-remainder_test)



all_truths_test_array=np.zeros(total_values_test)
all_preds_test_array=np.zeros(total_values_test)
k=0
while k < total_values_test:
    for i in range(len(all_truths_test)):
        if i<incomplete_batch_id_test:
            for j in range(batch_size):
                all_truths_test_array[k]=all_truths_test[i][j]
                all_preds_test_array[k]=all_preds_test[i][j]
                # print(i,j,k)
                k=k+1
                
                


        else:
            i=incomplete_batch_id_test
            for j in range(remainder_test):
                all_truths_test_array[k]=all_truths_test[i][j]
                all_preds_test_array[k]=all_preds_test[i][j]
                # print(i,j,k)
                k=k+1
                
                

print(all_truths_test[3][43])
print(all_preds_test[3][43])
print(all_truths_test_array[235])
print(all_preds_test_array[235])
# all_truths_test=all_truths_test_array
# all_preds_test=all_preds_test_array



def MSE(pred,truth,n):
    return((1/n)*np.sum((pred-truth)**2))





[-0.7357753]
[-0.58383316]
-0.7357752919197083
-0.5838331580162048


In [6]:
r2_train=r2_score(all_truths_train_array,all_preds_train_array)
r2_test=r2_score(all_truths_test_array,all_preds_test_array)
MSE_train=MSE(all_preds_train_array,all_truths_train_array,len(all_preds_train_array))
MSE_test=MSE(all_preds_test_array,all_truths_test_array,len(all_truths_test_array))

plt.figure(figsize=(16,12))
plt.suptitle('Predicting Sersic n Based on log SFR',fontsize=16,weight='bold')
plt.subplot(1,2,1) 
plt.title('Test Data Set')
plt.scatter(all_truths_test_array, all_preds_test_array,color = 'b', alpha = 0.1*7/3)
plt.xlabel('Test True Values')
plt.ylabel('Test Predicted Value')
plt.text(0.9,-0.85,'R$^2$='+ str(round(r2_test,4)),fontsize=14)
plt.text(0.9,-0.90,'MSE='+ str(round(MSE_test,4)),fontsize=14)
plt.plot([-1,1],[-1,1],'r--')

plt.subplot(1,2,2)
plt.title('Train Data Set')
plt.scatter(all_truths_train_array, all_preds_train_array,color = 'k', alpha = 0.1)
plt.xlabel('Train True Values')
plt.ylabel('Train Predicted Value')
plt.plot([-1,1],[-1,1],'r--') 
plt.text(0.9,-0.85,'R$^2$='+str(round(r2_train,4)),fontsize=14)
plt.text(0.9,-0.90,'MSE='+ str(round(MSE_train,4)),fontsize=14)
plt.show()

# plt.savefig('/home/juanp/Documents/SURP-2021/Plots/Model 2/SFR and n')

# Chaning/looping over Hyperparamters below 

# Changing Epoch while Keeping Learning Rate and Hidden Size Constant 

In [62]:
inputDim=len(inputs)
outputDim=1 
hiddenSize=50
learningRate=0.1

model = linearRegression(inputDim, outputDim,hiddenSize)
##### For GPU #######
if torch.cuda.is_available():
    model.cuda()

criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learningRate)

reps=50
last_loss=np.zeros(reps) #Stores final loss returned value after training loop is done for a given number of epoch
epoch_range=np.linspace(400,4000,num=reps,dtype=int)


for i in range (reps):
    #Training Loop
    epochs=epoch_range[i]
    
    # epoch_array=np.zeros(epochs)
    loss_array=np.zeros(epochs)
    for epoch in range(epochs): #Forward Pass and loss
        for xb,yb in train_dl:
            # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
            optimizer.zero_grad()

            # get output from the model, given the inputs
            outputs = model(xb)

            # get loss for the predicted output
            loss = criterion(outputs, yb)
            # print(loss)
            # get gradients w.r.t to parameters, (backward pass)
            loss.backward()

            # update parameters
            optimizer.step()

            # epoch_array[epoch]=epoch 
            loss_array[epoch]=loss.item()
    
 
    last_loss[i]=loss_array[-1] #Store final loss value
    print('Done rep ',i)

Done rep  0
Done rep  1
Done rep  2
Done rep  3
Done rep  4
Done rep  5
Done rep  6
Done rep  7
Done rep  8
Done rep  9
Done rep  10
Done rep  11
Done rep  12
Done rep  13
Done rep  14
Done rep  15
Done rep  16
Done rep  17
Done rep  18
Done rep  19
Done rep  20
Done rep  21
Done rep  22
Done rep  23
Done rep  24
Done rep  25
Done rep  26
Done rep  27
Done rep  28
Done rep  29
Done rep  30
Done rep  31
Done rep  32
Done rep  33
Done rep  34
Done rep  35
Done rep  36
Done rep  37
Done rep  38
Done rep  39
Done rep  40
Done rep  41
Done rep  42
Done rep  43
Done rep  44
Done rep  45
Done rep  46
Done rep  47
Done rep  48
Done rep  49


In [66]:
plt.title('Number of Epochs and Final Value of Loss')
plt.ylabel('Last Loss Value')
plt.xlabel('Number of Epochs Trained')
plt.plot(epoch_range,last_loss)

[<matplotlib.lines.Line2D at 0x7ff213273a50>]

# Changing Hidden Size while Keeping Learning Rate and Epoch Constant 

In [68]:
reps=50
hidden_size_range=np.linspace(10,200,num=reps,dtype=int)
last_loss_hidden_size=np.zeros(reps) #Stores final loss returned value after training loop is done for a given number of epoch

for i in range (reps):

    inputDim=len(inputs)
    outputDim=1 
    hiddenSize=hidden_size_range[i]
    learningRate=0.1

    model = linearRegression(inputDim, outputDim,hiddenSize)
    ##### For GPU #######
    if torch.cuda.is_available():
        model.cuda()

    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learningRate)

    #Training Loop
    epochs=1000 

    
    loss_array=np.zeros(epochs)
    for epoch in range(epochs): #Forward Pass and loss
        for xb,yb in train_dl:
            # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
            optimizer.zero_grad()

            # get output from the model, given the inputs
            outputs = model(xb)

            # get loss for the predicted output
            loss = criterion(outputs, yb)
            # print(loss)
            # get gradients w.r.t to parameters, (backward pass)
            loss.backward()

            # update parameters
            optimizer.step()

            loss_array[epoch]=loss.item()


    last_loss_hidden_size[i]=loss_array[-1] #Store final loss value
    print('Done rep ',i)


Done rep  0
Done rep  1
Done rep  2
Done rep  3
Done rep  4
Done rep  5
Done rep  6
Done rep  7
Done rep  8
Done rep  9
Done rep  10
Done rep  11
Done rep  12
Done rep  13
Done rep  14
Done rep  15
Done rep  16
Done rep  17
Done rep  18
Done rep  19
Done rep  20
Done rep  21
Done rep  22
Done rep  23
Done rep  24
Done rep  25
Done rep  26
Done rep  27
Done rep  28
Done rep  29
Done rep  30
Done rep  31
Done rep  32
Done rep  33
Done rep  34
Done rep  35
Done rep  36
Done rep  37
Done rep  38
Done rep  39
Done rep  40
Done rep  41
Done rep  42
Done rep  43
Done rep  44
Done rep  45
Done rep  46
Done rep  47
Done rep  48
Done rep  49


In [73]:
plt.title('Hidden Size and Final Value of Loss')
plt.ylabel('Last Loss Value')
plt.xlabel('Hidden Size')
plt.plot(hidden_size_range,last_loss_hidden_size)

[<matplotlib.lines.Line2D at 0x7ff18397d210>]

# Changing Hidden Size while Keeping Learning Rate and Epoch Constant 

In [84]:
reps=50
learningRate_range=np.linspace(0.01,1,num=reps,dtype=float)
last_loss_learningRate=np.zeros(reps) #Stores final loss returned value after training loop is done for a given number of epoch

for i in range (reps):

    inputDim=len(inputs)
    outputDim=1 
    hiddenSize= 50
    learningRate=learningRate_range[i]

    model = linearRegression(inputDim, outputDim,hiddenSize)
    ##### For GPU #######
    if torch.cuda.is_available():
        model.cuda()

    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learningRate)

    #Training Loop
    epochs= 1000 

    
    loss_array=np.zeros(epochs)
    for epoch in range(epochs): #Forward Pass and loss
        for xb,yb in train_dl:
            # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
            optimizer.zero_grad()

            # get output from the model, given the inputs
            outputs = model(xb)

            # get loss for the predicted output
            loss = criterion(outputs, yb)
            # print(loss)
            # get gradients w.r.t to parameters, (backward pass)
            loss.backward()

            # update parameters
            optimizer.step()

            loss_array[epoch]=loss.item()


    last_loss_learningRate[i]=loss_array[-1] #Store final loss value
    print('Done rep ',i)

Done rep  0
Done rep  1
Done rep  2
Done rep  3
Done rep  4
Done rep  5
Done rep  6
Done rep  7
Done rep  8
Done rep  9
Done rep  10
Done rep  11
Done rep  12
Done rep  13
Done rep  14
Done rep  15
Done rep  16
Done rep  17
Done rep  18
Done rep  19
Done rep  20
Done rep  21
Done rep  22
Done rep  23
Done rep  24
Done rep  25
Done rep  26
Done rep  27
Done rep  28
Done rep  29
Done rep  30
Done rep  31
Done rep  32
Done rep  33
Done rep  34
Done rep  35
Done rep  36
Done rep  37
Done rep  38
Done rep  39
Done rep  40
Done rep  41
Done rep  42
Done rep  43
Done rep  44
Done rep  45
Done rep  46
Done rep  47
Done rep  48
Done rep  49


In [85]:
plt.title('Learning Rate and Final Value of Loss')
plt.ylabel('Last Loss Value')
plt.xlabel('Learning Rate')
plt.plot(learningRate_range,last_loss_learningRate)



[<matplotlib.lines.Line2D at 0x7ff17a9f9b50>]