In [1]:
# importing required libraries
import numpy as np  # handles arrays, matrices, and mathematical operations
import pandas as pd # used to load and preprocess the structured dataset using data frames
from sklearn.model_selection import train_test_split # provides functions for splitting data into training and testing sets
from sklearn.preprocessing import LabelEncoder # LabelEncoder, a preprocessing function, class convert categorical labels into numerical values
from tensorflow import keras # Keras supports a wide range of pre-built layers, activation functions, and optimizers
from tensorflow.keras import layers, regularizers # used to define the layers of the model architecture
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.regularizers import l2

In [None]:
# Load the dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Portfolio/1. Capstone Projects/Galaxy Classification(DL Project)/GalaxyZoo1_DR_table2.csv')
df.head()

Unnamed: 0,OBJID,RA,DEC,NVOTE,P_EL,P_CW,P_ACW,P_EDGE,P_DK,P_MG,P_CS,P_EL_DEBIASED,P_CS_DEBIASED,SPIRAL,ELLIPTICAL,UNCERTAIN
0,587727178986356823,00:00:00.41,-10:22:25.7,59,0.61,0.034,0.0,0.153,0.153,0.051,0.186,0.61,0.186,0,0,1
1,587727227300741210,00:00:00.74,-09:13:20.2,18,0.611,0.0,0.167,0.222,0.0,0.0,0.389,0.203,0.797,1,0,0
2,587727225153257596,00:00:01.03,-10:56:48.0,68,0.735,0.029,0.0,0.147,0.074,0.015,0.176,0.432,0.428,0,0,1
3,587730774962536596,00:00:01.38,+15:30:35.3,52,0.885,0.019,0.0,0.058,0.019,0.019,0.077,0.885,0.077,0,1,0
4,587731186203885750,00:00:01.55,-00:05:33.3,59,0.712,0.0,0.0,0.22,0.068,0.0,0.22,0.64,0.29,0,0,1


### Column name and meaning:
<b style='color:green'>OBJID:</b> Object ID or unique identifier for each galaxy.

<b style='color:green'>RA:</b> Right Ascension, the angular distance eastward along the celestial equator from the vernal equinox to the hour circle of the object (in this case, given in hours).

<b style='color:green'>DEC:</b> Declination, the angular distance north or south of the celestial equator (in this case, given in degrees).

<b style='color:green'>NVOTE:</b> Number of votes or ratings received for each galaxy from different users.

<b style='color:green'>P_EL, P_CW, P_ACW, P_EDGE, P_DK, P_MG, P_CS:</b> Probability scores assigned by users for different classifications (e.g., probability of being elliptical, clockwise, anticlockwise, edge-on, disk, or merger).

<b style='color:green'>P_EL_DEBIASED, P_CS_DEBIASED:</b> Debiased probability scores, which account for classification biases in user voting.

<b style='color:green'>SPIRAL, ELLIPTICAL, UNCERTAIN:</b> Binary flags indicating the classifications of each galaxy based on user voting.

In [None]:
df.info()  # no null value observed

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667944 entries, 0 to 667943
Data columns (total 16 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   OBJID          667944 non-null  int64  
 1   RA             667944 non-null  object 
 2   DEC            667944 non-null  object 
 3   NVOTE          667944 non-null  int64  
 4   P_EL           667944 non-null  float64
 5   P_CW           667944 non-null  float64
 6   P_ACW          667944 non-null  float64
 7   P_EDGE         667944 non-null  float64
 8   P_DK           667944 non-null  float64
 9   P_MG           667944 non-null  float64
 10  P_CS           667944 non-null  float64
 11  P_EL_DEBIASED  667944 non-null  float64
 12  P_CS_DEBIASED  667944 non-null  float64
 13  SPIRAL         667944 non-null  int64  
 14  ELLIPTICAL     667944 non-null  int64  
 15  UNCERTAIN      667944 non-null  int64  
dtypes: float64(9), int64(5), object(2)
memory usage: 81.5+ MB


In [None]:
df.describe()  # no outlier in the dataset as most of them are probabilities

Unnamed: 0,OBJID,NVOTE,P_EL,P_CW,P_ACW,P_EDGE,P_DK,P_MG,P_CS,P_EL_DEBIASED,P_CS_DEBIASED,SPIRAL,ELLIPTICAL,UNCERTAIN
count,667944.0,667944.0,667944.0,667944.0,667944.0,667944.0,667944.0,667944.0,667944.0,667944.0,667944.0,667944.0,667944.0,667944.0
mean,5.878143e+17,38.760986,0.535947,0.074007,0.080736,0.205066,0.072924,0.031244,0.35985,0.393508,0.483439,0.284792,0.093107,0.622102
std,179602300000000.0,13.827837,0.284315,0.17444,0.179102,0.216642,0.085788,0.089369,0.298643,0.301595,0.321747,0.451316,0.290582,0.484862
min,5.87723e+17,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.87732e+17,28.0,0.306,0.0,0.0,0.054,0.018,0.0,0.102,0.106,0.18,0.0,0.0,0.0
50%,5.877386e+17,34.0,0.581,0.016,0.021,0.13,0.053,0.0,0.267,0.347,0.462,0.0,0.0,1.0
75%,5.877428e+17,51.0,0.783,0.051,0.062,0.275,0.103,0.02,0.586,0.664,0.789,1.0,0.0,1.0
max,5.888489e+17,94.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
# Preprocess the data
X = df.drop(['OBJID', 'RA', 'DEC', 'SPIRAL', 'ELLIPTICAL', 'UNCERTAIN'], axis=1)  # features
y = df[['SPIRAL', 'ELLIPTICAL', 'UNCERTAIN']] # labels

In [None]:
y

Unnamed: 0,SPIRAL,ELLIPTICAL,UNCERTAIN
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1
...,...,...,...
667939,1,0,0
667940,0,0,1
667941,0,0,1
667942,0,0,1


In [None]:
# Convert labels to numerical values for training purpose
label_encoder = LabelEncoder()
y = y.apply(label_encoder.fit_transform)
y

Unnamed: 0,SPIRAL,ELLIPTICAL,UNCERTAIN
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1
...,...,...,...
667939,1,0,0
667940,0,0,1
667941,0,0,1
667942,0,0,1


In [None]:
X

Unnamed: 0,NVOTE,P_EL,P_CW,P_ACW,P_EDGE,P_DK,P_MG,P_CS,P_EL_DEBIASED,P_CS_DEBIASED
0,59,0.610,0.034,0.000,0.153,0.153,0.051,0.186,0.610,0.186
1,18,0.611,0.000,0.167,0.222,0.000,0.000,0.389,0.203,0.797
2,68,0.735,0.029,0.000,0.147,0.074,0.015,0.176,0.432,0.428
3,52,0.885,0.019,0.000,0.058,0.019,0.019,0.077,0.885,0.077
4,59,0.712,0.000,0.000,0.220,0.068,0.000,0.220,0.640,0.290
...,...,...,...,...,...,...,...,...,...,...
667939,35,0.171,0.800,0.000,0.029,0.000,0.000,0.829,0.057,0.943
667940,21,0.810,0.048,0.000,0.095,0.048,0.000,0.143,0.758,0.193
667941,28,0.286,0.000,0.071,0.393,0.179,0.071,0.464,0.099,0.603
667942,23,0.391,0.000,0.043,0.000,0.130,0.435,0.043,0.390,0.045


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((534355, 10), (133589, 10), (534355, 3), (133589, 3))

In [None]:
# Define the model architecture
model = keras.Sequential([
    layers.Dense(8, activation='relu', kernel_regularizer=l2(0.001), input_shape=(X_train.shape[1],)),
    layers.Dropout(0.2),
    layers.Dense(8, activation='relu', kernel_regularizer=l2(0.001)),
    layers.Dropout(0.2),
    layers.Dense(3, activation='softmax')
])    # three layers: input layer with 128 neurons, hidden layer with 128 neuron and output layer with 3 neurons
      # dropout reduces overfitting by preventing co-adaptation of neurons;
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) # compiling the model with the specified optimizer, loss function, and metrics, prepares the model to efficiently update its parameters during the training process and evaluate its performance based on the specified metrics.

# Early stopping and model checkpoint
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) # used to monitor the validation loss and stop the training if the loss does not improve for a certain number of epochs (patience=5)
checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True) # saves the best model based on validation loss (save_best_only=True)

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=256, validation_data=(X_test, y_test),
                    callbacks=[early_stopping, checkpoint])  # epochs is iteration of training dataset through neural network; batch_size is training samples used in one forward/backward pass of the neural network; verbose=1 means the training progress will be displayed on the console for each epoch


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Evaluate the model
loss, accuracy = history.model.evaluate(X_test, y_test)
print("Test Loss:", loss)  # The loss function measures the discrepancy between the predicted output of the model and the true labels
print("Test Accuracy:", accuracy)

Test Loss: 0.29215502738952637
Test Accuracy: 0.8901256918907166


the above model with <b style="color:yellow">3</b> layers, <b style="color:yellow">128</b> input neurons and <b style="color:yellow">relu</b>(Rectified Linear Unit) activation function resulted a high accuracy of <b style="color:green">89.013%</b> and a very minute loss of <b style="color:green">0.2922</b>.

### Testing for new data

In [None]:
# taking random label from the dataset
n = int(input('Enter the row number'))
y_test.iloc[n:n+1,:]

Unnamed: 0,SPIRAL,ELLIPTICAL,UNCERTAIN
275644,1,0,0


In [None]:
# respective feature value array converted into list
arr = X_test.iloc[n:n+1,:].values.astype(list)
x_list = arr[0].tolist()
x_list

[58.0, 0.138, 0.034, 0.672, 0.155, 0.0, 0.0, 0.862, 0.022, 0.978]

In [None]:
# prediction is performed with the model and shape of galaxy is displayed
predict = history.model.predict([x_list])
val_list = []
for n in range(3):
    val_list.append(predict[0][n])

target = np.max(val_list)
tolerance = 1e-6

index = None
for i, val in enumerate(val_list):
    if abs(val - target) < tolerance:
        index = i
        break
if index ==0:
    print('The galaxy is spiral in shape')
elif index==1:
    print('The galaxy is elliptical in shape')
else:
    print("The shape of galaxy can't be identified")

The galaxy is spiral in shape
