In [4]:
import pandas as pd #Import pandas for operations
import matplotlib.pyplot as plt
import seaborn as sns
# from sympy.printing.pytorch import torch

In [5]:
#0. Import data

dataframe = pd.read_csv('data/sdss_100k_galaxy_form_burst.csv', low_memory=False, header=1)

print("Dataset loaded successfully!")
print(f"Dataset shape: {dataframe.shape}")


Dataset loaded successfully!
Dataset shape: (100000, 43)


In [15]:
from scipy.stats import zscore

# 0.1. Data Statistical Analysis

print(f"Dataset shape: {dataframe.shape}")
# print(f"Dataset statistics: {dataframe.describe()}")

numerical_features = dataframe.select_dtypes(include='number').columns
z_scores = dataframe[numerical_features].apply(zscore)  # compute z-score for each numeric column
threshold = 3
outliers_z = (abs(z_scores) > threshold)

print(dataframe[outliers_z.any(axis=1)])  # rows containing at least one outlier
# for col in dataframe.select_dtypes(include='number'):
#     plt.figure()
#     sns.histplot(dataframe[col], kde=True)   # histogram + smooth curve
#     plt.title(col)
#     plt.show()

# Data is somewhat skewed


Dataset shape: (100000, 43)
                     objid            specobjid          ra        dec  \
4      1237648702973149350   332154249716721664  198.706864  -1.046217   
42     1237648721763238209   314301479399745536  170.073523   0.405995   
54     1237651192432165212  8192142948407990272  131.241240  53.208364   
58     1237651251482067624  7181230230211942400  122.631878  45.281591   
72     1237648704061309661  4517153671457560576  231.533375  -0.340179   
...                    ...                  ...         ...        ...   
99928  1237662236392685639  1384880019989358592  184.933948   9.148592   
99936  1237662236929622034  1384887441692846080  185.110612   9.424278   
99957  1237654381978845196   623782117049919488  140.589226  53.606884   
99968  1237664667902214147  2354413718323881984  165.917906  37.929384   
99977  1237664669494804798  1783437871319902208  121.816368  21.689839   

                u           g         r         i         z  modelFlux_u  ...  \
4 

In [22]:
from sklearn.preprocessing import RobustScaler

#1 Data pre-processing

#1.1.0 Smoothen out outliers by means of Robust Scaler
robust_scaler = RobustScaler() # Initialize scaler
robust_scaled_values = robust_scaler.fit_transform(dataframe[numerical_features]) # Scale only numerical data from dataset.
robust_scaled_df = pd.DataFrame(robust_scaled_values, columns=numerical_features, index=dataframe.index) # Convert scaled data to pandas dataframe NB: index so it aligns with OG.
scaled_dataframe = dataframe.copy() # Create a copy of Dataframe to preserve original.
scaled_dataframe[numerical_features] = robust_scaled_df # Copy scaled values over into new copy.

#1.1.1 remove unnecessary columns
modified_dataframe = scaled_dataframe.drop(['objid', 'specobjid', 'class'], axis=1)

# #1.1.2 Encode subclass category accordingly
# modified_dataframe['subclass'] = modified_dataframe['subclass'].map({'STARBURST': 1, 'STARFORMING': 0})

#1.2.1 Split dataset into Test and Training data. (80% Training - 20% Test)
main_training_data = modified_dataframe.sample(frac=0.8)
testing_data = modified_dataframe.drop(main_training_data.index)


#1.2.2 Split Test data into Test and Validation Set. (70% Training - 30% Validation)
training_set = main_training_data.sample(frac=0.7)
validation_set = main_training_data.drop(training_set.index)


[[-1.51592129  4.32047684 -1.16853852 ... -1.04633737 -0.22823169
   0.56209323]
 [-1.51585466  4.32047755 -1.16736104 ...  0.1659419   0.24294346
  -0.09380619]
 [-1.51585466  4.32047773 -1.16865799 ... -0.55315059  1.86906363
   2.51207925]
 ...
 [ 1.11467835  0.88526498  0.63583879 ... -0.50108211  0.519648
  -0.10095447]
 [ 1.11467836  0.88526463  0.63796294 ...  0.8041712   0.38430284
   0.08256417]
 [ 1.1147441   3.33433775  0.44510893 ... -0.56856797  0.17993789
  -0.43718197]]


In [None]:
#Visualize data.
#1. Training Set Info
print(f"Training Dataset shape: {training_set.shape}")
# print(f"Training Dataset Columns: {list(training_set.columns)}")

#2. Testing Set Info
print(f"Testing Dataset shape: {testing_data.shape}")
# print(f"Testing Dataset Columns: {list(testing_data.columns)}")

#3. Validation Set Info
print(f"Validation Dataset shape: {validation_set.shape}")
# print(f"Validation Dataset Columns: {list(validation_set.columns)}")

assert(len(training_set) + len(testing_data) + len(validation_set) == len(modified_dataframe)) #Confirm that the lengths match.





In [156]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

def plot_distributions_seaborn(df, figsize=(15, 12)):
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    categorical_cols = df.select_dtypes(exclude=[np.number]).columns

    # Plot numerical distributions
    if len(numerical_cols) > 0:
        n_rows = (len(numerical_cols) + 2) // 3
        fig, axes = plt.subplots(n_rows, 3, figsize=figsize)
        axes = axes.flatten()

        for i, col in enumerate(numerical_cols):
            sns.histplot(data=df, x=col, ax=axes[i], kde=True, bins=30)
            axes[i].axvline(df[col].mean(), color='red', linestyle='--', alpha=0.8)
            axes[i].axvline(df[col].median(), color='green', linestyle='--', alpha=0.8)
            axes[i].set_title(f'Distribution of {col}')

        # Hide empty subplots
        for j in range(i + 1, len(axes)):
            axes[j].set_visible(False)

        plt.tight_layout()
        plt.show()

    # Plot categorical distributions
    if len(categorical_cols) > 0:
        n_rows = (len(categorical_cols) + 2) // 3
        fig, axes = plt.subplots(n_rows, 3, figsize=figsize)
        axes = axes.flatten()

        for i, col in enumerate(categorical_cols):
            value_counts = df[col].value_counts().head(10)
            sns.barplot(x=value_counts.index, y=value_counts.values, ax=axes[i])
            axes[i].set_title(f'Distribution of {col}')
            axes[i].tick_params(axis='x', rotation=45)
            axes[i].set_ylabel('Count')

        # Hide empty subplots
        for j in range(i + 1, len(axes)):
            axes[j].set_visible(False)

        plt.tight_layout()
        plt.show()

# plot_distributions_seaborn(training_set)

In [157]:
from sklearn.preprocessing import StandardScaler
#2. Normalize data.

#2. Apply mean-centering & variance scaling to data

#2.1. Define function
def standardize_with_sklearn(df):
    """
    Use scikit-learn's StandardScaler for robust standardization
    """
    # Separate numerical and categorical columns
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    categorical_cols = df['subclass']

    # Create a copy and only scale numerical columns
    df_normalized = df.copy()

    if len(numerical_cols) > 0:
        scaler = StandardScaler()
        df_normalized[numerical_cols] = scaler.fit_transform(df[numerical_cols])
        df_normalized.drop('subclass', axis=1, inplace=True)

    return df_normalized, categorical_cols

#2.2. Apply standardization
testing_dataset_normalized, testing_dataset_labels = standardize_with_sklearn(testing_data)
training_dataset_normalized, training_dataset_labels = standardize_with_sklearn(training_set)
validation_dataset_normalized, validation_dataset_labels = standardize_with_sklearn(validation_set)

# testing_dataset_normalized.head(5)
# testing_dataset_normalized.shape
# testing_dataset_labels.shape
# validation_dataset_normalized.shape
# validation_dataset_labels.shape
#1.1.2 Encode subclass category accordingly
testing_dataset_labels = testing_dataset_labels.map({'STARBURST': 1, 'STARFORMING': 0})
training_dataset_labels = training_dataset_labels.map({'STARBURST': 1, 'STARFORMING': 0})
validation_dataset_labels = validation_dataset_labels.map({'STARBURST': 1, 'STARFORMING': 0})


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init

#3. Begin Training Model.
# print(torch.cuda.is_available())

#3.1. Setup Model with Params. TODO: Specify weights!!
class galaxy_classification_nn(nn.Module):
    def __init__(self):

        super().__init__()
        self.input_layer = nn.Linear(in_features=39, out_features=133)
        self.hidden_layer = nn.Linear(in_features=133, out_features=133)
        self.output_layer = nn.Linear(in_features=133, out_features=2)

        #Specify activation functions.
        self.activation_function1 = nn.Tanh() # Input -> Hidden 1
        self.activation_function2 = nn.ReLU() # Hidden 1 -> Hidden 2
        self.activation_function3 = nn.Softmax(dim=1) # Hidden 2 -> Output TODO: Validate dim param

    def forward(self, x):
        x = self.input_layer(x)
        x = self.activation_function1(x)

        x = self.hidden_layer(x)
        x = self.activation_function2(x)

        x = self.output_layer(x)
        x = self.activation_function3(x)

        return x

        # x = F.relu(self.fc1(x)) # TODO: Check why not linear.
        # x = F.relu(self.fc2(x))
        # x = F.softmax(self.fc3(x), dim=1)
        # x = self.fc3(x)                  # raw scores (logits)
        return x

    def get_weights(self, model_type):
        if model_type == 'traditional':
            init.xavier_normal_(self.input_layer.weight, )
            init.xavier_normal_(self.hidden_layer.weight)
            init.xavier_normal_(self.output_layer.weight)
            return
        elif model_type == 'sign_based':
            init.xavier_normal_(self.input_layer.weight)

In [None]:
from torch.utils.data import TensorDataset, DataLoader


training_features = training_dataset_normalized.values

training_labels = training_dataset_labels.values

testing_features = testing_dataset_normalized.values

testing_labels = testing_dataset_labels.values


# TODO: Add Validation tensors
training_tensor_features = torch.tensor(training_features, dtype=torch.float32)

training_tensor_labels = torch.tensor(training_labels, dtype=torch.long)

testing_tensor_features = torch.tensor(testing_features, dtype=torch.float32)

testing_tensor_labels = torch.tensor(testing_labels, dtype=torch.long)

# Create Datasets
training_dataset = TensorDataset(training_tensor_features, training_tensor_labels)
testing_dataset = TensorDataset(testing_tensor_features, testing_tensor_labels)

training_dataset_loader = DataLoader(training_dataset, batch_size=64, shuffle=True)

testing_dataset_loader = DataLoader(testing_dataset, batch_size=64, shuffle=True)

print(training_tensor_features.dtype)
# print(training_dataset_labels)

# training_dataset_normalized.head(5)
#
# train_tensor_features = torch.tensor(training_dataset_normalized.values, dtype=torch.float32)
#
# train_tensor_labels = torch.tensor(training_dataset_labels.values, dtype=torch.float32)
#
# training_dataset = TensorDataset(train_tensor_features, train_tensor_labels)
#

In [None]:
import torch.optim as optim

#3.2. Init. model
device = torch.device("cuda:0" if torch.cuda.is_available () else "cpu")

galaxy_classification_model = galaxy_classification_nn().to(device)

#3.2. Setup Optimizer
loss_function = nn.CrossEntropyLoss()
error_optimizer = optim.SGD(galaxy_classification_model.parameters(), lr=0.01)


In [163]:
for epoch in range(5):
    print(f"Epoch: {epoch + 1}")
    # galaxy_classification_model.train()
    for  features, labels in training_dataset_loader:
        # 1. Forward Pass
        outputs = galaxy_classification_model(features) # Feed batches into NN -> Classifications
        loss = loss_function(outputs, labels) # Evaluate loss

        print(f"Loss: {loss.item()}")

        # 2. Backward Pass
        error_optimizer.zero_grad() #Calculate derivatives
        loss.backward() # Adjust weights
        error_optimizer.step()  # ??

        # 3. Display statistics
        galaxy_classification_model.eval() # Put model into evaluation mode.

        galaxy_classification_model.eval()
        correct = 0
        total   = 0


        with torch.no_grad():                    # no gradients needed for evaluation
            # for features, labels in testing_dataset_loader:
                # features, labels = features.to(device), labels.to(device)

            _, predicted = torch.max(outputs, 1)  # index of largest logit = predicted class

            total   += labels.size(0)         # number of samples in this batch
            correct += (predicted == labels).sum().item()

        accuracy = correct / total
        print(f"Classification accuracy: {accuracy:.2%}")


    print(f"Epoch: {epoch + 1} completed.")

    #     batch_X, batch_y = batch_X.to(device), batch_y.to(device)
    #
    #
    # error_optimizer.zero_grad()
    #
    # logits = galaxy_classification_model(batch_X)
    # loss = loss_function(logits, batch_y)
    # loss.backward()
    # error_optimizer.step()
    # print(f"Epoch {epoch+1} completed")



Epoch: 1
Loss: 0.6982377767562866
Classification accuracy: 34.38%
Loss: 0.7007230520248413
Classification accuracy: 18.75%
Loss: 0.7049838304519653
Classification accuracy: 18.75%
Loss: 0.7026772499084473
Classification accuracy: 28.12%
Loss: 0.6988763809204102
Classification accuracy: 32.81%
Loss: 0.6977540254592896
Classification accuracy: 29.69%
Loss: 0.6956086754798889
Classification accuracy: 37.50%
Loss: 0.698710560798645
Classification accuracy: 37.50%
Loss: 0.6968329548835754
Classification accuracy: 42.19%
Loss: 0.6921966075897217
Classification accuracy: 45.31%
Loss: 0.6974154114723206
Classification accuracy: 45.31%
Loss: 0.6922080516815186
Classification accuracy: 43.75%
Loss: 0.6941035985946655
Classification accuracy: 51.56%
Loss: 0.6922063231468201
Classification accuracy: 60.94%
Loss: 0.691697359085083
Classification accuracy: 57.81%
Loss: 0.6901007890701294
Classification accuracy: 64.06%
Loss: 0.6897386312484741
Classification accuracy: 68.75%
Loss: 0.6907904148101807