In [1]:
import sklearnex
sklearnex.patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [34]:
# ALL IMPORTS
import os
import pickle
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, log_loss
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Continuous, Categorical, Integer
from sklearn_genetic.plots import plot_fitness_evolution, plot_search_space

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegressionCV

In [23]:
# Choose one of the ways to import data below.

# My local path
df = pd.read_csv('/Users/pratik/Github/MScDataScienceThesis/Skyserver_SQL5_24_2023 12_41_33 PM.csv')

# Global Path
# df = pd.read_csv("https://raw.githubusercontent.com/iamstarstuff/MScDataScienceThesis/main/Skyserver_SQL5_24_2023%2

In [24]:
class_mapping = {class_label: idx for idx, class_label in enumerate(df['class'].unique())}
df['Encoded_Class'] = df['class'].map(class_mapping)

In [25]:
X = df[['u', 'g', 'r', 'i', 'z', 'redshift']]
y = df['Encoded_Class']

In [26]:
# Scale the features
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [27]:
def split_data(X, y, train_size=0.6, test_size=0.2, validation_size=0.2, random_state=None):
    """
    Splits the data into train, test, and validation datasets.

    Parameters:
        X : array-like, shape (n_samples, n_features)
            The input feature matrix.

        y : array-like, shape (n_samples,)
            The target labels.

        train_size : float, optional (default=0.6)
            The percentage of data to be used for training.

        test_size : float, optional (default=0.2)
            The percentage of data to be used for testing.

        validation_size : float, optional (default=0.2)
            The percentage of data to be used for validation.

        random_state : int or RandomState instance, optional (default=None)
            Seed used by the random number generator.

    Returns:
        X_train : array-like, shape (n_train_samples, n_features)
            The training feature matrix.

        X_test : array-like, shape (n_test_samples, n_features)
            The testing feature matrix.

        X_val : array-like, shape (n_validation_samples, n_features)
            The validation feature matrix.

        y_train : array-like, shape (n_train_samples,)
            The training target labels.

        y_test : array-like, shape (n_test_samples,)
            The testing target labels.

        y_val : array-like, shape (n_validation_samples,)
            The validation target labels.
    """

    # Split data into train+val and test sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Calculate the percentage for validation data based on the remaining data after test split
    remaining_size = 1.0 - test_size
    val_size = validation_size / remaining_size

    # Split train+val data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=val_size, random_state=random_state)

    return X_train, X_test, X_val, y_train, y_test, y_val

In [28]:
X_train, X_test, X_val, y_train, y_test, y_val = split_data(X, y)

In [29]:
random_seed = 42

np.random.seed(random_seed)
random.seed(random_seed)

In [32]:
def plot_heatmap(matrix, title, labels):
    """
    This function plots the heatmap.
    
    Parameters
    ----------
    `matrix`: 2D array
    `title`: title
    `labels`: target values
    
    Returns none.
    """
    sns.heatmap(data=matrix, annot=True, fmt='.2f', linewidths=0.1,
                xticklabels=labels, yticklabels=labels)
    plt.xlabel(xlabel='Predicted Class')
    plt.ylabel(ylabel='Actual Class')
    plt.title(label=title, fontsize=10)
    
def plot_confusion_matrix(y_true, y_pred, labels):
    """
    This function plots:
        1. Confusion matrix
        2. Precision matrix
        3. Recall matrix
    
    Parameters
    ----------
    `y_true`: ground truth (or actual) values
    `y_pred`: predicted values
    `labels`: target values
    
    Returns none.
    """
    cmat = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=labels)
    pmat = cmat / cmat.sum(axis=0)
    print("Column sum of precision matrix: {}".format(pmat.sum(axis=0)))
    rmat = ((cmat.T) / (cmat.sum(axis=1).T)).T
    print("Row sum of recall matrix:       {}".format(rmat.sum(axis=1)))
    
    plt.figure(figsize=(15, 3))
    plt.subplot(131)
    plot_heatmap(matrix=cmat, title='Confusion Matrix', labels=labels)
    plt.subplot(132)
    plot_heatmap(matrix=pmat, title='Precision Matrix', labels=labels)
    plt.subplot(133)
    plot_heatmap(matrix=rmat, title='Recall Matrix', labels=labels)
    plt.show()

In [33]:
def reporter(clf, X, y, title, labels, best=None):
    """
    This functions generates the report.
    
    Parameters
    ----------
    `clf`: classifier object
    `X`: features
    `y`: target
    `title`: title of the report
    `labels`: target values
    `best`: best parameters which are learned
    
    Returns logloss.
    """
    pred = clf.predict_proba(X=X)
    
    log_loss = log_loss(y_true=y, y_pred=pred)
    log_loss = np.round(a=log_loss, decimals=3)
    
    cm_pred = clf.predict(X=X)
    
    print(title)
    if best is None:
        print("Logloss: {}".format(log_loss))
    else:
        print("Logloss: {}".format(log_loss))
        print("Best parameters: {}".format(best))
    
    plot_confusion_matrix(y_true=y, y_pred=cm_pred, labels=labels)
    
    print(classification_report(y_true=y, y_pred=cm_pred))
    
    return log_loss

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_seed)
clf = LogisticRegressionCV(Cs=10, cv=cv, penalty='l2',solver='saga',n_jobs=-1,random_state=random_seed)
clf.fit(X=X_train, y=y_train)

Epoch 21, change: 0.00915812
Epoch 24, change: 0.00751045
Epoch 19, change: 0.01056419
Epoch 24, change: 0.00750127
Epoch 25, change: 0.00703730
Epoch 22, change: 0.00855703
Epoch 25, change: 0.00704083
Epoch 20, change: 0.00983252
Epoch 25, change: 0.00703131
Epoch 26, change: 0.00660750
Epoch 23, change: 0.00799832
Epoch 26, change: 0.00661053
Epoch 21, change: 0.00917035
Epoch 26, change: 0.00660148
Epoch 27, change: 0.00620923
Epoch 24, change: 0.00749338
Epoch 27, change: 0.00621244
Epoch 22, change: 0.00856184
Epoch 28, change: 0.00584192
Epoch 27, change: 0.00620417
Epoch 25, change: 0.00702645
Epoch 28, change: 0.00584442
Epoch 23, change: 0.00800909
Epoch 28, change: 0.00583797
Epoch 29, change: 0.00550325
Epoch 26, change: 0.00659256
Epoch 29, change: 0.00550407
Epoch 24, change: 0.00749716
Epoch 30, change: 0.00518435
Epoch 29, change: 0.00549481
Epoch 27, change: 0.00619642
Epoch 30, change: 0.00518941
Epoch 25, change: 0.00703035
Epoch 31, change: 0.00489112
Epoch 30, chan

KeyboardInterrupt: 

Epoch 50, change: 0.00177712
Epoch 49, change: 0.00186274
Epoch 47, change: 0.00205691
Epoch 50, change: 0.00177525
Epoch 45, change: 0.00228299
Epoch 51, change: 0.00169029
Epoch 50, change: 0.00177178
Epoch 51, change: 0.00168877
Epoch 48, change: 0.00195543
Epoch 46, change: 0.00216821
Epoch 52, change: 0.00160864
Epoch 51, change: 0.00168557
Epoch 52, change: 0.00160648
Epoch 49, change: 0.00185918
Epoch 47, change: 0.00206093
Epoch 53, change: 0.00153083
Epoch 52, change: 0.00160306
Epoch 53, change: 0.00152927
Epoch 50, change: 0.00176765
Epoch 48, change: 0.00195869
Epoch 54, change: 0.00145740
Epoch 53, change: 0.00152613
Epoch 54, change: 0.00145583
Epoch 51, change: 0.00168140
Epoch 49, change: 0.00186254
Epoch 55, change: 0.00138739
Epoch 54, change: 0.00145227
Epoch 55, change: 0.00138582
Epoch 52, change: 0.00159937
Epoch 50, change: 0.00177148
Epoch 56, change: 0.00132172
Epoch 55, change: 0.00138279
Epoch 56, change: 0.00131981
Epoch 53, change: 0.00152253
Epoch 51, chan



Epoch 1, change: 1.00000000
Epoch 96, change: 0.00020071
Epoch 1, change: 1.00000000
Epoch 100, change: 0.00016683
max_iter reached after 37 seconds
Epoch 98, change: 0.00018270




Epoch 97, change: 0.00019163
Epoch 2, change: 0.01281116
Epoch 2, change: 0.01281070
Epoch 1, change: 1.00000000
Epoch 99, change: 0.00017452
Epoch 98, change: 0.00018301
Epoch 3, change: 0.01225581
Epoch 3, change: 0.01225667
Epoch 2, change: 0.01281042
max_iter reached after 37 seconds
Epoch 100, change: 0.00016666




Epoch 99, change: 0.00017481
Epoch 4, change: 0.01174754
Epoch 4, change: 0.01174654
Epoch 3, change: 0.01225803
Epoch 1, change: 1.00000000
max_iter reached after 37 secondsEpoch 100, change: 0.00016700

Epoch 5, change: 0.01126848
Epoch 5, change: 0.01127044
Epoch 2, change: 0.01280715
Epoch 4, change: 0.01175007




Epoch 1, change: 1.00000000
Epoch 6, change: 0.01083135
Epoch 6, change: 0.01083136
Epoch 3, change: 0.01225590
Epoch 5, change: 0.01126672
Epoch 2, change: 0.01281179
Epoch 7, change: 0.01041397
Epoch 7, change: 0.01041786
Epoch 4, change: 0.01174392
Epoch 6, change: 0.01083026
Epoch 3, change: 0.01225674
Epoch 8, change: 0.01003152
Epoch 5, change: 0.01126572
Epoch 7, change: 0.01041755
Epoch 8, change: 0.01003428
Epoch 4, change: 0.01174651
Epoch 9, change: 0.00967052
Epoch 6, change: 0.01082689
Epoch 8, change: 0.01002839
Epoch 9, change: 0.00967148
Epoch 5, change: 0.01127111
Epoch 10, change: 0.00933050
Epoch 7, change: 0.01041396
Epoch 9, change: 0.00967018
Epoch 10, change: 0.00933493
Epoch 6, change: 0.01082893
Epoch 11, change: 0.00901277
Epoch 8, change: 0.01002892
Epoch 10, change: 0.00933230
Epoch 11, change: 0.00901542
Epoch 7, change: 0.01041876
Epoch 12, change: 0.00871359
Epoch 11, change: 0.00901516
Epoch 9, change: 0.00966982
Epoch 12, change: 0.00871734
Epoch 8, cha