# Model Making

### Description

This notebook file must be run after taking the output from datasetPreparation.ipynb

Training of the model will be done ONLY. Manipulation of the dataset must NOT be done here. Only importing of dataset is allowed. 

Train Test Splits will be done on datasetPrepation.ipynb

This notebook will output the top performing ZERO-SHOT models. No hyperparameter tuning will be done in this notebook.

Technique used will be the following:

In [29]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

import joblib
import os

### Importing of dataset: train, validation, and test sets

In [30]:
X_train = pd.read_csv('data/cleaned/X_train.csv')
X_test = pd.read_csv('data/cleaned/X_test.csv')
y_train = pd.read_csv('data/cleaned/y_train.csv')
y_test = pd.read_csv('data/cleaned/y_test.csv')

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

model_performances = []
models = []

X_train shape: (69999, 6)
X_test shape: (30000, 6)
y_train shape: (69999, 1)
y_test shape: (30000, 1)


### Functions

In [31]:
def get_performance_report(model, X_test, y_test):
    y_pred = model.predict(X_test)

    return classification_report(y_test, y_pred)

In [39]:
def load_model(filename, X_test, y_test):
    if os.path.exists(filename):
        print("Model Found: Loading...")
        model = joblib.load(filename)
        performance = get_performance_report(model, X_test, y_test)
        print(performance)

        return model, performance, True
    
    else:
        print(f"Model {filename} not found")

        return None, None, False

### Training of model

##### Naive Bayes

In [56]:
def gaussian_naive_bayes(param_grid, X_train, X_test, y_train, y_test):
    # Reshape y_train and y_test to be 1D arrays
    y_train = np.ravel(y_train)
    y_test = np.ravel(y_test)

    # Initialize the Gaussian Naive Bayes model
    gnb = GaussianNB()
    
    # Perform hyperparameter tuning with GridSearchCV
    grid_search = GridSearchCV(estimator=gnb, param_grid=param_grid, scoring='accuracy', cv=5, verbose=3)
    grid_search.fit(X_train, y_train)
    
    # Get the best model
    best_gnb = grid_search.best_estimator_
    
    # Train the best model on the training set
    best_gnb.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = best_gnb.predict(X_test)
    
    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.2f}")
    
    # Print the classification report
    class_report = classification_report(y_test, y_pred)
    print("Classification Report:")
    print(class_report)
    
    return best_gnb, class_report

In [57]:
# Define parameters
param_grid = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

filename = "models/gnb_model.joblib"

gnb_model, gnb_performance, isLoaded = load_model(filename, X_test, y_test)

# Train Model
if not(isLoaded):
    print("Training GNB model")
    gnb_model, gnb_performance = gaussian_naive_bayes(param_grid, X_train, X_test, y_train, y_test)

    # Export model
    joblib.dump(gnb_model, filename)

# Put model in array
models.append(gnb_model)
model_performances.append(gnb_performance)

Model Found: Loading...
              precision    recall  f1-score   support

      GALAXY       0.96      0.91      0.93     17834
         QSO       0.78      0.88      0.82      5688
        STAR       0.98      1.00      0.99      6478

    accuracy                           0.92     30000
   macro avg       0.90      0.93      0.92     30000
weighted avg       0.93      0.92      0.92     30000



##### K-Nearest Neighbors

In [58]:
def k_nearest_neighbors(param_grid, X_train, X_test, y_train, y_test):
    # Reshape y_train and y_test to be 1D arrays if necessary
    y_train = np.ravel(y_train)
    y_test = np.ravel(y_test)
    
    # Initialize the K-Nearest Neighbors model
    knn = KNeighborsClassifier()
    
    # Perform hyperparameter tuning with GridSearchCV
    grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, scoring='accuracy', cv=5, verbose=3, n_jobs = -1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model
    best_knn = grid_search.best_estimator_
    
    # Train the best model on the training set
    best_knn.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = best_knn.predict(X_test)
    
    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.2f}")
    
    # Print the classification report
    class_report = classification_report(y_test, y_pred)
    print("Classification Report:")
    print(class_report)
    
    return best_knn, class_report

In [59]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]
}

filename = "models/knn_model.joblib"

knn_model, knn_performance, isLoaded = load_model(filename, X_test, y_test)

# Train Model
if not(isLoaded):
    print("Training KNN model")
    knn_model, knn_performance = k_nearest_neighbors(param_grid, X_train, X_test, y_train, y_test)

    # Export model
    joblib.dump(knn_model, filename)

# Put model in array
models.append(knn_model)
model_performances.append(knn_performance)

Model Found: Loading...
              precision    recall  f1-score   support

      GALAXY       0.96      0.96      0.96     17834
         QSO       0.95      0.91      0.93      5688
        STAR       0.93      0.96      0.94      6478

    accuracy                           0.95     30000
   macro avg       0.95      0.94      0.95     30000
weighted avg       0.95      0.95      0.95     30000



##### Decision Trees

In [60]:
def decision_tree(param_grid, X_train, X_test, y_train, y_test):
    # Reshape y_train and y_test to be 1D arrays if necessary
    y_train = np.ravel(y_train)
    y_test = np.ravel(y_test)
    
    # Initialize the Decision Tree Classifier model
    dt = DecisionTreeClassifier()
    
    # Perform hyperparameter tuning with GridSearchCV
    grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, scoring='accuracy', cv=5, verbose = 3, n_jobs = -1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model
    best_dt = grid_search.best_estimator_
    
    # Train the best model on the training set
    best_dt.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = best_dt.predict(X_test)
    
    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.2f}")
    
    # Print the classification report
    class_report = classification_report(y_test, y_pred)
    print("Classification Report:")
    print(class_report)
    
    return best_dt, class_report

In [61]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

filename = "models/dt_model.joblib"

dt_model, dt_performance, isLoaded = load_model(filename, X_test, y_test)

# Train Model
if not(isLoaded):
    print("Training DT model")
    dt_model, dt_performance = decision_tree(param_grid, X_train, X_test, y_train, y_test)

    # Export model
    joblib.dump(dt_model, filename)

# Put model in array
models.append(dt_model)
model_performances.append(dt_performance)

Model Found: Loading...
              precision    recall  f1-score   support

      GALAXY       0.97      0.98      0.98     17834
         QSO       0.95      0.92      0.93      5688
        STAR       1.00      1.00      1.00      6478

    accuracy                           0.97     30000
   macro avg       0.97      0.97      0.97     30000
weighted avg       0.97      0.97      0.97     30000



##### XGBoost

In [62]:
def extreme_gradient_boosting(param_grid, X_train, X_test, y_train, y_test):
    # Reshape y_train and y_test to be 1D arrays if necessary
    y_train = np.ravel(y_train)
    y_test = np.ravel(y_test)
    
    # Initialize the XGBoost Classifier model
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    
    # Perform hyperparameter tuning with GridSearchCV
    grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='accuracy', cv=5, verbose=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model
    best_xgb = grid_search.best_estimator_
    
    # Train the best model on the training set
    best_xgb.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = best_xgb.predict(X_test)
    
    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.2f}")
    
    # Print the classification report
    class_report = classification_report(y_test, y_pred)
    print("Classification Report:")
    print(class_report)
    
    return best_xgb, class_report

In [64]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [1, 1.5, 2]
}

filename = "models/xgb_model.joblib"

xgb_model, xgb_performance, isLoaded = load_model(filename, X_test, y_test)

# Train Model
if not(isLoaded):
    print("Training DT model")
    xgb_model, xgb_performance = extreme_gradient_boosting(param_grid, X_train, X_test, y_train, y_test)

    # Export model
    joblib.dump(xgb_model, filename)

# Put model in array
models.append(xgb_model)
model_performances.append(xgb_performance)

Model models/xgb_model.joblib not found
Training DT model
Fitting 5 folds for each of 6561 candidates, totalling 32805 fits


ValueError: 
All the 32805 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
32805 fits failed with the following error:
Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\PC\AppData\Roaming\Python\Python311\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\PC\AppData\Roaming\Python\Python311\site-packages\xgboost\sklearn.py", line 1471, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got ['GALAXY' 'QSO' 'STAR']


##### LightGBM

In [None]:
def light_gradient_boosting_machine(param_grid, X_train, X_test, y_train, y_test):
    # Reshape y_train and y_test to be 1D arrays if necessary
    y_train = np.ravel(y_train)
    y_test = np.ravel(y_test)
    
    # Initialize the LightGBM Classifier model
    lgbm = LGBMClassifier()
    
    # Perform hyperparameter tuning with GridSearchCV
    grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, scoring='accuracy', cv=5, verbose=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model
    best_lgbm = grid_search.best_estimator_
    
    # Train the best model on the training set
    best_lgbm.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = best_lgbm.predict(X_test)
    
    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.2f}")
    
    # Print the classification report
    class_report = classification_report(y_test, y_pred)
    print("Classification Report:")
    print(class_report)
    
    return best_lgbm, class_report

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002870 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 125505, number of used features: 9
[LightGBM] [Info] Start training from score -0.693394
[LightGBM] [Info] Start training from score -1.417870
[LightGBM] [Info] Start training from score -1.355206
LightGBM Model Accuracy: 0.98


In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50, 100],
    'max_depth': [-1, 10, 20, 30],
    'min_data_in_leaf': [20, 50, 100],
    'feature_fraction': [0.6, 0.8, 1.0],
    'bagging_fraction': [0.6, 0.8, 1.0],
    'bagging_freq': [0, 5, 10],
    'lambda_l1': [0, 0.01, 0.1],
    'lambda_l2': [0, 0.01, 0.1]
}

filename = "models/lgbm_model.joblib"

lgbm_model, lgbm_performance, isLoaded = load_model(filename, X_test, y_test)

# Train Model
if not(isLoaded):
    print("Training DT model")
    lgbm_model, lgbm_performance = light_gradient_boosting_machine(param_grid, X_train, X_test, y_train, y_test)

    # Export model
    joblib.dump(lgbm_model, filename)

# Put model in array
models.append(lgbm_model)
model_performances.append(lgbm_performance)



