In [30]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_class_weight

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

from xgboost import XGBClassifier, XGBRegressor
import xgboost as xgb

In [31]:
mightee_data = pd.read_csv("../../../Data/MIGHTEE/Classification/final_gaussian_radio.csv")

In [32]:
# Dropping nan classifications
mightee_data = mightee_data[mightee_data['Classification'].notna()]
print("Amount of rows:", len(mightee_data))

Amount of rows: 4370


In [33]:
X = mightee_data[[c for c in mightee_data.columns if c != 'Classification']]
y = mightee_data['Classification']

In [36]:
y.value_counts()

star-forming galaxy                                     2790
jet-mode radio AGN/low-excitation radio galaxy           924
radio-quiet AGN                                          410
quasar-like radio AGN / high-excitation radio galaxy     246
Name: Classification, dtype: int64

In [37]:
X = X[['Z_BEST', 'ch1_flux_corr', 'ch2_flux_corr', 'ch3_flux_corr',
       'ch4_flux_corr', 'F_MIPS_24', 'F_PACS_100', 'F_PACS_160',
       'F_SPIRE_250', 'F_SPIRE_350', 'F_SPIRE_500', 'Ks_flux_corr',
       'H_flux_corr', 'J_flux_corr', 'i_flux_corr', 'r_flux_corr',
       'u_flux_corr', 'z_flux_corr', 'y_flux_corr', 'NUV_flux_corr',
       'FUV_flux_corr', 'Total_flux', 'Peak_flux', 'S_INT14', 'S_PEAK14']]

In [38]:
le = LabelEncoder()
labels = np.unique(y.astype(str))
y = le.fit_transform(y)

## XGBoost

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, train_size=0.8, stratify=y_test, random_state=42)

In [50]:
model = XGBClassifier(use_label_encoder=False, 
                      max_depth=6, 
                      #reg_alpha=1,   
                      #min_child_weight=0.5, 
                      reg_lambda=5,
                      #subsample=0.5,
                      eta=0.3, # Remember to reduce to 0.1 or 0.05 for better results
                      tree_method='exact', # exact is more precies, but this is much faster
                      gpu_id=1,
                      objective='multi:softprob',
                      eval_metric =['merror'],
                      nthread=8,
                      n_estimators=500,
                      )

In [51]:
bst = model.fit(X_train, y_train, 
                eval_set=[
                    (X_val, y_val),
                ], 
                early_stopping_rounds=20,
                verbose=True, 
                #sample_weight=classes_weights,
               ) #

[0]	validation_0-merror:0.24571
[1]	validation_0-merror:0.25143
[2]	validation_0-merror:0.24571
[3]	validation_0-merror:0.22857
[4]	validation_0-merror:0.21714
[5]	validation_0-merror:0.22286
[6]	validation_0-merror:0.22857
[7]	validation_0-merror:0.22857
[8]	validation_0-merror:0.22286
[9]	validation_0-merror:0.22286
[10]	validation_0-merror:0.23429
[11]	validation_0-merror:0.22286
[12]	validation_0-merror:0.21714
[13]	validation_0-merror:0.21143
[14]	validation_0-merror:0.21714
[15]	validation_0-merror:0.21143
[16]	validation_0-merror:0.22286
[17]	validation_0-merror:0.22286
[18]	validation_0-merror:0.21714
[19]	validation_0-merror:0.22286
[20]	validation_0-merror:0.21143
[21]	validation_0-merror:0.21143
[22]	validation_0-merror:0.21714
[23]	validation_0-merror:0.22286
[24]	validation_0-merror:0.22286
[25]	validation_0-merror:0.22857
[26]	validation_0-merror:0.22286
[27]	validation_0-merror:0.22286
[28]	validation_0-merror:0.22857
[29]	validation_0-merror:0.24000
[30]	validation_0-me

In [52]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=labels, digits=4))

                                                      precision    recall  f1-score   support

      jet-mode radio AGN/low-excitation radio galaxy     0.6613    0.5541    0.6029       148
quasar-like radio AGN / high-excitation radio galaxy     0.4615    0.1538    0.2308        39
                                     radio-quiet AGN     0.7812    0.3788    0.5102        66
                                 star-forming galaxy     0.7981    0.9484    0.8668       446

                                            accuracy                         0.7668       699
                                           macro avg     0.6755    0.5088    0.5527       699
                                        weighted avg     0.7488    0.7668    0.7418       699



In [53]:
83-77

6