In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_class_weight

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
import xgboost as xgb

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense

from sklearn.metrics import accuracy_score, precision_score, recall_score
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model

In [2]:
features_num = [
    'Total_flux', 'Peak_flux', 
       'NUV_flux_corr', 'u_flux_corr', 'Bw_flux_corr', 'R_flux_corr',
       'I_flux_corr', 'z_flux_corr', 'z_Subaru_flux_corr', 'y_flux_corr',
       'J_flux_corr', 'H_flux_corr', 'K_flux_corr', 'Ks_flux_corr',
       'ch1_flux_corr', 'ch2_flux_corr', 'ch3_flux_corr', 'ch4_flux_corr',
       'F_MIPS_24', 'F_PACS_100', 'F_PACS_160', 'F_SPIRE_250', 'F_SPIRE_350',
       'F_SPIRE_500', 'Z_BEST', 'Mass_median', 'Mass_l68', 'Mass_u68',
        'z_rcs_flux_corr', 'z_hsc_flux_corr', 'i_hsc_flux_corr',
       'i_rcs_flux_corr', 'i_flux_corr', 'y_hsc_flux_corr', 'r_flux_corr',
       'r_hsc_flux_corr', 'r_rcs_flux_corr', 'ch1_swire_flux_corr',
       'ch2_swire_flux_corr', 'ch3_swire_flux_corr', 'ch4_swire_flux_corr',
       'ch1_servs_flux_corr', 'ch2_servs_flux_corr', 'g_flux_corr',
       'g_hsc_flux_corr', 'nb921_hsc_flux_corr', 'g_rcs_flux_corr'
]
y_column = "Classification"

classes = ['jet-mode radio AGN/low-excitation radio galaxy', 'quasar-like radio AGN / high-excitation radio galaxy', 
           'radio-quiet AGN', 'star-forming galaxy']

In [3]:
data = pd.read_csv("../../../Data/Fangyou_data/Cleaned/combined_filled_preprocessed.csv")

In [4]:
data.columns

Index(['Source_Name', 'Total_flux', 'Peak_flux', 'S_Code', 'EBV',
       'FUV_flux_corr', 'NUV_flux_corr', 'u_flux_corr', 'Bw_flux_corr',
       'R_flux_corr', 'I_flux_corr', 'z_flux_corr', 'z_Subaru_flux_corr',
       'y_flux_corr', 'J_flux_corr', 'H_flux_corr', 'K_flux_corr',
       'Ks_flux_corr', 'ch1_flux_corr', 'ch2_flux_corr', 'ch3_flux_corr',
       'ch4_flux_corr', 'F_MIPS_24', 'F_PACS_100', 'F_PACS_160', 'F_SPIRE_250',
       'F_SPIRE_350', 'F_SPIRE_500', 'Z_BEST', 'Mass_median', 'Mass_l68',
       'Mass_u68', 'Source', 'AGN_final', 'RadioAGN_final', 'Classification',
       'Radio_excess', 'AGNfrac_af', 'AGNfrac_af_16', 'AGNfrac_cg_s_16',
       'Xray', 'Opt_spec', 'Extended_radio', 'z_rcs_flux_corr',
       'z_hsc_flux_corr', 'i_hsc_flux_corr', 'i_rcs_flux_corr', 'i_flux_corr',
       'y_hsc_flux_corr', 'r_flux_corr', 'r_hsc_flux_corr', 'r_rcs_flux_corr',
       'ch1_swire_flux_corr', 'ch2_swire_flux_corr', 'ch3_swire_flux_corr',
       'ch4_swire_flux_corr', 'ch1_servs_flu

In [5]:
data.isna().sum()

Source_Name            0
Total_flux             0
Peak_flux              0
S_Code                 0
EBV                    0
                      ..
ch2_servs_flux_corr    0
g_flux_corr            0
g_hsc_flux_corr        0
nb921_hsc_flux_corr    0
g_rcs_flux_corr        0
Length: 62, dtype: int64

In [6]:
X = data[features_num]
y = data[np.append(y_column, ['Xray', 'Opt_spec', 'Extended_radio'])]

In [7]:
le = LabelEncoder()
labels = np.unique(y[y_column].astype(str))
y[y_column] = le.fit_transform(y[y_column])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[y_column] = le.fit_transform(y[y_column])


In [8]:
# Creating training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

y_train_flags = y_train[['Xray', 'Opt_spec', 'Extended_radio']]
y_test_flags = y_test[['Xray', 'Opt_spec', 'Extended_radio']]

y_train = y_train.drop(columns=['Xray', 'Opt_spec', 'Extended_radio'])
y_test = y_test.drop(columns=['Xray', 'Opt_spec', 'Extended_radio'])

In [12]:
def one_hot(df, columns):
    """
    One-hot encoding a certain column in a dataframe
    """
    one_hot = pd.get_dummies(df[columns])
    df = df.drop(columns=columns, axis = 1)
    df = df.join(one_hot)
    return df

In [13]:
# Imputing missing categorical features
imp = SimpleImputer(strategy="most_frequent")

# Standard scaling the numerical data
scaler = MinMaxScaler()

# Transforming the categorical data
X_train[features_num] = scaler.fit_transform(X_train[features_num])
X_test[features_num] = scaler.transform(X_test[features_num])

In [14]:
from sklearn.utils import class_weight
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y_train
)

# Autoencoder

In [23]:
from sklearn.ensemble import IsolationForest
clf = IsolationForest(random_state=0).fit(X_train)

In [40]:
outlier = clf.predict(X_train)

In [41]:
model = XGBClassifier(use_label_encoder=False, 
                      max_depth=8, 
                      reg_alpha=3, 
                      min_child_weight=1, 
                      reg_lambda=5,
                      gamma=0,
                     eta=0.3, 
                      tree_method='hist',
                     n_estimators=1000,
                     class_weight='balanced')
bst = model.fit(X_train[outlier==1], y_train[outlier==1], early_stopping_rounds=20, eval_set=[(X_test, y_test)], verbose=True,)



Parameters: { "class_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-mlogloss:1.02726
[1]	validation_0-mlogloss:0.82670
[2]	validation_0-mlogloss:0.69657
[3]	validation_0-mlogloss:0.60553
[4]	validation_0-mlogloss:0.54042
[5]	validation_0-mlogloss:0.49261
[6]	validation_0-mlogloss:0.45589
[7]	validation_0-mlogloss:0.42801
[8]	validation_0-mlogloss:0.40616
[9]	validation_0-mlogloss:0.38836
[10]	validation_0-mlogloss:0.37394
[11]	validation_0-mlogloss:0.36192
[12]	validation_0-mlogloss:0.35293
[13]	validation_0-mlogloss:0.34499
[14]	validation_0-mlogloss:0.33880
[15]	validation_0-mlogloss:0.33308
[16]	validation_0-mlogloss:0.32845
[17]	validation_0-mlogloss:0.32410
[18]	validation_0-mlogloss:0.32000
[19]	validation_0-mlogloss

In [43]:
outlier2 = clf.predict(X_test)

In [45]:
y_pred = model.predict(X_test[outlier2==1])
print(classification_report(y_test[outlier2==1], y_pred, target_names=labels, digits=4))

                                                      precision    recall  f1-score   support

      jet-mode radio AGN/low-excitation radio galaxy     0.8558    0.7959    0.8248      2342
quasar-like radio AGN / high-excitation radio galaxy     0.6239    0.3174    0.4207       230
                                     radio-quiet AGN     0.7664    0.6810    0.7212      1185
                                 star-forming galaxy     0.9233    0.9591    0.9408     10556

                                            accuracy                         0.8990     14313
                                           macro avg     0.7924    0.6883    0.7269     14313
                                        weighted avg     0.8945    0.8990    0.8953     14313



In [39]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=labels, digits=4))

                                                      precision    recall  f1-score   support

      jet-mode radio AGN/low-excitation radio galaxy     0.8592    0.7943    0.8254      2581
quasar-like radio AGN / high-excitation radio galaxy     0.6714    0.3481    0.4585       270
                                     radio-quiet AGN     0.7800    0.7035    0.7398      1356
                                 star-forming galaxy     0.9223    0.9596    0.9406     11315

                                            accuracy                         0.8991     15522
                                           macro avg     0.8082    0.7014    0.7411     15522
                                        weighted avg     0.8950    0.8991    0.8955     15522

