In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_class_weight

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

from xgboost import XGBClassifier, XGBRegressor
import xgboost as xgb

In [2]:
features_num = [
    'Total_flux', 'Peak_flux', 
       'NUV_flux_corr', 'u_flux_corr', 'Bw_flux_corr', 'R_flux_corr',
       'I_flux_corr', 'z_flux_corr', 'y_flux_corr',
       'J_flux_corr', 'H_flux_corr', 'K_flux_corr', 'Ks_flux_corr',
       'ch1_flux_corr', 'ch2_flux_corr', 'ch3_flux_corr', 'ch4_flux_corr',
       'F_MIPS_24', 'F_PACS_100', 'F_PACS_160', 'F_SPIRE_250', 'F_SPIRE_350',
       'F_SPIRE_500', 'Z_BEST', 'r_rcs_flux_corr',
       'g_flux_corr', 'nb921_hsc_flux_corr'
    ]
y_column = "Classification"

classes = ['jet-mode radio AGN/low-excitation radio galaxy', 'quasar-like radio AGN / high-excitation radio galaxy', 
           'radio-quiet AGN', 'star-forming galaxy']

In [3]:
data = pd.read_csv("../../../Data/Fangyou_data/Cleaned/combined_using_similar_columns.csv")

In [4]:
data.Classification.value_counts()

star-forming galaxy                                     56640
jet-mode radio AGN/low-excitation radio galaxy          12767
radio-quiet AGN                                          6870
quasar-like radio AGN / high-excitation radio galaxy     1332
Name: Classification, dtype: int64

In [5]:
X = data[features_num]
y = data[np.append(y_column, ['Xray', 'Opt_spec', 'Extended_radio', 'IRAGN'])]

In [6]:
le = LabelEncoder()
labels = np.unique(y[y_column].astype(str))
y[y_column] = le.fit_transform(y[y_column])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[y_column] = le.fit_transform(y[y_column])


In [7]:
labels

array(['jet-mode radio AGN/low-excitation radio galaxy',
       'quasar-like radio AGN / high-excitation radio galaxy',
       'radio-quiet AGN', 'star-forming galaxy'], dtype=object)

# Weights

In [9]:
from sklearn.utils import class_weight
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y_train
)

# XGBoost

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y['Classification'], train_size=0.8, stratify=y['Classification'], random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, train_size=0.8, stratify=y_test, random_state=42)

In [32]:
model = XGBClassifier(use_label_encoder=False, 
                      max_depth=5, 
                      #reg_alpha=1,   
                      #min_child_weight=0.5, 
                      reg_lambda=5,
                      #subsample=0.5,
                      eta=0.1, # Remember to reduce to 0.1 or 0.05 for better results
                      tree_method='hist', # exact is more precies, but this is much faster
                      gpu_id=1,
                      objective='multi:softprob',
                      eval_metric =['merror'],
                      nthread=8,
                      n_estimators=500,
                      )

In [33]:
bst = model.fit(X_train, y_train, 
                eval_set=[
                    (X_val, y_val),
                ], 
                early_stopping_rounds=20,
                verbose=True, 
                #sample_weight=classes_weights,
               ) #

[0]	validation_0-merror:0.16586
[1]	validation_0-merror:0.15974
[2]	validation_0-merror:0.15813
[3]	validation_0-merror:0.15781
[4]	validation_0-merror:0.15652
[5]	validation_0-merror:0.15588
[6]	validation_0-merror:0.15394
[7]	validation_0-merror:0.15362
[8]	validation_0-merror:0.15394
[9]	validation_0-merror:0.15040
[10]	validation_0-merror:0.14911
[11]	validation_0-merror:0.14718
[12]	validation_0-merror:0.14718
[13]	validation_0-merror:0.14718
[14]	validation_0-merror:0.14686
[15]	validation_0-merror:0.14622
[16]	validation_0-merror:0.14622
[17]	validation_0-merror:0.14461
[18]	validation_0-merror:0.14396
[19]	validation_0-merror:0.14396
[20]	validation_0-merror:0.14267
[21]	validation_0-merror:0.14300
[22]	validation_0-merror:0.14203
[23]	validation_0-merror:0.14010
[24]	validation_0-merror:0.14106
[25]	validation_0-merror:0.13978
[26]	validation_0-merror:0.13688
[27]	validation_0-merror:0.13559
[28]	validation_0-merror:0.13559
[29]	validation_0-merror:0.13365
[30]	validation_0-me

[242]	validation_0-merror:0.10016
[243]	validation_0-merror:0.09984
[244]	validation_0-merror:0.10016
[245]	validation_0-merror:0.09984
[246]	validation_0-merror:0.10016
[247]	validation_0-merror:0.09984
[248]	validation_0-merror:0.09984
[249]	validation_0-merror:0.09984
[250]	validation_0-merror:0.09984
[251]	validation_0-merror:0.09984
[252]	validation_0-merror:0.09952
[253]	validation_0-merror:0.10016
[254]	validation_0-merror:0.10016
[255]	validation_0-merror:0.09952
[256]	validation_0-merror:0.09984
[257]	validation_0-merror:0.09920
[258]	validation_0-merror:0.09952
[259]	validation_0-merror:0.09952
[260]	validation_0-merror:0.09887
[261]	validation_0-merror:0.09952
[262]	validation_0-merror:0.09920
[263]	validation_0-merror:0.09791
[264]	validation_0-merror:0.09759
[265]	validation_0-merror:0.09759
[266]	validation_0-merror:0.09791
[267]	validation_0-merror:0.09791
[268]	validation_0-merror:0.09726
[269]	validation_0-merror:0.09726
[270]	validation_0-merror:0.09759
[271]	validati

In [34]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=labels, digits=4))

                                                      precision    recall  f1-score   support

      jet-mode radio AGN/low-excitation radio galaxy     0.8676    0.7763    0.8194      2043
quasar-like radio AGN / high-excitation radio galaxy     0.6154    0.3380    0.4364       213
                                     radio-quiet AGN     0.7683    0.6606    0.7104      1099
                                 star-forming galaxy     0.9148    0.9617    0.9377      9062

                                            accuracy                         0.8939     12417
                                           macro avg     0.7915    0.6842    0.7260     12417
                                        weighted avg     0.8889    0.8939    0.8895     12417



# Now try rescaling to MIGHTEE size

In [68]:
LERG = y[y['Classification']==0].sample(924).index
HERG = y[y['Classification']==1].sample(246).index
RQ = y[y['Classification']==2].sample(410).index
SFG = y[y['Classification']==3].sample(2790).index

all_index = LERG.append([HERG, RQ, SFG])

In [69]:
small_y = y.iloc[all_index]
small_X = X.iloc[all_index]

In [70]:
len(small_y)

4370

In [74]:
X_train, X_test, y_train, y_test = train_test_split(small_X, small_y['Classification'], train_size=0.8, stratify=small_y['Classification'], random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, train_size=0.8, stratify=y_test, random_state=42)

In [104]:
model = XGBClassifier(use_label_encoder=False, 
                      max_depth=6, 
                      #reg_alpha=1,   
                      #min_child_weight=0.5, 
                      reg_lambda=5,
                      #subsample=0.5,
                      eta=0.05, # Remember to reduce to 0.1 or 0.05 for better results
                      tree_method='hist', # exact is more precies, but this is much faster
                      gpu_id=1,
                      objective='multi:softprob',
                      eval_metric =['merror'],
                      nthread=8,
                      n_estimators=500,
                      )

In [105]:
bst = model.fit(X_train, y_train, 
                eval_set=[
                    (X_val, y_val),
                ], 
                early_stopping_rounds=20,
                verbose=True, 
                #sample_weight=classes_weights,
               ) #

[0]	validation_0-merror:0.19429
[1]	validation_0-merror:0.18857
[2]	validation_0-merror:0.17143
[3]	validation_0-merror:0.17714
[4]	validation_0-merror:0.16000
[5]	validation_0-merror:0.17143
[6]	validation_0-merror:0.17143
[7]	validation_0-merror:0.17143
[8]	validation_0-merror:0.16571
[9]	validation_0-merror:0.17143
[10]	validation_0-merror:0.17143
[11]	validation_0-merror:0.17714
[12]	validation_0-merror:0.17143
[13]	validation_0-merror:0.17143
[14]	validation_0-merror:0.17143
[15]	validation_0-merror:0.17143
[16]	validation_0-merror:0.16571
[17]	validation_0-merror:0.17143
[18]	validation_0-merror:0.16000
[19]	validation_0-merror:0.16000
[20]	validation_0-merror:0.16000
[21]	validation_0-merror:0.16571
[22]	validation_0-merror:0.16000
[23]	validation_0-merror:0.14857
[24]	validation_0-merror:0.14857
[25]	validation_0-merror:0.15429
[26]	validation_0-merror:0.15429
[27]	validation_0-merror:0.15429
[28]	validation_0-merror:0.15429
[29]	validation_0-merror:0.15429
[30]	validation_0-me

In [106]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=labels, digits=4))

                                                      precision    recall  f1-score   support

      jet-mode radio AGN/low-excitation radio galaxy     0.8346    0.7500    0.7900       148
quasar-like radio AGN / high-excitation radio galaxy     0.6957    0.4103    0.5161        39
                                     radio-quiet AGN     0.6905    0.4394    0.5370        66
                                 star-forming galaxy     0.8503    0.9552    0.8997       446

                                            accuracy                         0.8326       699
                                           macro avg     0.7678    0.6387    0.6857       699
                                        weighted avg     0.8233    0.8326    0.8208       699

