In [100]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

from xgboost import XGBClassifier, XGBRegressor
import xgboost as xgb

## Importing data

In [87]:
features_num = [
    'Total_flux', 'Peak_flux', 'FUV_flux_corr', 'NUV_flux_corr', 'g_flux_corr', 'r_flux_corr', 'i_flux_corr',
    'z_flux_corr', 'y_flux_corr', 'g_hsc_flux_corr', 'r_hsc_flux_corr', 'i_hsc_flux_corr', 'z_hsc_flux_corr',
    'y_hsc_flux_corr', 'nb921_hsc_flux_corr', 'J_flux_corr', 'K_flux_corr', 'ch1_swire_flux_corr', 
    'ch2_swire_flux_corr', 'ch3_swire_flux_corr', 'ch4_swire_flux_corr', 'ch1_servs_flux_corr', 'ch2_servs_flux_corr',
    'F_MIPS_24', 'F_PACS_100', 'F_PACS_160', 'F_SPIRE_250', 'F_SPIRE_350', 'F_SPIRE_500', 'Z_BEST', 'EBV'
    ]
statmorph_features = ['xc_centroid', 'yc_centroid', 'ellipticity_centroid',
       'elongation_centroid', 'orientation_centroid', 'xc_asymmetry',
       'yc_asymmetry', 'ellipticity_asymmetry', 'elongation_asymmetry',
       'orientation_asymmetry', 'r20', 'r50', 'r80', 'rhalf_circ',
       'rhalf_ellip', 'rmax_circ', 'rmax_ellip', 'rpetro_circ', 'rpetro_ellip',
       'C', 'A', 'S', 'M20', 'Gini', 'F(G,M20)', 'S(G,M20)', 'deviation',
       'shape_asymmetry', 'outer_asymmetry', 'multimode', 'sn_per_pixel',
       'flux_circ', 'flux_ellip', 'intensity', 'sersic_xc', 'sersic_yc',
       'sersic_amplitude', 'sersic_ellip', 'sersic_n', 'sersic_rhalf',
       'sersic_theta', 'sky_mean', 'sky_median', 'sky_sigma', 'nx_stamp',
       'ny_stamp', 'xmax_stamp', 'xmin_stamp', 'ymax_stamp', 'ymin_stamp',
       'flag', 'flag_sersic']
y_column = "Classification"

classes = ['jet-mode radio AGN/low-excitation radio galaxy', 'quasar-like radio AGN / high-excitation radio galaxy', 
           'radio-quiet AGN', 'star-forming galaxy']

In [90]:
# Loading data, Only selecting ELais-N1, since we do not have statmorph data on the others
data = pd.read_csv("../../../Data/Fangyou_data/Cleaned/Elais-N1.csv")

# Loading statmorph data
statmorph_data = pd.read_csv("../../../General/Fangyou/HSC_images/statmorph.dat", sep=' ')
statmorph_data.ID = statmorph_data.ID.str[0:-5]

# Only selecting ids where we have statmorph data for
data = data[data['Source_Name'].isin(statmorph_data.ID)]

In [94]:
# Normal data without statmorph features
X_normal = data[features_num]
y = data[y_column]

# Label encoding y
le = LabelEncoder()
labels = np.unique(y.astype(str))
y = le.fit_transform(y)

X_normal_train, X_normal_test, y_train, y_test =\
    train_test_split(X_normal, y, train_size=0.75, stratify=y, random_state=42)

In [81]:
combined_data = data.merge(statmorph_data, left_on='Source_Name', right_on='ID')

Unnamed: 0,Source_Name,RA,DEC,Total_flux,E_Total_flux,Peak_flux,E_Peak_flux,S_Code,EBV,FUV_flux_corr,...,sky_median,sky_sigma,nx_stamp,ny_stamp,xmax_stamp,xmin_stamp,ymax_stamp,ymin_stamp,flag,flag_sersic
0,ILTJ155957.58+550052.4,239.989902,55.014560,395.834521,0.000054,324.796395,0.000027,S,0.008792,,...,0.031428,0.414262,48.0,48.0,53.0,6.0,59.0,12.0,1.0,1.0
1,ILTJ155958.25+550105.3,239.992724,55.018160,735.574774,0.000059,557.217365,0.000028,S,0.008780,0.144002,...,-0.014147,0.099554,47.0,48.0,46.0,0.0,52.0,5.0,1.0,1.0
2,ILTJ155958.68+550534.6,239.994484,55.092971,196.988095,0.000054,145.972469,0.000024,S,0.008420,1.259824,...,-0.020424,0.087857,48.0,48.0,59.0,12.0,53.0,6.0,0.0,0.0
3,ILTJ160000.65+550723.3,240.002697,55.123141,195.907800,0.000056,140.977168,0.000025,S,0.008466,0.346457,...,0.001740,0.085209,48.0,48.0,53.0,6.0,54.0,7.0,1.0,1.0
4,ILTJ160001.30+550713.3,240.005426,55.120372,170.575808,0.000043,168.803311,0.000025,S,0.008482,0.043996,...,-0.014750,0.092218,48.0,48.0,50.0,3.0,53.0,6.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20150,ILTJ162211.32+550612.5,245.547149,55.103485,480.598455,0.000076,304.736496,0.000032,S,0.008023,0.699567,...,-0.123571,0.109558,59.0,58.0,60.0,2.0,59.0,2.0,1.0,0.0
20151,ILTJ162213.96+550907.3,245.558179,55.152042,582.153970,0.000051,499.840837,0.000027,S,0.008016,3.220747,...,-0.046317,0.139436,61.0,61.0,60.0,0.0,60.0,0.0,0.0,0.0
20152,ILTJ162214.42+550646.2,245.560101,55.112845,1603.159656,0.000082,863.365329,0.000030,M,0.007905,3.183800,...,-2.782889,0.241846,61.0,60.0,60.0,0.0,59.0,0.0,1.0,1.0
20153,ILTJ162218.30+545744.2,245.576261,54.962279,153.220502,0.000049,130.011489,0.000024,S,0.008061,0.603581,...,0.000016,0.130863,48.0,48.0,56.0,9.0,54.0,7.0,1.0,1.0


# Rnning model

In [95]:
model = XGBClassifier(use_label_encoder=False, 
                      max_depth=7, 
                      #reg_alpha=1,   
                      #min_child_weight=0.5, 
                      #reg_lambda=5,
                      #subsample=0.5,
                      eta=0.1, # Remember to reduce to 0.1 or 0.05 for better results
                      tree_method='gpu_hist', # exact is more precies, but this is much faster
                      gpu_id=0,
                      objective='multi:softprob',
                      eval_metric =['merror', 'mlogloss'],
                      nthread=8,
                      n_estimators=500,
                      )

In [96]:
bst = model.fit(X_normal_train, y_train, 
                eval_set=[(X_normal_train, y_train)], 
                #early_stopping_rounds=50,
                verbose=True, 
                #eval_metric=xgb_recall
                #sample_weight=classes_weights,
               ) # Maybe remove weight

  from pandas import MultiIndex, Int64Index


[0]	validation_0-merror:0.08541	validation_0-mlogloss:1.23317
[1]	validation_0-merror:0.07978	validation_0-mlogloss:1.10791
[2]	validation_0-merror:0.07800	validation_0-mlogloss:1.00248
[3]	validation_0-merror:0.07581	validation_0-mlogloss:0.91230
[4]	validation_0-merror:0.07390	validation_0-mlogloss:0.83450
[5]	validation_0-merror:0.07211	validation_0-mlogloss:0.76569
[6]	validation_0-merror:0.06933	validation_0-mlogloss:0.70509
[7]	validation_0-merror:0.06781	validation_0-mlogloss:0.65129
[8]	validation_0-merror:0.06576	validation_0-mlogloss:0.60374
[9]	validation_0-merror:0.06424	validation_0-mlogloss:0.56113
[10]	validation_0-merror:0.06285	validation_0-mlogloss:0.52311
[11]	validation_0-merror:0.06133	validation_0-mlogloss:0.48877
[12]	validation_0-merror:0.05967	validation_0-mlogloss:0.45728
[13]	validation_0-merror:0.05868	validation_0-mlogloss:0.42916
[14]	validation_0-merror:0.05749	validation_0-mlogloss:0.40372
[15]	validation_0-merror:0.05597	validation_0-mlogloss:0.38069
[1

[130]	validation_0-merror:0.00093	validation_0-mlogloss:0.03227
[131]	validation_0-merror:0.00093	validation_0-mlogloss:0.03191
[132]	validation_0-merror:0.00079	validation_0-mlogloss:0.03149
[133]	validation_0-merror:0.00073	validation_0-mlogloss:0.03098
[134]	validation_0-merror:0.00066	validation_0-mlogloss:0.03072
[135]	validation_0-merror:0.00066	validation_0-mlogloss:0.03031
[136]	validation_0-merror:0.00066	validation_0-mlogloss:0.03004
[137]	validation_0-merror:0.00066	validation_0-mlogloss:0.02971
[138]	validation_0-merror:0.00060	validation_0-mlogloss:0.02941
[139]	validation_0-merror:0.00053	validation_0-mlogloss:0.02909
[140]	validation_0-merror:0.00053	validation_0-mlogloss:0.02876
[141]	validation_0-merror:0.00053	validation_0-mlogloss:0.02844
[142]	validation_0-merror:0.00053	validation_0-mlogloss:0.02821
[143]	validation_0-merror:0.00053	validation_0-mlogloss:0.02796
[144]	validation_0-merror:0.00053	validation_0-mlogloss:0.02764
[145]	validation_0-merror:0.00053	valida

[259]	validation_0-merror:0.00000	validation_0-mlogloss:0.01061
[260]	validation_0-merror:0.00000	validation_0-mlogloss:0.01055
[261]	validation_0-merror:0.00000	validation_0-mlogloss:0.01048
[262]	validation_0-merror:0.00000	validation_0-mlogloss:0.01041
[263]	validation_0-merror:0.00000	validation_0-mlogloss:0.01034
[264]	validation_0-merror:0.00000	validation_0-mlogloss:0.01029
[265]	validation_0-merror:0.00000	validation_0-mlogloss:0.01022
[266]	validation_0-merror:0.00000	validation_0-mlogloss:0.01013
[267]	validation_0-merror:0.00000	validation_0-mlogloss:0.01004
[268]	validation_0-merror:0.00000	validation_0-mlogloss:0.00996
[269]	validation_0-merror:0.00000	validation_0-mlogloss:0.00990
[270]	validation_0-merror:0.00000	validation_0-mlogloss:0.00983
[271]	validation_0-merror:0.00000	validation_0-mlogloss:0.00978
[272]	validation_0-merror:0.00000	validation_0-mlogloss:0.00973
[273]	validation_0-merror:0.00000	validation_0-mlogloss:0.00967
[274]	validation_0-merror:0.00000	valida

[388]	validation_0-merror:0.00000	validation_0-mlogloss:0.00520
[389]	validation_0-merror:0.00000	validation_0-mlogloss:0.00518
[390]	validation_0-merror:0.00000	validation_0-mlogloss:0.00516
[391]	validation_0-merror:0.00000	validation_0-mlogloss:0.00514
[392]	validation_0-merror:0.00000	validation_0-mlogloss:0.00512
[393]	validation_0-merror:0.00000	validation_0-mlogloss:0.00510
[394]	validation_0-merror:0.00000	validation_0-mlogloss:0.00509
[395]	validation_0-merror:0.00000	validation_0-mlogloss:0.00507
[396]	validation_0-merror:0.00000	validation_0-mlogloss:0.00505
[397]	validation_0-merror:0.00000	validation_0-mlogloss:0.00502
[398]	validation_0-merror:0.00000	validation_0-mlogloss:0.00500
[399]	validation_0-merror:0.00000	validation_0-mlogloss:0.00498
[400]	validation_0-merror:0.00000	validation_0-mlogloss:0.00497
[401]	validation_0-merror:0.00000	validation_0-mlogloss:0.00495
[402]	validation_0-merror:0.00000	validation_0-mlogloss:0.00494
[403]	validation_0-merror:0.00000	valida

In [97]:
y_pred = model.predict(X_normal_test)
print(classification_report(y_test, y_pred, target_names=labels, digits=4))

                                                      precision    recall  f1-score   support

      jet-mode radio AGN/low-excitation radio galaxy     0.8518    0.7077    0.7731       609
quasar-like radio AGN / high-excitation radio galaxy     0.5769    0.2381    0.3371        63
                                     radio-quiet AGN     0.8088    0.7314    0.7682       376
                                 star-forming galaxy     0.9345    0.9757    0.9546      3991

                                            accuracy                         0.9159      5039
                                           macro avg     0.7930    0.6632    0.7082      5039
                                        weighted avg     0.9106    0.9159    0.9111      5039



In [104]:
import xgboost

# Putting the train data into a DMatrix
data = xgboost.DMatrix(X_normal_train, y_train, nthread=4)

parameters = {'num_class':4,
             'max_depth': 7, 
             'eta': 0.1,
             'tree_method':'hist',
              #'gpu_id':0,
             'nthread': 8,
             'objective':'multi:softprob',   
              'eval_metric':'merror'
             }

bst = xgboost.cv(dtrain=data, params=parameters, 
           num_boost_round=10**4, stratified=True, 
           nfold=16, early_stopping_rounds=50,
           verbose_eval=True, metrics=['merror'])

  from pandas import MultiIndex, Int64Index


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\jespe\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\IPython\core\interactiveshell.py", line 3369, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\jespe\AppData\Local\Temp\ipykernel_17076\577583340.py", line 16, in <cell line: 16>
    bst = xgboost.cv(dtrain=data, params=parameters,
  File "C:\Users\jespe\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\xgboost\training.py", line 487, in cv
    cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc,
  File "C:\Users\jespe\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\xgboost\training.py", line 379, in mknfold
    dtrain = dall.slice(in_idset[k])
  File "C:\Users\jespe\AppData\Local\Packages\PythonSoftwareFoundation.Pytho