In [318]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_class_weight

import matplotlib.pyplot as plt

from xgboost import XGBClassifier

import lightgbm as lgb

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

#from imblearn.over_sampling import RandomOverSampler, SMOTE 

# Necessary to get names out
SimpleImputer.get_feature_names_out = (lambda self, names=None:
                                       self.feature_names_in_)

In [319]:
features_num = [
    'Total_flux', 'Peak_flux', 'EBV', 
       'NUV_flux_corr', 'u_flux_corr', 'Bw_flux_corr', 'R_flux_corr',
       'I_flux_corr', 'z_flux_corr', 'z_Subaru_flux_corr', 'y_flux_corr',
       'J_flux_corr', 'H_flux_corr', 'K_flux_corr', 'Ks_flux_corr',
       'ch1_flux_corr', 'ch2_flux_corr', 'ch3_flux_corr', 'ch4_flux_corr',
       'F_MIPS_24', 'F_PACS_100', 'F_PACS_160', 'F_SPIRE_250', 'F_SPIRE_350',
       'F_SPIRE_500', 'Z_BEST', 'Mass_median', 'Mass_l68', 'Mass_u68'
]
    

features_cat = [
   "S_Code"
]
y_column = "Classification"

classes = ['jet-mode radio AGN/low-excitation radio galaxy', 'quasar-like radio AGN / high-excitation radio galaxy', 
           'radio-quiet AGN', 'star-forming galaxy']

In [320]:
data = pd.read_csv("../../../Data/Fangyou_data/Cleaned/Bootes_preprocessed.csv")

In [321]:
data.columns

Index(['Source_Name', 'Total_flux', 'Peak_flux', 'S_Code', 'EBV',
       'FUV_flux_corr', 'NUV_flux_corr', 'u_flux_corr', 'Bw_flux_corr',
       'R_flux_corr', 'I_flux_corr', 'z_flux_corr', 'z_Subaru_flux_corr',
       'y_flux_corr', 'J_flux_corr', 'H_flux_corr', 'K_flux_corr',
       'Ks_flux_corr', 'ch1_flux_corr', 'ch2_flux_corr', 'ch3_flux_corr',
       'ch4_flux_corr', 'F_MIPS_24', 'F_PACS_100', 'F_PACS_160', 'F_SPIRE_250',
       'F_SPIRE_350', 'F_SPIRE_500', 'Z_BEST', 'Mass_median', 'Mass_l68',
       'Mass_u68', 'Classification'],
      dtype='object')

In [322]:
#data = data.drop(columns="FUV_flux_corr")

In [323]:
data.isna().sum()

Source_Name              0
Total_flux               0
Peak_flux                0
S_Code                   0
EBV                      0
FUV_flux_corr            0
NUV_flux_corr            0
u_flux_corr              0
Bw_flux_corr             0
R_flux_corr              0
I_flux_corr              0
z_flux_corr              0
z_Subaru_flux_corr       0
y_flux_corr              0
J_flux_corr              0
H_flux_corr              0
K_flux_corr              0
Ks_flux_corr             0
ch1_flux_corr            0
ch2_flux_corr            0
ch3_flux_corr            0
ch4_flux_corr            0
F_MIPS_24                0
F_PACS_100               0
F_PACS_160               0
F_SPIRE_250              0
F_SPIRE_350              0
F_SPIRE_500              0
Z_BEST                   0
Mass_median           7510
Mass_l68              7654
Mass_u68              7671
Classification           0
dtype: int64

In [324]:
#data = data.drop(columns="FUV_flux_corr").dropa()

In [325]:
# Creating the pipeline
transformer_num = make_pipeline(
    SimpleImputer(strategy="mean"),  # there are a few missing values
    StandardScaler(),
)
transformer_cat = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(sparse=False, handle_unknown='ignore')
)

preprocessor = make_column_transformer(
    (transformer_num, features_num),
    (transformer_cat, features_cat),
)

In [326]:
# Dropping required columns
X = data
y = data[y_column]

In [327]:
# Transforming y to labels
le = preprocessing.LabelEncoder()
labels = np.unique(y.astype(str))
y = le.fit_transform(y)

# Creating training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [328]:
# Transforming the data, data that isn't in any of the pipelines is dropped automatically
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [329]:
clf = RandomForestClassifier(n_estimators=500, class_weight="balanced")
clf.fit(X_train, y_train)

In [330]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=labels, digits=4))

                                                      precision    recall  f1-score   support

      jet-mode radio AGN/low-excitation radio galaxy     0.8392    0.5976    0.6981       594
quasar-like radio AGN / high-excitation radio galaxy     0.5000    0.1026    0.1702        78
                                     radio-quiet AGN     0.7549    0.4835    0.5895       395
                                 star-forming galaxy     0.8385    0.9653    0.8974      2479

                                            accuracy                         0.8311      3546
                                           macro avg     0.7332    0.5373    0.5888      3546
                                        weighted avg     0.8219    0.8311    0.8137      3546



In [331]:
lightbm = lgb.LGBMClassifier(num_class=4, 
                             objective="multiclass",  
                             n_estimators=100000, 
                             class_weight="balanced",
                             max_depth=8, 
                             num_leaves=32, 
                             min_child_samples=5,
                             reg_alpha=1,
                             reg_lambda=2)

In [332]:
lightbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(stopping_rounds=20)])

Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[256]	valid_0's multi_logloss: 0.448806


In [333]:
y_pred = lightbm.predict(X_test)
print(classification_report(y_test, y_pred, target_names=labels, digits=4))

                                                      precision    recall  f1-score   support

      jet-mode radio AGN/low-excitation radio galaxy     0.7361    0.7795    0.7572       594
quasar-like radio AGN / high-excitation radio galaxy     0.5152    0.4359    0.4722        78
                                     radio-quiet AGN     0.6145    0.7063    0.6572       395
                                 star-forming galaxy     0.9086    0.8786    0.8934      2479

                                            accuracy                         0.8331      3546
                                           macro avg     0.6936    0.7001    0.6950      3546
                                        weighted avg     0.8383    0.8331    0.8350      3546

