In [None]:
from astroquery.sdss import SDSS
from astropy import coordinates as coords
import astropy.units as u
import pandas as pd
import os
from astropy.io import fits
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import *
from sklearn.model_selection import RandomizedSearchCV
import seaborn as sns
import matplotlib.pyplot as plt

In [18]:
query_stars = """
SELECT TOP 20000
    s.specobjid,
    s.class,
    p.objid,
    p.ra,
    p.dec,
    p.u, p.g, p.r, p.i, p.z,
    s.z AS redshift
FROM SpecObj AS s
JOIN PhotoObj AS p ON s.bestobjid = p.objid
WHERE s.class = 'STAR'
    AND p.g BETWEEN 14 AND 20
"""
stars = SDSS.query_sql(query_stars).to_pandas()

query_galaxies = """
SELECT TOP 20000
    s.specobjid,
    s.class,
    p.objid,
    p.ra,
    p.dec,
    p.u, p.g, p.r, p.i, p.z,
    s.z AS redshift
FROM SpecObj AS s
JOIN PhotoObj AS p ON s.bestobjid = p.objid
WHERE s.class = 'GALAXY'
    AND p.g BETWEEN 14 AND 20 
"""
galaxies = SDSS.query_sql(query_galaxies).to_pandas()

query_qsos = """
SELECT TOP 20000
    s.specobjid,
    s.class,
    p.objid,
    p.ra,
    p.dec,
    p.u, p.g, p.r, p.i, p.z,
    s.z AS redshift
FROM SpecObj AS s
JOIN PhotoObj AS p ON s.bestobjid = p.objid
WHERE s.class = 'QSO'
    AND p.g BETWEEN 14 AND 20 
"""
qsos = SDSS.query_sql(query_qsos).to_pandas()

df = pd.concat([stars, galaxies, qsos], ignore_index = True)
df



Unnamed: 0,specobjid,class,objid,ra,dec,u,g,r,i,z,redshift
0,1417553661621463040,STAR,1237646381542606068,94.503047,0.475851,18.96574,17.49392,16.80027,16.58460,16.32055,0.000022
1,1417555036010997760,STAR,1237646381542606175,94.513355,0.566979,18.87810,17.18946,16.36462,16.03709,15.80739,0.000102
2,1417555585766811648,STAR,1237646381542670881,94.597311,0.500819,18.80745,17.31809,17.08153,17.01195,16.92199,0.000252
3,1417550912842393600,STAR,1237646381542671165,94.641890,0.453088,20.07667,18.26777,17.52306,17.19679,16.90737,-0.000049
4,1417551462598207488,STAR,1237646381542671295,94.662293,0.559813,17.99740,16.56571,16.34072,16.27827,16.17252,0.000177
...,...,...,...,...,...,...,...,...,...,...,...
59995,528170508584249344,QSO,1237654604240847044,134.635981,2.291095,19.50961,19.37271,19.30165,19.18989,19.02949,1.927736
59996,4296661731869218816,QSO,1237654604240978249,135.023253,2.284548,20.30885,19.77832,19.33124,19.28784,19.16355,0.983246
59997,528203219055175680,QSO,1237654604240978399,134.969922,2.376171,20.20005,19.91301,19.73539,19.47443,19.55004,1.460657
59998,528190299793549312,QSO,1237654604240978421,135.026127,2.260372,20.35191,19.93885,19.91948,19.63856,19.40280,0.616836


In [None]:
le = LabelEncoder()
df["class"] = le.fit_transform(df["class"])  # 0 -> GALAXY, 1 -> QSO, 2 -> STAR

# Features
X = df[['u','g','r','i','z','redshift']]

# Labels
y = df['class']

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42, stratify = y
)

model = XGBClassifier(
    objective = 'multi:softmax',
    num_class = 3
)

model.fit(X_train, y_train)
preds = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

Accuracy: 0.9868333333333333
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      4000
           1       0.99      0.98      0.98      4000
           2       0.99      1.00      1.00      4000

    accuracy                           0.99     12000
   macro avg       0.99      0.99      0.99     12000
weighted avg       0.99      0.99      0.99     12000

[[3938   40   22]
 [  90 3909    1]
 [   5    0 3995]]


In [20]:
params = {
    "n_estimators": [200, 400, 600],
    "max_depth": [3, 4, 5],
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "subsample": [0.7, 0.8, 0.9],
    "colsample_bytree": [0.7, 0.8, 0.9],
    "min_child_weight": [1, 3, 5],
    "reg_alpha": [0, 0.001, 0.01],
    "reg_lambda": [0.5, 1.0, 2.0]
}

random = RandomizedSearchCV(
    model,
    param_distributions=params,
    n_iter=40,        
    cv=5,
    n_jobs=-1,
    scoring="accuracy",
    random_state=42
)

random.fit(X_train,y_train)

best_model = random.best_estimator_
preds_tuned = best_model.predict(X_test)
acc_tuned = accuracy_score(y_test, preds_tuned)

print("Accuracy after tunning:", acc_tuned)
print(classification_report(y_test, preds_tuned))
print(confusion_matrix(y_test, preds_tuned))

Accuracy after tunning: 0.9876666666666667
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      4000
           1       0.99      0.98      0.98      4000
           2       0.99      1.00      1.00      4000

    accuracy                           0.99     12000
   macro avg       0.99      0.99      0.99     12000
weighted avg       0.99      0.99      0.99     12000

[[3939   39   22]
 [  83 3916    1]
 [   3    0 3997]]
