## 별, 은하, 퀘이사 구별
출처 : https://www.kaggle.com/datasets/lucidlenn/sloan-digital-sky-survey


|종류|설명|
|------|---|
|별(star)|스스로 빛과 열을 내는 우주상의 천체|
|은하(galaxy)|천구 위에 구름 띠 모양으로 길게 분포되어 있는 수많은 천체의 무리.|
|퀘이사(quasar)|블랙홀이 주변 물질을 집어삼키는 에너지에 의해 형성되는 거대 발광체|


<img src="./image/sirius_star.webp" alt="drawing" width="200"/>
<img src="./image/galaxy.jpg" alt="drawing" width="240"/>
<img src="./image/quasar.jpg" alt="drawing" width="265"/>


In [2]:
# 모듈 로딩
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegressionCV    
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

# 경고 문구 방지
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("Skyserver_SQL2_27_2018 6_51_39_PM.csv")
df.head(10)

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,camcol,field,specobjid,class,redshift,plate,mjd,fiberid
0,1.23765e+18,183.531326,0.089693,19.47406,17.0424,15.94699,15.50342,15.22531,752,301,4,267,3.72236e+18,STAR,-9e-06,3306,54922,491
1,1.23765e+18,183.59837,0.135285,18.6628,17.21449,16.67637,16.48922,16.3915,752,301,4,267,3.63814e+17,STAR,-5.5e-05,323,51615,541
2,1.23765e+18,183.680207,0.126185,19.38298,18.19169,17.47428,17.08732,16.80125,752,301,4,268,3.23274e+17,GALAXY,0.123111,287,52023,513
3,1.23765e+18,183.870529,0.049911,17.76536,16.60272,16.16116,15.98233,15.90438,752,301,4,269,3.72237e+18,STAR,-0.000111,3306,54922,510
4,1.23765e+18,183.883288,0.102557,17.55025,16.26342,16.43869,16.55492,16.61326,752,301,4,269,3.72237e+18,STAR,0.00059,3306,54922,512
5,1.23765e+18,183.847174,0.173694,19.43133,18.46779,18.16451,18.01475,18.04155,752,301,4,269,3.64955e+17,STAR,0.000315,324,51666,594
6,1.23765e+18,183.864379,0.019201,19.38322,17.88995,17.10537,16.66393,16.36955,752,301,4,269,3.23287e+17,GALAXY,0.100242,287,52023,559
7,1.23765e+18,183.900081,0.187473,18.97993,17.84496,17.38022,17.20673,17.07071,752,301,4,269,3.72237e+18,STAR,0.000315,3306,54922,515
8,1.23765e+18,183.924588,0.097246,17.90616,16.97172,16.67541,16.53776,16.47596,752,301,4,270,3.63829e+17,STAR,8.9e-05,323,51615,595
9,1.23765e+18,183.973498,0.081626,18.67249,17.71375,17.49362,17.28284,17.22644,752,301,4,270,3.24369e+17,GALAXY,0.040508,288,52000,400


In [4]:
df.columns

Index(['objid', 'ra', 'dec', 'u', 'g', 'r', 'i', 'z', 'run', 'rerun', 'camcol',
       'field', 'specobjid', 'class', 'redshift', 'plate', 'mjd', 'fiberid'],
      dtype='object')

## 전처리
- 17개의 수치형 컬럼 + 1개의 target 컬럼(class 컬럼)


|컬럼|설명|컬럼|설명|
|------|---|---|---|
|objid|Object Identifier|rerun|Rerun Number to specify how the image was processed|
|ra, dec|astronomical coordinates|camcol|Camera column to identify the scanline within the run|
|u|Ultraviolet filter in the photometric system|field|Field number to identify each field|
|g|Green filter in the photometric system|specobjid|Object Identifier|
|r|Red filter in the photometric system |redshift|Final Redshift(적색편이)|
|i|Near Infrared filter in the photometric system|plate|plate number|
|z|Infrared filter in the photometric system|mjd|Modified Julian Date (SDSS데이터 받은 날짜)|
|run|Run Number used to identify the specific scan|fiberid|SDSS 분광기에 대응되는 분광기 ID|


In [5]:
# 타겟 컬럼 확인
print(f"class 컬럼 : {df['class'].unique()}")
print(f"class 컬럼별 개수:\n{df['class'].value_counts()}")
# 결측값 확인
print(f"결측값 확인 : \n{df.isnull().sum()}")

class 컬럼 : ['STAR' 'GALAXY' 'QSO']
class 컬럼별 개수:
GALAXY    4998
STAR      4152
QSO        850
Name: class, dtype: int64
결측값 확인 : 
objid        0
ra           0
dec          0
u            0
g            0
r            0
i            0
z            0
run          0
rerun        0
camcol       0
field        0
specobjid    0
class        0
redshift     0
plate        0
mjd          0
fiberid      0
dtype: int64


In [6]:
# 중복값 확인 (중복값 없음)
df.drop_duplicates()

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,camcol,field,specobjid,class,redshift,plate,mjd,fiberid
0,1.237650e+18,183.531326,0.089693,19.47406,17.04240,15.94699,15.50342,15.22531,752,301,4,267,3.722360e+18,STAR,-0.000009,3306,54922,491
1,1.237650e+18,183.598370,0.135285,18.66280,17.21449,16.67637,16.48922,16.39150,752,301,4,267,3.638140e+17,STAR,-0.000055,323,51615,541
2,1.237650e+18,183.680207,0.126185,19.38298,18.19169,17.47428,17.08732,16.80125,752,301,4,268,3.232740e+17,GALAXY,0.123111,287,52023,513
3,1.237650e+18,183.870529,0.049911,17.76536,16.60272,16.16116,15.98233,15.90438,752,301,4,269,3.722370e+18,STAR,-0.000111,3306,54922,510
4,1.237650e+18,183.883288,0.102557,17.55025,16.26342,16.43869,16.55492,16.61326,752,301,4,269,3.722370e+18,STAR,0.000590,3306,54922,512
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1.237650e+18,131.316413,51.539547,18.81777,17.47053,16.91508,16.68305,16.50570,1345,301,3,161,5.033450e+17,GALAXY,0.027583,447,51877,246
9996,1.237650e+18,131.306083,51.671341,18.27255,17.43849,17.07692,16.71661,16.69897,1345,301,3,162,5.033400e+17,GALAXY,0.117772,447,51877,228
9997,1.237650e+18,131.552562,51.666986,18.75818,17.77784,17.51872,17.43302,17.42048,1345,301,3,162,8.222620e+18,STAR,-0.000402,7303,57013,622
9998,1.237650e+18,131.477151,51.753068,18.88287,17.91068,17.53152,17.36284,17.13988,1345,301,3,163,5.033400e+17,GALAXY,0.014019,447,51877,229


In [7]:
df.drop(axis=1, columns=['objid', 'rerun'], inplace=True)

In [8]:
col_list = list(df.columns)
del col_list[11]

In [9]:
data = df[col_list].to_numpy()
target = df['class'].to_numpy()

In [10]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
target = label.fit_transform(target)

In [11]:
from sklearn.model_selection import train_test_split
train_data, predict_data, train_target, predict_target = train_test_split(data, target, random_state=42, stratify=target, test_size=0.2)

### 표준화 작업

In [12]:
from sklearn.preprocessing import StandardScaler
st = StandardScaler()
st.fit(train_data)
train_scaled = st.transform(train_data)
predict_scaled = st.transform(predict_data)

In [13]:
train2_data, test_data, train2_target, test_target = train_test_split(train_scaled, train_target, random_state=42, stratify=train_target)

In [14]:
def get_all_estimators(train_data, test_data, train_target, test_target, type_filter='classifier'):
    from sklearn.utils import all_estimators
    import warnings
    warnings.filterwarnings('ignore')
    models = all_estimators(type_filter=type_filter)
    scores = []
    for name, model in models:
        try: 
            md =model()
            # 학습
            md.fit(train_data, train_target)
            # 평가
            result = np.round(md.score(train_data, train_target),4)
            result2 = np.round(md.score(test_data, test_target),4)
            scores.append([name, result, result2])
        except:
            pass
    scores = pd.DataFrame(scores, columns = ['name', 'train', 'test'])
    return scores

### 7500개 시간이 너무 오래 소요 -> 1800개 정도로 데이터 샘플을 만든 후 수행

In [15]:
train_data, test_data, train_target, test_target

from sklearn.linear_model import LogisticRegression
lg = LogisticRegression(max_iter=10000)
lg.fit(train2_data, train2_target)
print(lg.score(train2_data, train2_target))
print(lg.score(test_data, test_target))

0.9743333333333334
0.9785


In [16]:
scores = get_all_estimators(train2_data, test_data, train2_target, test_target, type_filter='classifier')
scores

Unnamed: 0,name,train,test
0,AdaBoostClassifier,0.877,0.886
1,BaggingClassifier,0.9985,0.991
2,BernoulliNB,0.8623,0.86
3,CalibratedClassifierCV,0.9728,0.977
4,DecisionTreeClassifier,1.0,0.9875
5,DummyClassifier,0.4997,0.5
6,ExtraTreeClassifier,1.0,0.9
7,ExtraTreesClassifier,1.0,0.976
8,GaussianNB,0.935,0.943
9,GaussianProcessClassifier,0.9457,0.9245


In [17]:
# test_score 상위 10개 모델
scores.sort_values(['test'], ascending=False).head(10)

Unnamed: 0,name,train,test
18,LogisticRegressionCV,0.9907,0.996
11,HistGradientBoostingClassifier,1.0,0.9915
1,BaggingClassifier,0.9985,0.991
24,RandomForestClassifier,1.0,0.9905
10,GradientBoostingClassifier,0.9992,0.99
19,MLPClassifier,0.9905,0.989
4,DecisionTreeClassifier,1.0,0.9875
16,LinearSVC,0.974,0.979
17,LogisticRegression,0.9743,0.978
3,CalibratedClassifierCV,0.9728,0.977


### 괜찮은 모델링 3개정도(시간없으면 2개) 찾아서 교차검증 + 그리드서치

In [38]:
def modeling(model):
    lr = model()
    lr.fit(train_scaled, train_target)
    print(f'train score: {lr.score(train_scaled, train_target)}')
    print(f'test score: {lr.score(predict_scaled, predict_target)}')
    print(classification_report(predict_target, lr.predict(predict_scaled), target_names=['galaxy', 'quasar', 'star']))
    return lr

def modeling_grid(model, params):
    """ params: dict형태로 하이퍼파라미터를 받음 
    ex. {'penalty' : ['l1', 'l2', 'elasticnet'],'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']} """
    lrg = model()
    grid_lrg = GridSearchCV(lrg, param_grid=params, cv=3, refit=True, return_train_score=True)
    grid_lrg.fit(train_scaled, train_target)

    print('best parameters : ', grid_lrg.best_params_)
    cond = grid_lrg.cv_results_['mean_test_score'] == grid_lrg.best_score_
    print('best train score : ', grid_lrg.cv_results_['mean_train_score'][cond][0])
    print('best test score : ', grid_lrg.best_score_)
    print('best model : ' , grid_lrg.best_estimator_)
    return grid_lrg

### 1. LogisticRegressionCV (교차검증 포함)

In [27]:
lr = modeling(LogisticRegressionCV)

train score: 0.991875
test score: 0.9925
              precision    recall  f1-score   support

      galaxy       0.99      0.99      0.99      1000
      quasar       1.00      0.95      0.98       170
        star       0.99      1.00      1.00       830

    accuracy                           0.99      2000
   macro avg       0.99      0.98      0.99      2000
weighted avg       0.99      0.99      0.99      2000



### 1-2. LogisticRegressionCV + GridSearchCV 

In [39]:
param = {'penalty' : ['l1', 'l2', 'elasticnet'],'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
grid_lrg = modeling_grid(LogisticRegressionCV, param)

best parameters :  {'penalty': 'l2', 'solver': 'newton-cg'}
best train score :  0.9920624374418976
best test score :  0.9909999997187149
best model :  LogisticRegressionCV(solver='newton-cg')


In [43]:
print(classification_report(predict_target, grid_lrg.predict(predict_scaled)))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1000
           1       1.00      0.96      0.98       170
           2       0.99      1.00      1.00       830

    accuracy                           0.99      2000
   macro avg       0.99      0.98      0.99      2000
weighted avg       0.99      0.99      0.99      2000



### 2. HistGradientBoostingClassifier

In [44]:
hgb = modeling(HistGradientBoostingClassifier)

train score: 1.0
test score: 0.9915
              precision    recall  f1-score   support

      galaxy       0.99      0.99      0.99      1000
      quasar       0.99      0.95      0.97       170
        star       1.00      1.00      1.00       830

    accuracy                           0.99      2000
   macro avg       0.99      0.98      0.98      2000
weighted avg       0.99      0.99      0.99      2000



### 2-1. HistGradientBoostingClassifier + GridSearchCV

In [46]:
param = {'loss' : ['log_loss', 'auto', 'binary_crossentropy'], 'learning_rate': [0.01,0.02,0.05, 0.1, 0.2, 0.5, 1, 10]}
grid_hgb = modeling_grid(HistGradientBoostingClassifier, param)

best parameters :  {'learning_rate': 0.1, 'loss': 'log_loss'}
best train score :  1.0
best test score :  0.9897503434726324
best model :  HistGradientBoostingClassifier()


In [47]:
print(classification_report(predict_target, grid_hgb.predict(predict_scaled)))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1000
           1       0.99      0.95      0.97       170
           2       1.00      1.00      1.00       830

    accuracy                           0.99      2000
   macro avg       0.99      0.98      0.98      2000
weighted avg       0.99      0.99      0.99      2000



### 3. BaggingClassifier

In [48]:
bc = modeling(BaggingClassifier)

train score: 0.998625
test score: 0.99
              precision    recall  f1-score   support

      galaxy       0.99      0.99      0.99      1000
      quasar       0.98      0.93      0.95       170
        star       0.99      1.00      1.00       830

    accuracy                           0.99      2000
   macro avg       0.99      0.97      0.98      2000
weighted avg       0.99      0.99      0.99      2000



### 3-1. BaggingClassifier(교차검증)

In [49]:
result = cross_validate(bc, train_data, train_target, return_train_score=True, n_jobs=-1)
print(f"train score: {np.mean(result['train_score'])}")
print(f"test score: {np.mean(result['test_score'])}")

train score: 0.9988125
test score: 0.987125


### 4. RandomForestClassifier

In [50]:
rf = modeling(RandomForestClassifier)

train score: 0.999875
test score: 0.9915
              precision    recall  f1-score   support

      galaxy       0.99      0.99      0.99      1000
      quasar       0.98      0.95      0.97       170
        star       0.99      1.00      1.00       830

    accuracy                           0.99      2000
   macro avg       0.99      0.98      0.99      2000
weighted avg       0.99      0.99      0.99      2000



### 4-1. RandomForestClassifier + GridSearchCV

In [51]:
param = {'criterion' : ["gini", "entropy", "log_loss"],'min_impurity_decrease':[0, 0.0001, 0.0002,0.0003, 0.0004, 0.0005, 0.001, 0.01, 0.1] }
grid_rf = modeling_grid(RandomForestClassifier, param)

best parameters :  {'criterion': 'entropy', 'min_impurity_decrease': 0.0001}
best train score :  1.0
best test score :  0.9885000777753493
best model :  RandomForestClassifier(criterion='entropy', min_impurity_decrease=0.0001)


In [52]:
print(classification_report(predict_target, grid_rf.predict(predict_scaled)))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1000
           1       0.98      0.93      0.95       170
           2       0.99      1.00      0.99       830

    accuracy                           0.99      2000
   macro avg       0.98      0.97      0.98      2000
weighted avg       0.99      0.99      0.99      2000



### 5. GradientBoostingClassifier

In [53]:
gb = modeling(GradientBoostingClassifier)

train score: 0.99925
test score: 0.991
              precision    recall  f1-score   support

      galaxy       0.99      0.99      0.99      1000
      quasar       0.97      0.95      0.96       170
        star       1.00      0.99      1.00       830

    accuracy                           0.99      2000
   macro avg       0.99      0.98      0.98      2000
weighted avg       0.99      0.99      0.99      2000



### 5-1. GradientBoostingClassifier + GridSearchCV

In [55]:
param = {'loss' : ['log_loss', 'deviance', 'exponential'],'learning_rate':[0.01,0.02,0.05, 0.1, 0.2, 0.5, 1, 10] }
grid_bc = modeling_grid(GradientBoostingClassifier, param)

best parameters :  {'learning_rate': 0.05, 'loss': 'deviance'}
best train score :  0.9955000702729522
best test score :  0.9895001871952802
best model :  GradientBoostingClassifier(learning_rate=0.05, loss='deviance')


In [56]:
print(classification_report(predict_target, grid_bc.predict(predict_scaled)))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1000
           1       0.98      0.94      0.96       170
           2       1.00      1.00      1.00       830

    accuracy                           0.99      2000
   macro avg       0.99      0.98      0.98      2000
weighted avg       0.99      0.99      0.99      2000



### 6. xgboost

In [57]:
xg = modeling(XGBClassifier)

train score: 1.0
test score: 0.991
              precision    recall  f1-score   support

      galaxy       0.99      0.99      0.99      1000
      quasar       0.98      0.94      0.96       170
        star       1.00      1.00      1.00       830

    accuracy                           0.99      2000
   macro avg       0.99      0.98      0.98      2000
weighted avg       0.99      0.99      0.99      2000



In [58]:
result = cross_validate(xg, train_data, train_target, return_train_score=True, n_jobs=-1)
print(f"train score: {np.mean(result['train_score'])}")
print(f"test score: {np.mean(result['test_score'])}")

train score: 1.0
test score: 0.99025


In [65]:
param = {'booster' : ['gbtree', 'dart', 'gblinear', 'rf'], 'eta' : [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]}
grid_xg = modeling_grid(XGBClassifier, param)

best parameters :  {'booster': 'gbtree', 'eta': 0.1}
best train score :  0.9999375078115236
best test score :  0.9900002184648149
best model :  XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False, eta=0.1,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.100000001, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, ...)


### 7. lightgbm

In [59]:
light = modeling(LGBMClassifier)

train score: 1.0
test score: 0.9915
              precision    recall  f1-score   support

      galaxy       0.99      0.99      0.99      1000
      quasar       0.99      0.95      0.97       170
        star       0.99      1.00      1.00       830

    accuracy                           0.99      2000
   macro avg       0.99      0.98      0.99      2000
weighted avg       0.99      0.99      0.99      2000



In [60]:
result = cross_validate(light, train_data, train_target, return_train_score=True, n_jobs=-1)
print(f"train score: {np.mean(result['train_score'])}")
print(f"test score: {np.mean(result['test_score'])}")

train score: 1.0
test score: 0.9907499999999999


In [64]:
param = {'boosting_type' : ['gbdt', 'dart', 'goss', 'rf'], 'learning_rate' : [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]}
grid_light = modeling_grid(LGBMClassifier,param)

best parameters :  {'boosting_type': 'goss', 'learning_rate': 0.2}
best train score :  1.0
best test score :  0.9895002809570034
best model :  LGBMClassifier(boosting_type='goss', learning_rate=0.2)


### 8. CatBoost

In [61]:
cat = modeling(CatBoostClassifier)

Learning rate set to 0.087979
0:	learn: 0.9414659	total: 172ms	remaining: 2m 51s
1:	learn: 0.8198167	total: 183ms	remaining: 1m 31s
2:	learn: 0.7217552	total: 193ms	remaining: 1m 4s
3:	learn: 0.6436326	total: 203ms	remaining: 50.6s
4:	learn: 0.5753946	total: 215ms	remaining: 42.8s
5:	learn: 0.5164593	total: 226ms	remaining: 37.4s
6:	learn: 0.4677159	total: 232ms	remaining: 33s
7:	learn: 0.4243342	total: 243ms	remaining: 30.2s
8:	learn: 0.3852960	total: 261ms	remaining: 28.7s
9:	learn: 0.3514414	total: 274ms	remaining: 27.2s
10:	learn: 0.3210089	total: 289ms	remaining: 26s
11:	learn: 0.2944596	total: 302ms	remaining: 24.8s
12:	learn: 0.2704200	total: 313ms	remaining: 23.8s
13:	learn: 0.2491558	total: 327ms	remaining: 23s
14:	learn: 0.2296108	total: 338ms	remaining: 22.2s
15:	learn: 0.2129480	total: 348ms	remaining: 21.4s
16:	learn: 0.1973942	total: 360ms	remaining: 20.8s
17:	learn: 0.1837439	total: 370ms	remaining: 20.2s
18:	learn: 0.1712617	total: 382ms	remaining: 19.7s
19:	learn: 0.16

In [62]:
print(f'train score: {cat.score(train_scaled, train_target)}')
print(f'test score: {cat.score(predict_scaled, predict_target)}')
print(classification_report(predict_target, cat.predict(predict_scaled), target_names=['galaxy', 'quasar', 'star']))

train score: 0.9995
test score: 0.9915
              precision    recall  f1-score   support

      galaxy       0.99      0.99      0.99      1000
      quasar       0.99      0.94      0.96       170
        star       0.99      1.00      1.00       830

    accuracy                           0.99      2000
   macro avg       0.99      0.98      0.98      2000
weighted avg       0.99      0.99      0.99      2000



# 결론
|컬럼|train|test|
|------|---|---|
|__LogisticRegressionCV__|0.9919|__0.9925__|
|LogisticRegressionCV+gridSearchCV|0.9921|0.9909|
|HistGradientBoostingClassifier|1.0|0.9915|
|HistGradientBoostingClassifier+gridSearchCV|1.0|0.9898|
|BaggingClassifier|0.9986|0.99|
|BaggingClassifier+교차검증|0.9988|0.9872|
|RandomForestClassifier|0.9999|0.9915|
|RandomForestClassifier+gridSearchCV|1.0|0.9885|
|GradientBoostingClassifier|0.9993|0.991|
|GradientBoostingClassifier+gridSearchCV|0.9955|0.9895|
|xgboost|1.0|0.991|
|xgboost+gridSearchCV|0.9999|0.99|
|lightgbm|1.0|0.9915|
|lightgbm+gridSearchCV|1.0|0.9895|
|catboost|0.9995|0.9915|