1. data pre-processing (Train data & Test data)

data를 전처리 하는 단계로 본 프로젝트에선 raw data를 활용하여 원하는 열을 생성하는 것부터 encoding까지의 과정이다.

In [1]:
# 별의 일생을 예측하기 위해 정답지 생성
# 예측 요소에 관한 열이 없어 물리식을 통해 구함.(질량, 절대등급, 유형)
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
# train data load
df = pd.read_csv('/content/Star9999_raw.csv')

In [3]:
# train data 확인
df.head()

Unnamed: 0,Vmag,Plx,B-V,SpType
0,7.64,5.93,0.056,A0/A1V
1,9.99,-0.41,1.044,A0:Ia
2,11.42,-6.63,0.041,A0Ia
3,11.18,3.24,0.117,A0Ia:
4,10.95,0.62,0.134,A0Ia:


In [4]:
# test data load
ds = pd.read_csv('/content/STELLARHOSTS_2024.04.01_09.19.43_수정.csv')

In [5]:
# test data 확인
ds.head()

Unnamed: 0,st_spectype,st_teff,st_mass
0,A1IV,9360.0,2.07
1,A2V,8720.0,1.76
2,A2V,8840.0,1.96
3,A7V,7800.0,1.75
4,A8V,7500.0,1.47


 1) Train data pre-processing

 *  data의 결측치 or 오류 발생 값 제거  



In [6]:
# 색지수를 통해 절대등급을 계산하기 위해 열 이름 변경
df.rename(columns={'B-V':'B_V'}, inplace = True)
df

Unnamed: 0,Vmag,Plx,B_V,SpType
0,7.64,5.93,0.056,A0/A1V
1,9.99,-0.41,1.044,A0:Ia
2,11.42,-6.63,0.041,A0Ia
3,11.18,3.24,0.117,A0Ia:
4,10.95,0.62,0.134,A0Ia:
...,...,...,...,...
4638,9.38,3.36,0.187,O8V
4639,8.26,-0.19,0.306,O9.5Ib
4640,8.72,0.80,0.390,O9.5IV
4641,6.11,0.57,-0.151,O9IIInn


In [7]:
# 결측치 확인
df.isnull().sum()

Vmag      0
Plx       0
B_V       0
SpType    0
dtype: int64

In [8]:
# data type 변경
df = df.astype({'Vmag':'float32', 'Plx':'float32', 'B_V':'float32', 'SpType':'object'})
df.dtypes

Vmag      float32
Plx       float32
B_V       float32
SpType     object
dtype: object

In [9]:
# 후반 파라미터를 설정할때 log가 0이 되버리는 경우가 생겨 cell의 모든값을 날려버리고 다시 계산
df_missing_removed = df.dropna()
df_missing_removed.shape

(4643, 4)

In [10]:
# 결측치 확인
df_missing_removed.isnull().sum()

Vmag      0
Plx       0
B_V       0
SpType    0
dtype: int64

In [11]:
# 결측치 or 값이 0인 행 제거
dropped_rows = df.shape[0] - df_missing_removed.shape[0]
print(dropped_rows)

0


In [12]:
df_missing_removed.reset_index(drop=True, inplace=True)
df_missing_removed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4643 entries, 0 to 4642
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Vmag    4643 non-null   float32
 1   Plx     4643 non-null   float32
 2   B_V     4643 non-null   float32
 3   SpType  4643 non-null   object 
dtypes: float32(3), object(1)
memory usage: 90.8+ KB


In [13]:
df_missing_removed.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Vmag,4643.0,8.179234,1.355639,0.45,7.43,8.45,9.11,12.49
Plx,4643.0,8.397532,12.802938,-6.82,2.99,5.55,9.6,280.269989
B_V,4643.0,0.765008,0.429365,-0.236,0.46,0.676,1.078,2.266


In [14]:
# Plx의 값이 0인 행을 찾아 특정 쿼리에 저장
df_missing_removed.query("Plx == 0")

Unnamed: 0,Vmag,Plx,B_V,SpType
390,7.63,0.0,0.267,B1II...
3996,8.68,0.0,1.29,K2III


In [15]:
# 그 외 나머지를 같은 쿼리에 저장하여 data 덮어쓰기 진행
df_missing_removed = df_missing_removed.query('Plx != 0')

In [16]:
df_missing_removed.shape

(4641, 4)

In [17]:
# 다시 확인시 Plx=0인 행이 완전히 제거된 모습을 볼 수 있음.
df_missing_removed.query('Plx == 0')

Unnamed: 0,Vmag,Plx,B_V,SpType


In [19]:
# reindexing the dataframe
df_missing_removed.reset_index(drop = True, inplace = True)
df_missing_removed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4641 entries, 0 to 4640
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Vmag    4641 non-null   float32
 1   Plx     4641 non-null   float32
 2   B_V     4641 non-null   float32
 3   SpType  4641 non-null   object 
dtypes: float32(3), object(1)
memory usage: 90.8+ KB


In [20]:
# df_missing_removed를 복사해서 dt로 정의
dt = df_missing_removed.copy()



*  model 학습하는데 필요한 열 계산
   (거리, 절대등급, 온도, 밝기, 질량)



In [21]:
# star predict parameter setting
# Plx : 지구에서 항성까지의 거리
dt['Plx'] = dt.Plx.map(lambda x : x * 0.00099999995874704)

# Amag : 절대등급
dt['Amag'] = dt.Vmag + 5 *(np.log10(((dt.Plx))) + 1)

# temperature(K)
dt['Temp(K)'] = 7090/(df.B_V + 0.72)

# Lum : 밝기 (태양 밝기 비례)
dt['Lum'] = dt.Amag.map(lambda x : 3.828e+26 / 10**((x-4.8)/2.5))

# Lum(sun=1) : 태양 밝기를 1로 놓았을때의 밝기
dt['Lum(sun=1)'] = dt.Amag.map(lambda x : 1 / 10**((x-4.8)/2.5))

# Mass: 별의 질량 - 별의 일생 측정에 중요, 밝기-질량 법칙에 의해 계산
dt['Mass(sun=1)'] = dt['Lum(sun=1)']**2/7
# data확인
dt.describe().T

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Vmag,4641.0,8.179244,1.355887,0.45,7.43,8.45,9.11,12.49
Plx,4641.0,0.00840115,0.01280451,-0.00682,0.00299,0.00556,0.009609999,0.28027
B_V,4641.0,0.7650025,0.4293258,-0.236,0.46,0.676,1.078,2.266
Amag,4534.0,1.815603,2.30722,-12.18,0.4516343,1.848968,3.361009,14.24917
Temp(K),4641.0,5239.925,1750.69,2374.414,3943.27,5078.796,6008.474,14648.76
Lum,4534.0,1.348973e+30,4.637931e+31,6.357785e+22,1.440682e+27,5.799421e+27,2.100478e+28,2.37122e+33
Lum(sun=1),4534.0,3523.964,121158.1,0.0001660863,3.763538,15.15,54.87143,6194411.0
Mass(sun=1),4534.0,2098351000.0,97668440000.0,3.940668e-09,2.02346,32.78894,430.1249,5481532000000.0


In [22]:
# 결측치가 있는 행 제거
dt = dt.dropna(axis=0)
dt

Unnamed: 0,Vmag,Plx,B_V,SpType,Amag,Temp(K),Lum,Lum(sun=1),Mass(sun=1)
0,7.64,0.00593,0.056,A0/A1V,1.505273,9136.597656,7.959076e+27,20.791735,6.175661e+01
3,11.18,0.00324,0.117,A0Ia:,3.732725,8470.728516,1.023015e+27,2.672452,1.020286e+00
4,10.95,0.00062,0.134,A0Ia:,-0.088042,8302.107422,3.452937e+28,90.202116,1.162346e+03
6,6.07,0.00010,0.389,A0Iab,-8.930000,6393.146973,1.188425e+32,310455.926090,1.376898e+10
7,8.19,0.00128,0.818,A0Iab,-1.273951,4609.882812,1.029324e+29,268.893487,1.032910e+04
...,...,...,...,...,...,...,...,...,...
4634,10.58,0.00282,1.316,M7III,2.831245,3148.312744,2.346788e+27,6.130584,5.369152e+00
4635,7.70,0.00329,1.467,MIII,0.285979,3938.888672,2.446697e+28,63.915814,5.836045e+02
4636,9.38,0.00336,0.187,O8V,2.011696,3482.318359,4.992216e+27,13.041317,2.429657e+01
4638,8.72,0.00080,0.390,O9.5IV,-1.764550,7816.978516,1.617305e+29,422.493413,2.550010e+04




*   one-hot encoding작업



* Train data

In [23]:
# 여키스 항성분류를 통해 현재 항성의 형태 파악(MKK)
def type(star):
  main = ['V', 'Va', 'Vb']
  dwarf = ['VI']
  white = ['VII']
  giant = ['II', 'IIa', 'IIab', 'IIb', 'III','IIIa','IIIab',
            'IIIb','IV','IVa','IVb']
  for i in main:
    if i in star:
      return 'Main Sequence'
  for i in dwarf:
    if i in star:
      return 'Dwarf'
  for i in white:
    if i in star:
      return 'White Dwarf'
  for i in giant:
    if i in star:
      return 'Giant'
  return 'Super Giant'

# 열에 적용
dt['cur_Type'] = dt.SpType.apply(type)
dt

Unnamed: 0,Vmag,Plx,B_V,SpType,Amag,Temp(K),Lum,Lum(sun=1),Mass(sun=1),cur_Type
0,7.64,0.00593,0.056,A0/A1V,1.505273,9136.597656,7.959076e+27,20.791735,6.175661e+01,Main Sequence
3,11.18,0.00324,0.117,A0Ia:,3.732725,8470.728516,1.023015e+27,2.672452,1.020286e+00,Super Giant
4,10.95,0.00062,0.134,A0Ia:,-0.088042,8302.107422,3.452937e+28,90.202116,1.162346e+03,Super Giant
6,6.07,0.00010,0.389,A0Iab,-8.930000,6393.146973,1.188425e+32,310455.926090,1.376898e+10,Super Giant
7,8.19,0.00128,0.818,A0Iab,-1.273951,4609.882812,1.029324e+29,268.893487,1.032910e+04,Super Giant
...,...,...,...,...,...,...,...,...,...,...
4634,10.58,0.00282,1.316,M7III,2.831245,3148.312744,2.346788e+27,6.130584,5.369152e+00,Giant
4635,7.70,0.00329,1.467,MIII,0.285979,3938.888672,2.446697e+28,63.915814,5.836045e+02,Giant
4636,9.38,0.00336,0.187,O8V,2.011696,3482.318359,4.992216e+27,13.041317,2.429657e+01,Main Sequence
4638,8.72,0.00080,0.390,O9.5IV,-1.764550,7816.978516,1.617305e+29,422.493413,2.550010e+04,Main Sequence


In [26]:
# 분광형을 통한 색 파악
def color(star):
  blue = ['O', 'B']
  white = ['A']
  yellow = ['F', 'G']
  red = ['K', 'M']
  for i in blue:
    if i in star:
      return 'Blue'
  for i in white:
    if i in star:
      return 'White'
  for i in yellow:
    if i in star:
      return 'Yellow'
  for i in red:
    if i in star:
      return 'Red'

# 열에 적용
dt['Color'] = dt.SpType.apply(color)
dt

Unnamed: 0,Vmag,Plx,B_V,SpType,Amag,Temp(K),Lum,Lum(sun=1),Mass(sun=1),cur_Type,Color
0,7.64,0.00593,0.056,A0/A1V,1.505273,9136.597656,7.959076e+27,20.791735,6.175661e+01,Main Sequence,White
3,11.18,0.00324,0.117,A0Ia:,3.732725,8470.728516,1.023015e+27,2.672452,1.020286e+00,Super Giant,White
4,10.95,0.00062,0.134,A0Ia:,-0.088042,8302.107422,3.452937e+28,90.202116,1.162346e+03,Super Giant,White
6,6.07,0.00010,0.389,A0Iab,-8.930000,6393.146973,1.188425e+32,310455.926090,1.376898e+10,Super Giant,White
7,8.19,0.00128,0.818,A0Iab,-1.273951,4609.882812,1.029324e+29,268.893487,1.032910e+04,Super Giant,White
...,...,...,...,...,...,...,...,...,...,...,...
4634,10.58,0.00282,1.316,M7III,2.831245,3148.312744,2.346788e+27,6.130584,5.369152e+00,Giant,Red
4635,7.70,0.00329,1.467,MIII,0.285979,3938.888672,2.446697e+28,63.915814,5.836045e+02,Giant,Red
4636,9.38,0.00336,0.187,O8V,2.011696,3482.318359,4.992216e+27,13.041317,2.429657e+01,Main Sequence,Blue
4638,8.72,0.00080,0.390,O9.5IV,-1.764550,7816.978516,1.617305e+29,422.493413,2.550010e+04,Main Sequence,Blue


In [27]:
# 앞의 분광형 색과 현재 유형에 관한 열 병합
dt['Type'] = dt['Color'].map(str) +  " " + dt['cur_Type'].map(str)
dt

Unnamed: 0,Vmag,Plx,B_V,SpType,Amag,Temp(K),Lum,Lum(sun=1),Mass(sun=1),cur_Type,Color,Type
0,7.64,0.00593,0.056,A0/A1V,1.505273,9136.597656,7.959076e+27,20.791735,6.175661e+01,Main Sequence,White,White Main Sequence
3,11.18,0.00324,0.117,A0Ia:,3.732725,8470.728516,1.023015e+27,2.672452,1.020286e+00,Super Giant,White,White Super Giant
4,10.95,0.00062,0.134,A0Ia:,-0.088042,8302.107422,3.452937e+28,90.202116,1.162346e+03,Super Giant,White,White Super Giant
6,6.07,0.00010,0.389,A0Iab,-8.930000,6393.146973,1.188425e+32,310455.926090,1.376898e+10,Super Giant,White,White Super Giant
7,8.19,0.00128,0.818,A0Iab,-1.273951,4609.882812,1.029324e+29,268.893487,1.032910e+04,Super Giant,White,White Super Giant
...,...,...,...,...,...,...,...,...,...,...,...,...
4634,10.58,0.00282,1.316,M7III,2.831245,3148.312744,2.346788e+27,6.130584,5.369152e+00,Giant,Red,Red Giant
4635,7.70,0.00329,1.467,MIII,0.285979,3938.888672,2.446697e+28,63.915814,5.836045e+02,Giant,Red,Red Giant
4636,9.38,0.00336,0.187,O8V,2.011696,3482.318359,4.992216e+27,13.041317,2.429657e+01,Main Sequence,Blue,Blue Main Sequence
4638,8.72,0.00080,0.390,O9.5IV,-1.764550,7816.978516,1.617305e+29,422.493413,2.550010e+04,Main Sequence,Blue,Blue Main Sequence


In [28]:
# 시각성을 위해 1차 열 제거
dt = dt.drop(['Plx', 'B_V', 'Lum', 'Lum(sun=1)', 'cur_Type', 'Color'], axis=1)
dt

Unnamed: 0,Vmag,SpType,Amag,Temp(K),Mass(sun=1),Type
0,7.64,A0/A1V,1.505273,9136.597656,6.175661e+01,White Main Sequence
3,11.18,A0Ia:,3.732725,8470.728516,1.020286e+00,White Super Giant
4,10.95,A0Ia:,-0.088042,8302.107422,1.162346e+03,White Super Giant
6,6.07,A0Iab,-8.930000,6393.146973,1.376898e+10,White Super Giant
7,8.19,A0Iab,-1.273951,4609.882812,1.032910e+04,White Super Giant
...,...,...,...,...,...,...
4634,10.58,M7III,2.831245,3148.312744,5.369152e+00,Red Giant
4635,7.70,MIII,0.285979,3938.888672,5.836045e+02,Red Giant
4636,9.38,O8V,2.011696,3482.318359,2.429657e+01,Blue Main Sequence
4638,8.72,O9.5IV,-1.764550,7816.978516,2.550010e+04,Blue Main Sequence


In [29]:
# encoding과정
type_encoder = LabelEncoder()
dt['Type_Encoded'] = type_encoder.fit_transform(dt['Type'])
class_mapping = dict(zip(type_encoder.classes_, type_encoder.transform(type_encoder.classes_)))
print("카테고리와 인코딩된 숫자:")
print(class_mapping)

카테고리와 인코딩된 숫자:
{'Blue Giant': 0, 'Blue Main Sequence': 1, 'Blue Super Giant': 2, 'Red Giant': 3, 'Red Main Sequence': 4, 'Red Super Giant': 5, 'White Giant': 6, 'White Main Sequence': 7, 'White Super Giant': 8, 'Yellow Giant': 9, 'Yellow Main Sequence': 10, 'Yellow Super Giant': 11}


In [36]:
# 정답지 열 생성
dt.loc[(dt.Type_Encoded == 0) & ((1.989 * 10e+30)*8 <= dt['Mass(sun=1)']), 'end_Type'] = 'Super nova'
dt.loc[(dt.Type_Encoded == 0), 'end_Type'] = 'White Dwarf'
dt.loc[(dt.Type_Encoded == 1), 'end_Type'] = 'Neutron Star'
dt.loc[(dt.Type_Encoded == 2), 'end_Type'] = 'Super nova'
dt.loc[(dt.Type_Encoded == 3) & ((1.989 * 10e+30)*0.25 <= dt['Mass(sun=1)']), 'end_Type'] = 'White Dwarf'
dt.loc[(dt.Type_Encoded == 3)& (dt['Mass(sun=1)'] <= (1.989 * 10e+30)*8), 'end_Type'] = 'White Dwarf'
dt.loc[(dt.Type_Encoded == 3), 'end_Type'] = 'Wolf Rayet Star'
dt.loc[(dt.Type_Encoded == 4), 'end_Type'] = 'White Dwarf'
dt.loc[(dt.Type_Encoded == 5) & ((1.989 * 10e+30)*9 <= dt['Mass(sun=1)']), 'end_Type'] = 'Type II supernova'
dt.loc[(dt.Type_Encoded == 5) & (dt['Mass(sun=1)'] <= (1.989 * 10e+30)*45), 'end_Type'] = 'Type II supernova'
dt.loc[(dt.Type_Encoded == 5), 'end_Type'] = 'Wolf Rayet Star'
dt.loc[(dt.Type_Encoded == 6) & ((1.989 * 10e+30)*8 <= dt['Mass(sun=1)']), 'end_Type'] = 'Super nova'
dt.loc[(dt.Type_Encoded == 6), 'end_Type'] = 'White Dwarf'
dt.loc[(dt.Type_Encoded == 7), 'end_Type'] = 'White Dwarf'
dt.loc[(dt.Type_Encoded == 8), 'end_Type'] = 'Super nova or Hyper nova'
dt.loc[(dt.Type_Encoded == 9), 'end_Type'] = 'White Dwarf'
dt.loc[(dt.Type_Encoded == 10), 'end_Type'] = 'White Dwarf'
dt.loc[(dt.Type_Encoded == 11), 'end_Type'] = 'White Dwarf & Neutron star'

# 열에 적용
dt

Unnamed: 0,Vmag,SpType,Amag,Temp(K),Mass(sun=1),Type,Type_Encoded,end_Type
0,7.64,A0/A1V,1.505273,9136.597656,6.175661e+01,White Main Sequence,7,White Dwarf
3,11.18,A0Ia:,3.732725,8470.728516,1.020286e+00,White Super Giant,8,Super nova or Hyper nova
4,10.95,A0Ia:,-0.088042,8302.107422,1.162346e+03,White Super Giant,8,Super nova or Hyper nova
6,6.07,A0Iab,-8.930000,6393.146973,1.376898e+10,White Super Giant,8,Super nova or Hyper nova
7,8.19,A0Iab,-1.273951,4609.882812,1.032910e+04,White Super Giant,8,Super nova or Hyper nova
...,...,...,...,...,...,...,...,...
4634,10.58,M7III,2.831245,3148.312744,5.369152e+00,Red Giant,3,Wolf Rayet Star
4635,7.70,MIII,0.285979,3938.888672,5.836045e+02,Red Giant,3,Wolf Rayet Star
4636,9.38,O8V,2.011696,3482.318359,2.429657e+01,Blue Main Sequence,1,Neutron Star
4638,8.72,O9.5IV,-1.764550,7816.978516,2.550010e+04,Blue Main Sequence,1,Neutron Star


In [37]:
end_type_encoder = LabelEncoder()
dt['end_Type_Encoded'] = end_type_encoder.fit_transform(dt['end_Type'])
class_mapping = dict(zip(end_type_encoder.classes_, end_type_encoder.transform(end_type_encoder.classes_)))
print("카테고리와 인코딩된 숫자:")
print(class_mapping)

카테고리와 인코딩된 숫자:
{'Neutron Star': 0, 'Super nova': 1, 'Super nova or Hyper nova': 2, 'White Dwarf': 3, 'White Dwarf & Neutron star': 4, 'Wolf Rayet Star': 5}


In [38]:
# 최종 열 제거
dt = dt.drop(['Amag', 'Type', 'Vmag', 'end_Type'], axis=1)
dt

Unnamed: 0,SpType,Temp(K),Mass(sun=1),Type_Encoded,end_Type_Encoded
0,A0/A1V,9136.597656,6.175661e+01,7,3
3,A0Ia:,8470.728516,1.020286e+00,8,2
4,A0Ia:,8302.107422,1.162346e+03,8,2
6,A0Iab,6393.146973,1.376898e+10,8,2
7,A0Iab,4609.882812,1.032910e+04,8,2
...,...,...,...,...,...
4634,M7III,3148.312744,5.369152e+00,3,5
4635,MIII,3938.888672,5.836045e+02,3,5
4636,O8V,3482.318359,2.429657e+01,1,0
4638,O9.5IV,7816.978516,2.550010e+04,1,0


In [42]:
# 1차 파일 저장
dt.to_csv('/content/train_data.csv')

* Test data

In [30]:
# 여키스 항성분류를 통해 현재 항성의 형태 파악(MKK)
def type(star):
  main = ['V', 'Va', 'Vb']
  dwarf = ['VI']
  white = ['VII']
  giant = ['II', 'IIa', 'IIab', 'IIb', 'III','IIIa','IIIab',
            'IIIb','IV','IVa','IVb']
  for i in main:
    if i in star:
      return 'Main Sequence'
  for i in dwarf:
    if i in star:
      return 'Dwarf'
  for i in white:
    if i in star:
      return 'White Dwarf'
  for i in giant:
    if i in star:
      return 'Giant'
  return 'Super Giant'

# 열에 적용
ds['cur_Type'] = ds.st_spectype.apply(type)
ds

Unnamed: 0,st_spectype,st_teff,st_mass,cur_Type
0,A1IV,9360.0,2.07,Main Sequence
1,A2V,8720.0,1.76,Main Sequence
2,A2V,8840.0,1.96,Main Sequence
3,A7V,7800.0,1.75,Main Sequence
4,A8V,7500.0,1.47,Main Sequence
...,...,...,...,...
827,M5V,3050.0,0.12,Main Sequence
828,M5V,2900.0,0.12,Main Sequence
829,M5V,3185.0,0.26,Main Sequence
830,M6V,2850.0,0.12,Main Sequence


In [31]:
# 분광형을 통한 색 파악
def color(star):
  blue = ['O', 'B']
  white = ['A']
  yellow = ['F', 'G']
  red = ['K', 'M']
  for i in blue:
    if i in star:
      return 'Blue'
  for i in white:
    if i in star:
      return 'White'
  for i in yellow:
    if i in star:
      return 'Yellow'
  for i in red:
    if i in star:
      return 'Red'

# 열에 적용
ds['Color'] = ds.st_spectype.apply(color)
ds

Unnamed: 0,st_spectype,st_teff,st_mass,cur_Type,Color
0,A1IV,9360.0,2.07,Main Sequence,White
1,A2V,8720.0,1.76,Main Sequence,White
2,A2V,8840.0,1.96,Main Sequence,White
3,A7V,7800.0,1.75,Main Sequence,White
4,A8V,7500.0,1.47,Main Sequence,White
...,...,...,...,...,...
827,M5V,3050.0,0.12,Main Sequence,Red
828,M5V,2900.0,0.12,Main Sequence,Red
829,M5V,3185.0,0.26,Main Sequence,Red
830,M6V,2850.0,0.12,Main Sequence,Red


In [32]:
# 앞의 분광형 색과 현재 유형에 관한 열 병합
ds['Type'] = ds['Color'].map(str) +  " " + ds['cur_Type'].map(str)
ds

Unnamed: 0,st_spectype,st_teff,st_mass,cur_Type,Color,Type
0,A1IV,9360.0,2.07,Main Sequence,White,White Main Sequence
1,A2V,8720.0,1.76,Main Sequence,White,White Main Sequence
2,A2V,8840.0,1.96,Main Sequence,White,White Main Sequence
3,A7V,7800.0,1.75,Main Sequence,White,White Main Sequence
4,A8V,7500.0,1.47,Main Sequence,White,White Main Sequence
...,...,...,...,...,...,...
827,M5V,3050.0,0.12,Main Sequence,Red,Red Main Sequence
828,M5V,2900.0,0.12,Main Sequence,Red,Red Main Sequence
829,M5V,3185.0,0.26,Main Sequence,Red,Red Main Sequence
830,M6V,2850.0,0.12,Main Sequence,Red,Red Main Sequence


In [33]:
# 필요럾는 열 제거
ds = ds.drop(['cur_Type', 'Color'], axis=1)
ds

Unnamed: 0,st_spectype,st_teff,st_mass,Type
0,A1IV,9360.0,2.07,White Main Sequence
1,A2V,8720.0,1.76,White Main Sequence
2,A2V,8840.0,1.96,White Main Sequence
3,A7V,7800.0,1.75,White Main Sequence
4,A8V,7500.0,1.47,White Main Sequence
...,...,...,...,...
827,M5V,3050.0,0.12,Red Main Sequence
828,M5V,2900.0,0.12,Red Main Sequence
829,M5V,3185.0,0.26,Red Main Sequence
830,M6V,2850.0,0.12,Red Main Sequence


In [34]:
# encoding과정
type_encoder = LabelEncoder()
ds['Type_Encoded'] = type_encoder.fit_transform(ds['Type'])
class_mapping = dict(zip(type_encoder.classes_, type_encoder.transform(type_encoder.classes_)))
print("카테고리와 인코딩된 숫자:")
print(class_mapping)

카테고리와 인코딩된 숫자:
{'Blue Main Sequence': 0, 'Red Giant': 1, 'Red Main Sequence': 2, 'White Main Sequence': 3, 'Yellow Giant': 4, 'Yellow Main Sequence': 5}


In [39]:
# 정답지 열 생성
ds.loc[(ds.Type_Encoded == 0) & ((1.989 * 10e+30)*8 <= ds['st_mass']), 'end_Type'] = 'Super nova'
ds.loc[(ds.Type_Encoded == 0), 'end_Type'] = 'White Dwarf'
ds.loc[(ds.Type_Encoded == 1), 'end_Type'] = 'Neutron Star'
ds.loc[(ds.Type_Encoded == 2), 'end_Type'] = 'Super nova'
ds.loc[(ds.Type_Encoded == 3) & ((1.989 * 10e+30)*0.25 <= ds['st_mass']), 'end_Type'] = 'White Dwarf'
ds.loc[(ds.Type_Encoded == 3)& (ds['st_mass'] <= (1.989 * 10e+30)*8), 'end_Type'] = 'White Dwarf'
ds.loc[(ds.Type_Encoded == 3), 'end_Type'] = 'Wolf Rayet Star'
ds.loc[(ds.Type_Encoded == 4), 'end_Type'] = 'White Dwarf'
ds.loc[(ds.Type_Encoded == 5) & ((1.989 * 10e+30)*9 <= ds['st_mass']), 'end_Type'] = 'Type II supernova'
ds.loc[(ds.Type_Encoded == 5) & (ds['st_mass'] <= (1.989 * 10e+30)*45), 'end_Type'] = 'Type II supernova'
ds.loc[(ds.Type_Encoded == 5), 'end_Type'] = 'Wolf Rayet Star'
ds.loc[(ds.Type_Encoded == 6) & ((1.989 * 10e+30)*8 <= ds['st_mass']), 'end_Type'] = 'Super nova'
ds.loc[(ds.Type_Encoded == 6), 'end_Type'] = 'White Dwarf'
ds.loc[(ds.Type_Encoded == 7), 'end_Type'] = 'White Dwarf'
ds.loc[(ds.Type_Encoded == 8), 'end_Type'] = 'Super nova or Hyper nova'
ds.loc[(ds.Type_Encoded == 9), 'end_Type'] = 'White Dwarf'
ds.loc[(ds.Type_Encoded == 10), 'end_Type'] = 'White Dwarf'
ds.loc[(ds.Type_Encoded == 11), 'end_Type'] = 'White Dwarf & Neutron star'

# 열에 적용
ds

Unnamed: 0,st_spectype,st_teff,st_mass,Type,Type_Encoded,end_Type
0,A1IV,9360.0,2.07,White Main Sequence,3,Wolf Rayet Star
1,A2V,8720.0,1.76,White Main Sequence,3,Wolf Rayet Star
2,A2V,8840.0,1.96,White Main Sequence,3,Wolf Rayet Star
3,A7V,7800.0,1.75,White Main Sequence,3,Wolf Rayet Star
4,A8V,7500.0,1.47,White Main Sequence,3,Wolf Rayet Star
...,...,...,...,...,...,...
827,M5V,3050.0,0.12,Red Main Sequence,2,Super nova
828,M5V,2900.0,0.12,Red Main Sequence,2,Super nova
829,M5V,3185.0,0.26,Red Main Sequence,2,Super nova
830,M6V,2850.0,0.12,Red Main Sequence,2,Super nova


In [40]:
end_type_encoder = LabelEncoder()
ds['end_Type_Encoded'] = end_type_encoder.fit_transform(ds['end_Type'])
class_mapping = dict(zip(end_type_encoder.classes_, end_type_encoder.transform(end_type_encoder.classes_)))
print("카테고리와 인코딩된 숫자:")
print(class_mapping)

카테고리와 인코딩된 숫자:
{'Neutron Star': 0, 'Super nova': 1, 'White Dwarf': 2, 'Wolf Rayet Star': 3}


In [41]:
# 최종 열 제거
ds = ds.drop(['Type', 'end_Type'], axis=1)
ds

Unnamed: 0,st_spectype,st_teff,st_mass,Type_Encoded,end_Type_Encoded
0,A1IV,9360.0,2.07,3,3
1,A2V,8720.0,1.76,3,3
2,A2V,8840.0,1.96,3,3
3,A7V,7800.0,1.75,3,3
4,A8V,7500.0,1.47,3,3
...,...,...,...,...,...
827,M5V,3050.0,0.12,2,1
828,M5V,2900.0,0.12,2,1
829,M5V,3185.0,0.26,2,1
830,M6V,2850.0,0.12,2,1


In [43]:
# 1차 test data 저장
ds.to_csv('/content/test_data.csv')

2. Final Encoded
*   2차 encoding과정
*   spectrum 유형에 대한 encoding을 끝으로
    data-preprocessing 완료





In [44]:
# library load
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [46]:
# Sp_Type에 대해 encoding 실행
# 저장된 파일 불러오기
td = pd.read_csv('/content/train_data.csv')
te = pd.read_csv('/content/test_data.csv')

*   Train data

In [47]:
# SpType중 숫자는 같은 분광형에서의 세부 온도를 나타냄.(train)
# 프로젝트 과정을 수행하는데, 큰 영향을 주지 않으므로 제거
td['SpType'] = td['SpType'].str.replace(pat=r'[0-9]', repl= r' ', regex=True)
td['SpType'] = td['SpType'].str.replace(pat=r'[^\uAC00-\uD7A30-9a-zA-Z\s]', repl= r' ', regex=True)
td

Unnamed: 0.1,Unnamed: 0,SpType,Temp(K),Mass(sun=1),Type_Encoded,end_Type_Encoded
0,0,A A V,9136.5980,6.175661e+01,7,3
1,3,A Ia,8470.7290,1.020286e+00,8,2
2,4,A Ia,8302.1070,1.162346e+03,8,2
3,6,A Iab,6393.1470,1.376898e+10,8,2
4,7,A Iab,4609.8830,1.032910e+04,8,2
...,...,...,...,...,...,...
4529,4634,M III,3148.3127,5.369152e+00,3,5
4530,4635,MIII,3938.8887,5.836045e+02,3,5
4531,4636,O V,3482.3184,2.429657e+01,1,0
4532,4638,O IV,7816.9785,2.550010e+04,1,0


In [48]:
# 띄어쓰기를 기준으로 각 단어 분할하여 인코딩 처리(train)
td['SpType'].str.split(" ")
td['SpType'].str.split(" ", expand=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,A,,A,V,,,,,
1,A,Ia,,,,,,,
2,A,Ia,,,,,,,
3,A,Iab,,,,,,,
4,A,Iab,,,,,,,
...,...,...,...,...,...,...,...,...,...
4529,M,III,,,,,,,
4530,MIII,,,,,,,,
4531,O,V,,,,,,,
4532,O,,,IV,,,,,


In [49]:
td['Spec'] = td['SpType'].str.split(" ", expand=True)[0]
td['MMK'] = td['SpType'].str.split(" ", expand=True)[1]
td.head()

Unnamed: 0.1,Unnamed: 0,SpType,Temp(K),Mass(sun=1),Type_Encoded,end_Type_Encoded,Spec,MMK
0,0,A A V,9136.598,61.75661,7,3,A,
1,3,A Ia,8470.729,1.020286,8,2,A,Ia
2,4,A Ia,8302.107,1162.346,8,2,A,Ia
3,6,A Iab,6393.147,13768980000.0,8,2,A,Iab
4,7,A Iab,4609.883,10329.1,8,2,A,Iab


In [50]:
# 인코딩(train)
spec_encoder = LabelEncoder()
mmk_encoder = LabelEncoder()
td['Spec_Encoded'] = spec_encoder.fit_transform(td['Spec'])
td['MMK_Encoded'] = mmk_encoder.fit_transform(td['MMK'])
class_mapping = dict(zip(spec_encoder.classes_, spec_encoder.transform(spec_encoder.classes_)))
class_mapping1 = dict(zip(mmk_encoder.classes_, mmk_encoder.transform(mmk_encoder.classes_)))
print("카테고리와 인코딩된 숫자:")
print(class_mapping)
print(class_mapping1)

카테고리와 인코딩된 숫자:
{'A': 0, 'B': 1, 'F': 2, 'G': 3, 'GIII': 4, 'K': 5, 'M': 6, 'MIII': 7, 'O': 8}
{'': 0, 'II': 1, 'IICNp': 2, 'III': 3, 'IIICN': 4, 'IIICNII': 5, 'IIICNIV': 6, 'IIIMNp': 7, 'IIIb': 8, 'IIIe': 9, 'IIIm': 10, 'IIIn': 11, 'IIIne': 12, 'IIInn': 13, 'IIIp': 14, 'IIIsp': 15, 'IIIvar': 16, 'IIIwe': 17, 'IIvar': 18, 'IV': 19, 'IVCN': 20, 'IVn': 21, 'IVne': 22, 'IVsvar': 23, 'IVws': 24, 'Ia': 25, 'Iab': 26, 'Ib': 27, 'Ibp': 28, 'Ibpev': 29, 'Ibvar': 30, 'V': 31, 'VCN': 32, 'Vawvar': 33, 'Ve': 34, 'Vm': 35, 'Vn': 36, 'Vne': 37, 'Vnn': 38, 'Vp': 39, 'Vpe': 40, 'Vs': 41, 'Vv': 42, 'Vvar': 43, 'Vw': 44, 'Vws': 45, 'p': 46, None: 47}


In [51]:
# 열 제거(train)
td = td.drop(['SpType', 'Spec', 'MMK'], axis=1)
td

Unnamed: 0.1,Unnamed: 0,Temp(K),Mass(sun=1),Type_Encoded,end_Type_Encoded,Spec_Encoded,MMK_Encoded
0,0,9136.5980,6.175661e+01,7,3,0,0
1,3,8470.7290,1.020286e+00,8,2,0,25
2,4,8302.1070,1.162346e+03,8,2,0,25
3,6,6393.1470,1.376898e+10,8,2,0,26
4,7,4609.8830,1.032910e+04,8,2,0,26
...,...,...,...,...,...,...,...
4529,4634,3148.3127,5.369152e+00,3,5,6,3
4530,4635,3938.8887,5.836045e+02,3,5,7,47
4531,4636,3482.3184,2.429657e+01,1,0,8,31
4532,4638,7816.9785,2.550010e+04,1,0,8,0


*   Test data

In [None]:
# spectrum 유형의 숫자 제거
te['st_spectype'] = te['st_spectype'].str.replace(pat=r'[0-9]', repl= r' ', regex=True)
te['st_spectype'] = te['st_spectype'].str.replace(pat=r'[^\uAC00-\uD7A30-9a-zA-Z\s]', repl= r' ', regex=True)
te

Unnamed: 0,st_spectype,st_teff,st_mass
0,A IV,9360.0,2.07
1,A V,8720.0,1.76
2,A V,8840.0,1.96
3,A V,7800.0,1.75
4,A V,7500.0,1.47
...,...,...,...
827,M V,3050.0,0.12
828,M V,2900.0,0.12
829,M V,3185.0,0.26
830,M V,2850.0,0.12


In [None]:
# 띄어쓰기를 기준으로 문자 분리
te['st_spectype'].str.split(" ")
te['st_spectype'].str.split(" ", expand=True)

Unnamed: 0,0,1,2
0,A,IV,
1,A,V,
2,A,V,
3,A,V,
4,A,V,
...,...,...,...
827,M,V,
828,M,V,
829,M,V,
830,M,V,


In [None]:
te['Spec'] = te['st_spectype'].str.split(" ", expand=True)[0]
te['MMK'] = te['st_spectype'].str.split(" ", expand=True)[1]
te.head()

Unnamed: 0,st_spectype,st_teff,st_mass,Spec,MMK
0,A IV,9360.0,2.07,A,IV
1,A V,8720.0,1.76,A,V
2,A V,8840.0,1.96,A,V
3,A V,7800.0,1.75,A,V
4,A V,7500.0,1.47,A,V


In [None]:
# one-hot encoding 진행
spec_encoder = LabelEncoder()
mmk_encoder = LabelEncoder()
te['Spec_Encoded'] = spec_encoder.fit_transform(te['Spec'])
te['MMK_Encoded'] = mmk_encoder.fit_transform(te['MMK'])
class_mapping = dict(zip(spec_encoder.classes_, spec_encoder.transform(spec_encoder.classes_)))
class_mapping1 = dict(zip(mmk_encoder.classes_, mmk_encoder.transform(mmk_encoder.classes_)))
print("카테고리와 인코딩된 숫자:")
print(class_mapping)
print(class_mapping1)

카테고리와 인코딩된 숫자:
{'A': 0, 'B': 1, 'F': 2, 'G': 3, 'K': 4, 'M': 5}
{'II': 0, 'III': 1, 'IV': 2, 'V': 3}


In [None]:
# 열 제거(test)
te = te.drop(['st_spectype', 'Spec', 'MMK'], axis=1)
te

Unnamed: 0,st_teff,st_mass,Spec_Encoded,MMK_Encoded
0,9360.0,2.07,0,2
1,8720.0,1.76,0,3
2,8840.0,1.96,0,3
3,7800.0,1.75,0,3
4,7500.0,1.47,0,3
...,...,...,...,...
827,3050.0,0.12,5,3
828,2900.0,0.12,5,3
829,3185.0,0.26,5,3
830,2850.0,0.12,5,3


In [None]:
# 최종 파일 저장
td.to_csv('/content/train_data.csv')
te.to_csv('/content/test_data.csv')