In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib

#### 데이터 준비

In [2]:
df = pd.read_csv('C:/Users/EL087/Desktop/MS_MachineLearning/data/RocketLaunchDataCompleted2.csv', encoding='cp949')
df.head()

Unnamed: 0,Crewed or Uncrewed,Launched?,High Temp,Low Temp,Ave Temp,Hist High Temp,Hist Low Temp,Hist Ave Temp,Percipitation at Launch Time,Hist Ave Percipitation,Wind Direction,Max Wind Speed,Visibility,Condition,target
0,Uncrewed,N,75.0,68.0,71.0,75.0,55.0,65.0,0.0,0.08,E,16.0,15.0,Cloudy,0
1,Uncrewed,N,78.0,70.0,73.39,75.0,55.0,65.0,0.0,0.09,E,14.0,10.0,Cloudy,0
2,Uncrewed,Y,73.0,0.0,60.21,75.0,55.0,65.0,0.0,0.09,NE,15.0,10.0,Cloudy,1
3,Uncrewed,N,76.0,57.0,66.04,75.0,55.0,65.0,0.0,0.08,N,10.0,10.0,Partly Cloudy,0
4,Uncrewed,N,79.0,60.0,70.52,75.0,55.0,65.0,0.0,0.09,E,12.0,10.0,Partly Cloudy,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Crewed or Uncrewed            300 non-null    object 
 1   Launched?                     300 non-null    object 
 2   High Temp                     300 non-null    float64
 3   Low Temp                      300 non-null    float64
 4   Ave Temp                      300 non-null    float64
 5   Hist High Temp                300 non-null    float64
 6   Hist Low Temp                 300 non-null    float64
 7   Hist Ave Temp                 300 non-null    float64
 8   Percipitation at Launch Time  300 non-null    float64
 9   Hist Ave Percipitation        300 non-null    float64
 10  Wind Direction                300 non-null    object 
 11  Max Wind Speed                300 non-null    float64
 12  Visibility                    300 non-null    float64
 13  Condi

### 독립변수, 종속변수 지정

In [4]:
X = df.drop(['Launched?', 'target'], axis=1)
y = df['target']

In [5]:
# -----------------
# 데이터 유형 파악
# -----------------
cat_cols = X.select_dtypes(include='object').columns
num_cols = X.select_dtypes(include='number').columns

### 훈련/테스트세트 분할

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((225, 13), (75, 13), (225,), (75,))

### 데이터 전처리

In [7]:
# ------------
# 스케일링
# ------------

# 스케일러 객체 생성
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# 훈련 데이터 학습 및 변환 / 테스트 데이터 변환
train_scaled = scaler.fit_transform(X_train[num_cols])
test_scaled = scaler.transform(X_test[num_cols])

# DataFrame으로 변환
X_train_scaled = pd.DataFrame(train_scaled, columns=num_cols)
X_test_scaled = pd.DataFrame(test_scaled, columns=num_cols)

표준화 기법을 사용하여 훈련/테스트 데이터의 숫자형 변수들을 머신러닝 모델이 더 잘 학습할 수 있도록 전처리
- 표준화: 각 특성(컬럼)의 평균을 0으로, 표준편차를 1로 조정하여 데이터의 분포를 표준 정규 분포에 가깝게 만듦

<br>

| 데이터셋 | 메서드 | 이유 |
| :---: | :---: | :--- |
| **훈련 데이터** | **`fit_transform()`** | 모델이 학습할 기준 통계량($\mu, \sigma$)을 정하고 변환까지 한 번에 수행 |
| **테스트 데이터** | **`transform()`** | 훈련 데이터에서 학습된 통계량($\mu, \sigma$)만을 사용하여 변환 <br> 테스트 데이터의 정보(평균, 표준편차)가 학습에 유출되는 것을 방지 (**Data Leakage 방지**) |

In [8]:
# ------------
# 인코딩
# ------------

# 인코더 객체 생성 및 설정
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')

# 훈련 데이터 학습 및 변환 / 테스트 데이터 변환
train_encoded = encoder.fit_transform(X_train[cat_cols])
test_encoded = encoder.transform(X_test[cat_cols])

# 새로운 컬럼 이름 생성 및 DataFrame 변환
col_names = encoder.get_feature_names_out(cat_cols)
X_train_encoded = pd.DataFrame(train_encoded, columns=col_names)
X_test_encoded = pd.DataFrame(test_encoded, columns=col_names)



원-핫 인코딩을 사용하여 훈련 데이터와 테스트 데이터의 범주형 변수를 머신러닝 모델이 이해할 수 있는 숫자 형태로 변환

- drop='first': 다중 공선성(Multicollinearity) 문제를 피하기 위해 각 범주형 변수의 첫 번째 카테고리에 해당하는 열을 삭제
- encoder.get_feature_names_out(cat_cols): 인코딩된 결과(새로운 열)에 부여될 의미 있는 컬럼 이름을 생성

<br>

| 데이터셋 | 메서드 | 이유 |
| :---: | :---: | :--- |
| **훈련 데이터** | **`fit_transform()`** | 훈련 데이터에 존재하는 **모든 고유 카테고리**를 학습하여, <br> 인코딩된 **새로운 열의 종류와 개수**를 정의하고 변환까지 수행 |
| **테스트 데이터** | **`transform()`** | 훈련 데이터에서 **학습된 카테고리 목록**을 그대로 사용하여 변환 <br>테스트 데이터의 새로운 카테고리 정보가 훈련에 유출되거나 열의 구조를 바꾸는 것을 방지 (**데이터 누수 방지**) |

In [9]:
# ------------
# 최종 훈련/테스트 데이터셋 생성
# ------------
X_train_preprocessed = pd.concat([X_train_scaled, X_train_encoded], axis=1)
X_test_preprocessed = pd.concat([X_test_scaled, X_test_encoded], axis=1)

X_train_preprocessed

Unnamed: 0,High Temp,Low Temp,Ave Temp,Hist High Temp,Hist Low Temp,Hist Ave Temp,Percipitation at Launch Time,Hist Ave Percipitation,Max Wind Speed,Visibility,...,Condition_Fail,Condition_Fair,Condition_Heavy T-Storm,Condition_Light Rain,Condition_Mostly Cloudy,Condition_Partly Cloudly,Condition_Partly Cloudy,Condition_Rain,Condition_T-Storm,Condition_Thunder
0,0.984930,-1.178614,-0.380270,0.768842,1.083569,0.995520,-0.296502,-0.043031,1.173852,-0.927043,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.932935,0.239343,-0.927798,-1.075149,-1.287454,-1.195732,-0.096613,-0.078353,0.592097,-0.460758,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-1.028828,-1.178614,-1.079133,-1.075149,-1.390542,-1.195732,-0.296502,-0.078353,-0.377494,-0.460758,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-2.083653,-0.002011,-1.772896,-1.075149,-1.287454,-1.195732,-0.296502,-0.076494,-0.183576,0.316383,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-0.549362,0.872898,0.138352,-0.729401,-0.772014,-0.757482,-0.296502,-0.078353,-0.183576,0.316383,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,0.505464,1.144422,0.992802,1.114590,1.083569,1.105082,-0.296502,-0.054185,-1.541004,1.093524,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
221,-0.261682,-1.178614,-1.245772,-0.729401,-0.978190,-0.867044,-0.296502,-0.072775,-0.183576,-0.460758,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
222,0.697250,-1.178614,0.575354,0.999341,1.083569,1.105082,-0.296502,-0.046749,0.592097,-0.460758,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
223,-0.837041,0.269512,-0.762009,-0.844650,-1.081278,-0.976607,-0.196557,-0.078353,-0.571412,1.093524,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### 모델 생성

1. 🚀 로지스틱 회귀 (Logistic Regression)

    가장 기본이 되는 선형 분류 모델 

In [10]:
# 로지스틱 회귀 모델 생성 및 학습
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 모델 객체 생성 
logreg_model = LogisticRegression(random_state=42)

# 모델 학습
logreg_model.fit(X_train_preprocessed, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,100


2. 🌳 랜덤 포레스트 (Random Forest)

    앙상블 기법을 사용하며, 과적합에 강하고 성능이 뛰어난 모델

In [13]:
# 랜덤 포레스트 모델 생성 및 학습
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 모델 객체 생성 (n_estimators: 트리의 개수, 보통 100~500 사이로 설정)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# 모델 학습
rf_model.fit(X_train_preprocessed, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


3. 📉 XGBoost (eXtreme Gradient Boosting)

    최고 성능을 목표로 할 때 자주 사용되는 그래디언트 부스팅 모델

In [None]:
# !pip install xgboost <- 처음 사용할 때 설치 필요

In [16]:
# 그래디언트 부스팅 계열(XGBoost) 모델 생성
import xgboost as xgb
from sklearn.metrics import accuracy_score

# 모델 객체 생성 (objective='binary:logistic'는 이진 분류를 의미)
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42)

# 모델 학습
xgb_model.fit(X_train_preprocessed, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


### 성능평가

In [11]:
# 로지스틱 회귀 모델 성능 평가

# 테스트 데이터 예측
y_pred_logreg = logreg_model.predict(X_test_preprocessed)

# 정확도 계산
accuracy_score = accuracy_score(y_test, y_pred_logreg)

print(f"로지스틱 회귀 정확도: {accuracy_score:.4f}")

로지스틱 회귀 정확도: 0.8133


In [12]:
from sklearn.metrics import classification_report
print(" ### 로지스틱 회귀 (Logistic Regression) 성능 리포트 ###\n")
print(classification_report(y_test, y_pred_logreg))


 ### 로지스틱 회귀 (Logistic Regression) 성능 리포트 ###

              precision    recall  f1-score   support

           0       0.81      1.00      0.90        60
           1       1.00      0.07      0.12        15

    accuracy                           0.81        75
   macro avg       0.91      0.53      0.51        75
weighted avg       0.85      0.81      0.74        75



In [14]:
# 랜덤포레스트 모델 성능 평가

# 테스트 데이터 예측
y_pred_rf = rf_model.predict(X_test_preprocessed)

# 정확도 계산
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print(f"랜덤 포레스트 정확도: {accuracy_rf:.4f}")

랜덤 포레스트 정확도: 0.8000


In [15]:
print("   ### 랜덤 포레스트 (Random Forest) 성능 리포트 ###\n")
print(classification_report(y_test, y_pred_rf))

   ### 랜덤 포레스트 (Random Forest) 성능 리포트 ###

              precision    recall  f1-score   support

           0       0.81      0.98      0.89        60
           1       0.50      0.07      0.12        15

    accuracy                           0.80        75
   macro avg       0.65      0.53      0.50        75
weighted avg       0.75      0.80      0.73        75



In [17]:
# XGBoost 모델 성능 평가

# 테스트 데이터 예측
y_pred_xgb = xgb_model.predict(X_test_preprocessed)

# 정확도 계산
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print(f"XGBoost 정확도: {accuracy_xgb:.4f}")

XGBoost 정확도: 0.7733


In [18]:
print("             ### XGBoost 성능 리포트 ###\n")
print(classification_report(y_test, y_pred_xgb))

             ### XGBoost 성능 리포트 ###

              precision    recall  f1-score   support

           0       0.81      0.93      0.87        60
           1       0.33      0.13      0.19        15

    accuracy                           0.77        75
   macro avg       0.57      0.53      0.53        75
weighted avg       0.72      0.77      0.73        75

