## 로켓 발사 예측 - 3가지 모델 비교

### 데이터 불러오기 및 살펴보기

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [27]:
# TODO: 데이터 불러와서 확인하기
rocket = pd.read_csv('./data/RocketLaunchDataCSV.csv')
rocket.sample(5)

Unnamed: 0,Name,Date,Time (East Coast),Location,Crewed or Uncrewed,Launched?,High Temp,Low Temp,Ave Temp,Temp at Launch Time,...,Max Wind Speed,Visibility,Wind Speed at Launch Time,Hist Ave Max Wind Speed,Hist Ave Visibility,Sea Level Pressure,Hist Ave Sea Level Pressure,Day Length,Condition,Notes
165,,17-Jan-06,,Cape Canaveral,,,79.0,56.0,68.67,,...,18.0,10.0,,,,30.02,,10:35,Cloudy,
164,,07-Aug-11,,Cape Canaveral,,,95.0,77.0,84.19,,...,15.0,10.0,,,,29.96,,13:20,Cloudy,
107,Mariner 9,30-May-71,23:23,Cape Canaveral,Uncrewed,Y,90.0,75.0,82.88,75.0,...,17.0,7.0,9.0,,,29.95,,13:47,Fair,
222,Apollo 7,11-Oct-68,16:02,Cape Canaveral,Crewed,Y,80.0,61.0,70.82,73.0,...,13.0,15.0,10.0,,,30.03,,11:36,Fair,
184,,09-Feb-99,,Cape Canaveral,,,79.0,61.0,68.91,,...,12.0,7.0,,,,30.16,,11:03,Rain,


In [4]:
rocket.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 26 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Name                          60 non-null     object 
 1   Date                          300 non-null    object 
 2   Time (East Coast)             59 non-null     object 
 3   Location                      300 non-null    object 
 4   Crewed or Uncrewed            60 non-null     object 
 5   Launched?                     60 non-null     object 
 6   High Temp                     299 non-null    float64
 7   Low Temp                      299 non-null    float64
 8   Ave Temp                      299 non-null    float64
 9   Temp at Launch Time           59 non-null     float64
 10  Hist High Temp                299 non-null    float64
 11  Hist Low Temp                 299 non-null    float64
 12  Hist Ave Temp                 299 non-null    float64
 13  Perci

In [13]:
rocket[rocket['Notes'].notna()]['Notes']

29     Looks like we didn’t capture temp at 7pm - pro...
291                                     T-Storm at night
297                                   Rain before launch
Name: Notes, dtype: object

In [14]:
rocket.isnull().sum().sort_values(ascending=False)

Hist Ave Sea Level Pressure     300
Hist Ave Visibility             300
Hist Ave Max Wind Speed         300
Notes                           297
Temp at Launch Time             241
Wind Speed at Launch Time       241
Time (East Coast)               241
Launched?                       240
Crewed or Uncrewed              240
Name                            240
Condition                         2
Day Length                        2
Visibility                        1
Ave Temp                          1
Low Temp                          1
Max Wind Speed                    1
High Temp                         1
Hist High Temp                    1
Percipitation at Launch Time      1
Hist Ave Temp                     1
Wind Direction                    1
Hist Ave Percipitation            1
Hist Low Temp                     1
Sea Level Pressure                1
Location                          0
Date                              0
dtype: int64

### 데이터 전처리

In [None]:
# TODO: 결측치 처리및 문자열데이터 인코딩

전체가 누락된 컬럼 : 'Hist Ave Max Wind Speed','Hist Ave Visibility','Hist Ave Sea Level Pressure'
날씨와 무관한 컬럼 : 'Name','Date',Time (East Coast)','Location','Day Length','Notes', 'Sea Level Pressure'
종속변수 : 'Launched?'

#### 누락/무관 컬럼 제거

In [None]:
cols_all_na = ['Hist Ave Sea Level Pressure', 'Hist Ave Visibility', 'Hist Ave Max Wind Speed']
cols_irrelevant = ['Name','Date', 'Time (East Coast)','Location', 'Day Length','Notes', 'Sea Level Pressure']

rocket = rocket.drop( columns= cols_all_na + cols_irrelevant, axis=1)
rocket.head(1)


Unnamed: 0,Crewed or Uncrewed,Launched?,High Temp,Low Temp,Ave Temp,Temp at Launch Time,Hist High Temp,Hist Low Temp,Hist Ave Temp,Percipitation at Launch Time,Hist Ave Percipitation,Wind Direction,Max Wind Speed,Visibility,Wind Speed at Launch Time,Condition
0,,,75.0,68.0,71.0,,75.0,55.0,65.0,0.0,0.08,E,16.0,15.0,,Cloudy


In [None]:
rocket.info() # 16개 컬럼 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Crewed or Uncrewed            60 non-null     object 
 1   Launched?                     60 non-null     object 
 2   High Temp                     299 non-null    float64
 3   Low Temp                      299 non-null    float64
 4   Ave Temp                      299 non-null    float64
 5   Temp at Launch Time           59 non-null     float64
 6   Hist High Temp                299 non-null    float64
 7   Hist Low Temp                 299 non-null    float64
 8   Hist Ave Temp                 299 non-null    float64
 9   Percipitation at Launch Time  299 non-null    float64
 10  Hist Ave Percipitation        299 non-null    float64
 11  Wind Direction                299 non-null    object 
 12  Max Wind Speed                299 non-null    float64
 13  Visib

#### 종속변수의 null 처리 : NaN -> 'N'

In [None]:
# Launched? = N
rocket['Launched?'].value_counts()

Launched?
Y    59
N     1
Name: count, dtype: int64

In [18]:
rocket['Launched?'] = rocket['Launched?'].fillna('N')
rocket['Launched?'].value_counts()


Launched?
N    241
Y     59
Name: count, dtype: int64

#### 적당한 값으로 결측치 처리

In [32]:
# crewed = Uncrewed로
rocket['Crewed or Uncrewed'].value_counts()

Crewed or Uncrewed
Uncrewed    44
Crewed      16
Name: count, dtype: int64

In [None]:
# Wind Direction = Unknown
rocket[]

In [None]:
# Condition = Cloudy


In [None]:
# 기타는 중앙값 또는 최빈값

In [None]:
# 문자데이터 인코딩


### 데이터 준비 및 모델 학습

In [None]:
# TODO: 데이터 분리하기


In [None]:
# TODO: 모델 만들고 성능 평가하기
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


### 성능 평가 및 비교

In [154]:
out = f'''
compare DT ,RF, LR
DT : {gr_search.best_score_}
RF : {rf.score(X_test, y_test)}
LR : {lr.score(X_test, y_test)}

DT confusion matrix : 
{classification_report(y_test, gr_search.best_estimator_.predict(X_test))}

RF confusion matrix : 
{classification_report(y_test, y_pred_rf)}

LR confusion matrix : 
{classification_report(y_test, y_pred_lr)}
'''
print(out)


compare DT ,RF, LR
DT : 0.9833333333333332
RF : 0.9833333333333333
LR : 0.8666666666666667

DT confusion matrix : 
              precision    recall  f1-score   support

           N       1.00      0.98      0.99        49
           Y       0.92      1.00      0.96        11

    accuracy                           0.98        60
   macro avg       0.96      0.99      0.97        60
weighted avg       0.98      0.98      0.98        60


RF confusion matrix : 
              precision    recall  f1-score   support

           N       1.00      0.98      0.99        49
           Y       0.92      1.00      0.96        11

    accuracy                           0.98        60
   macro avg       0.96      0.99      0.97        60
weighted avg       0.98      0.98      0.98        60


LR confusion matrix : 
              precision    recall  f1-score   support

           N       0.87      0.98      0.92        49
           Y       0.80      0.36      0.50        11

    accuracy      