# Taurus Project

## 우주선 타이타닉

우주선 타이타닉이 충돌하는 동안 어떤 승객이 변칙적으로 운송되었는지 예측

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import itertools
from sklearn.model_selection import GridSearchCV

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import plotly.express as px

## 데이터 확인
- 결측치 처리 완료
- One-Hot Encoding

In [2]:
df = pd.read_csv('data/aries_processed_data.csv')
df = df.drop('Unnamed: 0', axis=1)
df

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Earth,...,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinSide_P,CabinSide_S
0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,1,0,0,0,0,0,0,1,0
1,0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,1.0,1,...,0,0,0,0,0,1,0,0,0,1
2,0.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,0.0,0,...,1,0,0,0,0,0,0,0,0,1
3,0.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,0.0,0,...,1,0,0,0,0,0,0,0,0,1
4,0.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,1.0,1,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,1.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,,1,...,0,0,0,0,0,0,1,0,0,1
12966,0.0,42.0,0.0,0.0,847.0,17.0,10.0,144.0,,1,...,0,0,0,0,0,0,1,0,1,0
12967,1.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,,0,...,0,0,0,1,0,0,0,0,1,0
12968,0.0,34.0,0.0,0.0,2680.0,0.0,0.0,523.0,,0,...,0,0,0,1,0,0,0,0,1,0


In [3]:
X_data = df.drop('Transported', axis=1)
y_data = df['Transported']
train_idx = len(y_data[y_data.isna() == 0])

In [4]:
X_data.columns

Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'CabinDeck_A',
       'CabinDeck_B', 'CabinDeck_C', 'CabinDeck_D', 'CabinDeck_E',
       'CabinDeck_F', 'CabinDeck_G', 'CabinDeck_T', 'CabinSide_P',
       'CabinSide_S'],
      dtype='object')

In [5]:
y_data

0        0.0
1        1.0
2        0.0
3        0.0
4        1.0
        ... 
12965    NaN
12966    NaN
12967    NaN
12968    NaN
12969    NaN
Name: Transported, Length: 12970, dtype: float64

## 초기 성능

- Standard Scaler로 데이터 정규화
- 모든 파라미터를 default로 설정하여 학습 및 예측

![aries_result](data/image/aries_result.png)

LogisticRegression : 0.79074  

RandomForestClassifier : 0.78653

In [6]:
standard_scaler = StandardScaler()

standard_scaler.fit(X_data)
scaled_X_data = standard_scaler.transform(X_data)
scaled_X_data

array([[-0.75449851,  0.7180162 , -0.14663268, ..., -0.0291347 ,
         0.99523111, -0.99523111],
       [-0.75449851, -0.33506412, -0.14663268, ..., -0.0291347 ,
        -1.00479174,  1.00479174],
       [-0.75449851,  2.05191795,  6.81976228, ..., -0.0291347 ,
        -1.00479174,  1.00479174],
       ...,
       [ 1.32538367,  0.01596265, -0.14663268, ..., -0.0291347 ,
         0.99523111, -0.99523111],
       [-0.75449851,  0.36698943, -0.14663268, ..., -0.0291347 ,
         0.99523111, -0.99523111],
       [ 1.32538367,  0.99883763, -0.14663268, ..., -0.0291347 ,
        -1.00479174,  1.00479174]])

In [7]:
train_X = scaled_X_data[:train_idx]
test_X = scaled_X_data[train_idx:]
train_y = y_data[:train_idx]
len(train_X), len(train_y)

(8693, 8693)

## Plan 1 Modeling

Algorithm

- LogisticRegression

- GaussianNB

- RandomForest

In [8]:
plan1_algorithms = [LogisticRegression(), GaussianNB(), RandomForestClassifier()]
plan1_c_params = [0.001, 0.1,  1.0, 10.0, 20.0, 100.0]


plan1_params = []
plan1_params.append([{
    "solver" : ["saga"],
    "penalty" : ["l1"],
    "C" : plan1_c_params
    },{
    "solver" : ['liblinear'],
    "penalty" : ["l2"],
    "C" : plan1_c_params
    }
    ])
plan1_params.append({})    
plan1_params.append({
    "criterion" : ["gini", "entropy"],
    "max_depth" : [10,8,7,6,5,4,3,2],
    "min_samples_leaf": [1,2,3,4,5,6,7,8,9]})

plan1_params

[[{'solver': ['saga'],
   'penalty': ['l1'],
   'C': [0.001, 0.1, 1.0, 10.0, 20.0, 100.0]},
  {'solver': ['liblinear'],
   'penalty': ['l2'],
   'C': [0.001, 0.1, 1.0, 10.0, 20.0, 100.0]}],
 {},
 {'criterion': ['gini', 'entropy'],
  'max_depth': [10, 8, 7, 6, 5, 4, 3, 2],
  'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9]}]

In [9]:
plan1_estimator_results = []

for i, (estimator, params) in enumerate(zip(plan1_algorithms, plan1_params)):
    gs_estimator = GridSearchCV(
            refit="accuracy", estimator=estimator,param_grid=params, scoring=['accuracy'], cv=5, verbose=1, n_jobs=4)
    gs_estimator.fit(train_X, train_y)
    plan1_estimator_results.append(gs_estimator)

Fitting 5 folds for each of 12 candidates, totalling 60 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 144 candidates, totalling 720 fits


In [10]:
# LogisticRegression
plan1_estimator_results[0].best_score_

0.7891415446492204

In [11]:
# GaussianNB
plan1_estimator_results[1].best_score_

0.707241837729314

In [12]:
# RandomForest
plan1_estimator_results[2].best_score_

0.8012220824501999

In [13]:
plan1_best_RF = plan1_estimator_results[2].best_estimator_
test_y = plan1_best_RF.predict(test_X)
test_y


array([1., 0., 1., ..., 1., 1., 1.])

In [14]:
submission_df = pd.read_csv('data/sample_submission.csv')
submission_df['Transported'] = test_y.astype(bool)
submission_df.to_csv('data/plan1_best_RF.csv', index=False)

## Plan 1 Result

- Score : 0.79074 > 0.79424

- Leaderboard : 1740위 > 1499위

![plan1_best_RF](data/image/plan1_best_RF.png)

In [15]:
plan1_attributes = ["model", "accuracy", "penalty", "solver", "C", "criterion", "max_depth", "min_samples_leaf"]
plan1_dict = defaultdict(list)

algorithm_name= ["LogisticRegression", "GaussianNB", "RandomForestClassifier"]

for i, estimators in enumerate(plan1_estimator_results):
    number_of_estimators = len(estimators.cv_results_["mean_fit_time"])

    for idx_estimator in range(number_of_estimators):
        plan1_dict["model"].append(algorithm_name[i])
        plan1_dict["accuracy"].append(
            estimators.cv_results_["mean_test_accuracy"][idx_estimator])
            
    for param_value in estimators.cv_results_["params"]:
        for k,v in param_value.items():
            plan1_dict[k].append(v)
    for attr_name in plan1_attributes:
        if len(plan1_dict[attr_name]) < len(plan1_dict["accuracy"]):
            plan1_dict[attr_name].extend([None for i in range(number_of_estimators)])

plan1_result_df = pd.DataFrame(plan1_dict, columns = plan1_attributes)
plan1_result_df

Unnamed: 0,model,accuracy,penalty,solver,C,criterion,max_depth,min_samples_leaf
0,LogisticRegression,0.718396,l1,saga,0.001,,,
1,LogisticRegression,0.787876,l1,saga,0.100,,,
2,LogisticRegression,0.789141,l1,saga,1.000,,,
3,LogisticRegression,0.789142,l1,saga,10.000,,,
4,LogisticRegression,0.789142,l1,saga,20.000,,,
...,...,...,...,...,...,...,...,...
152,RandomForestClassifier,0.739333,,,,entropy,2.0,5.0
153,RandomForestClassifier,0.740598,,,,entropy,2.0,6.0
154,RandomForestClassifier,0.738643,,,,entropy,2.0,7.0
155,RandomForestClassifier,0.738529,,,,entropy,2.0,8.0


In [16]:
plan1_result_df.sort_values('accuracy', ascending=False)[:30]

Unnamed: 0,model,accuracy,penalty,solver,C,criterion,max_depth,min_samples_leaf
14,RandomForestClassifier,0.801222,,,,gini,10.0,2.0
13,RandomForestClassifier,0.800187,,,,gini,10.0,1.0
15,RandomForestClassifier,0.799842,,,,gini,10.0,3.0
16,RandomForestClassifier,0.799841,,,,gini,10.0,4.0
90,RandomForestClassifier,0.799152,,,,entropy,10.0,6.0
87,RandomForestClassifier,0.799152,,,,entropy,10.0,3.0
85,RandomForestClassifier,0.799036,,,,entropy,10.0,1.0
88,RandomForestClassifier,0.798806,,,,entropy,10.0,4.0
86,RandomForestClassifier,0.798001,,,,entropy,10.0,2.0
20,RandomForestClassifier,0.797772,,,,gini,10.0,8.0


In [17]:
plan1_result_df[plan1_result_df['model'] == 'LogisticRegression'].sort_values('accuracy', ascending=False)

Unnamed: 0,model,accuracy,penalty,solver,C,criterion,max_depth,min_samples_leaf
3,LogisticRegression,0.789142,l1,saga,10.0,,,
4,LogisticRegression,0.789142,l1,saga,20.0,,,
9,LogisticRegression,0.789142,l2,liblinear,10.0,,,
2,LogisticRegression,0.789141,l1,saga,1.0,,,
5,LogisticRegression,0.789026,l1,saga,100.0,,,
10,LogisticRegression,0.789026,l2,liblinear,20.0,,,
11,LogisticRegression,0.789026,l2,liblinear,100.0,,,
8,LogisticRegression,0.788681,l2,liblinear,1.0,,,
1,LogisticRegression,0.787876,l1,saga,0.1,,,
7,LogisticRegression,0.786381,l2,liblinear,0.1,,,


In [18]:
plan1_result_df[plan1_result_df['model'] == 'LogisticRegression'].sort_values('accuracy', ascending=False)

Unnamed: 0,model,accuracy,penalty,solver,C,criterion,max_depth,min_samples_leaf
3,LogisticRegression,0.789142,l1,saga,10.0,,,
4,LogisticRegression,0.789142,l1,saga,20.0,,,
9,LogisticRegression,0.789142,l2,liblinear,10.0,,,
2,LogisticRegression,0.789141,l1,saga,1.0,,,
5,LogisticRegression,0.789026,l1,saga,100.0,,,
10,LogisticRegression,0.789026,l2,liblinear,20.0,,,
11,LogisticRegression,0.789026,l2,liblinear,100.0,,,
8,LogisticRegression,0.788681,l2,liblinear,1.0,,,
1,LogisticRegression,0.787876,l1,saga,0.1,,,
7,LogisticRegression,0.786381,l2,liblinear,0.1,,,


In [19]:
plan1_result_df[plan1_result_df['model'] == 'GaussianNB'].sort_values('accuracy', ascending=False)

Unnamed: 0,model,accuracy,penalty,solver,C,criterion,max_depth,min_samples_leaf
12,GaussianNB,0.707242,,,,,,


## Plan 1 Feedback

1. `RandomForestClassifier`의 `max_depth`를 높이자 !  
`max_depth`가 작아질수록 성능이 떨어짐  
그 외 다른 파라미터에서는 큰 의미가 없었음

2. `LogisticRegression`의 `C`는 너무 작지도, 너무 크지도 않은 10이 적절  
`LogisticRegression`가 `C`는 10일 때 `penalty`, `solver`에 관계없이 가장 좋은 성능을 보임  
`C`가 매우 작을 때(0.1, 0.001) 제약이 커지므로 성능이 떨어지는 것을 확인

3. `GaussianNB`의 성능은 0.707242  
조절할 파라미터가 없음

2. 다른 ML 모델로 학습해보자 !  
`LogisticRegression`, `GaussianNB` 모델로 성능을 높이는 데 한계가 있음  
다른 앙상블 모델 `XGBoost`, `LightGBM`으로 예측해보기

## Plan 2 Modeling

In [34]:
estimator = RandomForestClassifier()
plan2_rf_params = {
            "criterion" : ["gini", "entropy"],
            "max_depth" : [25,20,18,15,12,10,9],
            "min_samples_leaf": [1,2,3,4,5,6,7,8,9]
            }
plan2_gs_estimator = GridSearchCV(
        refit="accuracy", estimator=estimator,param_grid=plan2_rf_params, scoring=['accuracy'], cv=5, verbose=1, n_jobs=-1)
plan2_gs_estimator.fit(train_X, train_y)

Fitting 5 folds for each of 126 candidates, totalling 630 fits


In [35]:
plan2_gs_estimator.best_score_

0.8030625513254115

In [36]:
plan2_attributes = ["accuracy", "criterion", "max_depth", "min_samples_leaf"]
plan2_dict = defaultdict(list)

number_of_estimators = len(plan2_gs_estimator.cv_results_["mean_fit_time"])

for idx_estimator in range(number_of_estimators):
    plan2_dict["accuracy"].append(
        plan2_gs_estimator.cv_results_["mean_test_accuracy"][idx_estimator])
        
for param_value in plan2_gs_estimator.cv_results_["params"]:
    for k,v in param_value.items():
        plan2_dict[k].append(v)

plan2_result_df = pd.DataFrame(plan2_dict, columns = plan2_attributes)
plan2_result_df

Unnamed: 0,accuracy,criterion,max_depth,min_samples_leaf
0,0.787762,gini,25,1
1,0.799842,gini,25,2
2,0.799612,gini,25,3
3,0.799037,gini,25,4
4,0.800072,gini,25,5
...,...,...,...,...
121,0.796621,entropy,9,5
122,0.796046,entropy,9,6
123,0.796276,entropy,9,7
124,0.796046,entropy,9,8


In [37]:
plan2_result_df.sort_values('accuracy', ascending=False)[:20]

Unnamed: 0,accuracy,criterion,max_depth,min_samples_leaf
102,0.803063,entropy,12,4
74,0.802717,entropy,20,3
31,0.802603,gini,15,5
101,0.802488,entropy,12,3
32,0.801798,gini,15,6
75,0.801683,entropy,20,4
87,0.801568,entropy,18,7
65,0.801568,entropy,25,3
108,0.801566,entropy,10,1
13,0.801338,gini,20,5


In [39]:
plan2_result_df[plan2_result_df['criterion'] == 'entropy'].sort_values('accuracy', ascending=False)[:10]

Unnamed: 0,accuracy,criterion,max_depth,min_samples_leaf
102,0.803063,entropy,12,4
74,0.802717,entropy,20,3
101,0.802488,entropy,12,3
75,0.801683,entropy,20,4
87,0.801568,entropy,18,7
65,0.801568,entropy,25,3
108,0.801566,entropy,10,1
103,0.801222,entropy,12,5
76,0.801108,entropy,20,5
79,0.801108,entropy,20,8


`criterion`이 `entropy`이면 `max_depth` 20이 적절

In [40]:
plan2_result_df[plan2_result_df['criterion'] == 'gini'].sort_values('accuracy', ascending=False)[:10]

Unnamed: 0,accuracy,criterion,max_depth,min_samples_leaf
31,0.802603,gini,15,5
32,0.801798,gini,15,6
13,0.801338,gini,20,5
6,0.800992,gini,25,7
38,0.800762,gini,12,3
37,0.800762,gini,12,2
24,0.800532,gini,18,7
23,0.800418,gini,18,6
30,0.800418,gini,15,4
34,0.800417,gini,15,8


`criterion`이 `gini`이면 `max_depth` 15가 적절

In [24]:
plan2_best_RF = plan2_gs_estimator.best_estimator_
test_y = plan2_best_RF.predict(test_X)
submission_df['Transported'] = test_y.astype(bool)
submission_df.to_csv('data/plan2_best_RF.csv', index=False)

![plan2_best_RF](data/image/plan2_best_RF.png)

### best RF 예측 결과
Overfitting이 일어나 성능이 오히려 더 떨어졌음

In [42]:
plan2_gini_estimator = RandomForestClassifier(criterion='gini', max_depth=15, min_samples_leaf=5)
plan2_gini_estimator.fit(train_X, train_y)
test_y = plan2_gini_estimator.predict(test_X)
submission_df['Transported'] = test_y.astype(bool)
submission_df.to_csv('data/plan2_gini_RF.csv', index=False)

![plan2_gini_RF](data/image/plan2_gini_RF.png)

### XGB, LGBM

In [43]:
new_algorithm = [XGBClassifier(), LGBMClassifier()]
new_algorithm_name = ["XGB", "LGBM"]

for estimator, estimator_name in zip(new_algorithm, new_algorithm_name):
  estimator.fit(train_X, train_y)
  test_y = estimator.predict(test_X)
  submission_df['Transported'] = test_y.astype(bool)
  submission_df.to_csv(f'data/plan2_{estimator_name}.csv', index=False)

![plan2_XGB_LGBM](data/image/plan2_XGBoost_LGBM.png)

아무리 다른 모델 파라미터 조절해도 LGBM, XGB default 만도 못하다 ,,

## Plan 2 Result

- Score : 0.79424 > 0.79448 > 0.79775

- Leaderboard : 1499위 > 1228위

![plan2_leaderboard](data/image/plan2_Leaderboard.png)

## Plan 2 Feedback

1. `RandomForestClassifier`로 최대 성능을 내보자 !  
`LightGBM`으로 더 좋은 성능을 낼 수 있지만 알고리즘에 대한 이해 부족  
성능 자체보다는 파라미터 조절과 모델 외 다양한 접근을 시도하는 것에 초점 (그래도 `XGM` default만 넘어보자 ,,)  

2. Overfitting 문제를 PCA로 해결되나 ?

## Plan 3 Modeling

In [44]:
estimator = RandomForestClassifier()
plan3_rf_params = {
            "criterion" : ["gini", "entropy"],
            "max_depth" : [24,22,20,18,17,16,15,14,12],
            "min_samples_leaf": [1,2,3,4,5,6,7,8]
            }
plan3_gs_estimator = GridSearchCV(
        refit="accuracy", estimator=estimator,param_grid=plan3_rf_params, scoring=['accuracy'], cv=5, verbose=1, n_jobs=-1)
plan3_gs_estimator.fit(train_X, train_y)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


In [45]:
plan3_RF_best = plan3_gs_estimator.best_estimator_
test_y = plan3_RF_best.predict(test_X)
submission_df['Transported'] = test_y.astype(bool)
submission_df.to_csv(f'data/plan3_RF_best.csv', index=False)

![plan3_best_RF](data/image/plan3_best_RF.png)

### PCA

In [46]:
pca = PCA(n_components=scaled_X_data.shape[1])
Z = pca.fit_transform(scaled_X_data)

In [47]:
pca.explained_variance_

array([3.44094472e+00, 2.59497686e+00, 2.00019733e+00, 1.62671833e+00,
       1.55274326e+00, 1.25643229e+00, 1.12165707e+00, 1.09813438e+00,
       1.07161579e+00, 1.03805488e+00, 1.00221583e+00, 9.42893028e-01,
       9.12446880e-01, 8.51993762e-01, 8.37885503e-01, 7.96992480e-01,
       7.34269067e-01, 6.72248843e-01, 3.57112814e-01, 9.23174511e-02,
       2.74966937e-29, 1.78879568e-29, 8.38472233e-30, 4.92426735e-30])

In [48]:
pca.explained_variance_ratio_

array([1.43361642e-01, 1.08115699e-01, 8.33351299e-02, 6.77747044e-02,
       6.46926476e-02, 5.23473092e-02, 4.67321077e-02, 4.57520715e-02,
       4.46472151e-02, 4.32489517e-02, 4.17557734e-02, 3.92841804e-02,
       3.80156887e-02, 3.54970030e-02, 3.49092042e-02, 3.32054596e-02,
       3.05921856e-02, 2.80082088e-02, 1.48785533e-02, 3.84626389e-03,
       1.14560724e-30, 7.45274068e-31, 3.49336494e-31, 2.05161987e-31])

In [49]:
evr_cumsum = np.cumsum(pca.explained_variance_ratio_)

px.area(
  x = range(1, evr_cumsum.shape[0]+1),
  y = evr_cumsum,
  labels={"x": "Components", "y": "Explained Variance"}
)

In [50]:
component_num = [14, 15, 16, 17, 18]
for n in component_num:
  pca = PCA(n_components=n)
  Z = pca.fit_transform(scaled_X_data)
  train_X = Z[:train_idx]
  test_X = Z[train_idx:]
  plan3_gs_estimator.fit(train_X, train_y)
  test_y = plan3_gs_estimator.best_estimator_.predict(test_X)
  submission_df['Transported'] = test_y.astype(bool)
  submission_df.to_csv(f'data/plan3_pca{n}.csv', index=False)


Fitting 5 folds for each of 144 candidates, totalling 720 fits
Fitting 5 folds for each of 144 candidates, totalling 720 fits
Fitting 5 folds for each of 144 candidates, totalling 720 fits
Fitting 5 folds for each of 144 candidates, totalling 720 fits
Fitting 5 folds for each of 144 candidates, totalling 720 fits


## Plan 3 Result

0.79424 이상 높이기 실패 ,,  
PCA로 오버피팅을 해결할 수 없음

![pca](data/image/plan3_pca16.png)

![pca](data/image/plan3_pca18.png)

## Feature Importances

In [52]:
plan1_estimator_results[2].best_estimator_.feature_importances_

array([0.17381741, 0.05087004, 0.0019723 , 0.12717207, 0.08787112,
       0.08095788, 0.1515969 , 0.13233866, 0.03676861, 0.03749969,
       0.01130208, 0.00762314, 0.00363619, 0.00749324, 0.00174099,
       0.00782112, 0.00751644, 0.00267467, 0.01343428, 0.0185913 ,
       0.01483496, 0.        , 0.01132337, 0.01114354])

In [56]:
X_data.columns

Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'CabinDeck_A',
       'CabinDeck_B', 'CabinDeck_C', 'CabinDeck_D', 'CabinDeck_E',
       'CabinDeck_F', 'CabinDeck_G', 'CabinDeck_T', 'CabinSide_P',
       'CabinSide_S'],
      dtype='object')

## Backlog
- `RandomForestClassifier` 외 다른 앙상블 모델로 파라미터 조절
- 모델 학습을 위한 Pipeline 구축