In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from xgboost import XGBClassifier as XGB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample = pd.read_csv("sample_submission.csv")

In [3]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
#Cabinを分割且つ、欠損値補完
train["deck"] = train["Cabin"].apply(lambda x :x.split("/")[0] if isinstance(x, str) else "unknown")
train["num"] = train["Cabin"].apply(lambda x :x.split("/")[1] if isinstance(x, str) else "unknown")
train["side"] = train["Cabin"].apply(lambda x :x.split("/")[2] if isinstance(x, str) else "unknown")

test["deck"] = test["Cabin"].apply(lambda x :x.split("/")[0] if isinstance(x, str) else "unknown")
test["num"] = test["Cabin"].apply(lambda x :x.split("/")[1] if isinstance(x, str) else "unknown")
test["side"] = test["Cabin"].apply(lambda x :x.split("/")[2] if isinstance(x, str) else "unknown")

In [5]:
#Nameを姓と名に分割且つ、欠損値補完
train["first"] = train["Name"].apply(lambda x :x.split(" ")[0] if isinstance(x, str) else "unknown")
train["last"] = train["Name"].apply(lambda x :x.split(" ")[1] if isinstance(x, str) else "unknown")

test["first"] = test["Name"].apply(lambda x :x.split(" ")[0] if isinstance(x, str) else "unknown")
test["last"] = test["Name"].apply(lambda x :x.split(" ")[1] if isinstance(x, str) else "unknown")

In [6]:
#欠損値補完
train["Destination"] = train["Destination"].fillna("missingPL")
test["Destination"] = test["Destination"].fillna("missingPL")

In [7]:
train["VIP"].value_counts()

VIP
False    8291
True      199
Name: count, dtype: int64

In [8]:
#Age欠損値をtrain平均値で補完
train["Age"] = train["Age"].fillna(train["Age"].mean())
test["Age"] = test["Age"].fillna(train["Age"].mean())

In [9]:
train[train["VRDeck"] >= 1]["VIP"].value_counts()

VIP
False    2829
True      118
Name: count, dtype: int64

In [10]:
#HomePlanet欠損値補完
train["HomePlanet"] = train["HomePlanet"].fillna("missing")
test["HomePlanet"] = test["HomePlanet"].fillna("missing")

In [11]:
train["CryoSleep"].value_counts()

CryoSleep
False    5439
True     3037
Name: count, dtype: int64

In [12]:
#欠損値補完
train["CryoSleep"] = train["CryoSleep"].fillna("None")
test["CryoSleep"] = test["CryoSleep"].fillna("None")
train["VIP"] = train["VIP"].fillna("None")
test["VIP"] = test["VIP"].fillna("None")

train["RoomService"] = train["RoomService"].fillna(0)
train["FoodCourt"] = train["FoodCourt"].fillna(0)
train["ShoppingMall"] = train["ShoppingMall"].fillna(0)
train["Spa"] = train["Spa"].fillna(0)
train["VRDeck"] = train["VRDeck"].fillna(0)

test["RoomService"] = test["RoomService"].fillna(0)
test["FoodCourt"] = test["FoodCourt"].fillna(0)
test["ShoppingMall"] = test["ShoppingMall"].fillna(0)
test["Spa"] = test["Spa"].fillna(0)
test["VRDeck"] = test["VRDeck"].fillna(0)

In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8693 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8693 non-null   object 
 5   Age           8693 non-null   float64
 6   VIP           8693 non-null   object 
 7   RoomService   8693 non-null   float64
 8   FoodCourt     8693 non-null   float64
 9   ShoppingMall  8693 non-null   float64
 10  Spa           8693 non-null   float64
 11  VRDeck        8693 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
 14  deck          8693 non-null   object 
 15  num           8693 non-null   object 
 16  side          8693 non-null   object 
 17  first         8693 non-null   object 
 18  last          8693 non-null 

In [14]:
#PassengerIdの乗客IDを取り出す
train["PassengerId_gg"] = train["PassengerId"].apply(lambda x :x.split("_")[0] if isinstance(x, str) else "unknown")

# PassengerId_ggとlastの組み合わせで同じ人が何人いるか数える
train["Family_num"] = train.groupby(["PassengerId_gg", "last"])["PassengerId"].transform("count")

# Family_num：2人以上なら1、1人だけなら0
train["Family"] = train["Family_num"].apply(lambda x: 1 if x > 1 else 0)

# Family_num削除
train.drop(columns=["Family_num"], inplace=True)

#テストデータも同様に処理
test["PassengerId_gg"] = test["PassengerId"].apply(lambda x :x.split("_")[0] if isinstance(x, str) else "unknown")
test["Family_num"] = test.groupby(["PassengerId_gg", "last"])["PassengerId"].transform("count")
test["Family"] = test["Family_num"].apply(lambda x: 1 if x > 1 else 0)
test.drop(columns=["Family_num"], inplace=True)

In [15]:
#特徴量をdrop、目的変数設定
trainX = train.drop(["Cabin","Name","PassengerId","Transported","PassengerId_gg","first","last",],axis=1)
y = train["Transported"]
testX = test.drop(["Cabin","Name","PassengerId","PassengerId_gg","first","last",],axis=1)

In [16]:
trainX["num"].value_counts()

num
unknown    199
82          28
86          22
19          22
56          21
          ... 
1644         1
1515         1
1639         1
1277         1
1894         1
Name: count, Length: 1818, dtype: int64

In [17]:
#numカラム内補完した"unknown"を0に変換
trainX["num"] = trainX["num"].replace("unknown", 0).astype(int)
testX["num"] = testX["num"].replace("unknown", 0).astype(int)

In [18]:
#trainX,testXをダミー変数化
trainX = pd.get_dummies(trainX,dtype=int)
testX = pd.get_dummies(testX,dtype=int)

In [19]:
trainX.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 34 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        8693 non-null   float64
 1   RoomService                8693 non-null   float64
 2   FoodCourt                  8693 non-null   float64
 3   ShoppingMall               8693 non-null   float64
 4   Spa                        8693 non-null   float64
 5   VRDeck                     8693 non-null   float64
 6   num                        8693 non-null   int32  
 7   Family                     8693 non-null   int64  
 8   HomePlanet_Earth           8693 non-null   int32  
 9   HomePlanet_Europa          8693 non-null   int32  
 10  HomePlanet_Mars            8693 non-null   int32  
 11  HomePlanet_missing         8693 non-null   int32  
 12  CryoSleep_False            8693 non-null   int32  
 13  CryoSleep_True             8693 non-null   int32

In [20]:
#必要のないカラムを削除
trainX.drop(["HomePlanet_missing","CryoSleep_None","Destination_missingPL","VIP_None","deck_unknown","side_unknown"],axis=1,inplace=True)
testX.drop(["HomePlanet_missing","CryoSleep_None","Destination_missingPL","VIP_None","deck_unknown","side_unknown"],axis=1,inplace=True)

In [21]:
#目的変数を数値化
y = train["Transported"].astype(int)

In [22]:
trainX.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        8693 non-null   float64
 1   RoomService                8693 non-null   float64
 2   FoodCourt                  8693 non-null   float64
 3   ShoppingMall               8693 non-null   float64
 4   Spa                        8693 non-null   float64
 5   VRDeck                     8693 non-null   float64
 6   num                        8693 non-null   int32  
 7   Family                     8693 non-null   int64  
 8   HomePlanet_Earth           8693 non-null   int32  
 9   HomePlanet_Europa          8693 non-null   int32  
 10  HomePlanet_Mars            8693 non-null   int32  
 11  CryoSleep_False            8693 non-null   int32  
 12  CryoSleep_True             8693 non-null   int32  
 13  Destination_55 Cancri e    8693 non-null   int32

In [23]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

In [24]:
#XGBモデル
xgb = XGB(use_label_encoder=False, eval_metric="auc", random_state=42)

param_grid = {'max_depth': [3, 5],'learning_rate': [0.1, 0.3,0.5],'n_estimators': [50, 100,150]}

gcv_xgb = GridSearchCV(xgb, param_grid,scoring='accuracy', cv=5, verbose=1, n_jobs=-1)

In [25]:
gcv_xgb.fit(trainX,y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [26]:
pred_xgb = gcv_xgb.predict(testX)

In [27]:
xgb_best = gcv_xgb.best_estimator_

In [28]:
pred_xgb = pred_xgb.astype(bool)

In [29]:
pred_y_xgb = xgb_best.predict(trainX)

In [30]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score
#CVスコア_xgb
cv_accuracy = accuracy_score(y, pred_y_xgb)
print(f"Stacking CV Accuracy: {cv_accuracy:.4f}")

Stacking CV Accuracy: 0.8197


In [None]:
sample["Transported"] = pred_xgb
sample.to_csv("submission_xgb_a.csv",index=None)
#LBスコア：0.80360

In [31]:
trainX.corrwith(y)

Age                         -0.074249
RoomService                 -0.241124
FoodCourt                    0.045583
ShoppingMall                 0.009391
Spa                         -0.218545
VRDeck                      -0.204874
num                         -0.043832
Family                       0.100146
HomePlanet_Earth            -0.169019
HomePlanet_Europa            0.176916
HomePlanet_Mars              0.019544
CryoSleep_False             -0.451744
CryoSleep_True               0.460132
Destination_55 Cancri e      0.108722
Destination_PSO J318.5-22    0.000092
Destination_TRAPPIST-1e     -0.094700
VIP_False                    0.024602
VIP_True                    -0.037261
deck_A                      -0.002623
deck_B                       0.144733
deck_C                       0.108193
deck_D                      -0.034046
deck_E                      -0.097965
deck_F                      -0.087753
deck_G                       0.016269
deck_T                      -0.014568
side_P      

In [32]:
#LGBMモデル
model_LG = LGBMClassifier(use_label_encoder=False, eval_metric="auc", random_state=42)
gcv_LG = GridSearchCV(model_LG, param_grid,scoring='accuracy', cv=5, verbose=1, n_jobs=-1)

In [33]:
gcv_LG.fit(trainX,y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[LightGBM] [Info] Number of positive: 4378, number of negative: 4315
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000597 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1650
[LightGBM] [Info] Number of data points in the train set: 8693, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503624 -> initscore=0.014495
[LightGBM] [Info] Start training from score 0.014495


In [34]:
gcv_LG.best_params_

{'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 50}

In [35]:
LG_best = gcv_LG.best_estimator_

In [36]:
pred_y_LG = LG_best.predict(trainX)



In [37]:
#CVスコア_LGBM
cv_accuracy = accuracy_score(y, pred_y_LG)
print(f"Stacking CV Accuracy: {cv_accuracy:.4f}")

Stacking CV Accuracy: 0.8240


In [38]:
pred_LG = LG_best.predict(testX).astype(bool)



In [None]:
sample["Transported"] = pred_LG
sample.to_csv("submission_LGBM_a.csv",index=None)
#LBスコア：0.80336

In [39]:
#CatBoostモデル
model_cat = CatBoostClassifier(verbose=0)
gcv_cat = GridSearchCV(model_cat, param_grid,scoring='accuracy', cv=5, verbose=1, n_jobs=-1)

In [40]:
gcv_cat.fit(trainX,y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [41]:
cat_best = gcv_cat.best_estimator_

In [42]:
pred_y_cat = cat_best.predict(trainX)

In [43]:
#CVスコア_CAT
cv_accuracy = accuracy_score(y, pred_y_cat)
print(f"Stacking CV Accuracy: {cv_accuracy:.4f}")

Stacking CV Accuracy: 0.8149


In [44]:
pred_cat = cat_best.predict(testX).astype(bool)

In [None]:
sample["Transported"] = pred_cat
sample.to_csv("submission_cat_a.csv",index=None)
#LBスコア：0.80360

In [45]:
#スタッキング（XGB,LGBM,Cat）再グリッドサーチあり
estimators = [
    ('xgb', gcv_xgb),
    ('lgb', gcv_LG),
    ('cat', gcv_cat)
]

stacked_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression()
)


In [46]:
stacked_model.fit(trainX,y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Fitting 5 folds for each of 18 candidates, totalling 90 fits
[LightGBM] [Info] Number of positive: 4378, number of negative: 4315
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000924 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1650
[LightGBM] [Info] Number of data points in the train set: 8693, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503624 -> initscore=0.014495
[LightGBM] [Info] Start training from score 0.014495
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Fitting 5 folds for 

In [47]:
pred_stacked = stacked_model.predict(testX).astype(bool)



In [48]:
#CVスコア_stacked
pred_y_staced = stacked_model.predict(trainX)



In [49]:
cv_accuracy = accuracy_score(y, pred_y_staced)
print(f"Stacking CV Accuracy: {cv_accuracy:.4f}")

Stacking CV Accuracy: 0.8192


In [None]:
sample["Transported"] = pred_stacked
sample.to_csv("submission_stacked.csv",index=None)
#LBスコア：0.80710(ベストスコア)

In [50]:
#スタッキング（oof_predsを特徴量とする）
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

base_models = [
    XGB(verbosity=0),
    CatBoostClassifier(verbose=0),
    LGBMClassifier()
]

meta_model = LogisticRegression()

# 各ベースモデルの out-of-fold 予測値
oof_preds = np.zeros((trainX.shape[0], len(base_models)))
test_preds = np.zeros((testX.shape[0], len(base_models)))

for i, model in enumerate(base_models):
    test_preds_i = np.zeros((testX.shape[0], kf.n_splits))
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(trainX)):
        X_train, X_val = trainX.iloc[train_idx], trainX.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        oof_preds[val_idx, i] = model.predict(X_val)
        test_preds_i[:, fold] = model.predict(testX)
    
    # 各foldの平均予測
    test_preds[:, i] = test_preds_i.mean(axis=1)

# メタモデルを学習（oof_predsを特徴量として）
meta_model.fit(oof_preds, y)

# メタモデルの予測
meta_preds = meta_model.predict(oof_preds)

# CVスコア（accuracy）を計算
cv_accuracy = accuracy_score(y, meta_preds)
print(f"Stacking CV Accuracy: {cv_accuracy:.4f}")

# テストデータに対する最終予測
final_test_preds = meta_model.predict(test_preds)

[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000565 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1649
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230
[LightGBM] [Info] Number of positive: 3492, number of negative: 3462
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000589 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1649
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 27
[LightGBM] [Info] [binary:

In [None]:
final_test_preds = final_test_preds.astype(bool)
sample["Transported"] = final_test_preds
sample.to_csv("submission_stacked_oof.csv",index=None)
#LBスコア：0.80383