In [140]:
# 套件
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier

In [141]:
# 輸入資料集
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [142]:
print(train.head())  # 查看前幾行

  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False  
1         True  
2        False  
3        False  
4         True  


In [143]:
# 處理缺失值：數值欄位
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for col in num_cols:
    train[col].fillna(train[col].median(), inplace=True) # 用中位數填補缺失值
    test[col].fillna(test[col].median(), inplace=True) # 用中位數填補缺失值

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].median(), inplace=True) # 用中位數填補缺失值
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(test[col].median(), inplace=True) # 用中位數填補缺失值
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate obj

In [144]:
# 處理缺失值：類別欄位
cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
for col in cat_cols:
    train[col].fillna(train[col].mode()[0], inplace=True) # 用眾數填補缺失值
    test[col].fillna(test[col].mode()[0], inplace=True) # 用眾數填補缺失值

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].mode()[0], inplace=True) # 用眾數填補缺失值
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(test[col].mode()[0], inplace=True) # 用眾數填補缺失值
  train[col].fillna(train[col].mode()[0], inplace=True) # 用眾數填補缺失值
  test[col].fillna(test[col].mode()[0

In [145]:
# 拆分 Cabin 為 Deck, Num, Side
train['Deck'] = train['Cabin'].str.split('/').str[0]
train['Num'] = train['Cabin'].str.split('/').str[1]
train['Side'] = train['Cabin'].str.split('/').str[2]

test['Deck'] = test['Cabin'].str.split('/').str[0]
test['Num'] = test['Cabin'].str.split('/').str[1]
test['Side'] = test['Cabin'].str.split('/').str[2]

# 填補 Cabin 的缺失值
train[['Deck', 'Num', 'Side']].fillna(train[['Deck', 'Num', 'Side']].mode().iloc[0], inplace=True)
test[['Deck', 'Num', 'Side']].fillna(test[['Deck', 'Num', 'Side']].mode().iloc[0], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[['Deck', 'Num', 'Side']].fillna(train[['Deck', 'Num', 'Side']].mode().iloc[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[['Deck', 'Num', 'Side']].fillna(test[['Deck', 'Num', 'Side']].mode().iloc[0], inplace=True)


In [146]:
# 將 Num 轉為數值型
train['Num'] = pd.to_numeric(train['Num'], errors='coerce').fillna(0).astype(int)
test['Num'] = pd.to_numeric(test['Num'], errors='coerce').fillna(0).astype(int)

In [147]:
# 編碼類別欄位
cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']
for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

In [148]:
# 特徵工程：計算總消費，將 RoomService, FoodCourt, ShoppingMall, Spa, VRDeck 合併為總消費
train['TotalSpend'] = train[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
test['TotalSpend'] = test[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

In [149]:
# 移除無用欄位
train.drop(['Name', 'Cabin'], axis=1, inplace=True)
test.drop(['Name', 'Cabin'], axis=1, inplace=True)

In [150]:
# 準備訓練與測試資料
X = train.drop(['PassengerId', 'Transported'], axis=1)
y = train['Transported']
X_test = test.drop(['PassengerId'], axis=1)

In [151]:
# 檢查資料類型
print(X.dtypes)

HomePlanet        int32
CryoSleep         int32
Destination       int32
Age             float64
VIP               int32
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Deck              int32
Num               int32
Side              int32
TotalSpend      float64
dtype: object


In [152]:
# 切分訓練與驗證集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [156]:
# 訓練模型
#model = LGBMClassifier(random_state=42)
#model.fit(X_train, y_train)
from lightgbm import early_stopping

model = LGBMClassifier(random_state=42)
model.fit(X_train, y_train, 
          eval_set=[(X_val, y_val)], 
          callbacks=[early_stopping(stopping_rounds=50)])

[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002680 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1885
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[71]	valid_0's binary_logloss: 0.389173


In [159]:
# 驗證
y_pred = model.predict(X_val)
print(f'Validation Accuracy: {accuracy_score(y_val, y_pred):.4f}')

Validation Accuracy: 0.8068


In [160]:
# 預測測試集
predictions = model.predict(X_test)
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Transported': predictions
})
submission.to_csv('submission.csv', index=False)