In [17]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [18]:
# 讀取資料
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [19]:
# 處理缺失值：數值欄位
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for col in num_cols:
    train[col] = train[col].fillna(train[col].median())
    test[col] = test[col].fillna(test[col].median())

# 處理缺失值：類別欄位
cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
for col in cat_cols:
    train[col] = train[col].fillna(train[col].mode()[0])
    test[col] = test[col].fillna(test[col].mode()[0])

  train[col] = train[col].fillna(train[col].mode()[0])
  test[col] = test[col].fillna(test[col].mode()[0])


In [20]:
# 拆分 Cabin
train['Deck'] = train['Cabin'].str.split('/').str[0]
train['Num'] = train['Cabin'].str.split('/').str[1]
train['Side'] = train['Cabin'].str.split('/').str[2]

test['Deck'] = test['Cabin'].str.split('/').str[0]
test['Num'] = test['Cabin'].str.split('/').str[1]
test['Side'] = test['Cabin'].str.split('/').str[2]

# 填補 Cabin 缺失值
train[['Deck', 'Num', 'Side']] = train[['Deck', 'Num', 'Side']].fillna(train[['Deck', 'Num', 'Side']].mode().iloc[0])
test[['Deck', 'Num', 'Side']] = test[['Deck', 'Num', 'Side']].fillna(test[['Deck', 'Num', 'Side']].mode().iloc[0])

# 轉換 Num 為數值
train['Num'] = pd.to_numeric(train['Num'], errors='coerce').fillna(0).astype(int)
test['Num'] = pd.to_numeric(test['Num'], errors='coerce').fillna(0).astype(int)

In [21]:
# 編碼類別欄位
cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']
for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

In [22]:
# 特徵工程：總消費
train['TotalSpend'] = train[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
test['TotalSpend'] = test[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

In [23]:
# 準備資料
X = train.drop(['PassengerId', 'Transported', 'Cabin', 'Name'], axis=1)
y = train['Transported']
X_test = test.drop(['PassengerId', 'Cabin', 'Name'], axis=1)

print(f"特徵數量: {X.shape[1]}")
print(f"特徵列表: {X.columns.tolist()}")

特徵數量: 14
特徵列表: ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Deck', 'Num', 'Side', 'TotalSpend']


In [24]:
# 分割資料
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
models = {}
scores = {}

In [26]:
# 版本1: 你的原始版本（默認參數）
print("🚀 版本1: 原始LightGBM（默認參數）")
model1 = LGBMClassifier(random_state=42)
model1.fit(X_train, y_train, 
          eval_set=[(X_val, y_val)], 
          callbacks=[])

pred1 = model1.predict(X_val)
score1 = accuracy_score(y_val, pred1)
models['Original'] = model1
scores['Original'] = score1
print(f"驗證準確率: {score1:.4f}")

🚀 版本1: 原始LightGBM（默認參數）
[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000589 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1883
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230
驗證準確率: 0.8091


In [27]:
# 版本2: 稍微調整early stopping
print("\n🚀 版本2: 調整early stopping")
model2 = LGBMClassifier(random_state=42)
model2.fit(X_train, y_train, 
          eval_set=[(X_val, y_val)], 
          callbacks=[])

pred2 = model2.predict(X_val)
score2 = accuracy_score(y_val, pred2)
models['EarlyStopping'] = model2
scores['EarlyStopping'] = score2
print(f"驗證準確率: {score2:.4f}")


🚀 版本2: 調整early stopping
[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000481 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1883
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230
驗證準確率: 0.8091


In [28]:
# 版本3: 不同隨機種子
print("\n🚀 版本3: 不同隨機種子")
model3 = LGBMClassifier(random_state=2024)
model3.fit(X_train, y_train, 
          eval_set=[(X_val, y_val)], 
          callbacks=[])

pred3 = model3.predict(X_val)
score3 = accuracy_score(y_val, pred3)
models['DiffSeed'] = model3
scores['DiffSeed'] = score3
print(f"驗證準確率: {score3:.4f}")


🚀 版本3: 不同隨機種子
[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048113 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1883
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230
驗證準確率: 0.8091


In [29]:
# 版本4: 輕微調參（保持簡單）
print("\n🚀 版本4: 輕微調參")
model4 = LGBMClassifier(
    random_state=42,
    n_estimators=150,  # 稍微多一點
    learning_rate=0.08  # 稍微慢一點學習
)
model4.fit(X_train, y_train, 
          eval_set=[(X_val, y_val)], 
          callbacks=[])

pred4 = model4.predict(X_val)
score4 = accuracy_score(y_val, pred4)
models['Tuned'] = model4
scores['Tuned'] = score4
print(f"驗證準確率: {score4:.4f}")


🚀 版本4: 輕微調參
[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000418 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1883
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230
驗證準確率: 0.8079


In [30]:
# 選擇最佳單一模型
best_single = max(scores, key=scores.get)
print(f"\n🏆 最佳單一模型: {best_single} ({scores[best_single]:.4f})")

# 簡單ensemble: 平均前3個最好的模型
print("\n🤝 嘗試簡單ensemble...")
sorted_models = sorted(scores.items(), key=lambda x: x[1], reverse=True)
top3_names = [name for name, _ in sorted_models[:3]]

# 計算ensemble預測
ensemble_probs = np.zeros(len(X_val))
for name in top3_names:
    probs = models[name].predict_proba(X_val)[:, 1]
    ensemble_probs += probs
ensemble_probs /= len(top3_names)

ensemble_pred = (ensemble_probs > 0.5).astype(int)
ensemble_score = accuracy_score(y_val, ensemble_pred)
print(f"Ensemble驗證準確率: {ensemble_score:.4f}")
print(f"使用模型: {top3_names}")


🏆 最佳單一模型: Original (0.8091)

🤝 嘗試簡單ensemble...
Ensemble驗證準確率: 0.8091
使用模型: ['Original', 'EarlyStopping', 'DiffSeed']


In [31]:
# 決定最終模型
if ensemble_score > scores[best_single]:
    print(f"\n✅ 使用Ensemble (提升 {ensemble_score - scores[best_single]:.4f})")
    use_ensemble = True
    final_score = ensemble_score
else:
    print(f"\n✅ 使用單一模型 {best_single}")
    use_ensemble = False
    final_score = scores[best_single]

# 生成最終預測
print(f"\n🎯 生成最終預測...")

if use_ensemble:
    # Ensemble預測
    final_probs = np.zeros(len(X_test))
    for name in top3_names:
        # 用全部資料重新訓練
        models[name].fit(X, y)
        probs = models[name].predict_proba(X_test)[:, 1]
        final_probs += probs
    final_probs /= len(top3_names)
    final_predictions = (final_probs > 0.5).astype(int)
    filename = 'submission_ensemble.csv'
else:
    # 單一模型預測
    final_model = models[best_single]
    final_model.fit(X, y)
    final_predictions = final_model.predict(X_test)
    filename = f'submission_{best_single.lower()}.csv'


✅ 使用單一模型 Original

🎯 生成最終預測...
[LightGBM] [Info] Number of positive: 4378, number of negative: 4315
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000628 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1884
[LightGBM] [Info] Number of data points in the train set: 8693, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503624 -> initscore=0.014495
[LightGBM] [Info] Start training from score 0.014495


In [32]:
# 創建submission
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Transported': final_predictions
})
submission.to_csv(filename, index=False)

# 結果總結
print(f"\n{'='*60}")
print(f"基於你的0.807成功版本的改進結果")
print(f"{'='*60}")
print(f"所有模型分數:")
for name, score in sorted(scores.items(), key=lambda x: x[1], reverse=True):
    print(f"  {name}: {score:.4f}")
print(f"Ensemble: {ensemble_score:.4f}")
print(f"")
print(f"最終選擇: {'Ensemble' if use_ensemble else best_single}")
print(f"最終分數: {final_score:.4f}")
print(f"預測 True 比例: {final_predictions.mean():.3f}")
print(f"保存文件: {filename}")
print(f"{'='*60}")
print(f"期望: 基於你的成功基礎，可能提升到 0.81+")
print(f"{'='*60}")


基於你的0.807成功版本的改進結果
所有模型分數:
  Original: 0.8091
  EarlyStopping: 0.8091
  DiffSeed: 0.8091
  Tuned: 0.8079
Ensemble: 0.8091

最終選擇: Original
最終分數: 0.8091
預測 True 比例: 0.520
保存文件: submission_original.csv
期望: 基於你的成功基礎，可能提升到 0.81+
