# 資料探勘作業二 - 吸菸預測

## 1. 導入必要的套件

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, PowerTransformer, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier

# 用於儲存中間結果的目錄
import os
if not os.path.exists('model_checkpoints'):
    os.makedirs('model_checkpoints')

## 2. 載入資料

In [None]:
# 讀取資料
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

print("Initial shapes:")
print("Train shape:", train.shape)
print("Test shape:", test.shape)

# 儲存原始資料
with open('model_checkpoints/raw_data.pkl', 'wb') as f:
    pickle.dump({'train': train, 'test': test}, f)

## 3. 資料預處理

In [None]:
# 合併訓練和測試資料
test['smoking'] = np.nan
data = pd.concat([train, test], ignore_index=True)
print("Combined data shape:", data.shape)

# 填補缺失值
imputer = SimpleImputer(strategy='median')
data.iloc[:, :] = imputer.fit_transform(data)

# 移除欄位名稱中的空格
data.columns = data.columns.str.replace(' ', '_')

# 識別數值型和類別型欄位
categorical_columns = ['hearing(left)', 'hearing(right)', 'Urine_protein', 'dental_caries']
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
numerical_columns = [col for col in numerical_columns if col not in ['smoking']]

print("\nNumerical columns:", numerical_columns)
print("Categorical columns:", categorical_columns)

# 儲存預處理後的數據和轉換器
with open('model_checkpoints/preprocessed_data.pkl', 'wb') as f:
    pickle.dump({'data': data, 'numerical_columns': numerical_columns, 
                'categorical_columns': categorical_columns}, f)

## 4. 特徵工程

In [None]:
# 處理數值型特徵
scaler = MinMaxScaler()
power_transformer = PowerTransformer(method='yeo-johnson')
data[numerical_columns] = power_transformer.fit_transform(data[numerical_columns])
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# OneHot編碼
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded = encoder.fit_transform(data[categorical_columns])
encoded_df = pd.DataFrame(
    encoded, 
    columns=encoder.get_feature_names_out(categorical_columns),
    index=data.index
)

# 移除原始類別型欄位並加入編碼後的欄位
data = data.drop(columns=categorical_columns)
data = pd.concat([data, encoded_df], axis=1)

# 新增KMeans聚類特徵
kmeans = KMeans(n_clusters=5, random_state=42)
data['kmeans_cluster'] = kmeans.fit_predict(data.drop(columns=['smoking']))

# 儲存特徵工程後的數據和轉換器
with open('model_checkpoints/feature_engineering.pkl', 'wb') as f:
    pickle.dump({
        'data': data,
        'scaler': scaler,
        'power_transformer': power_transformer,
        'encoder': encoder,
        'kmeans': kmeans
    }, f)

## 5. 資料分割

In [None]:
# 取得原始資料長度
train_length = len(train)
test_length = len(test)

# 分割資料
X_train = data.iloc[:train_length].drop(columns=['smoking'])
X_test = data.iloc[train_length:].drop(columns=['smoking'])
y_train = data.iloc[:train_length]['smoking'].astype(int)

print("Final shapes:")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)

# 儲存分割後的資料
with open('model_checkpoints/split_data.pkl', 'wb') as f:
    pickle.dump({
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train
    }, f)

## 6. 模型訓練與交叉驗證

In [None]:
# 初始化模型
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
xgb_model = XGBClassifier(tree_method='hist', eval_metric='logloss', use_label_encoder=False, base_score=0.5)
lgbm_model = LGBMClassifier(objective='binary')
catboost_model = CatBoostClassifier(verbose=0)

# 初始化預測結果陣列
xgb_preds = np.zeros(len(X_test))
lgbm_preds = np.zeros(len(X_test))
catboost_preds = np.zeros(len(X_test))

# 追蹤每一折的驗證分數
validation_scores = {
    'xgb': [],
    'lgbm': [],
    'catboost': []
}

# 交叉驗證訓練
for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    print(f"\nFold {fold}")
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # XGBoost
    try:
        xgb_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)])
        xgb_preds += xgb_model.predict_proba(X_test)[:, 1] / skf.n_splits
        validation_scores['xgb'].append(xgb_model.score(X_val, y_val))
        print(f"XGBoost validation score: {validation_scores['xgb'][-1]:.4f}")
    except Exception as e:
        print(f"XGBoost training error: {e}")
    
    # LightGBM
    try:
        lgbm_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)])
        lgbm_preds += lgbm_model.predict_proba(X_test)[:, 1] / skf.n_splits
        validation_scores['lgbm'].append(lgbm_model.score(X_val, y_val))
        print(f"LightGBM validation score: {validation_scores['lgbm'][-1]:.4f}")
    except Exception as e:
        print(f"LightGBM training error: {e}")
    
    # CatBoost
    try:
        catboost_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)])
        catboost_preds += catboost_model.predict_proba(X_test)[:, 1] / skf.n_splits
        validation_scores['catboost'].append(catboost_model.score(X_val, y_val))
        print(f"CatBoost validation score: {validation_scores['catboost'][-1]:.4f}")
    except Exception as e:
        print(f"CatBoost training error: {e}")
    
    # 儲存每一折的模型和預測結果
    with open(f'model_checkpoints/fold_{fold}_models.pkl', 'wb') as f:
        pickle.dump({
            'xgb_model': xgb_model,
            'lgbm_model': lgbm_model,
            'catboost_model': catboost_model,
            'validation_scores': validation_scores
        }, f)

# 儲存所有預測結果
with open('model_checkpoints/predictions.pkl', 'wb') as f:
    pickle.dump({
        'xgb_preds': xgb_preds,
        'lgbm_preds': lgbm_preds,
        'catboost_preds': catboost_preds,
        'validation_scores': validation_scores
    }, f)

## 7. 模型融合與輸出結果

In [None]:
# 加權平均融合
final_preds = 0.34 * xgb_preds + 0.33 * lgbm_preds + 0.33 * catboost_preds

# 輸出預測結果
sample_submission['smoking'] = final_preds
sample_submission.to_csv('submission.csv', index=False)

print("\nFinal validation scores (mean):")
print(f"XGBoost: {np.mean(validation_scores['xgb']):.4f}")
print(f"LightGBM: {np.mean(validation_scores['lgbm']):.4f}")
print(f"CatBoost: {np.mean(validation_scores['catboost']):.4f}")