# Load Processed Data

In [None]:
# 1. Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

# 2. Load data
train = pd.read_csv(r"C:\Users\ghwns\HJ_git\titanic-survival-prediction\data\train_processed.csv")
test = pd.read_csv(r"C:\Users\ghwns\HJ_git\titanic-survival-prediction\data\test_processed.csv")

# 3. Define features and target
X = train.drop(columns=['Survived'])
y = train['Survived']

# 4. Encode categorical variables
for col in ['Sex', 'Title']:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    test[col] = le.transform(test[col])

# 5. Split training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Hyperparameter tuning: RandomForest
rf_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5]
}
rf_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_grid, cv=5, scoring='accuracy')
rf_search.fit(X_train, y_train)
best_rf = rf_search.best_estimator_
print("Best RF:", rf_search.best_params_)

# 7. Hyperparameter tuning: GradientBoosting
gb_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}
gb_search = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_grid, cv=5, scoring='accuracy')
gb_search.fit(X_train, y_train)
best_gb = gb_search.best_estimator_
print("Best GB:", gb_search.best_params_)

# 8. Hyperparameter tuning: XGBoost
xgb_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}
xgb_search = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
                          xgb_grid, cv=5, scoring='accuracy')
xgb_search.fit(X_train, y_train)
best_xgb = xgb_search.best_estimator_
print("Best XGB:", xgb_search.best_params_)

# 9. Define Soft VotingClassifier
voting_clf = VotingClassifier(
    estimators=[('rf', best_rf), ('gb', best_gb), ('xgb', best_xgb)],
    voting='soft'
)
voting_clf.fit(X_train, y_train)

# 10. Print validation accuracy
val_preds = voting_clf.predict(X_val)
acc = accuracy_score(y_val, val_preds)
print(f'Validation Accuracy (Tuned Voting): {acc:.4f}')

# 11. Predict on test set and save submission
test_preds = voting_clf.predict(test)
submission = pd.DataFrame({
    'PassengerId': np.arange(892, 892 + len(test_preds)),
    'Survived': test_preds
})
submission.to_csv(r"C:\Users\ghwns\HJ_git\titanic-survival-prediction\submissions\submission_v4_soft_voting_tuned.csv", index=False)
print("✅ Tuned submission saved as submission_v4_soft_voting_tuned.csv")


Collecting optuna
  Obtaining dependency information for optuna from https://files.pythonhosted.org/packages/d9/dd/0b593d1a5ee431b33a1fdf4ddb5911c312ed3bb598ef9e17457af2ee7b34/optuna-4.3.0-py3-none-any.whl.metadata
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Obtaining dependency information for alembic>=1.5.0 from https://files.pythonhosted.org/packages/41/18/d89a443ed1ab9bcda16264716f809c663866d4ca8de218aa78fd50b38ead/alembic-1.15.2-py3-none-any.whl.metadata
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Obtaining dependency information for colorlog from https://files.pythonhosted.org/packages/e3/51/9b208e85196941db2f0654ad0357ca6388ab3ed67efdbfc799f35d1f83aa/colorlog-6.9.0-py3-none-any.whl.metadata
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Obtaining dependency information for Mako from https://files.python

[I 2025-04-30 15:06:21,339] A new study created in memory with name: no-name-e4b854fd-9f2f-4713-b42f-8ce8b04c6395
[I 2025-04-30 15:06:22,307] Trial 0 finished with value: 0.7808988764044944 and parameters: {'n_estimators': 287, 'max_depth': 9, 'min_samples_split': 10, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.7808988764044944.
[I 2025-04-30 15:06:22,763] Trial 1 finished with value: 0.8146067415730337 and parameters: {'n_estimators': 142, 'max_depth': 3, 'min_samples_split': 9, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.8146067415730337.
[I 2025-04-30 15:06:23,341] Trial 2 finished with value: 0.8146067415730337 and parameters: {'n_estimators': 193, 'max_depth': 4, 'min_samples_split': 9, 'min_samples_leaf': 5}. Best is trial 1 with value: 0.8146067415730337.
[I 2025-04-30 15:06:24,037] Trial 3 finished with value: 0.8146067415730337 and parameters: {'n_estimators': 215, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 1 with value: 0

Best Parameters: {'n_estimators': 142, 'max_depth': 3, 'min_samples_split': 9, 'min_samples_leaf': 1}
Best Validation Accuracy: 0.8146067415730337


In [4]:
# 10. Retrain best model on full data and predict test set
best_rf = RandomForestClassifier(**study.best_params)
best_rf.fit(X, y)
test_preds = best_rf.predict(test)

# 11. Generate submission file
submission = pd.DataFrame({
    'PassengerId': np.arange(892, 892 + len(test_preds)),
    'Survived': test_preds
})
submission.to_csv(r"C:\Users\ghwns\HJ_git\titanic-survival-prediction\submissions\submission_v5_optuna_rf.csv", index=False)
print("✅ Optuna tuned RF submission saved!")

✅ Optuna 튜닝 RF 결과 저장 완료!
