# Titanic Dataset - 模型优化

这个notebook包含多种模型优化策略：
1. 高级特征工程
2. 集成学习方法
3. 特征选择
4. 处理类别不平衡
5. 模型堆叠

In [3]:
# 导入必要的库
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# 设置随机种子
np.random.seed(42)

## 1. 高级特征工程

In [4]:
# 加载预处理后的数据
train_data = pd.read_csv('data/processed_train.csv')
test_data = pd.read_csv('data/processed_test.csv')

def create_advanced_features(df):
    # 创建新特征的副本
    data = df.copy()
    
    # 1. 票价分箱
    data['Fare_Bin'] = pd.qcut(data['Fare'], q=4, labels=['Low', 'Medium', 'High', 'Very High'])
    
    # 2. 年龄分组
    data['Age_Group'] = pd.cut(data['Age'], 
                              bins=[0, 12, 18, 35, 50, 100],
                              labels=['Child', 'Teenager', 'Young Adult', 'Adult', 'Elder'])
    
    # 3. 家庭规模分类
    data['Family_Category'] = pd.cut(data['FamilySize'],
                                    bins=[0, 1, 4, 20],
                                    labels=['Single', 'Small', 'Large'])
    
    # 4. 创建交互特征
    data['Age_Class'] = data['Age'] * data['Pclass']
    data['Fare_Per_Person'] = data['Fare'] / data['FamilySize']
    
    # 5. 对数变换
    data['Log_Fare'] = np.log1p(data['Fare'])
    
    # 对新的分类变量进行独热编码
    data = pd.get_dummies(data, columns=['Fare_Bin', 'Age_Group', 'Family_Category'])
    
    return data

# 应用高级特征工程
train_advanced = create_advanced_features(train_data)
test_advanced = create_advanced_features(test_data)

# 准备训练数据
X = train_advanced.drop('Survived', axis=1)
y = train_advanced['Survived']

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## 2. 特征选择

In [5]:
# 使用随机森林进行特征选择
selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))
selector.fit(X_train, y_train)

# 获取选中的特征
selected_features = X_train.columns[selector.get_support()].tolist()
print("选中的特征数量:", len(selected_features))
print("选中的特征:", selected_features)

# 使用选中的特征
X_train_selected = X_train[selected_features]
X_val_selected = X_val[selected_features]

选中的特征数量: 9
选中的特征: ['Pclass', 'Age', 'Fare', 'Sex_female', 'Sex_male', 'Title_Mr', 'Age_Class', 'Fare_Per_Person', 'Log_Fare']


## 3. 处理类别不平衡

In [6]:
# 创建采样pipeline
sampler = Pipeline([
    ('over', SMOTE(random_state=42)),
    ('under', RandomUnderSampler(random_state=42))
])

# 重采样数据
X_train_resampled, y_train_resampled = sampler.fit_resample(X_train_selected, y_train)

print("原始类别分布:\n", pd.Series(y_train).value_counts())
print("\n重采样后类别分布:\n", pd.Series(y_train_resampled).value_counts())

原始类别分布:
 Survived
0    444
1    268
Name: count, dtype: int64

重采样后类别分布:
 Survived
0    444
1    444
Name: count, dtype: int64


## 4. 集成学习

In [7]:
# 定义基础模型
rf = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
svm = SVC(probability=True, random_state=42)
lr = LogisticRegression(random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)

# 创建投票分类器
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf),
        ('svm', svm),
        ('lr', lr)
    ],
    voting='soft'
)

# 创建堆叠分类器
estimators = [
    ('rf', rf),
    ('svm', svm),
    ('knn', knn)
]

stack_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=5
)

In [8]:
# 使用分层K折交叉验证评估模型
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 评估投票分类器
voting_scores = cross_val_score(voting_clf, X_train_resampled, y_train_resampled, cv=skf)
print("投票分类器得分: %0.3f (+/- %0.3f)" % (voting_scores.mean(), voting_scores.std() * 2))

# 评估堆叠分类器
stack_scores = cross_val_score(stack_clf, X_train_resampled, y_train_resampled, cv=skf)
print("堆叠分类器得分: %0.3f (+/- %0.3f)" % (stack_scores.mean(), stack_scores.std() * 2))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

投票分类器得分: 0.838 (+/- 0.050)
堆叠分类器得分: 0.840 (+/- 0.055)


## 5. 生成最终预测

In [9]:
# 使用表现最好的模型进行预测
best_model = stack_clf  # 或 voting_clf，取决于上面的评估结果

# 在完整训练集上训练模型
best_model.fit(X_train_resampled, y_train_resampled)

# 对测试集进行预测
test_selected = test_advanced[selected_features]
final_predictions = best_model.predict(test_selected)

# 创建提交文件
submission = pd.DataFrame({
    'PassengerId': range(892, 892 + len(final_predictions)),
    'Survived': final_predictions
})

# 保存预测结果
submission.to_csv('data/optimized_submission.csv', index=False)
print("优化后的预测结果已保存到 'optimized_submission.csv'")

优化后的预测结果已保存到 'optimized_submission.csv'
