In [1]:
import numpy as np
import pandas as pd

In [5]:
import os
for dirname, _, filenames in os.walk('/Users/coco/Desktop/專案/titanic/raw'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/Users/coco/Desktop/專案/titanic/raw/test.csv
/Users/coco/Desktop/專案/titanic/raw/train.csv
/Users/coco/Desktop/專案/titanic/raw/gender_submission.csv


In [19]:
train_data = pd.read_csv("/Users/coco/Desktop/專案/titanic/raw/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [20]:
test_data = pd.read_csv("/Users/coco/Desktop/專案/titanic/raw/test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

% of women who survived: 0.7420382165605095


In [9]:
men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

% of men who survived: 0.18890814558058924


In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# 填補 Age 和 Embarked 欄位的缺失值
train_data = train_data.fillna({'Age': train_data['Age'].median(), 'Embarked': 'S'})
test_data = test_data.fillna({'Age': test_data['Age'].median(), 'Embarked': 'S'})

# 生成 FamilySize 特徵
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1

# 從 Name 提取 Title 特徵
train_data['Title'] = train_data['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
test_data['Title'] = test_data['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

# One-hot Encoding
# 使用 drop_first=True 參數是為了避免虛擬變數陷阱（dummy variable trap），即避免生成過多的共線性變數。
if 'Sex' in train_data.columns and 'Embarked' in train_data.columns:
    train_data = pd.get_dummies(train_data, columns=["Sex", "Embarked", "Title"], drop_first=True)
if 'Sex' in test_data.columns and 'Embarked' in test_data.columns:
    test_data = pd.get_dummies(test_data, columns=["Sex", "Embarked", "Title"], drop_first=True)

# 對齊訓練集和測試集的欄位
train_data, test_data = train_data.align(test_data, join='left', axis=1, fill_value=0)

# 選擇特徵 數值特徵（Pclass, SibSp, Parch, FamilySize, Fare）和類別變數（Sex, Embarked, Title）
features = ["Pclass", "SibSp", "Parch", "FamilySize", "Fare"] + [col for col in train_data.columns if "Sex_" in col or "Embarked_" in col or "Title_" in col]
X = train_data[features]
y = train_data["Survived"]
X_test = test_data[features]

# 設置隨機森林模型
rf_model = RandomForestClassifier(random_state=1)

# 超參數範圍設置
param_grid = {
    'n_estimators': [100, 150, 200],  # 樹的數量
    'max_depth': [3, 5, 7],  # 樹的深度
    'min_samples_split': [2, 5, 10],  # 最小分裂樣本數
    'min_samples_leaf': [1, 2, 4],  # 最小葉節點樣本數
    'max_features': ['sqrt', 'log2', None]  # 每棵樹選擇的特徵數量
}

# 執行網格搜尋
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X, y)

# 輸出最佳參數和最佳分數
print("最佳參數：", grid_search.best_params_)
print("最佳交叉驗證分數：", grid_search.best_score_)

# 使用最佳參數訓練模型
best_rf_model = grid_search.best_estimator_
best_rf_model.fit(X, y)

# 預測
predictions = best_rf_model.predict(X_test)

# 生成提交文件
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


  _data = np.array(data, dtype=dtype, copy=copy,


最佳參數： {'max_depth': 7, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 150}
最佳交叉驗證分數： 0.8383842822170611
Your submission was successfully saved!
