In [1]:
import pandas as pd
train_df = pd.read_csv("train.csv", encoding="utf-8")
test_df = pd.read_csv("test.csv", encoding="utf-8")
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [2]:
print(train_df.shape)
print(test_df.shape)

(891, 12)
(418, 11)


In [3]:
# isna, sum: train_df.isna().sum()
s = train_df.isna().sum()
s[s > 0]

Age         177
Cabin       687
Embarked      2
dtype: int64

In [4]:
# isna, sum: train_df.isna().sum()
s = test_df.isna().sum()
s[s > 0]

Age       86
Fare       1
Cabin    327
dtype: int64

In [5]:
# 數值類型缺失直: 補中位數
# medain(), mean()
# Pclass: 其實是類別, 不該填中位數
med = train_df.median().drop(["Pclass"])
train_df = train_df.fillna(med)
# test要不要重算一次med? 不能重算!!
test_df = test_df.fillna(med)
# isna, sum: train_df.isna().sum()
s = test_df.isna().sum()
s[s > 0]

Cabin    327
dtype: int64

In [6]:
# value_counts/idxmax
# 類別型態補缺失職: 捕最常出現
most = train_df["Embarked"].value_counts().idxmax()
train_df["Embarked"] = train_df["Embarked"].fillna(most)
s = train_df.isna().sum()
s[s > 0]

Cabin    687
dtype: int64

In [7]:
n = train_df["Name"][0]
n = n.split(",")[-1].split(".")[0]
n.strip()

'Mr'

In [8]:
def middle(n):
    n = n.split(",")[-1].split(".")[0]
    return n.strip()
train_df["Name"].apply(middle).value_counts()

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Col               2
Major             2
Mlle              2
Jonkheer          1
the Countess      1
Lady              1
Mme               1
Capt              1
Sir               1
Don               1
Ms                1
Name: Name, dtype: int64

In [9]:
def middle(n):
    reserved = ["Mr", "Mrs", "Miss", "Master"]
    n = n.split(",")[-1].split(".")[0]
    if n.strip() in reserved:
        return n.strip()
    else:
        return None
train_df["Name"] = train_df["Name"].apply(middle)
test_df["Name"] = test_df["Name"].apply(middle)
train_df["Name"].value_counts()

Mr        517
Miss      182
Mrs       125
Master     40
Name: Name, dtype: int64

In [10]:
x_train = train_df.drop(["PassengerId", "Ticket", "Cabin", "Survived"],
                        axis=1)
y_train = train_df["Survived"]
x_predict = test_df.drop(["PassengerId", "Ticket", "Cabin"],
                         axis=1)
predict_id = test_df["PassengerId"]
print(x_train.shape)
print(x_predict.shape)

(891, 8)
(418, 8)


In [11]:
# 類別題目: One-Hot Encoding 
x_train = pd.get_dummies(x_train)
x_predict = pd.get_dummies(x_predict)
print(x_train.shape)
print(x_predict.shape)
x_train

(891, 14)
(418, 14)


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Name_Master,Name_Miss,Name_Mr,Name_Mrs,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.2500,0,0,1,0,0,1,0,0,1
1,1,38.0,1,0,71.2833,0,0,0,1,1,0,1,0,0
2,3,26.0,0,0,7.9250,0,1,0,0,1,0,0,0,1
3,1,35.0,1,0,53.1000,0,0,0,1,1,0,0,0,1
4,3,35.0,0,0,8.0500,0,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,0,0,0,0,0,1,0,0,1
887,1,19.0,0,0,30.0000,0,1,0,0,1,0,0,0,1
888,3,28.0,1,2,23.4500,0,1,0,0,1,0,0,0,1
889,1,26.0,0,0,30.0000,0,0,1,0,0,1,1,0,0


In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
clf = RandomForestClassifier()
params = {
    "n_estimators":range(25, 45, 2),
    "max_depth":range(5, 11)
}
search = GridSearchCV(clf, params, cv=10, n_jobs=4)
search.fit(x_train, y_train)
print("最佳參數:", search.best_params_)
print("最佳分數:", search.best_score_)

最佳參數: {'max_depth': 10, 'n_estimators': 27}
最佳分數: 0.8395505617977529


In [43]:
import numpy as np
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(n_estimators=27,
                             max_depth=8)
scores = cross_val_score(clf, x_train, y_train, cv=10, n_jobs=4)
print("十次分數:", scores)
print("平均:", np.average(scores))

十次分數: [0.78888889 0.84269663 0.74157303 0.88764045 0.8988764  0.84269663
 0.84269663 0.76404494 0.85393258 0.84269663]
平均: 0.8305742821473159


In [45]:
clf = RandomForestClassifier(n_estimators=27,
                             max_depth=8)
clf.fit(x_train, y_train)
pre = clf.predict(x_predict)
result = pd.DataFrame({
    "PassengerId":predict_id,
    "Survived":pre
})
result.to_csv("titanic_rf.csv", encoding="utf-8", index=False)
result

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
