In [1]:
import pandas as pd
import numpy as np

In [11]:
data = pd.DataFrame(pd.read_csv('train.csv'))
data_test = pd.DataFrame(pd.read_csv('test.csv'))
data_test = data_test[["Pclass","Name","Sex","Age","SibSp","Parch","Ticket","Fare","Cabin","Embarked"]]
x = data[["Pclass","Name","Sex","Age","SibSp","Parch","Ticket","Fare","Cabin","Embarked"]]
y = data[["Survived"]]
print(x.describe())
print(x.describe(include=['O']))
print(data_test.describe())
print(data_test.describe(include=['O']))

           Pclass         Age       SibSp       Parch        Fare
count  891.000000  714.000000  891.000000  891.000000  891.000000
mean     2.308642   29.699118    0.523008    0.381594   32.204208
std      0.836071   14.526497    1.102743    0.806057   49.693429
min      1.000000    0.420000    0.000000    0.000000    0.000000
25%      2.000000   20.125000    0.000000    0.000000    7.910400
50%      3.000000   28.000000    0.000000    0.000000   14.454200
75%      3.000000   38.000000    1.000000    0.000000   31.000000
max      3.000000   80.000000    8.000000    6.000000  512.329200
                           Name   Sex  Ticket    Cabin Embarked
count                       891   891     891      204      889
unique                      891     2     681      147        3
top     Braund, Mr. Owen Harris  male  347082  B96 B98        S
freq                          1   577       7        4      644
           Pclass         Age       SibSp       Parch        Fare
count  418.000000  3

# 2、数据清洗

## 2、1缺失值处理

Age和Embarked 列存在少量缺失值，分别处理

In [12]:
# 用众数填充缺失值
data['Embarked'] = data['Embarked'].fillna('S')
# 用平均数填充Age缺失值
data["Age"] = data['Age'].fillna(data["Age"].mean())

## 2.1 删除缺失率较大的列（初步处理）

Cabin列的缺失率达到了75%,删除列

In [14]:
data = data.drop(["Cabin"], axis=1)

## 3 特征处理

特征处理是基于具体的数据的，所以在特征处理之前要对数据做充分的理解。特征处理没有固定方法之说，
主要靠个人的经验与观察，通过不断的尝试和变换，以期望挖掘出较好的特征变量。所以说，
特征处理是模型建立过程中最耗时和耗神的工作

### 3.1 单变量特征提取

In [15]:
# 根据name的长度,抽象出name_len特征
data['name_len'] = data['Name'].apply(len)

In [17]:
data['name_class'] = data["Name"].apply(lambda x : x.split(",")[1]).apply(lambda x: x.split()[0])

3.2 多变量的组合

sibsp 代表兄弟姐妹和配偶的数量
parch 代表父母和子女的数量
因此 可以将sibsp和parch结合获得家庭成员的数量

In [19]:
data['family_num'] = data['Parch'] + data['SibSp'] + 1

### 3.3 名义变量转数值变量

In [20]:
# Embarked
data['Embarked'] = data['Embarked'].map({'S':1, 'C':2,'Q':3}).astype(int)
# Sex
data['Sex'] = data['Sex'].apply(lambda x: 0 if x =='male' else 1)

### 3.4 数据分段

根据统计信息和经验分段

In [26]:
def cutFeature(datalist:list, x):
    for i in range(1,len(datalist)):
        if x < datalist[i]:
            break
    return i

In [27]:
#[7.91,14.45,31.0]根据Fare的统计信息进行分段
data["Fare"] = data["Fare"].apply(lambda x:cutFeature([7.91,14.45,31.0],x))
#[18,48,64]按照经验分段
data["Age"] = data["Age"].apply(lambda x:cutFeature([18,48,64],x))

## 4. 模型选择与测试

RandomForestClassifier

ExtraTreesClassifier

AdaBoostClassifier

GradientBoostingClassifier

SVC

模型参数：

In [30]:
#随机森林
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
    'warm_start': True,
    # 'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features': 'sqrt',
    'verbose': 0
}
# Extra Trees 随机森林
et_params = {
    'n_jobs': -1,
    'n_estimators': 500,
    # 'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost 
ada_params = {
    'n_estimators': 500,
    'learning_rate': 0.75
}

# GBDT
gb_params = {
    'n_estimators': 500,
    # 'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# SVC
svc_params = {
    'kernel': 'linear',
    'C': 0.025
}

模型选择代码：

In [32]:
import matplotlib.pyplot as plt

In [47]:
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [49]:
classifiers=[
    ("rf_model", RandomForestClassifier(**rf_params)),
    ("et_model", ExtraTreesClassifier(**et_params)),
    ("ada_model", AdaBoostClassifier(**ada_params)),
    ("gb_model", GradientBoostingClassifier(**gb_params)),
    ("svc_model", SVC(**svc_params)),
]
holdout = [0.95, 0.90, 0.75, 0.50, 0.01]
rounds = 20
x_train = data[:,0]
labels_train =  train_np[:,1:]
xx = 1. - np.array(holdout)
for name, clf in classifiers:
    print("training %s" % name)
    rng = np.random.RandomState(42)
    yy = []
    for i in holdout:
        yy_ = []
        for r in range(rounds):
            X_train_turn, X_test_turn, y_train_turn, y_test_turn = \
                train_test_split(x_train, labels_train, test_size=i, random_state=rng)
            clf.fit(X_train_turn, y_train_turn)
            y_pred = clf.predict(X_test_turn)
            yy_.append(1 - np.mean(y_pred == y_test_turn))
        yy.append(np.mean(yy_))
    plt.plot(xx, yy, label=name)

plt.legend(loc="upper right")
plt.xlabel("Proportion train")
plt.ylabel("Test Error Rate")
plt.show()

InvalidIndexError: (slice(None, None, None), 0)