加载模块

In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from xgboost import XGBRegressor
import pandas as pd
import numpy as np

path_data = './all/'
df_train = pd.read_csv(path_data + 'train.csv')
df_test = pd.read_csv(path_data + 'test.csv')
df_data = pd.concat([df_train, df_test])


数据清洗

In [2]:
df_data['Embarked'].fillna(df_data['Embarked'].mode()[0], inplace=True)
df_data['Fare'].fillna(df_data['Fare'].median(), inplace=True)
df_data['Cabin'] = df_data['Cabin'].apply(lambda x:x[0] if x is not np.nan else 'X')
cabin_counts = df_data['Cabin'].value_counts()
df_data['Cabin'] = df_data['Cabin'].apply((lambda x:'X' if cabin_counts[x] < 10 else x))

构造特征

In [3]:
#是否有着家人一起
df_data['FamilySize'] = df_data['SibSp'] + df_data['Parch'] + 1

In [4]:
#是否独自一人
df_data['IsAlone'] = 1
df_data['IsAlone'].loc[df_data['FamilySize'] > 1] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [5]:
#称谓表示身份的不同
df_data['Title'] = df_data['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
title_counts = df_data['Title'].value_counts()
df_data['Title'] = list(map(lambda x:'Rare' if title_counts[x] < 10 else x, df_data['Title'])) 

In [6]:
#有着相同姓氏的人，可能是一个家庭
df_data['Family_Name'] = df_data['Name'].apply(lambda x: str.split(x, ",")[0])

In [7]:
#构建家庭存活率特征
DEFAULT_SURVIVAL_VALUE = 0.5
df_data['Family_Survival'] = DEFAULT_SURVIVAL_VALUE

for grp, grp_df in df_data.groupby(['Family_Name', 'Fare']):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                df_data.loc[df_data['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin==0.0):
                df_data.loc[df_data['PassengerId'] == passID, 'Family_Survival'] = 0

for _, grp_df in df_data.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    df_data.loc[df_data['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0.0):
                    df_data.loc[df_data['PassengerId'] == passID, 'Family_Survival'] = 0

In [8]:
#回归预测缺失的age
def predict_age(x_train, y_train, x_test):
    param_grid = {
        'learning_rate':[.001, .005, .01, .05, .1],
        'max_depth':[2, 4, 6, 8],
        'n_estimators':[50, 100, 300, 500, 1000],
        'seed':[2018]
    }
    cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0) 
    tune_model = model_selection.GridSearchCV(XGBRegressor(nthread=-1), param_grid=param_grid, 
                                              scoring = 'neg_mean_squared_error', cv = cv_split)
    tune_model.fit(x_train, y_train)
    print(tune_model.best_params_)
    y_test = tune_model.best_estimator_.predict(x_test)

    return y_test

data_p = df_data.drop(['Cabin', 'Embarked', 'Fare', 'Name', 'PassengerId',
                       'Sex', 'Survived', 'Ticket', 'Title', 'Family_Name'], 1)
x_train = data_p.loc[~data_p['Age'].isnull(), :].drop('Age', 1)
y_train = data_p.loc[~data_p['Age'].isnull(), :]['Age']
x_test = data_p.loc[data_p['Age'].isnull(), :].drop('Age', 1)
df_data.loc[df_data['Age'].isnull(), 'Age'] = predict_age(x_train, y_train, x_test)

{'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 1000, 'seed': 2018}


In [9]:
#特征转换
label = LabelEncoder()
df_data['Sex_Code'] = label.fit_transform(df_data['Sex'])  # female为0, male为1

df_data = pd.concat([df_data, pd.get_dummies(df_data[['Embarked', 'Title', 'Cabin']])], axis=1)

In [10]:
#提出特征
drop_columns = ['Sex', 'Name', 'Embarked', 'Cabin', 'Ticket', 'Title', 'Family_Name']
df_data = df_data.drop(drop_columns, 1)
df_data.to_csv(path_data + 'fe_data.csv', index=False)

In [11]:
df_data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,FamilySize,IsAlone,Family_Survival,...,Title_Mr,Title_Mrs,Title_Rare,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_X
413,1305,,3,28.005854,0,0,8.05,1,1,0.5,...,1,0,0,0,0,0,0,0,0,1
414,1306,,1,39.0,0,0,108.9,1,1,1.0,...,0,0,1,0,0,1,0,0,0,0
415,1307,,3,38.5,0,0,7.25,1,1,0.5,...,1,0,0,0,0,0,0,0,0,1
416,1308,,3,28.005854,0,0,8.05,1,1,0.5,...,1,0,0,0,0,0,0,0,0,1
417,1309,,3,17.898249,1,1,22.3583,3,0,1.0,...,0,0,0,0,0,0,0,0,0,1


进行训练

In [12]:
#训练集和目标集
train = df_data.iloc[:len(df_train),:]
test = df_data.iloc[len(df_train):,:]

In [13]:
train_X = train.drop(columns=['PassengerId','Survived'])
train_Y = train['Survived']
test_X = test.drop(columns=['PassengerId','Survived'])

In [14]:
#加载模型
from sklearn.preprocessing import MinMaxScaler 
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(train_X, train_Y, test_size=0.2, random_state=1)

In [16]:
# 网格搜索训练
def train_test_model(X_train, y_train, X_test, y_test, model_name, model, param_range):
    
    print('训练{}中'.format(model_name))   
    
    clf = GridSearchCV(estimator = model,
                       param_grid = param_range,
                       cv = 6,
                       scoring = 'roc_auc',
                       refit = True, verbose = 1, n_jobs = 4)
    
    clf.fit(X_train, y_train)   
    
    train_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    
    print('train score：{:.3f}'.format(train_score))
    print('test score：{:.3f}'.format(test_score))
    print('parameter：{}'.format(clf.best_params_))
    
    print('###########################################')
    
    return clf

model_name_param_dict = {
                        'XG': (XGBClassifier(),
# TODO
# 对参数进行调整来得到最优结果                                   
                        {'n_estimators':[200,400,800],'max_depth':[5,10,15],'learning_rate':[0.001,0.01,0.1]}),
    'GBDT':(GradientBoostingClassifier(),
            {'n_estimators':[200,400,800],'max_depth':[5,10,15],'learning_rate':[0.001,0.01,0.1]}),
    'SVC':(SVC(),
          {'C':[1,0.1,0.01]}),
                         }
gscv_ls = []
for model_name, (model, param_range) in model_name_param_dict.items():
    gscv = train_test_model(X_train, y_train, X_test, y_test,model_name, model, param_range)
    gscv_ls.append(gscv)
print('训练{}中'.format('GNB'))   
gnb = GaussianNB()
gnb.fit(X_train,y_train)
print('test score：{:.3f}'.format(gnb.score(X_test, y_test)))

训练XG中
Fitting 6 folds for each of 27 candidates, totalling 162 fits




train score：0.965
test score：0.881
parameter：{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 400}
###########################################
训练GBDT中
Fitting 6 folds for each of 27 candidates, totalling 162 fits
train score：0.984
test score：0.874
parameter：{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 400}
###########################################
训练SVC中
Fitting 6 folds for each of 3 candidates, totalling 18 fits
train score：0.791
test score：0.712
parameter：{'C': 1}
###########################################
训练GNB中
test score：0.726


In [17]:
gscv = gscv_ls[0]
#结果保存
id=test['PassengerId']
id = id.values
result=list(zip(id,gscv.predict(test_X)))
result=np.array(result)

df = pd.DataFrame(result, columns=['PassengerId','Survived'])
df['PassengerId'] = df['PassengerId'].astype(np.int32)
df['Survived'] = df['Survived'].astype(np.int32)
df.to_csv('result.csv',index=False)