In [286]:
# 分析数据中是否存在异常样本
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from pathlib import Path

import re 
import numpy as np
import pandas as pd

from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [250]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
data = train.append(test,ignore_index=True)
data 

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.2500,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.9250,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.0500,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450
5,,,Q,8.4583,"Moran, Mr. James",0,6,3,male,0,0.0,330877
6,54.0,E46,S,51.8625,"McCarthy, Mr. Timothy J",0,7,1,male,0,0.0,17463
7,2.0,,S,21.0750,"Palsson, Master. Gosta Leonard",1,8,3,male,3,0.0,349909
8,27.0,,S,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,female,0,1.0,347742
9,14.0,,C,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,female,1,1.0,237736


In [251]:
# 观察数据缺失值
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [252]:
# 数据预处理
# 用平均年龄填充年龄字段的空值
data['Age'] = data['Age'].fillna(data['Age'].mean())
# 用平均票价填充票价字段的空值
data['Fare'] = data['Fare'].fillna(data['Fare'].mean())
# 用'U'填充客舱号的空值
data['Cabin'] = data['Cabin'].fillna('U')
print(data['Cabin'].value_counts(),'\n')
# 查看乘客从哪个登陆港口次数最多
print(data['Embarked'].value_counts())
# 使用登陆港口次数最多的值填充登陆港口的空值
data['Embarked'] = data['Embarked'].fillna('S')
print(data.info())

U                  1014
C23 C25 C27           6
G6                    5
B57 B59 B63 B66       5
D                     4
C22 C26               4
F4                    4
F33                   4
F2                    4
B96 B98               4
C78                   4
E101                  3
C101                  3
B58 B60               3
E34                   3
A34                   3
B51 B53 B55           3
D35                   2
D33                   2
C123                  2
D19                   2
C7                    2
F G73                 2
C2                    2
C55 C57               2
C65                   2
B69                   2
C116                  2
C85                   2
C32                   2
                   ... 
C53                   1
B38                   1
B42                   1
B102                  1
C28                   1
C104                  1
A16                   1
B3                    1
D9                    1
C132                  1
A23             

In [253]:
# 由于数据特征中包含类别数据，因此使用哑变量转换
# 使用get_dummies进行one-hot编码
embarkedDf = pd.DataFrame()
embarkedDf = pd.get_dummies(data['Embarked'],prefix='Embarked')
# print(embarkedDf.head())

data = pd.concat([data,embarkedDf],axis=1)
data.drop('Embarked',axis=1,inplace=True)
print(data.head())

    Age Cabin     Fare                                               Name  \
0  22.0     U   7.2500                            Braund, Mr. Owen Harris   
1  38.0   C85  71.2833  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2  26.0     U   7.9250                             Heikkinen, Miss. Laina   
3  35.0  C123  53.1000       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4  35.0     U   8.0500                           Allen, Mr. William Henry   

   Parch  PassengerId  Pclass     Sex  SibSp  Survived            Ticket  \
0      0            1       3    male      1       0.0         A/5 21171   
1      0            2       1  female      1       1.0          PC 17599   
2      0            3       3  female      0       1.0  STON/O2. 3101282   
3      0            4       1  female      1       1.0            113803   
4      0            5       3    male      0       0.0            373450   

   Embarked_C  Embarked_Q  Embarked_S  
0           0           0           1  


In [254]:
# 客舱等级(Pclass)  1,2,3表示1,2,3等舱
pclassDf = pd.DataFrame()
pclassDf = pd.get_dummies(data['Pclass'],prefix='Pclass')
# print(pclassDf.head())

data = pd.concat([data,pclassDf],axis=1)
data.drop('Pclass',axis=1,inplace=True)
data.head()


Unnamed: 0,Age,Cabin,Fare,Name,Parch,PassengerId,Sex,SibSp,Survived,Ticket,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,22.0,U,7.25,"Braund, Mr. Owen Harris",0,1,male,1,0.0,A/5 21171,0,0,1,0,0,1
1,38.0,C85,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,female,1,1.0,PC 17599,1,0,0,1,0,0
2,26.0,U,7.925,"Heikkinen, Miss. Laina",0,3,female,0,1.0,STON/O2. 3101282,0,0,1,0,0,1
3,35.0,C123,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,female,1,1.0,113803,0,0,1,1,0,0
4,35.0,U,8.05,"Allen, Mr. William Henry",0,5,male,0,0.0,373450,0,0,1,0,0,1


In [255]:
# 将性别的值映射为数值 男(male)对应的值 1 女(female)对应的值0
sexDf = pd.DataFrame()
sexDf = pd.get_dummies(data['Sex'],prefix='Sex')
# print(pclassDf.head())

data = pd.concat([data,sexDf],axis=1)
data.drop('Sex',axis=1,inplace=True)
data.head()


Unnamed: 0,Age,Cabin,Fare,Name,Parch,PassengerId,SibSp,Survived,Ticket,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male
0,22.0,U,7.25,"Braund, Mr. Owen Harris",0,1,1,0.0,A/5 21171,0,0,1,0,0,1,0,1
1,38.0,C85,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1.0,PC 17599,1,0,0,1,0,0,1,0
2,26.0,U,7.925,"Heikkinen, Miss. Laina",0,3,0,1.0,STON/O2. 3101282,0,0,1,0,0,1,1,0
3,35.0,C123,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1.0,113803,0,0,1,1,0,0,1,0
4,35.0,U,8.05,"Allen, Mr. William Henry",0,5,0,0.0,373450,0,0,1,0,0,1,0,1


In [256]:
"""
家庭信息
"""
familyDf = pd.DataFrame()

"""
家庭人数=同代直系亲属数(Parch)+不同代直系亲属数(SibSp)+乘客自己
"""
familyDf['FamilySize'] = data['Parch'] + data['SibSp'] + 1

"""
家庭类别：
小家庭Family_Small:人数=1
中家庭Family_Medium: 2<= 人数 <=4
大家庭Family_Large: 人数>=5
"""
familyDf['Family_Small'] = familyDf['FamilySize'].map(lambda s: 1 if s == 1 else 0)
familyDf['Family_Medium'] = familyDf['FamilySize'].map(lambda s: 1 if s >= 2  and s <= 4else 0)
familyDf['Family_Large'] = familyDf['FamilySize'].map(lambda s: 1 if s >= 5 else 0)
# print(familyDf.head())

data = pd.concat([data,familyDf],axis=1)
data.head()

Unnamed: 0,Age,Cabin,Fare,Name,Parch,PassengerId,SibSp,Survived,Ticket,Embarked_C,...,Embarked_S,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,FamilySize,Family_Small,Family_Medium,Family_Large
0,22.0,U,7.25,"Braund, Mr. Owen Harris",0,1,1,0.0,A/5 21171,0,...,1,0,0,1,0,1,2,0,1,0
1,38.0,C85,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1.0,PC 17599,1,...,0,1,0,0,1,0,2,0,1,0
2,26.0,U,7.925,"Heikkinen, Miss. Laina",0,3,0,1.0,STON/O2. 3101282,0,...,1,0,0,1,1,0,1,1,0,0
3,35.0,C123,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1.0,113803,0,...,1,1,0,0,1,0,2,0,1,0
4,35.0,U,8.05,"Allen, Mr. William Henry",0,5,0,0.0,373450,0,...,1,0,0,1,0,1,1,1,0,0


In [257]:
Counter(data['Name'].map(lambda x: re.compile(", (.*?)\.").findall(x)[0]))

Counter({'Mr': 757,
         'Mrs': 197,
         'Miss': 260,
         'Master': 61,
         'Don': 1,
         'Rev': 8,
         'Dr': 8,
         'Mme': 1,
         'Ms': 2,
         'Major': 2,
         'Lady': 1,
         'Sir': 1,
         'Mlle': 2,
         'Col': 4,
         'Capt': 1,
         'the Countess': 1,
         'Jonkheer': 1,
         'Dona': 1})

In [258]:
data['Title'] = data['Name'].map(lambda x: re.compile(", (.*?)\.").findall(x)[0])
data.head()

Unnamed: 0,Age,Cabin,Fare,Name,Parch,PassengerId,SibSp,Survived,Ticket,Embarked_C,...,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,FamilySize,Family_Small,Family_Medium,Family_Large,Title
0,22.0,U,7.25,"Braund, Mr. Owen Harris",0,1,1,0.0,A/5 21171,0,...,0,0,1,0,1,2,0,1,0,Mr
1,38.0,C85,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1.0,PC 17599,1,...,1,0,0,1,0,2,0,1,0,Mrs
2,26.0,U,7.925,"Heikkinen, Miss. Laina",0,3,0,1.0,STON/O2. 3101282,0,...,0,0,1,1,0,1,1,0,0,Miss
3,35.0,C123,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1.0,113803,0,...,1,0,0,1,0,2,0,1,0,Mrs
4,35.0,U,8.05,"Allen, Mr. William Henry",0,5,0,0.0,373450,0,...,0,0,1,0,1,1,1,0,0,Mr


In [259]:
# 
title_Dict = {}
title_Dict.update(dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer'))
title_Dict.update(dict.fromkeys(['Don', 'Sir', 'the Countess', 'Dona', 'Lady'], 'Royalty'))
title_Dict.update(dict.fromkeys(['Mme', 'Ms', 'Mrs'], 'Mrs'))
title_Dict.update(dict.fromkeys(['Mlle', 'Miss'], 'Miss'))
title_Dict.update(dict.fromkeys(['Mr'], 'Mr'))
title_Dict.update(dict.fromkeys(['Master','Jonkheer'], 'Master'))

data['Title'] = data['Title'].map(title_Dict)
data.head()

Unnamed: 0,Age,Cabin,Fare,Name,Parch,PassengerId,SibSp,Survived,Ticket,Embarked_C,...,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,FamilySize,Family_Small,Family_Medium,Family_Large,Title
0,22.0,U,7.25,"Braund, Mr. Owen Harris",0,1,1,0.0,A/5 21171,0,...,0,0,1,0,1,2,0,1,0,Mr
1,38.0,C85,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1.0,PC 17599,1,...,1,0,0,1,0,2,0,1,0,Mrs
2,26.0,U,7.925,"Heikkinen, Miss. Laina",0,3,0,1.0,STON/O2. 3101282,0,...,0,0,1,1,0,1,1,0,0,Miss
3,35.0,C123,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1.0,113803,0,...,1,0,0,1,0,2,0,1,0,Mrs
4,35.0,U,8.05,"Allen, Mr. William Henry",0,5,0,0.0,373450,0,...,0,0,1,0,1,1,1,0,0,Mr


In [260]:

data['Title'] = pd.factorize(data['Title'])[0]

title_df = pd.get_dummies(data['Title'], prefix=data[['Title']].columns[0])
# print(title_df)
data = pd.concat([data, title_df], axis=1)
data.head()


Unnamed: 0,Age,Cabin,Fare,Name,Parch,PassengerId,SibSp,Survived,Ticket,Embarked_C,...,Family_Small,Family_Medium,Family_Large,Title,Title_0,Title_1,Title_2,Title_3,Title_4,Title_5
0,22.0,U,7.25,"Braund, Mr. Owen Harris",0,1,1,0.0,A/5 21171,0,...,0,1,0,0,1,0,0,0,0,0
1,38.0,C85,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1.0,PC 17599,1,...,0,1,0,1,0,1,0,0,0,0
2,26.0,U,7.925,"Heikkinen, Miss. Laina",0,3,0,1.0,STON/O2. 3101282,0,...,1,0,0,2,0,0,1,0,0,0
3,35.0,C123,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1.0,113803,0,...,0,1,0,1,0,1,0,0,0,0
4,35.0,U,8.05,"Allen, Mr. William Henry",0,5,0,0.0,373450,0,...,1,0,0,0,1,0,0,0,0,0


In [261]:
# 添加新特征
# 均值差
fare = data['Fare'].mean()
fare_mean = list()
for i in range(len(data.index)): 
    if data['Fare'][i] - fare > 0:
        fare_mean.append(1)
    else:
        fare_mean.append(0)     
data['fare_mean'] = fare_mean
data


Unnamed: 0,Age,Cabin,Fare,Name,Parch,PassengerId,SibSp,Survived,Ticket,Embarked_C,...,Family_Medium,Family_Large,Title,Title_0,Title_1,Title_2,Title_3,Title_4,Title_5,fare_mean
0,22.000000,U,7.2500,"Braund, Mr. Owen Harris",0,1,1,0.0,A/5 21171,0,...,1,0,0,1,0,0,0,0,0,0
1,38.000000,C85,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1.0,PC 17599,1,...,1,0,1,0,1,0,0,0,0,1
2,26.000000,U,7.9250,"Heikkinen, Miss. Laina",0,3,0,1.0,STON/O2. 3101282,0,...,0,0,2,0,0,1,0,0,0,0
3,35.000000,C123,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1.0,113803,0,...,1,0,1,0,1,0,0,0,0,1
4,35.000000,U,8.0500,"Allen, Mr. William Henry",0,5,0,0.0,373450,0,...,0,0,0,1,0,0,0,0,0,0
5,29.881138,U,8.4583,"Moran, Mr. James",0,6,0,0.0,330877,0,...,0,0,0,1,0,0,0,0,0,0
6,54.000000,E46,51.8625,"McCarthy, Mr. Timothy J",0,7,0,0.0,17463,0,...,0,0,0,1,0,0,0,0,0,1
7,2.000000,U,21.0750,"Palsson, Master. Gosta Leonard",1,8,3,0.0,349909,0,...,0,1,3,0,0,0,1,0,0,0
8,27.000000,U,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,0,1.0,347742,0,...,1,0,1,0,1,0,0,0,0,0
9,14.000000,U,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,1,1.0,237736,1,...,1,0,1,0,1,0,0,0,0,0


In [262]:
# 添加新特征
# 标准差
fare = data['Fare'].std()
fare_std = list()
for i in range(len(data.index)): 
    if data['Fare'][i] - fare > 0:
        fare_std.append(1)
    else:
        fare_std.append(0)     
data['fare_std'] = fare_std
data

Unnamed: 0,Age,Cabin,Fare,Name,Parch,PassengerId,SibSp,Survived,Ticket,Embarked_C,...,Family_Large,Title,Title_0,Title_1,Title_2,Title_3,Title_4,Title_5,fare_mean,fare_std
0,22.000000,U,7.2500,"Braund, Mr. Owen Harris",0,1,1,0.0,A/5 21171,0,...,0,0,1,0,0,0,0,0,0,0
1,38.000000,C85,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1.0,PC 17599,1,...,0,1,0,1,0,0,0,0,1,1
2,26.000000,U,7.9250,"Heikkinen, Miss. Laina",0,3,0,1.0,STON/O2. 3101282,0,...,0,2,0,0,1,0,0,0,0,0
3,35.000000,C123,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1.0,113803,0,...,0,1,0,1,0,0,0,0,1,1
4,35.000000,U,8.0500,"Allen, Mr. William Henry",0,5,0,0.0,373450,0,...,0,0,1,0,0,0,0,0,0,0
5,29.881138,U,8.4583,"Moran, Mr. James",0,6,0,0.0,330877,0,...,0,0,1,0,0,0,0,0,0,0
6,54.000000,E46,51.8625,"McCarthy, Mr. Timothy J",0,7,0,0.0,17463,0,...,0,0,1,0,0,0,0,0,1,1
7,2.000000,U,21.0750,"Palsson, Master. Gosta Leonard",1,8,3,0.0,349909,0,...,1,3,0,0,0,1,0,0,0,0
8,27.000000,U,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,0,1.0,347742,0,...,0,1,0,1,0,0,0,0,0,0
9,14.000000,U,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,1,1.0,237736,1,...,0,1,0,1,0,0,0,0,0,0


In [263]:
print(len(data.index))
print(len(data))

1309
1309


In [264]:
# 添加新特征
# 中位数差
fare = data['Fare'].median()
fare_median = list()
for i in range(len(data.index)): 
    if data['Fare'][i] - fare > 0:
        fare_median.append(1)
    else:
        fare_median.append(0)     
data['fare_median'] = fare_median
data

Unnamed: 0,Age,Cabin,Fare,Name,Parch,PassengerId,SibSp,Survived,Ticket,Embarked_C,...,Title,Title_0,Title_1,Title_2,Title_3,Title_4,Title_5,fare_mean,fare_std,fare_median
0,22.000000,U,7.2500,"Braund, Mr. Owen Harris",0,1,1,0.0,A/5 21171,0,...,0,1,0,0,0,0,0,0,0,0
1,38.000000,C85,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1.0,PC 17599,1,...,1,0,1,0,0,0,0,1,1,1
2,26.000000,U,7.9250,"Heikkinen, Miss. Laina",0,3,0,1.0,STON/O2. 3101282,0,...,2,0,0,1,0,0,0,0,0,0
3,35.000000,C123,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1.0,113803,0,...,1,0,1,0,0,0,0,1,1,1
4,35.000000,U,8.0500,"Allen, Mr. William Henry",0,5,0,0.0,373450,0,...,0,1,0,0,0,0,0,0,0,0
5,29.881138,U,8.4583,"Moran, Mr. James",0,6,0,0.0,330877,0,...,0,1,0,0,0,0,0,0,0,0
6,54.000000,E46,51.8625,"McCarthy, Mr. Timothy J",0,7,0,0.0,17463,0,...,0,1,0,0,0,0,0,1,1,1
7,2.000000,U,21.0750,"Palsson, Master. Gosta Leonard",1,8,3,0.0,349909,0,...,3,0,0,0,1,0,0,0,0,1
8,27.000000,U,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,0,1.0,347742,0,...,1,0,1,0,0,0,0,0,0,0
9,14.000000,U,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,1,1.0,237736,1,...,1,0,1,0,0,0,0,0,0,1


In [265]:
# 查看各个特征与（Survived）的相关系数
corrDf = data.corr()
print(corrDf['Survived'].sort_values(ascending=False))

Survived         1.000000
Sex_female       0.543351
Title            0.378807
Title_1          0.344935
Title_2          0.332795
fare_std         0.286121
Pclass_1         0.285904
Family_Medium    0.279855
fare_median      0.274981
Fare             0.257307
fare_mean        0.240466
Embarked_C       0.168240
Pclass_2         0.093349
Parch            0.081629
Title_3          0.079996
Title_4          0.050561
FamilySize       0.016639
Embarked_Q       0.003650
PassengerId     -0.005007
Title_5         -0.031316
SibSp           -0.035322
Age             -0.070323
Family_Large    -0.125147
Embarked_S      -0.149683
Family_Small    -0.203367
Pclass_3        -0.322308
Sex_male        -0.543351
Title_0         -0.549199
Name: Survived, dtype: float64


In [276]:
# 取相关性大于0的特征作为训练数据
data_X = data[['Sex_female'   
,'Title'       
,'Title_1'          
,'Title_2'          
,'Pclass_1'         
,'fare_std'         
,'fare_median'      
,'Family_Medium'    
,'Fare'             
,'fare_mean'        
,'Embarked_C'       
,'Pclass_2'         
,'Parch'            
,'Title_3'          
,'Title_4'         
,'FamilySize'       
,'Embarked_Q']].values

train_size = len(train)

# data_1 = data.drop(['Ticket','Name','Cabin'],axis=1)
# 训练数据和标签
# train_X = data.drop(['Survived'],axis=1).loc[0:train_size-1,:]
# train_y = data.loc[0:train_size-1,'Survived']
train_X = data_X[0:train_size,:]
train_y = data.loc[0:train_size-1,'Survived']
print(train_X.shape)
print(train_y.shape)

# 预测数据集
pred_X = data_X[train_size:,:]
print(len(pred_X))


(891, 17)
(891,)
418


In [277]:
# 训练模型
model = LogisticRegression()
model.fit(train_X,train_y)

print('模型详细情况: ', model)
print('训练集分类精度: ', model.score(train_X,train_y))

模型详细情况:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
测试集分类精度:  0.8249158249158249


In [278]:
rfg = RandomForestClassifier()
rfg.fit(train_X,train_y)

print('模型详细情况: ', rfg)
print('训练集分类精度: ', rfg.score(train_X,train_y))

模型详细情况:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
测试集分类精度:  0.920314253647587


In [None]:
xgb = XGBClassifier(learning_rate=0.1,silent=True,objective='binary:logistic')
param_test = {
    'n_estimators': range(1,101,10),
    'max_depth': range(1,11,1),
    'max_feature':range(1,21,1)
}
grid_search = GridSearchCV(estimator=xgb,param_grid=param_test,scoring='accuracy',cv=10)
grid_search.fit(train_X,train_y)
#print(grid_search.grid_scores_)
print(grid_search.best_params_)
print(grid_search.best_score_)

pred_hat = grid_search.predict(pred_X)
print(predict_data,'\n')
print(grid_search.grid_scores_)
print(grid_search.best_params_)
print(grid_search.best_score_)

In [282]:
pred_Y = model.predict(pred_X)
pred_Y = pred_Y.astype(int)
# 乘客id
passemger_id = data.loc[train_size:,'PassengerId']
# 数据框: 乘客id,预测生存情况
predDf = pd.DataFrame({
    'PassengerId':passemger_id,
    'Survived':pred_Y
})
print(predDf.shape)
print(predDf.head())

# 保存结果
predDf.to_csv('titanic_pred_xgb.csv',index=False)


(418, 2)
     PassengerId  Survived
891          892         0
892          893         1
893          894         0
894          895         0
895          896         1
