In [326]:
# 引入库包
%matplotlib inline
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import time
import warnings
warnings.filterwarnings('ignore')

# 训练集和测试集提取特征

In [327]:
#PassengerId:乘客编号
#Survived：存活情况（存活：1，死亡：0）
#Pclass：客舱等级
#Name：乘客姓名
#Sex：性别
#Age：年龄
#SibSp：同乘的兄弟姐妹/配偶数
#Parch：同乘的父母/小孩数
#Ticket：船票编号
#Fare：船票价格
#Cabin：客舱号
#Embarked：登船港口
df_train = pd.read_csv('data/train.csv',header=None)
df_train.columns = ['PassengerId','Survived','Pclass','Name','Sex','Age',"SibSp","Parch","Ticket","Fare","Cabin","Embarked"]
df_train.drop(index=0,inplace=True)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
3,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
5,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [328]:
df_test = pd.read_csv('data/test.csv',header=None)
df_test.columns = ['PassengerId','Pclass','Name','Sex','Age',"SibSp","Parch","Ticket","Fare","Cabin","Embarked"]
df_test.drop(index=0,inplace=True)
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
2,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
3,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
4,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
5,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [329]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 12 columns):
PassengerId    891 non-null object
Survived       891 non-null object
Pclass         891 non-null object
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null object
SibSp          891 non-null object
Parch          891 non-null object
Ticket         891 non-null object
Fare           891 non-null object
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: object(12)
memory usage: 90.5+ KB


In [330]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 1 to 418
Data columns (total 11 columns):
PassengerId    418 non-null object
Pclass         418 non-null object
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null object
SibSp          418 non-null object
Parch          418 non-null object
Ticket         418 non-null object
Fare           417 non-null object
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: object(11)
memory usage: 39.2+ KB


In [331]:
df_train.select_dtypes(include="object").describe().T.assign(
  missing_pct=df_train.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,unique,top,freq,missing_pct
PassengerId,891,891,680,1,0.0
Survived,891,2,0,549,0.0
Pclass,891,3,3,491,0.0
Name,891,891,"Minahan, Miss. Daisy E",1,0.0
Sex,891,2,male,577,0.0
Age,714,88,24,30,0.198653
SibSp,891,7,0,608,0.0
Parch,891,7,0,678,0.0
Ticket,891,681,CA. 2343,7,0.0
Fare,891,248,8.05,43,0.0


训练集：
* 年龄缺失20%
* 客舱号缺失77%
* 登船港口缺失0.2%

In [332]:
df_test.select_dtypes(include="object").describe().T.assign(
  missing_pct=df_test.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,unique,top,freq,missing_pct
PassengerId,418,418,1275,1,0.0
Pclass,418,3,3,218,0.0
Name,418,418,"Fortune, Mrs. Mark (Mary McDougald)",1,0.0
Sex,418,2,male,266,0.0
Age,332,79,24,17,0.205742
SibSp,418,7,0,283,0.0
Parch,418,8,0,324,0.0
Ticket,418,363,PC 17608,5,0.0
Fare,417,169,7.75,21,0.002392
Cabin,91,76,B57 B59 B63 B66,3,0.782297


测试集：
* 年龄缺失20%
* 客舱号缺失78%
* 船票价格缺失0.2%

训练集删除:
* 乘客ID
* 名字
* 船票编号
* 客舱号（缺失值太多）

剩余特征：
* 存活情况（存活：1，死亡：0）
* 客舱等级
* 性别
* 年龄
* 同乘的兄弟姐妹/配偶数
* 同乘的父母/小孩数
* 船票价格
* 登船港口

In [333]:
#df_train.drop("PassengerId",1,inplace=True)#乘客ID
df_train.drop("Name",1,inplace=True)#名字
df_train.drop("Ticket",1,inplace=True)#船票编号
df_train.drop("Cabin",1,inplace=True)#客舱号（缺失值太多）

测试集删除:
* 乘客ID
* 名字
* 船票编号
* 客舱号（缺失值太多）

剩余特征：
* 存活情况（存活：1，死亡：0）
* 客舱等级
* 性别
* 年龄
* 同乘的兄弟姐妹/配偶数
* 同乘的父母/小孩数
* 船票价格
* 登船港口

In [334]:
#df_test.drop("PassengerId",1,inplace=True)#乘客ID
df_test.drop("Name",1,inplace=True)#名字
df_test.drop("Ticket",1,inplace=True)#船票编号
df_test.drop("Cabin",1,inplace=True)#客舱号（缺失值太多）

训练集删除：
* 登船港口（空行样本）
* 年龄（空行样本）

剩余特征：
* 存活情况（存活：1，死亡：0）
* 客舱等级
* 性别
* 年龄
* 同乘的兄弟姐妹/配偶数
* 同乘的父母/小孩数
* 船票价格
* 登船港口

In [335]:
df_train['Embarked'] = df_train['Embarked'].fillna("999")#登船港口
embarked_none_index = df_train[(df_train.Embarked == "999")].index.tolist()
df_train = df_train.drop(embarked_none_index)
df_train.shape

(889, 9)

In [336]:
df_train['Age'] = df_train['Age'].fillna("999")#登船港口
age_none_index = df_train[(df_train.Age == "999")].index.tolist()
df_train = df_train.drop(age_none_index)
df_train.shape

(712, 9)

In [337]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 1 to 891
Data columns (total 9 columns):
PassengerId    712 non-null object
Survived       712 non-null object
Pclass         712 non-null object
Sex            712 non-null object
Age            712 non-null object
SibSp          712 non-null object
Parch          712 non-null object
Fare           712 non-null object
Embarked       712 non-null object
dtypes: object(9)
memory usage: 55.6+ KB


测试集缺失值处理：
* 按照一二三等舱各自的均价来填充

In [338]:
df_test['Fare'] = df_test['Fare'].astype("float")
df_test['Fare'] = df_test[['Fare']].fillna(df_test.groupby('Pclass').transform(np.mean))

In [339]:
df_test.select_dtypes(include="object").describe().T.assign(
  missing_pct=df_test.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,unique,top,freq,missing_pct
PassengerId,418,418,1275,1,0.0
Pclass,418,3,3,218,0.0
Sex,418,2,male,266,0.0
Age,332,79,24,17,0.205742
SibSp,418,7,0,283,0.0
Parch,418,8,0,324,0.0
Embarked,418,3,S,270,0.0


In [340]:
df_train['Fare'] = df_train['Fare'].astype("float")
df_train.select_dtypes(include="object").describe().T.assign(
  missing_pct=df_train.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,unique,top,freq,missing_pct
PassengerId,712,712,423,1,0.0
Survived,712,2,0,424,0.0
Pclass,712,3,3,355,0.0
Sex,712,2,male,453,0.0
Age,712,88,24,30,0.0
SibSp,712,6,0,469,0.0
Parch,712,7,0,519,0.0
Embarked,712,3,S,554,0.0


In [341]:
df_train_temp = df_train.copy()
df_test_temp = df_test.copy()

In [342]:
def fun_SibSp(x):
    x = int(x)
    if x == 8:
        return 5
    else:
        return x

df_test['SibSp']= df_test['SibSp'].apply(lambda x: fun_SibSp(x))

In [343]:
def fun_Parch(x):
    x = int(x)
    if x == 9:
        return 6
    else:
        return x
df_test['Parch']= df_test['Parch'].apply(lambda x: fun_Parch(x))

In [344]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 1 to 418
Data columns (total 8 columns):
PassengerId    418 non-null object
Pclass         418 non-null object
Sex            418 non-null object
Age            332 non-null object
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           418 non-null float64
Embarked       418 non-null object
dtypes: float64(1), int64(2), object(5)
memory usage: 29.4+ KB


In [345]:
df_test.select_dtypes(include="int64").describe().T.assign(
  missing_pct=df_test.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_pct
SibSp,418.0,0.433014,0.793596,0.0,0.0,0.0,1.0,5.0,0.0
Parch,418.0,0.37799,0.870896,0.0,0.0,0.0,0.0,6.0,0.0


* 删除训练集标签

In [346]:
df_train_temp.drop("Survived",1,inplace=True)#删除训练集的标签

* 合并训练集和测试集提取特征

In [347]:
frames = [df_train_temp, df_test]
df_all = pd.concat(frames)

In [348]:
df_all.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
1,1,3,male,22,1,0,7.25,S
2,2,1,female,38,1,0,71.2833,C
3,3,3,female,26,0,0,7.925,S
4,4,1,female,35,1,0,53.1,S
5,5,3,male,35,0,0,8.05,S


In [349]:
df_all.select_dtypes(include="object").describe().T.assign(
  missing_pct=df_all.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,unique,top,freq,missing_pct
PassengerId,1130,1130,680,1,0.0
Pclass,1130,3,3,573,0.0
Sex,1130,2,male,719,0.0
Age,1044,98,24,47,0.076106
SibSp,1130,12,0,469,0.0
Parch,1130,14,0,519,0.0
Embarked,1130,3,S,824,0.0


In [350]:
df_all.select_dtypes(include="float").describe().T.assign(
  missing_pct=df_all.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_pct
Fare,1130.0,34.938832,54.012205,0.0,8.05,14.75,31.6344,512.3292,0.0


## 提取单一特征：
* 客舱等级比例
* 性别比例
* 年龄分段
* 年龄分段比例
* 同乘的兄弟姐妹/配偶数比例
* 同乘的父母/小孩数比例
* 船票价格分段
* 船票价格分段比例
* 登船港口比例

In [351]:

def category_feature_proportion(df,feature,new_feature):
    def computer_p(x):
        return x/df.shape[0]
    f1 = df[[feature]] 
    f1[new_feature] = 1
    f1 = f1.groupby(feature).agg('sum').reset_index()
    f1[new_feature] = f1[new_feature].apply(computer_p)
    return f1


* 客舱等级比例

In [352]:
pclass_proportion_feature = category_feature_proportion(df_all,"Pclass","Pclass_proportion")
pclass_proportion_feature

Unnamed: 0,Pclass,Pclass_proportion
0,1,0.257522
1,2,0.235398
2,3,0.50708


* 性别比例

In [353]:
sex_proportion_feature = category_feature_proportion(df_all,"Sex","Sex_proportion")
sex_proportion_feature

Unnamed: 0,Sex,Sex_proportion
0,female,0.363717
1,male,0.636283


* 同乘的兄弟姐妹/配偶数比例

In [354]:
sibSp_proportion_feature = category_feature_proportion(df_all,"SibSp","SibSp_proportion")
sibSp_proportion_feature

Unnamed: 0,SibSp,SibSp_proportion
0,0,0.250442
1,1,0.097345
2,2,0.012389
3,3,0.00354
4,4,0.00354
5,5,0.002655
6,0,0.415044
7,1,0.161947
8,2,0.022124
9,3,0.010619


* 同乘的父母/小孩数比例

In [355]:
parch_proportion_feature = category_feature_proportion(df_all,"Parch","Parch_proportion")
parch_proportion_feature

Unnamed: 0,Parch,Parch_proportion
0,0,0.286726
1,1,0.046018
2,2,0.029204
3,3,0.002655
4,4,0.00177
5,5,0.000885
6,6,0.002655
7,0,0.459292
8,1,0.097345
9,2,0.060177


* 登船港口比例

In [356]:
embarked_proportion_feature = category_feature_proportion(df_all,"Embarked","Embarked_proportion")
embarked_proportion_feature

Unnamed: 0,Embarked,Embarked_proportion
0,C,0.20531
1,Q,0.065487
2,S,0.729204


* 年龄分段

In [357]:
#年龄处理，切段
def age_process(age):
    age = int(age)
    if age<10:
        return 1
    elif age<20:
        return 2
    elif age<30:
        return 3
    elif age<40:
        return 4
    elif age<50:
        return 5
    elif age<60:
        return 6
    elif age<70:
        return 7
    else:
        return 8
    
#df_all['Age'] = df_all['Age'].astype('float')
#df_all['Age_section'] = df_all['Age'].apply(age_process)

* 年龄分段比例

In [358]:
#age_section_proportion_feature = category_feature_proportion(df_all,"Age_section","Age_section_proportion")
#age_section_proportion_feature

* 船票价格分段

In [359]:
#船票价格处理，切段
def fare_process(fare):
    fare = float(fare)
    if fare<64:
        return 1
    elif fare<128:
        return 2
    elif fare<192:
        return 3
    elif fare<256:
        return 4
    elif fare<320:
        return 5
    elif fare<448:
        return 6
    elif fare<512:
        return 7
    else:
        return 8
df_all['Fare'] = df_all['Fare'].astype('float')
df_all['Fare_section'] = df_all['Fare'].apply(fare_process)

* 船票价格分段比例

In [360]:
fare_section_proportion_feature = category_feature_proportion(df_all,"Fare_section","Fare_section_proportion")
fare_section_proportion_feature

Unnamed: 0,Fare_section,Fare_section_proportion
0,1,0.859292
1,2,0.084956
2,3,0.023894
3,4,0.016814
4,5,0.011504
5,8,0.00354


In [361]:
df_all.select_dtypes(include="object").describe().T.assign(
  missing_pct=df_all.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,unique,top,freq,missing_pct
PassengerId,1130,1130,680,1,0.0
Pclass,1130,3,3,573,0.0
Sex,1130,2,male,719,0.0
Age,1044,98,24,47,0.076106
SibSp,1130,12,0,469,0.0
Parch,1130,14,0,519,0.0
Embarked,1130,3,S,824,0.0


In [362]:
df_all.select_dtypes(include="float").describe().T.assign(
  missing_pct=df_all.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_pct
Fare,1130.0,34.938832,54.012205,0.0,8.05,14.75,31.6344,512.3292,0.0


In [363]:
df_all.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Fare_section
1,1,3,male,22,1,0,7.25,S,1
2,2,1,female,38,1,0,71.2833,C,2
3,3,3,female,26,0,0,7.925,S,1
4,4,1,female,35,1,0,53.1,S,1
5,5,3,male,35,0,0,8.05,S,1


In [364]:
df_all = pd.merge(df_all, pclass_proportion_feature, on=["Pclass"], how='left')
df_all = pd.merge(df_all, sex_proportion_feature, on=["Sex"], how='left')
df_all = pd.merge(df_all, sibSp_proportion_feature, on=["SibSp"], how='left')
df_all = pd.merge(df_all, parch_proportion_feature, on=["Parch"], how='left')
df_all = pd.merge(df_all, embarked_proportion_feature, on=["Embarked"], how='left')
#df_all = pd.merge(df_all, age_section_proportion_feature, on=["Age_section"], how='left')
df_all = pd.merge(df_all, fare_section_proportion_feature, on=["Fare_section"], how='left')

In [365]:
df_all.select_dtypes(include="float").describe().T.assign(
  missing_pct=df_all.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_pct
Fare,1130.0,34.938832,54.012205,0.0,8.05,14.75,31.6344,512.3292,0.0
Pclass_proportion,1130.0,0.37886,0.130337,0.235398,0.257522,0.50708,0.50708,0.50708,0.0
Sex_proportion,1130.0,0.537146,0.131181,0.363717,0.363717,0.636283,0.636283,0.636283,0.0
SibSp_proportion,1130.0,0.271747,0.136243,0.002655,0.161947,0.250442,0.415044,0.415044,0.0
Parch_proportion,1130.0,0.309299,0.161492,0.000885,0.097345,0.286726,0.459292,0.459292,0.0
Embarked_proportion,1130.0,0.578178,0.24989,0.065487,0.20531,0.729204,0.729204,0.729204,0.0
Fare_section_proportion,1130.0,0.746599,0.278888,0.00354,0.859292,0.859292,0.859292,0.859292,0.0


In [366]:
df_all.select_dtypes(include="object").describe().T.assign(
  missing_pct=df_all.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,unique,top,freq,missing_pct
PassengerId,1130,1130,680,1,0.0
Pclass,1130,3,3,573,0.0
Sex,1130,2,male,719,0.0
Age,1044,98,24,47,0.076106
SibSp,1130,12,0,469,0.0
Parch,1130,14,0,519,0.0
Embarked,1130,3,S,824,0.0


In [367]:
df_all.columns

Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked', 'Fare_section', 'Pclass_proportion', 'Sex_proportion',
       'SibSp_proportion', 'Parch_proportion', 'Embarked_proportion',
       'Fare_section_proportion'],
      dtype='object')

* 归一化+标准化

In [368]:
from sklearn import preprocessing

#归一化处理
min_max_scaler = preprocessing.MinMaxScaler()
X_temp = min_max_scaler.fit_transform(df_all[['SibSp', 'Parch', 'Fare',
        'Pclass_proportion', 'Sex_proportion',
        'SibSp_proportion', 'Parch_proportion', 'Embarked_proportion',
        'Fare_section_proportion']])
#标准化处理
df_all[['SibSp', 'Parch', 'Fare',
        'Pclass_proportion', 'Sex_proportion',
        'SibSp_proportion', 'Parch_proportion', 'Embarked_proportion',
        'Fare_section_proportion']] = preprocessing.scale(X_temp)

* One-Hot

In [369]:
#One-Hot Encoding
df_all = pd.get_dummies(df_all,columns = ['Pclass', 'Sex', 'Fare_section', 'Embarked'])

In [370]:
df_all.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Pclass_proportion,Sex_proportion,SibSp_proportion,Parch_proportion,Embarked_proportion,...,Sex_male,Fare_section_1,Fare_section_2,Fare_section_3,Fare_section_4,Fare_section_5,Fare_section_8,Embarked_C,Embarked_Q,Embarked_S
0,1,22,0.584568,-0.4795,-0.512867,0.98419,0.75606,-0.806271,0.929207,0.604633,...,1,1,0,0,0,0,0,0,0,1
1,2,38,0.584568,-0.4795,0.673192,-0.931363,-1.322646,-0.806271,0.929207,-1.49279,...,0,0,1,0,0,0,0,1,0,0
2,3,26,-0.548472,-0.4795,-0.500365,0.98419,-1.322646,1.05224,0.929207,0.604633,...,0,1,0,0,0,0,0,0,0,1
3,4,35,0.584568,-0.4795,0.336391,-0.931363,-1.322646,-0.806271,0.929207,0.604633,...,0,1,0,0,0,0,0,0,0,1
4,5,35,-0.548472,-0.4795,-0.498049,0.98419,0.75606,1.05224,0.929207,0.604633,...,1,1,0,0,0,0,0,0,0,1


* 划分训练集和测试集

In [371]:
age_not_null = df_all[df_all['Age'].notnull()]
age_is_null = df_all[df_all['Age'].isnull()]

In [372]:
age_not_null.shape

(1044, 25)

In [373]:
age_is_null.shape

(86, 25)

In [374]:
Y = age_not_null.Age
X = age_not_null.drop('Age',1,inplace=False)
X_test = age_is_null.drop('Age',1,inplace=False)

In [375]:
X.columns

Index(['PassengerId', 'SibSp', 'Parch', 'Fare', 'Pclass_proportion',
       'Sex_proportion', 'SibSp_proportion', 'Parch_proportion',
       'Embarked_proportion', 'Fare_section_proportion', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'Fare_section_1',
       'Fare_section_2', 'Fare_section_3', 'Fare_section_4', 'Fare_section_5',
       'Fare_section_8', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [376]:
X_train = X[['SibSp', 'Parch', 'Fare', 'Pclass_proportion',
       'Sex_proportion', 'SibSp_proportion', 'Parch_proportion',
       'Embarked_proportion', 'Fare_section_proportion', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'Fare_section_1',
       'Fare_section_2', 'Fare_section_3', 'Fare_section_4', 'Fare_section_5',
       'Fare_section_8', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
X_pred = X_test[['SibSp', 'Parch', 'Fare', 'Pclass_proportion',
       'Sex_proportion', 'SibSp_proportion', 'Parch_proportion',
       'Embarked_proportion', 'Fare_section_proportion', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'Fare_section_1',
       'Fare_section_2', 'Fare_section_3', 'Fare_section_4', 'Fare_section_5',
       'Fare_section_8', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]

In [377]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()#多元线性回归模型
linreg.fit(X_train, Y)#数据拟合
y_pred = linreg.predict(X_pred)

In [378]:
X_test['Age'] = y_pred

In [379]:
X_test.head()

Unnamed: 0,PassengerId,SibSp,Parch,Fare,Pclass_proportion,Sex_proportion,SibSp_proportion,Parch_proportion,Embarked_proportion,Fare_section_proportion,...,Fare_section_1,Fare_section_2,Fare_section_3,Fare_section_4,Fare_section_5,Fare_section_8,Embarked_C,Embarked_Q,Embarked_S,Age
722,902,-0.548472,-0.4795,-0.500905,0.98419,0.75606,-0.156442,-0.139843,0.604633,0.404259,...,1,0,0,0,0,0,0,0,1,26.667409
734,914,-0.548472,-0.4795,-0.060301,-0.931363,-1.322646,-0.156442,-0.139843,0.604633,0.404259,...,1,0,0,0,0,0,0,0,1,40.606351
741,921,1.717609,-0.4795,-0.245602,0.98419,0.75606,-1.904482,-0.139843,-1.49279,0.404259,...,1,0,0,0,0,0,1,0,0,19.118879
745,925,0.584568,1.845972,-0.212802,0.98419,-1.322646,-1.280646,-1.735194,0.604633,0.404259,...,1,0,0,0,0,0,0,0,1,21.706078
748,928,-0.548472,-0.4795,-0.498049,0.98419,-1.322646,-0.156442,-0.139843,0.604633,0.404259,...,1,0,0,0,0,0,0,0,1,24.157826


In [380]:
pred_age_feature = X_test[["PassengerId","Age"]]
pred_age_feature.shape

(86, 2)

In [381]:
pred_age_feature.describe().T.assign(
  missing_pct=pred_age_feature.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_pct
Age,86.0,27.919041,6.581149,1.986194,24.941301,26.701318,29.824201,46.323559,0.0


In [382]:
age_is_null = df_test_temp[df_test_temp['Age'].isnull()]
age_not_null = df_test_temp[df_test_temp['Age'].notnull()]

print(age_not_null.shape)

age_is_null.drop('Age',1,inplace=True)
print(age_is_null.shape)

age_is_null = pd.merge(age_is_null, pred_age_feature, on=["PassengerId"], how='left')

frames = [age_is_null, age_not_null]
df_test = pd.concat(frames)


(332, 8)
(86, 7)


In [383]:
df_test['Age'] = df_test['Age'].astype('float')
df_test.select_dtypes(include="object").describe().T.assign(
  missing_pct=df_test.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,unique,top,freq,missing_pct
Embarked,418,3,S,270,0.0
Parch,418,8,0,324,0.0
PassengerId,418,418,1275,1,0.0
Pclass,418,3,3,218,0.0
Sex,418,2,male,266,0.0
SibSp,418,7,0,283,0.0


In [384]:
df_test.head()

Unnamed: 0,Age,Embarked,Fare,Parch,PassengerId,Pclass,Sex,SibSp
0,26.667409,S,7.8958,0,902,3,male,0
1,40.606351,S,31.6833,0,914,1,female,0
2,19.118879,C,21.6792,0,921,3,male,2
3,21.706078,S,23.45,2,925,3,female,1
4,24.157826,S,8.05,0,928,3,female,0


In [385]:
df_train['Age'] = df_train['Age'].astype('float')
df_train.select_dtypes(include="object").describe().T.assign(
  missing_pct=df_train.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,unique,top,freq,missing_pct
PassengerId,712,712,423,1,0.0
Survived,712,2,0,424,0.0
Pclass,712,3,3,355,0.0
Sex,712,2,male,453,0.0
SibSp,712,6,0,469,0.0
Parch,712,7,0,519,0.0
Embarked,712,3,S,554,0.0


## 提取组合特征-幸存率：
* 客舱等级&幸存比例
* 性别&幸存比例
* 分段年龄&幸存比例
* 同乘的兄弟姐妹/配偶数&幸存率比例
* 同乘的父母/小孩数&幸存率比例
* 船票价格分段&幸存率比例
* 登船港口&幸存率比例

In [386]:
def extrac_group_feature(df,feature1,feature2,new_feature):
    def computer_p(x):
        return x/df.shape[0]
    t1 = df[[feature1,feature2]]
    t1[new_feature] = 1
    t1 = t1.groupby([feature1,feature2]).agg('sum').reset_index()
    t1[new_feature] = t1[new_feature].apply(computer_p)
    return t1

* 客舱等级&幸存比例

In [387]:
pclass_survived_proportion_feature= extrac_group_feature(df_train,"Pclass","Survived","pclass_survived_proportion")
pclass_survived_proportion_feature.drop("Survived",1,inplace=True)
pclass_survived_proportion_feature = pclass_survived_proportion_feature.groupby(["Pclass"]).agg('sum').reset_index()
pclass_survived_proportion_feature

Unnamed: 0,Pclass,pclass_survived_proportion
0,1,0.258427
1,2,0.242978
2,3,0.498596


* 性别&幸存比例

In [388]:
sex_survived_proportion_feature= extrac_group_feature(df_train,"Sex","Survived","sex_survived_proportion")
sex_survived_proportion_feature.drop("Survived",1,inplace=True)
sex_survived_proportion_feature = sex_survived_proportion_feature.groupby(["Sex"]).agg('sum').reset_index()
sex_survived_proportion_feature

Unnamed: 0,Sex,sex_survived_proportion
0,female,0.363764
1,male,0.636236


* 分段年龄&幸存比例

In [389]:
df_train['Age'] = df_train['Age'].astype('float')
df_train['Age_section'] = df_train['Age'].apply(age_process)

In [390]:
age_section_survived_proportion_feature= extrac_group_feature(df_train,"Age_section","Survived","age_section_survived_proportion")
age_section_survived_proportion_feature.drop("Survived",1,inplace=True)
age_section_survived_proportion_feature = age_section_survived_proportion_feature.groupby(["Age_section"]).agg('sum').reset_index()
age_section_survived_proportion_feature

Unnamed: 0,Age_section,age_section_survived_proportion
0,1,0.087079
1,2,0.143258
2,3,0.308989
3,4,0.233146
4,5,0.125
5,6,0.067416
6,7,0.025281
7,8,0.009831


* 同乘的兄弟姐妹/配偶数&幸存率比例

In [391]:
sibSp_survived_proportion_feature = extrac_group_feature(df_train,"SibSp","Survived","sibSp_survived_proportion")
sibSp_survived_proportion_feature.drop("Survived",1,inplace=True)
sibSp_survived_proportion_feature = sibSp_survived_proportion_feature.groupby(["SibSp"]).agg('sum').reset_index()
sibSp_survived_proportion_feature

Unnamed: 0,SibSp,sibSp_survived_proportion
0,0,0.658708
1,1,0.257022
2,2,0.035112
3,3,0.016854
4,4,0.025281
5,5,0.007022


* 同乘的父母/小孩数&幸存率比例

In [392]:
parch_survived_proportion_feature = extrac_group_feature(df_train,"Parch","Survived","parch_survived_proportion")
parch_survived_proportion_feature.drop("Survived",1,inplace=True)
parch_survived_proportion_feature = parch_survived_proportion_feature.groupby(["Parch"]).agg('sum').reset_index()
parch_survived_proportion_feature

Unnamed: 0,Parch,parch_survived_proportion
0,0,0.728933
1,1,0.154494
2,2,0.095506
3,3,0.007022
4,4,0.005618
5,5,0.007022
6,6,0.001404


* 船票价格分段&幸存率比例

In [393]:
df_train['Fare'] = df_train['Fare'].astype('float')
df_train['Fare_section'] = df_train['Fare'].apply(fare_process)

In [394]:
fare_section_survived_proportion_feature = extrac_group_feature(df_train,"Fare_section","Survived","fare_section_survived_proportion")
fare_section_survived_proportion_feature.drop("Survived",1,inplace=True)
fare_section_survived_proportion_feature = fare_section_survived_proportion_feature.groupby(["Fare_section"]).agg('sum').reset_index()
fare_section_survived_proportion_feature

Unnamed: 0,Fare_section,fare_section_survived_proportion
0,1,0.858146
1,2,0.094101
2,3,0.022472
3,4,0.01264
4,5,0.008427
5,8,0.004213


* 登船港口&幸存率比例

In [395]:
embarked_survived_proportion_feature = extrac_group_feature(df_train,"Embarked","Survived","embarked_survived_proportion")
embarked_survived_proportion_feature.drop("Survived",1,inplace=True)
embarked_survived_proportion_feature = embarked_survived_proportion_feature.groupby(["Embarked"]).agg('sum').reset_index()
embarked_survived_proportion_feature

Unnamed: 0,Embarked,embarked_survived_proportion
0,C,0.182584
1,Q,0.039326
2,S,0.77809


* 客舱等级&幸存比例
* 性别&幸存比例
* 分段年龄&幸存比例
* 同乘的兄弟姐妹/配偶数&幸存率比例
* 同乘的父母/小孩数&幸存率比例
* 船票价格分段&幸存率比例
* 登船港口&幸存率比例

In [396]:
df_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Embarked', 'Age_section', 'Fare_section'],
      dtype='object')

In [398]:
df_train.shape

(712, 11)

In [399]:
df_test.columns

Index(['Age', 'Embarked', 'Fare', 'Parch', 'PassengerId', 'Pclass', 'Sex',
       'SibSp'],
      dtype='object')

In [400]:
df_test.shape

(418, 8)

In [401]:
#提取单一特征
df_train = pd.merge(df_train, pclass_proportion_feature, on=["Pclass"], how='left')
df_train = pd.merge(df_train, sex_proportion_feature, on=["Sex"], how='left')
df_train = pd.merge(df_train, sibSp_proportion_feature, on=["SibSp"], how='left')
df_train = pd.merge(df_train, parch_proportion_feature, on=["Parch"], how='left')
df_train = pd.merge(df_train, embarked_proportion_feature, on=["Embarked"], how='left')
df_train = pd.merge(df_train, fare_section_proportion_feature, on=["Fare_section"], how='left')

#提取组合特征
df_train = pd.merge(df_train, pclass_survived_proportion_feature, on=["Pclass"], how='left')
df_train = pd.merge(df_train, sex_survived_proportion_feature, on=["Sex"], how='left')
df_train = pd.merge(df_train, age_section_survived_proportion_feature, on=["Age_section"], how='left')
df_train = pd.merge(df_train, sibSp_survived_proportion_feature, on=["SibSp"], how='left')
df_train = pd.merge(df_train, parch_survived_proportion_feature, on=["Parch"], how='left')
df_train = pd.merge(df_train, fare_section_survived_proportion_feature, on=["Fare_section"], how='left')
df_train = pd.merge(df_train, embarked_survived_proportion_feature, on=["Embarked"], how='left')

In [402]:
df_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Embarked', 'Age_section', 'Fare_section', 'Pclass_proportion',
       'Sex_proportion', 'SibSp_proportion', 'Parch_proportion',
       'Embarked_proportion', 'Fare_section_proportion',
       'pclass_survived_proportion', 'sex_survived_proportion',
       'age_section_survived_proportion', 'sibSp_survived_proportion',
       'parch_survived_proportion', 'fare_section_survived_proportion',
       'embarked_survived_proportion'],
      dtype='object')

In [403]:
df_train.shape

(712, 24)

One-Hot Encoded：
* 客舱等级
* 性别
* 登船港口
* 分段年龄
* 船票价格年龄

In [404]:
df_train = pd.get_dummies(df_train,columns = ['Pclass', 'Sex',
       'Embarked', 'Age_section', 'Fare_section'])
df_train.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,Pclass_proportion,Sex_proportion,SibSp_proportion,Parch_proportion,...,Age_section_5,Age_section_6,Age_section_7,Age_section_8,Fare_section_1,Fare_section_2,Fare_section_3,Fare_section_4,Fare_section_5,Fare_section_8
0,1,0,22.0,1,0,7.25,0.50708,0.636283,0.161947,0.459292,...,0,0,0,0,1,0,0,0,0,0
1,2,1,38.0,1,0,71.2833,0.257522,0.363717,0.161947,0.459292,...,0,0,0,0,0,1,0,0,0,0
2,3,1,26.0,0,0,7.925,0.50708,0.363717,0.415044,0.459292,...,0,0,0,0,1,0,0,0,0,0
3,4,1,35.0,1,0,53.1,0.257522,0.363717,0.161947,0.459292,...,0,0,0,0,1,0,0,0,0,0
4,5,0,35.0,0,0,8.05,0.50708,0.636283,0.415044,0.459292,...,0,0,0,0,1,0,0,0,0,0


In [405]:
df_train.columns

Index(['PassengerId', 'Survived', 'Age', 'SibSp', 'Parch', 'Fare',
       'Pclass_proportion', 'Sex_proportion', 'SibSp_proportion',
       'Parch_proportion', 'Embarked_proportion', 'Fare_section_proportion',
       'pclass_survived_proportion', 'sex_survived_proportion',
       'age_section_survived_proportion', 'sibSp_survived_proportion',
       'parch_survived_proportion', 'fare_section_survived_proportion',
       'embarked_survived_proportion', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Age_section_1', 'Age_section_2', 'Age_section_3', 'Age_section_4',
       'Age_section_5', 'Age_section_6', 'Age_section_7', 'Age_section_8',
       'Fare_section_1', 'Fare_section_2', 'Fare_section_3', 'Fare_section_4',
       'Fare_section_5', 'Fare_section_8'],
      dtype='object')

In [406]:
from sklearn import preprocessing

#归一化处理
min_max_scaler = preprocessing.MinMaxScaler()
X_temp = min_max_scaler.fit_transform(df_train[['Age', 'SibSp', 'Parch', 'Fare',
       'Pclass_proportion', 'Sex_proportion', 'SibSp_proportion',
       'Parch_proportion', 'Embarked_proportion', 'Fare_section_proportion',
       'pclass_survived_proportion', 'sex_survived_proportion',
       'age_section_survived_proportion', 'sibSp_survived_proportion',
       'parch_survived_proportion', 'fare_section_survived_proportion',
       'embarked_survived_proportion']])
#标准化处理
df_train[['Age', 'SibSp', 'Parch', 'Fare',
       'Pclass_proportion', 'Sex_proportion', 'SibSp_proportion',
       'Parch_proportion', 'Embarked_proportion', 'Fare_section_proportion',
       'pclass_survived_proportion', 'sex_survived_proportion',
       'age_section_survived_proportion', 'sibSp_survived_proportion',
       'parch_survived_proportion', 'fare_section_survived_proportion',
       'embarked_survived_proportion']] = preprocessing.scale(X_temp)

In [407]:
df_train.drop('PassengerId',1,inplace=True)
Y = df_train.Survived
X = df_train.drop('Survived',1,inplace=False)

# 最终LR 模型训练

In [408]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X, Y )

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# 测试集预测

In [409]:
df_test['Age'] = df_test['Age'].astype('float')
df_test['Age_section'] = df_test['Age'].apply(age_process)
df_test['Fare'] = df_test['Fare'].astype('float')
df_test['Fare_section'] = df_test['Fare'].apply(fare_process)

In [410]:
df_test.shape

(418, 10)

In [411]:
df_test.columns

Index(['Age', 'Embarked', 'Fare', 'Parch', 'PassengerId', 'Pclass', 'Sex',
       'SibSp', 'Age_section', 'Fare_section'],
      dtype='object')

## 提取单一特征：
* 客舱等级比例
* 性别比例
* 同乘的兄弟姐妹/配偶数比例
* 同乘的父母/小孩数比例
* 登船港口比例
* 船票价格分段
* 船票价格分段比例

In [412]:
#提取单一特征
df_test = pd.merge(df_test, pclass_proportion_feature, on=["Pclass"], how='left')
df_test = pd.merge(df_test, sex_proportion_feature, on=["Sex"], how='left')
df_test = pd.merge(df_test, sibSp_proportion_feature, on=["SibSp"], how='left')
df_test = pd.merge(df_test, parch_proportion_feature, on=["Parch"], how='left')
df_test = pd.merge(df_test, embarked_proportion_feature, on=["Embarked"], how='left')
df_test = pd.merge(df_test, fare_section_proportion_feature, on=["Fare_section"], how='left')


In [413]:
df_test.shape

(418, 16)

In [414]:
df_test.columns

Index(['Age', 'Embarked', 'Fare', 'Parch', 'PassengerId', 'Pclass', 'Sex',
       'SibSp', 'Age_section', 'Fare_section', 'Pclass_proportion',
       'Sex_proportion', 'SibSp_proportion', 'Parch_proportion',
       'Embarked_proportion', 'Fare_section_proportion'],
      dtype='object')

In [415]:
df_test_feature = df_test.copy()

In [416]:
df_test.drop("PassengerId",1,inplace=True)

## 提取组合特征-幸存率：
* 客舱等级&幸存比例
* 性别&幸存比例
* 分段年龄&幸存比例
* 同乘的兄弟姐妹/配偶数&幸存率比例
* 同乘的父母/小孩数&幸存率比例
* 船票价格分段&幸存率比例
* 登船港口&幸存率比例

In [417]:
#提取组合特征
df_test = pd.merge(df_test, pclass_survived_proportion_feature, on=["Pclass"], how='left')
print(df_test.shape)
df_test = pd.merge(df_test, sex_survived_proportion_feature, on=["Sex"], how='left')
print(df_test.shape)
df_test = pd.merge(df_test, sibSp_survived_proportion_feature, on=["SibSp"], how='left')
print(df_test.shape)
df_test = pd.merge(df_test, parch_survived_proportion_feature, on=["Parch"], how='left')
print(df_test.shape)
df_test = pd.merge(df_test, embarked_survived_proportion_feature, on=["Embarked"], how='left')
print(df_test.shape)
df_test = pd.merge(df_test, age_section_survived_proportion_feature, on=["Age_section"], how='left')
print(df_test.shape)
df_test = pd.merge(df_test, fare_section_survived_proportion_feature, on=["Fare_section"], how='left')
print(df_test.shape)

(418, 16)
(418, 17)
(418, 18)
(418, 19)
(418, 20)
(418, 21)
(418, 22)


In [418]:
df_test.select_dtypes(include="float").describe().T.assign(
  missing_pct=df_test.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_pct
Age,418.0,29.788367,13.014119,0.17,22.0,27.0,36.875,76.0,0.0
Fare,418.0,35.571764,55.851997,0.0,7.8958,14.4542,31.471875,512.3292,0.0
Pclass_proportion,418.0,0.382752,0.130182,0.235398,0.257522,0.50708,0.50708,0.50708,0.0
Sex_proportion,418.0,0.537168,0.131274,0.363717,0.363717,0.636283,0.636283,0.636283,0.0
SibSp_proportion,416.0,0.326183,0.13338,0.004425,0.161947,0.415044,0.415044,0.415044,0.004785
Parch_proportion,416.0,0.374721,0.159418,0.000885,0.459292,0.459292,0.459292,0.459292,0.004785
Embarked_proportion,418.0,0.528323,0.274372,0.065487,0.20531,0.729204,0.729204,0.729204,0.0
Fare_section_proportion,418.0,0.747186,0.279915,0.00354,0.859292,0.859292,0.859292,0.859292,0.0
pclass_survived_proportion,418.0,0.380245,0.123825,0.242978,0.258427,0.498596,0.498596,0.498596,0.0
sex_survived_proportion,418.0,0.537155,0.131229,0.363764,0.363764,0.636236,0.636236,0.636236,0.0


One-Hot Encoded：
* 客舱等级
* 性别
* 登船港口
* 分段年龄
* 船票价格年龄

In [419]:
df_test = pd.get_dummies(df_test,columns = ['Pclass', 'Sex',
       'Embarked', 'Age_section', 'Fare_section'])
df_test.head()

Unnamed: 0,Age,Fare,Parch,SibSp,Pclass_proportion,Sex_proportion,SibSp_proportion,Parch_proportion,Embarked_proportion,Fare_section_proportion,...,Age_section_5,Age_section_6,Age_section_7,Age_section_8,Fare_section_1,Fare_section_2,Fare_section_3,Fare_section_4,Fare_section_5,Fare_section_8
0,26.667409,7.8958,0,0,0.50708,0.636283,0.415044,0.459292,0.729204,0.859292,...,0,0,0,0,1,0,0,0,0,0
1,40.606351,31.6833,0,0,0.257522,0.363717,0.415044,0.459292,0.729204,0.859292,...,1,0,0,0,1,0,0,0,0,0
2,19.118879,21.6792,0,2,0.50708,0.636283,0.022124,0.459292,0.20531,0.859292,...,0,0,0,0,1,0,0,0,0,0
3,21.706078,23.45,2,1,0.50708,0.363717,0.161947,0.060177,0.729204,0.859292,...,0,0,0,0,1,0,0,0,0,0
4,24.157826,8.05,0,0,0.50708,0.363717,0.415044,0.459292,0.729204,0.859292,...,0,0,0,0,1,0,0,0,0,0


In [420]:
df_test.select_dtypes(include="float").describe().T.assign(
  missing_pct=df_test.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_pct
Age,418.0,29.788367,13.014119,0.17,22.0,27.0,36.875,76.0,0.0
Fare,418.0,35.571764,55.851997,0.0,7.8958,14.4542,31.471875,512.3292,0.0
Pclass_proportion,418.0,0.382752,0.130182,0.235398,0.257522,0.50708,0.50708,0.50708,0.0
Sex_proportion,418.0,0.537168,0.131274,0.363717,0.363717,0.636283,0.636283,0.636283,0.0
SibSp_proportion,416.0,0.326183,0.13338,0.004425,0.161947,0.415044,0.415044,0.415044,0.004785
Parch_proportion,416.0,0.374721,0.159418,0.000885,0.459292,0.459292,0.459292,0.459292,0.004785
Embarked_proportion,418.0,0.528323,0.274372,0.065487,0.20531,0.729204,0.729204,0.729204,0.0
Fare_section_proportion,418.0,0.747186,0.279915,0.00354,0.859292,0.859292,0.859292,0.859292,0.0
pclass_survived_proportion,418.0,0.380245,0.123825,0.242978,0.258427,0.498596,0.498596,0.498596,0.0
sex_survived_proportion,418.0,0.537155,0.131229,0.363764,0.363764,0.636236,0.636236,0.636236,0.0


* 归一化+标准化

In [421]:
from sklearn import preprocessing

#归一化处理
min_max_scaler = preprocessing.MinMaxScaler()
X_temp = min_max_scaler.fit_transform(df_test[['Age', 'SibSp', 'Parch', 'Fare',
       'Pclass_proportion', 'Sex_proportion', 'SibSp_proportion',
       'Parch_proportion', 'Embarked_proportion', 'Fare_section_proportion',
       'pclass_survived_proportion', 'sex_survived_proportion',
       'age_section_survived_proportion', 'sibSp_survived_proportion',
       'parch_survived_proportion', 'fare_section_survived_proportion',
       'embarked_survived_proportion']])
#标准化处理
df_test[['Age', 'SibSp', 'Parch', 'Fare',
       'Pclass_proportion', 'Sex_proportion', 'SibSp_proportion',
       'Parch_proportion', 'Embarked_proportion', 'Fare_section_proportion',
       'pclass_survived_proportion', 'sex_survived_proportion',
       'age_section_survived_proportion', 'sibSp_survived_proportion',
       'parch_survived_proportion', 'fare_section_survived_proportion',
       'embarked_survived_proportion']] = preprocessing.scale(X_temp)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
Predict = lr.predict(df_test)

In [None]:
# submission
df = pd.DataFrame({"PassengerId": df_test_feature["PassengerId"].values, "Survived": Predict})
df.to_csv("gender_submission1.csv", index=False)