In [1]:
# 引入库包
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv('data/train.csv',header=None)
df_train.columns = ['PassengerId','Survived','Pclass','Name','Sex','Age',"SibSp","Parch","Ticket","Fare","Cabin","Embarked"]
df_train.drop(index=0,inplace=True)
df_train.head()
#PassengerId:乘客编号
#Survived：存活情况（存活：1，死亡：0）
#Pclass：客舱等级
#Name：乘客姓名
#Sex：性别
#Age：年龄
#SibSp：同乘的兄弟姐妹/配偶数
#Parch：同乘的父母/小孩数
#Ticket：船票编号
#Fare：船票价格
#Cabin：客舱号
#Embarked：登船港口

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
3,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
5,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 12 columns):
PassengerId    891 non-null object
Survived       891 non-null object
Pclass         891 non-null object
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null object
SibSp          891 non-null object
Parch          891 non-null object
Ticket         891 non-null object
Fare           891 non-null object
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: object(12)
memory usage: 90.5+ KB


In [4]:
df_train.drop("PassengerId",1,inplace=True)
df_train.drop("Name",1,inplace=True)
df_train.drop("Ticket",1,inplace=True)

In [5]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
1,0,3,male,22,1,0,7.25,,S
2,1,1,female,38,1,0,71.2833,C85,C
3,1,3,female,26,0,0,7.925,,S
4,1,1,female,35,1,0,53.1,C123,S
5,0,3,male,35,0,0,8.05,,S


In [6]:
df_train.select_dtypes(include="object").describe().T.assign(
  missing_pct=df_train.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,unique,top,freq,missing_pct
Survived,891,2,0,549,0.0
Pclass,891,3,3,491,0.0
Sex,891,2,male,577,0.0
Age,714,88,24,30,0.198653
SibSp,891,7,0,608,0.0
Parch,891,7,0,678,0.0
Fare,891,248,8.05,43,0.0
Cabin,204,147,B96 B98,4,0.771044
Embarked,889,3,S,644,0.002245


In [7]:
df_train.drop("Cabin",1,inplace=True)

In [8]:
df_train.Embarked.isnull().value_counts()

False    889
True       2
Name: Embarked, dtype: int64

In [9]:
df_train['Embarked'] = df_train['Embarked'].fillna("999")
df_train[(df_train.Embarked == "999")].index.tolist()
df_train = df_train.drop([62, 830])
df_train.shape

(889, 8)

In [10]:
df_train.select_dtypes(include="object").describe().T.assign(
  missing_pct=df_train.apply(lambda x : (len(x)-x.count())/float(len(x))))

Unnamed: 0,count,unique,top,freq,missing_pct
Survived,889,2,0,549,0.0
Pclass,889,3,3,491,0.0
Sex,889,2,male,577,0.0
Age,712,88,24,30,0.1991
SibSp,889,7,0,606,0.0
Parch,889,7,0,676,0.0
Fare,889,247,8.05,43,0.0
Embarked,889,3,S,644,0.0


In [11]:
df_train['Fare'] = df_train['Fare'].astype("float")
df_train['Age'] = df_train['Age'].astype("float")
df_train['SibSp'] = df_train['SibSp'].astype("float")
df_train['Parch'] = df_train['Parch'].astype("float")

In [12]:
df_train = pd.get_dummies(df_train,columns = ["Sex","Pclass","Embarked"])
df_train.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
1,0,22.0,1.0,0.0,7.25,0,1,0,0,1,0,0,1
2,1,38.0,1.0,0.0,71.2833,1,0,1,0,0,1,0,0
3,1,26.0,0.0,0.0,7.925,1,0,0,0,1,0,0,1
4,1,35.0,1.0,0.0,53.1,1,0,1,0,0,0,0,1
5,0,35.0,0.0,0.0,8.05,0,1,0,0,1,0,0,1


In [13]:
Y = df_train.Survived
X = df_train.drop('Survived',1,inplace=False)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)

# xgboost 模型训练

In [15]:
from xgboost import XGBClassifier

gbm = XGBClassifier( n_estimators= 2000, max_depth= 4, min_child_weight= 2, gamma=0.9, subsample=0.8, 
                        colsample_bytree=0.8, objective= 'binary:logistic', nthread= -1, scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict(x_test)
print(u'xgboost模型的平均正确率为：%s' % gbm.score(x_test, y_test))

xgboost模型的平均正确率为：0.8202247191011236


# 预测

In [16]:
df_test = pd.read_csv('data/test.csv',header=None)
df_test.columns = ['PassengerId','Pclass','Name','Sex','Age',"SibSp","Parch","Ticket","Fare","Cabin","Embarked"]
df_test.drop(index=0,inplace=True)

In [17]:
df_temp = df_test.copy()

In [18]:
df_test.drop("PassengerId",1,inplace=True)
df_test.drop("Name",1,inplace=True)
df_test.drop("Ticket",1,inplace=True)
df_test.drop("Cabin",1,inplace=True)

df_test['Fare'] = df_test['Fare'].astype("float")
df_test['Fare'] = df_test[['Fare']].fillna(df_test.groupby('Pclass').transform(np.mean))

df_test['Age'] = df_test['Age'].astype("float")
df_test['SibSp'] = df_test['SibSp'].astype("float")
df_test['Parch'] = df_test['Parch'].astype("float")

df_test = pd.get_dummies(df_test,columns = ["Sex","Pclass","Embarked"])
df_test.fillna(0,inplace=True)

In [19]:
from sklearn import preprocessing

#归一化处理
min_max_scaler = preprocessing.MinMaxScaler()
X_temp = min_max_scaler.fit_transform(df_test[["Fare","SibSp","Parch","Age"]])

#标准化处理
df_test[["Fare","SibSp","Parch","Age"]] = preprocessing.scale(X_temp)

In [20]:
Predict = gbm.predict(df_test)

In [21]:
df_test.columns

Index(['Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [22]:
# submission
df = pd.DataFrame({"PassengerId": df_temp["PassengerId"].values, "Survived": Predict})
df.to_csv("gender_submission.csv", index=False)