In [1]:
# Kaggle竞赛项目的全过程：

# 了解问题背景：对竞赛的背景进行了解
# 下载数据
# 分析数据：Explore Data Analysis
# 数据处理和特征工程：Data Process and FeatureEngineering
# 模型选择：Model Select
# 提交结果：Submission

# 第一步：了解问题背景
# https://www.kaggle.com/c/titanic

# 第二步：下载数据

# gender_submission.csv：我们需要提交的示例文件
# test.csv：测试数据集
# train.csv：训练数据集

# # 第三步：读取数据，分析数据
import pandas as pd

print("load... data")

train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

print(train.info(),test.info())
# # 训练数据集有891行12列。各列代表的信息：

# · PassengerId：一个用以标记每个乘客的数字id

# · Survived：标记乘客是否幸存——幸存(1)、死亡(0)。我们将预测这一列。

# · Pclass：标记乘客所属船层——第一层(1),第二层(2),第三层(3)。

# · Name：乘客名字。

# · Sex：乘客性别——男male、女female

# · Age：乘客年龄。部分。

# · SibSp：船上兄弟姐妹和配偶的数量。

# · Parch：船上父母和孩子的数量。

# · Ticket：乘客的船票号码。

# · Fare：乘客为船票付了多少钱。

# · Cabin：乘客住在哪个船舱。

# · Embarked：乘客从哪个地方登上泰坦尼克号。

load... data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 n

In [2]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [3]:
# 第四步、数据处理和特征工程
# 数据分析过程中，了解业务背景是非常重要的。
# 大家记得在泰坦尼克号沉没的时候，船长说了一句话：小孩和妇女先走，男人留下。
# 知道这个背景以后，在做数据处理的时候我们就应该知道Sex和Age两个字段应该是关键。

# 注意：我们需要同时对训练集和测试集做相同的处理。

# 1 age
train['Age'] = train['Age'].fillna(train['Age'].median())
test['Age'] = test['Age'].fillna(test['Age'].median())
# 2. Sex
train['Sex'] = train['Sex'].apply(lambda x: 1 if x == 'male' else 0)
test['Sex'] = test['Sex'].apply(lambda x: 1 if x == 'male' else 0)
# 3. 特征选择
feature = ['Age', 'Sex']

train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",1,28.0,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",1,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",1,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",0,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",0,14.0,1,0,237736,30.0708,,C


In [4]:
# model第五步：模型选择
# 我们的任务是预测乘客是否能幸存，很明显是一个基本的二分类问题（Binary Classification）。可以用来处理二分类问题的模型主要有：感知机、Logistic回归、决策树、SVM和随机森林等，可选的模型非常多。网上有一张图，建议我们选择哪种模型应用到我们的项目中。

# 这里我们选择sciket-learn提供的决策树（Decision Tree）模型。
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

dt = dt.fit(train[feature], train['Survived'])


In [5]:
# 第六步、预测结果并提交

predict_data = dt.predict(test[feature])

submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predict_data
})

submission.to_csv('submission_decision_tree.csv', index=False)