In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer

In [2]:
# 数据加载
train_data = pd.read_csv(r'./Titanic/train.csv')
test_data = pd.read_csv(r'./Titanic/test.csv')

In [3]:
# 数据探索
print(train_data.info())
print('-'*30)
print(train_data.describe())
print('-'*30)
# describe(include=['O']) 查看字符串（非数字）的整体情况
print(train_data.describe(include=['O']))
print('-'*30)
print(train_data.head())
print('-'*30)
print(train_data.tail())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
------------------------------
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008  

In [4]:
# 数据清洗
# 使用平均年龄来填充年龄中的 nan 值
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
test_data['Age'].fillna(test_data['Age'].mean(), inplace=True)

In [5]:
# 使用票价的均值填充票价中的 nan 值
train_data['Fare'].fillna(train_data['Fare'].mean(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mean(), inplace=True)

In [6]:
# 查看Embarked出现最多的港口
print(train_data['Embarked'].value_counts())

S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [7]:
# 使用登录最多的港口众数来填充登录港口的 nan 值
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)

In [8]:
# 特征选择
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train_features = train_data[features]
train_labels = train_data['Survived']
test_features = test_data[features]

In [9]:
# DictVectorizer 类,将可以处理符号化的对象(Sex/Embarked)，将符号转成数字 0/1 进行表示。
# fit_transform 这个函数，它可以将特征向量转化为特征值矩阵。
dvec=DictVectorizer(sparse=False)
train_features = dvec.fit_transform(train_features.to_dict(orient='record'))
print(dvec.feature_names_)

['Age', 'Embarked=C', 'Embarked=Q', 'Embarked=S', 'Fare', 'Parch', 'Pclass', 'Sex=female', 'Sex=male', 'SibSp']


  train_features = dvec.fit_transform(train_features.to_dict(orient='record'))


In [10]:
# 构造 ID3 决策树
clf = DecisionTreeClassifier(criterion='entropy')
# 决策树训练
clf.fit(train_features, train_labels)
# 测试集转化为特征值矩阵
test_features = dvec.transform(test_features.to_dict(orient='record'))
# 用训练好的决策树对测试集进行预测
pred_labels = clf.predict(test_features)

  test_features = dvec.transform(test_features.to_dict(orient='record'))


In [11]:
# 得到决策树准确率
acc_decision_tree = round(clf.score(train_features, train_labels), 6)
print(u'score 准确率为 %.4lf' % acc_decision_tree)

score 准确率为 0.9820


In [12]:
"""
K 折交叉验证的原理是这样的：
1、将数据集平均分割成 K 个等份；
2、使用 1 份数据作为测试数据，其余作为训练数据；计算测试准确率；
3、使用不同的测试集，重复 2、3 步骤。
"""
# cross_val_score 函数中的参数 cv 代表对原始数据划分成多少份

'\nK 折交叉验证的原理是这样的：\n1、将数据集平均分割成 K 个等份；\n2、使用 1 份数据作为测试数据，其余作为训练数据；计算测试准确率；\n3、使用不同的测试集，重复 2、3 步骤。\n'

In [13]:
import numpy as np
from sklearn.model_selection import cross_val_score
# 使用K折交叉验证 统计决策树准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(clf, train_features, train_labels, cv=10)))

cross_val_score准确率为 0.7812


In [15]:
from sklearn import tree
import graphviz
dot_data = tree.export_graphviz(clf, out_file=None)
graph = graphviz.Source(dot_data)
graph.render("tree")
graph.view('graph')

'graph.pdf'