# Tensorflow 框架

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
#准备数据集
iris = load_iris()
#获取特征集和分类标识
features = iris.data
labels = iris.target
#随机抽取33%的数据集作为测试集
train_features,test_features,train_labels,test_labels = train_test_split(features,labels,test_size = 0.33,random_state = 0)
#创建CART分类树
clf = DecisionTreeClassifier(criterion = 'gini')
#拟合构造CART分类树
clf = clf.fit(train_features,train_labels)
#用CART分类树做预测
test_predict = clf.predict(test_features)
#预测结果与测试集结果做对比
score = accuracy_score(test_labels,test_predict)
print("CART 分类树准确率%.4lf" % score)

CART 分类树准确率0.9600


In [2]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.tree import DecisionTreeRegressor
#准备数据集
boston = load_boston()

In [3]:
#探索数据集
print(boston.feature_names)

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [4]:
#获取特征集和房价
features = boston.data
prices = boston.target

In [5]:
#随机抽取33%的数据作为测试集，其余为训练集
train_features,test_features,train_price,test_price = train_test_split(features,prices,test_size = 0.33)

In [6]:
#创建CART回归树
dtr = DecisionTreeRegressor()
#拟合构造CART回归树
dtr.fit(train_features,train_price)
#预测测试集中的房价
predict_price = dtr.predict(test_features)
#测试集的评价结果
print('回归树二乘偏差均值;',mean_squared_error(test_price,predict_price))
print('回归树绝对值偏差均值:',mean_absolute_error(test_price,predict_price))

回归树二乘偏差均值; 30.54616766467066
回归树绝对值偏差均值: 3.612574850299401


# titanic

## 数据探索

In [7]:
import pandas as pd

In [8]:
#数据加载
train_data = pd.read_csv('C:\\Users\\tanling\\deeplearning\\Titanic_Data\\train.csv')
test_data = pd.read_csv('C:\\Users\\tanling\\deeplearning\\Titanic_Data\\test.csv')

In [9]:
#数据探索
print(train_data.info())
print('-'*30)
print(train_data.describe())
print('-'*30)
print(train_data.describe(include = ['O']))
print('-'*30)
print(train_data.head())
print('-'*30)
print(train_data.tail())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
------------------------------
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.00000

## 数据清洗

In [10]:
#使用平均年凌来填充年龄中的nan值
train_data['Age'].fillna(train_data['Age'].mean(),inplace = True)
test_data['Age'].fillna(train_data['Age'].mean(),inplace = True)
#使用票价的均值填充票价中的nan值
train_data['Fare'].fillna(train_data['Fare'].mean(),inplace = True)
test_data['Fare'].fillna(test_data['Fare'].mean(),inplace = True)

In [11]:
print(train_data['Embarked'].value_counts())

S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [12]:
#使用登录最多的港口来填充登录港口的nan值
train_data['Embarked'].fillna('S',inplace=True)
test_data['Embarked'].fillna('S',inplace=True)

## 特征选择

In [13]:
#特征选择
features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
train_features = train_data[features]
train_labels = train_data['Survived']
test_features = test_data[features]

In [14]:
from sklearn.feature_extraction import DictVectorizer
dvec = DictVectorizer(sparse=False)
train_features = dvec.fit_transform(train_features.to_dict(orient = 'record'))

In [15]:
print(dvec.feature_names_)

['Age', 'Embarked=C', 'Embarked=Q', 'Embarked=S', 'Fare', 'Parch', 'Pclass', 'Sex=female', 'Sex=male', 'SibSp']


## 决策树模型

In [16]:
from sklearn.tree import DecisionTreeClassifier
#构造ID3决策树
clf = DecisionTreeClassifier(criterion = 'entropy')
#决策树训练
clf.fit(train_features,train_labels)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

## 模型评估、预测

In [17]:
test_features = dvec.transform(test_features.to_dict(orient = 'record'))
#决策树预测
pred_labels = clf.predict(test_features)

In [18]:
#得到决策时准确率
acc_decision_tree = round(clf.score(train_features,train_labels),6)
print(u'score 准确率为 %.4lf' % acc_decision_tree)

score 准确率为 0.9820


In [19]:
import numpy as np
from sklearn.model_selection import cross_val_score
#使用k折交叉验证 统计决策树准确率
print(u'cross_val_score 准确率为 %.4lf' % np.mean(cross_val_score(clf,train_features,train_labels,cv = 10)))

cross_val_score 准确率为 0.7813


## 决策树可视化

In [20]:
import graphviz

In [24]:
from sklearn import tree
import graphviz
dot_data = tree.export_graphviz(clf, out_file=None)
graph = graphviz.Source(dot_data)
graph.render("tree")
graph.view('graph')

'graph.pdf'