### sklearn库之iris-------三分类问题

In [5]:
# 数据加载
from sklearn.datasets import load_iris
import seaborn as sb
iris = load_iris()
print("iris.keys(): \n{}".format(iris.keys()))
print("Shape of iris data: {}".format(iris.data.shape))
print("Shape of iris data: {}".format(iris.target.shape))
print(iris.data[0])
print(iris.target[0])
print(iris.target)

iris.keys(): 
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])
Shape of iris data: (150, 4)
Shape of iris data: (150,)
[ 5.1  3.5  1.4  0.2]
0
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [23]:
# 模型加载
from sklearn.neighbors import KNeighborsClassifier
# 数据预处理
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(112, 4)
(112,)
(38, 4)
(38,)


In [24]:
knn = KNeighborsClassifier()
print(knn)
knn.fit(X_train,y_train)
# 输出训练集和测试机的预测结果
print(knn.score(X_train,y_train))
print(knn.score(X_test,y_test))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
0.973214285714
0.973684210526


### 用sklearn中的神经网络框架再写

In [6]:
from sklearn.neural_network import MLPClassifier

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(112, 4)
(112,)
(38, 4)
(38,)


In [18]:
net = MLPClassifier(hidden_layer_sizes=(100,20,90))
net.fit(X_train,y_train)
print(net.score(X_train,y_train))
print(net.score(X_test,y_test))
print(net.n_layers_)

0.991071428571
0.973684210526
5




### 利用pytorch编写神经网络解决iris数据分类问题

In [10]:
import numpy as np
from collections import Counter
from sklearn import datasets
import torch.nn.functional as Fun
from torch.autograd import Variable
import matplotlib.pyplot as plt
import torch
# 数据预处理
from sklearn.model_selection import train_test_split


#数据准备
iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)
X_train=torch.FloatTensor(X_train)
y_train=torch.LongTensor(y_train)

#定义BP神经网络
class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # hidden layer
        self.out = torch.nn.Linear(n_hidden, n_output)   # output layer

    def forward(self, x):
        x = Fun.relu(self.hidden(x))      # activation function for hidden layer we choose sigmoid
        x = self.out(x)
        return x

net = Net(n_feature=4, n_hidden=60, n_output=3)
optimizer = torch.optim.SGD(net.parameters(), lr=0.02) #SGD: 随机梯度下降
loss_func = torch.nn.CrossEntropyLoss() #针对分类问题的损失函数![在这里插入图片描述](https://img-blog.csdnimg.cn/20190108120127973.png)

#训练数据
for t in range(10000):
    out = net(X_train)                 # input x and predict based on x
    loss = loss_func(out, y_train)     # 输出与label对比
    optimizer.zero_grad()   # clear gradients for next train
    loss.backward()         # backpropagation, compute gradients
    optimizer.step()        # apply gradients

out = net(X_train) #out是一个计算矩阵，可以用Fun.softmax(out)转化为概率矩阵
prediction = torch.max(out, 1)[1] # 1返回index  0返回原值
pred_y = prediction.data.numpy()
target_y = y_train.data.numpy()
accuracy = float((pred_y == target_y).astype(int).sum()) / float(target_y.size)
print("莺尾花训练集预测准确率",accuracy)
X_test=torch.FloatTensor(X_test)
y_test=torch.LongTensor(y_test)
out = net(X_test) #out是一个计算矩阵，可以用Fun.softmax(out)转化为概率矩阵
prediction = torch.max(out, 1)[1] # 1返回index  0返回原值
pred_y = prediction.data.numpy()
target_y = y_test.data.numpy()
accuracy = float((pred_y == target_y).astype(int).sum()) / float(target_y.size)
print("莺尾花训练集预测准确率",accuracy)

莺尾花训练集预测准确率 0.9821428571428571
莺尾花训练集预测准确率 0.9736842105263158


### 决策树

In [35]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion="gini")
tree.fit(X_train,y_train)
print("训练集精度：",tree.score(X_train,y_train))
print("测试集精度：",tree.score(X_test,y_test))
print(tree)

训练集精度： 1.0
测试集精度： 0.973684210526
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


### 随机森林

In [34]:
from sklearn.ensemble import RandomForestClassifier
rand_tree = RandomForestClassifier()
rand_tree.fit(X_train,y_train)
print("训练集精度：",rand_tree.score(X_train,y_train))
print("测试集精度：",rand_tree.score(X_test,y_test))
print(rand_tree)

训练集精度： 1.0
测试集精度： 0.973684210526
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [32]:
import graphviz
dot_data = tree.export_graphviz(clf
                                ,out_file=None
                                ,feature_names= iris.feature_names
                                ,class_names= iris.target_names
                                ,filled=True
                                ,rounded=True
                               )
graph = graphviz.Source(dot_data)
graph

AttributeError: 'DecisionTreeClassifier' object has no attribute 'export_graphviz'

In [1]:
#导入需要的算法库和模块
from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

#探索数据
wine = load_wine()  
wine.data
wine.data.shape
wine.target
wine.target.shape

#如果wine是一张表，应该长这样：
import pandas as pd
pd.concat([pd.DataFrame(wine.data),pd.DataFrame(wine.target)],axis=1)

wine.feature_names
wine.target_names

#切分训练集和测试集
Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3)

Xtrain.shape
Xtest.shape

#建立模型
clf = tree.DecisionTreeClassifier()
clf = clf.fit(Xtrain, Ytrain)
score = clf.score(Xtest, Ytest) #返回预测的准确度
score

#绘制树

import graphviz
import pydotplus
dot_data = StringIO()
feature_name = ['酒精','苹果酸','灰','灰的碱性','镁','总酚','类黄酮','非黄烷类酚类','花青素','颜色强度','色调','od280/od315稀释葡萄酒','脯氨酸']
dot_data = tree.export_graphviz(clf
                                ,out_file=dot_data
                                ,feature_names= feature_name
                                ,class_names=["琴酒","雪莉","贝尔摩德"]
                                ,filled=True
                                ,rounded=True
                               )
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph = graphviz.Source(dot_data)

NameError: name 'StringIO' is not defined

In [4]:
from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

In [5]:
wine = load_wine()
wine.data.shape#(178,13)
wine.target
#如果wine是一张表，应该长这样：
import pandas as pd
pd.concat([pd.DataFrame(wine.data),pd.DataFrame(wine.target)],axis=1)
wine.feature_names
wine.target_names

array(['class_0', 'class_1', 'class_2'],
      dtype='<U7')

In [6]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3)
Xtrain.shape
Xtest.shape

(54, 13)

In [7]:
clf = tree.DecisionTreeClassifier(criterion="entropy")#实例化，criterion不写的话默认是基尼系数
clf = clf.fit(Xtrain, Ytrain)
score = clf.score(Xtest, Ytest) #返回预测的准确度
score

0.96296296296296291

In [8]:
feature_name = ['酒精','苹果酸','灰','灰的碱性','镁','总酚','类黄酮','非黄烷类酚类','花青素','颜色强度','色调','od280/od315稀释葡萄酒','脯氨酸']
 
import graphviz
dot_data = tree.export_graphviz(clf
                               # ,out_file = None
                               ,feature_names= feature_name
                               ,class_names=["琴酒","雪莉","贝尔摩德"]
                               ,filled=True#让树的每一块有颜色，颜色越浅，表示不纯度越高
                               ,rounded=True#树的块的形状
                               )
graph = graphviz.Source(dot_data)
graph.render("Tree")
graph# graph.view()



ExecutableNotFound: failed to execute ['dot', '-Tpdf', '-O', 'Tree'], make sure the Graphviz executables are on your systems' PATH