### sklearn库之iris-------三分类问题

In [45]:
# 数据加载
from sklearn.datasets import load_iris
import seaborn as sb
iris = load_iris()
print("iris.keys(): \n{}".format(iris.keys()))
print("Shape of iris data: {}".format(iris.data.shape))
print("Shape of iris data: {}".format(iris.target.shape))
print(iris.data[0])
print(iris.target[0])
print(iris.target)

iris.keys(): 
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])
Shape of iris data: (150, 4)
Shape of iris data: (150,)
[ 5.1  3.5  1.4  0.2]
0
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [46]:
# 模型加载
from sklearn.neighbors import KNeighborsClassifier
# 数据预处理
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(112, 4)
(112,)
(38, 4)
(38,)


### PCA主成分分析

In [60]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(iris.data)
print (pca.explained_variance_ratio_)
X_new = pca.transform(iris.data)
print(X_new.shape)
print(iris.target.shape)
x_train,x_test,y_train,y_test = train_test_split(X_new,iris.target,test_size=0.25,random_state=0)
print(x_train.shape,y_train.shape)
knn_0 = KNeighborsClassifier()
knn_0.fit(x_train,y_train)
print("训练集精度：",knn_0.score(x_train,y_train))
print("测试集精度：",knn_0.score(x_test,y_test))

[ 0.92461621  0.05301557]
(150, 2)
(150,)
(112, 2) (112,)
训练集精度： 0.973214285714
测试集精度： 0.973684210526


In [3]:
knn = KNeighborsClassifier()
print(knn)
knn.fit(X_train,y_train)
# 输出训练集和测试机的预测结果
print(knn.score(X_train,y_train))
print(knn.score(X_test,y_test))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
0.973214285714
0.973684210526


### 用sklearn中的神经网络框架再写

In [4]:
from sklearn.neural_network import MLPClassifier

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(112, 4)
(112,)
(38, 4)
(38,)


In [6]:
net = MLPClassifier(hidden_layer_sizes=(100,20,90))
net.fit(X_train,y_train)
print(net.score(X_train,y_train))
print(net.score(X_test,y_test))
print(net.n_layers_)

0.991071428571
0.973684210526
5




### 利用pytorch编写神经网络解决iris数据分类问题

In [7]:
import numpy as np
from collections import Counter
from sklearn import datasets
import torch.nn.functional as Fun
from torch.autograd import Variable
import matplotlib.pyplot as plt
import torch
# 数据预处理
from sklearn.model_selection import train_test_split


#数据准备
iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)
X_train=torch.FloatTensor(X_train)
y_train=torch.LongTensor(y_train)

#定义BP神经网络
class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # hidden layer
        self.out = torch.nn.Linear(n_hidden, n_output)   # output layer

    def forward(self, x):
        x = Fun.relu(self.hidden(x))      # activation function for hidden layer we choose sigmoid
        x = self.out(x)
        return x

net = Net(n_feature=4, n_hidden=60, n_output=3)
optimizer = torch.optim.SGD(net.parameters(), lr=0.02) #SGD: 随机梯度下降
loss_func = torch.nn.CrossEntropyLoss() #针对分类问题的损失函数![在这里插入图片描述](https://img-blog.csdnimg.cn/20190108120127973.png)

#训练数据
for t in range(10000):
    out = net(X_train)                 # input x and predict based on x
    loss = loss_func(out, y_train)     # 输出与label对比
    optimizer.zero_grad()   # clear gradients for next train
    loss.backward()         # backpropagation, compute gradients
    optimizer.step()        # apply gradients

out = net(X_train) #out是一个计算矩阵，可以用Fun.softmax(out)转化为概率矩阵
prediction = torch.max(out, 1)[1] # 1返回index  0返回原值
pred_y = prediction.data.numpy()
target_y = y_train.data.numpy()
accuracy = float((pred_y == target_y).astype(int).sum()) / float(target_y.size)
print("莺尾花训练集预测准确率",accuracy)
X_test=torch.FloatTensor(X_test)
y_test=torch.LongTensor(y_test)
out = net(X_test) #out是一个计算矩阵，可以用Fun.softmax(out)转化为概率矩阵
prediction = torch.max(out, 1)[1] # 1返回index  0返回原值
pred_y = prediction.data.numpy()
target_y = y_test.data.numpy()
accuracy = float((pred_y == target_y).astype(int).sum()) / float(target_y.size)
print("莺尾花训练集预测准确率",accuracy)

莺尾花训练集预测准确率 0.9821428571428571
莺尾花训练集预测准确率 0.9736842105263158


### 决策树

In [8]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion="gini",max_depth=5,min_samples_leaf=3)
dt.fit(X_train,y_train)
print("训练集精度：",dt.score(X_train,y_train))
print("测试集精度：",dt.score(X_test,y_test))
print(dt)

训练集精度： 0.982142857143
测试集精度： 0.973684210526
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [9]:
# 绘制决策树
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
import pydotplus
from io import StringIO
dot_data = StringIO()

export_graphviz(dt, out_file=dot_data,  # 绘制决策树
    feature_names=iris.feature_names,
    class_names=iris.target_names,
    filled=True, rounded=True,
    special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("iris.pdf")

True

### 随机森林

In [10]:
from sklearn.ensemble import RandomForestClassifier
rand_tree = RandomForestClassifier(max_depth=4,min_samples_leaf=5)
rand_tree.fit(X_train,y_train)
print("训练集精度：",rand_tree.score(X_train,y_train))
print("测试集精度：",rand_tree.score(X_test,y_test))
print(rand_tree)

训练集精度： 0.982142857143
测试集精度： 0.973684210526
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


### SVM

In [44]:
from sklearn import svm
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, random_state=42)
svm_classifier = svm.SVC(C=10, kernel='rbf', decision_function_shape='ovr', gamma=0.1)# C是惩罚项（c越大，误差越小，但是容易发生过拟合）
svm_classifier.fit(X_train, y_train)
print("训练集精度：",svm_classifier.score(X_train,y_train))
print("测试集精度：",svm_classifier.score(X_test,y_test))

训练集精度： 0.980952380952
测试集精度： 1.0
