In [2]:
import time

from sklearn.datasets import load_iris, fetch_20newsgroups, fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

In [3]:
#鸢尾花数据集，查看特征，目标，样本量

li = load_iris()

print("获取特征值")
print(type(li.data))
print('-' * 50)
print(li.data[0:3])
print('-' * 50)
print(li.data.shape) # 150个样本，4个特征,一般看shape
print("目标值")
print(li.target)
print('-' * 50)
print(li.DESCR)
print('-' * 50)
print(li.feature_names)  # 重点,特征名字
print('-' * 50)
print(li.target_names) # 目标名字
print('-' * 50)

获取特征值
<class 'numpy.ndarray'>
--------------------------------------------------
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]]
--------------------------------------------------
(150, 4)
目标值
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
--------------------------------------------------
.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Ve

In [4]:
# 注意返回值, 训练集 train  x_train, y_train        测试集  test   x_test, y_test，顺序千万别搞错了
# 默认是乱序的,random_state为了确保两次的随机策略一致，就会得到相同的随机数据，往往会带上
x_train, x_test, y_train, y_test = train_test_split(li.data, li.target, test_size=0.25, random_state=1)

print("训练集特征值和目标值：", x_train, y_train)
print("训练集特征值shape", x_train.shape)

print("测试集特征值和目标值：", x_test, y_test)
print("测试集特征值shape", x_test.shape)

训练集特征值和目标值： [[6.5 2.8 4.6 1.5]
 [6.7 2.5 5.8 1.8]
 [6.8 3.  5.5 2.1]
 [5.1 3.5 1.4 0.3]
 [6.  2.2 5.  1.5]
 [6.3 2.9 5.6 1.8]
 [6.6 2.9 4.6 1.3]
 [7.7 2.6 6.9 2.3]
 [5.7 3.8 1.7 0.3]
 [5.  3.6 1.4 0.2]
 [4.8 3.  1.4 0.3]
 [5.2 2.7 3.9 1.4]
 [5.1 3.4 1.5 0.2]
 [5.5 3.5 1.3 0.2]
 [7.7 3.8 6.7 2.2]
 [6.9 3.1 5.4 2.1]
 [7.3 2.9 6.3 1.8]
 [6.4 2.8 5.6 2.2]
 [6.2 2.8 4.8 1.8]
 [6.  3.4 4.5 1.6]
 [7.7 2.8 6.7 2. ]
 [5.7 3.  4.2 1.2]
 [4.8 3.4 1.6 0.2]
 [5.7 2.5 5.  2. ]
 [6.3 2.7 4.9 1.8]
 [4.8 3.  1.4 0.1]
 [4.7 3.2 1.3 0.2]
 [6.5 3.  5.8 2.2]
 [4.6 3.4 1.4 0.3]
 [6.1 3.  4.9 1.8]
 [6.5 3.2 5.1 2. ]
 [6.7 3.1 4.4 1.4]
 [5.7 2.8 4.5 1.3]
 [6.7 3.3 5.7 2.5]
 [6.  3.  4.8 1.8]
 [5.1 3.8 1.6 0.2]
 [6.  2.2 4.  1. ]
 [6.4 2.9 4.3 1.3]
 [6.5 3.  5.5 1.8]
 [5.  2.3 3.3 1. ]
 [6.3 3.3 6.  2.5]
 [5.5 2.5 4.  1.3]
 [5.4 3.7 1.5 0.2]
 [4.9 3.1 1.5 0.2]
 [5.2 4.1 1.5 0.1]
 [6.7 3.3 5.7 2.1]
 [4.4 3.  1.3 0.2]
 [6.  2.7 5.1 1.6]
 [6.4 2.7 5.3 1.9]
 [5.9 3.  5.1 1.8]
 [5.2 3.5 1.5 0.2]
 [5.1 3.3 1.7 0.5]


In [5]:
# 下面是比较大的数据，需要下载一会，20类新闻
#subset代表下载的数据集类型，默认是train，只有训练集
news = fetch_20newsgroups(subset='all', data_home='data')
# print(news.feature_names)  #这个数据集是没有的
# print(news.DESCR)
print('第一个样本')
print(news.data[0])
print('特征类型')
print(type(news.data))
print('-' * 50)
print(news.target[0])
print('-' * 50)
print(len(news.data))
print('新闻所有的标签')
print(news.target)
print('-' * 50)
print(min(news.target), max(news.target))

第一个样本
From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!


特征类型
<class 'list'>
--------------------------------------------------
10
-----------------

In [6]:
house=fetch_california_housing(data_home='data')
print("获取特征值")
print(house.data[0])  #第一个样本特征值
print('样本的形状')
print(house.data.shape)
print('-' * 50)
print("目标值")
print(house.target)
print('-' * 50)
print(house.DESCR)
print('-' * 50)
print(house.feature_names)
print('-' * 50)

获取特征值
[   8.3252       41.            6.98412698    1.02380952  322.
    2.55555556   37.88       -122.23      ]
样本的形状
(20640, 8)
--------------------------------------------------
目标值
[4.526 3.585 3.521 ... 0.923 0.847 0.894]
--------------------------------------------------
.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: 

In [7]:
# K近邻
"""
K-近邻预测用户签到位置
:return:None
"""
# 读取数据
data = pd.read_csv("E:\wangdao training camp\datasites\data\FBlocation/train.csv")

print(data.head(10))
print(data.shape)
# 处理数据
# 1、缩小数据,查询数据,为了减少计算时间
data = data.query("x > 1.0 &  x < 1.25 & y > 2.5 & y < 2.75")

# 处理时间的数据
time_value = pd.to_datetime(data['time'], unit='s')

print(time_value)  #最大时间是1月10号

   row_id       x       y  accuracy    time    place_id
0       0  0.7941  9.0809        54  470702  8523065625
1       1  5.9567  4.7968        13  186555  1757726713
2       2  8.3078  7.0407        74  322648  1137537235
3       3  7.3665  2.5165        65  704587  6567393236
4       4  4.0961  1.1307        31  472130  7440663949
5       5  3.8099  1.9586        75  178065  6289802927
6       6  6.3336  4.3720        13  666829  9931249544
7       7  5.7409  6.7697        85  369002  5662813655
8       8  4.3114  6.9410         3  166384  8471780938
9       9  6.3414  0.0758        65  400060  1253803156
(29118021, 6)
600        1970-01-01 18:09:40
957        1970-01-10 02:11:10
4345       1970-01-05 15:08:02
4735       1970-01-06 23:03:03
5580       1970-01-09 11:26:50
                   ...        
29100203   1970-01-01 10:33:56
29108443   1970-01-07 23:22:04
29109993   1970-01-08 15:03:14
29111539   1970-01-04 00:53:41
29112154   1970-01-08 23:01:07
Name: time, Length: 17710, dt

In [8]:
# 把日期格式转换成 字典格式，把年，月，日，时，分，秒转换为字典格式，
time_value = pd.DatetimeIndex(time_value)
#
print('-' * 50)
print(time_value)

--------------------------------------------------
DatetimeIndex(['1970-01-01 18:09:40', '1970-01-10 02:11:10',
               '1970-01-05 15:08:02', '1970-01-06 23:03:03',
               '1970-01-09 11:26:50', '1970-01-02 16:25:07',
               '1970-01-04 15:52:57', '1970-01-01 10:13:36',
               '1970-01-09 15:26:06', '1970-01-08 23:52:02',
               ...
               '1970-01-07 10:03:36', '1970-01-09 11:44:34',
               '1970-01-04 08:07:44', '1970-01-04 15:47:47',
               '1970-01-08 01:24:11', '1970-01-01 10:33:56',
               '1970-01-07 23:22:04', '1970-01-08 15:03:14',
               '1970-01-04 00:53:41', '1970-01-08 23:01:07'],
              dtype='datetime64[ns]', name='time', length=17710, freq=None)


In [9]:
print('-' * 50)
# 构造一些特征，执行的警告是因为我们的操作是复制，loc是直接放入
print(type(data))
# data['day'] = time_value.day
# data['hour'] = time_value.hour
# data['weekday'] = time_value.weekday
#日期，是否是周末，小时对于个人行为的影响是较大的,所以才做下面的处理
data.insert(data.shape[1], 'day', time_value.day) #data.shape[1]是代表插入到最后的意思
data.insert(data.shape[1], 'hour', time_value.hour)
data.insert(data.shape[1], 'weekday', time_value.weekday)

#
# 把时间戳特征删除
data = data.drop(['time'], axis=1)
print('-' * 50)
data

--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
--------------------------------------------------


Unnamed: 0,row_id,x,y,accuracy,place_id,day,hour,weekday
600,600,1.2214,2.7023,17,6683426742,1,18,3
957,957,1.1832,2.6891,58,6683426742,10,2,5
4345,4345,1.1935,2.6550,11,6889790653,5,15,0
4735,4735,1.1452,2.6074,49,6822359752,6,23,1
5580,5580,1.0089,2.7287,19,1527921905,9,11,4
...,...,...,...,...,...,...,...,...
29100203,29100203,1.0129,2.6775,12,3312463746,1,10,3
29108443,29108443,1.1474,2.6840,36,3533177779,7,23,2
29109993,29109993,1.0240,2.7238,62,6424972551,8,15,3
29111539,29111539,1.2032,2.6796,87,3533177779,4,0,6


In [10]:
data.describe()


Unnamed: 0,row_id,x,y,accuracy,place_id,day,hour,weekday
count,17710.0,17710.0,17710.0,17710.0,17710.0,17710.0,17710.0,17710.0
mean,14505690.0,1.122538,2.632309,82.482101,5129895000.0,5.101863,11.485545,3.092377
std,8353805.0,0.077086,0.070144,113.613227,2357399000.0,2.709287,6.932195,1.680218
min,600.0,1.0001,2.5001,1.0,1012024000.0,1.0,0.0,0.0
25%,7327816.0,1.0492,2.5738,25.0,3312464000.0,3.0,6.0,2.0
50%,14430710.0,1.1233,2.6423,62.0,5261906000.0,5.0,12.0,3.0
75%,21634630.0,1.1905,2.6878,75.0,6766325000.0,7.0,17.0,4.0
max,29112150.0,1.2499,2.7499,1004.0,9980711000.0,10.0,23.0,6.0


In [11]:
# # 把签到数量少于n个目标位置删除，place_id是标签，即目标值
place_count = data.groupby('place_id').count()
place_count

Unnamed: 0_level_0,row_id,x,y,accuracy,day,hour,weekday
place_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1012023972,1,1,1,1,1,1,1
1057182134,1,1,1,1,1,1,1
1059958036,3,3,3,3,3,3,3
1085266789,1,1,1,1,1,1,1
1097200869,1044,1044,1044,1044,1044,1044,1044
...,...,...,...,...,...,...,...
9904182060,1,1,1,1,1,1,1
9915093501,1,1,1,1,1,1,1
9946198589,1,1,1,1,1,1,1
9950190890,1,1,1,1,1,1,1


In [12]:
place_count['x'].describe()

count     805.000000
mean       22.000000
std        88.955632
min         1.000000
25%         1.000000
50%         2.000000
75%         5.000000
max      1044.000000
Name: x, dtype: float64

In [13]:
# # 把index变为0,1,2，3,4,5,6这种效果，从零开始排，原来的index是row_id
#只选择去的人大于3的数据，认为1,2,3的是噪音，这个地方去的人很少，不用推荐给其他人
tf = place_count[place_count.row_id > 3].reset_index()
tf  #剩余的签到地点

Unnamed: 0,place_id,row_id,x,y,accuracy,day,hour,weekday
0,1097200869,1044,1044,1044,1044,1044,1044,1044
1,1228935308,120,120,120,120,120,120,120
2,1267801529,58,58,58,58,58,58,58
3,1278040507,15,15,15,15,15,15,15
4,1285051622,21,21,21,21,21,21,21
...,...,...,...,...,...,...,...,...
234,9741307878,5,5,5,5,5,5,5
235,9753855529,21,21,21,21,21,21,21
236,9806043737,6,6,6,6,6,6,6
237,9809476069,23,23,23,23,23,23,23


In [14]:
# 根据设定的地点目标值，对原本的样本进行过滤
#isin可以过滤某一列要在一组值
data = data[data['place_id'].isin(tf.place_id)]
data.shape

(16918, 8)

In [15]:
# # 取出数据当中的特征值和目标值
y = data['place_id']
# 删除目标值，保留特征值，
x = data.drop(['place_id'], axis=1)
# 删除无用的特征值
x = x.drop(['row_id'], axis=1)
print(x.shape)
print(x.columns)

(16918, 6)
Index(['x', 'y', 'accuracy', 'day', 'hour', 'weekday'], dtype='object')


In [16]:
# 进行数据的分割训练集合测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)

# 特征工程（标准化）,下面3行注释，一开始我们不进行标准化，看下效果，目标值要不要标准化？
std = StandardScaler()
# #
# # # 对测试集和训练集的特征值进行标准化,服务于knn fit
x_train = std.fit_transform(x_train)
# # transform返回的是copy，不在原有的输入对象中去修改
# print(id(x_test))
print(std.mean_)
print(std.var_)
x_test = std.transform(x_test)  #transfrom不再进行均值和方差的计算，是在原有的基础上去标准化
print('-' * 50)
# print(id(x_test))
print(std.mean_)
print(std.var_)

[ 1.12295735  2.63237278 81.34938525  5.10064628 11.44293821  3.10135561]
[5.98489138e-03 4.86857391e-03 1.19597480e+04 7.32837915e+00
 4.83742660e+01 2.81838404e+00]
--------------------------------------------------
[ 1.12295735  2.63237278 81.34938525  5.10064628 11.44293821  3.10135561]
[5.98489138e-03 4.86857391e-03 1.19597480e+04 7.32837915e+00
 4.83742660e+01 2.81838404e+00]


In [17]:
# # 进行算法流程 # 超参数，可以通过设置n_neighbors=5，来调整结果好坏
knn = KNeighborsClassifier(n_neighbors=6)

# # fit， predict,score，训练
knn.fit(x_train, y_train)
# # #
# # # 得出预测结果
y_predict = knn.predict(x_test)
# #
print("预测的目标签到位置为：", y_predict)
# # #
# # # # 得出准确率
print("预测的准确率:", knn.score(x_test, y_test))


预测的目标签到位置为： [5689129232 1097200869 2355236719 ... 4932578245 6424972551 5095999304]
预测的准确率: 0.484160756501182


In [18]:
1+11+111+1111+11111

12345

In [19]:
2+22+222+2222+22222

24690

In [20]:
20%2

0

In [21]:
#网格搜索时讲解
# # 构造一些参数的值进行搜索
param = {"n_neighbors": [3, 5, 10, 12, 15],'weights':['uniform', 'distance']}
#
# 进行网格搜索，cv=3是3折交叉验证，用其中2折训练，1折验证
gc = GridSearchCV(knn, param_grid=param, cv=3)

gc.fit(x_train, y_train)  #你给它的x_train，它又分为训练集，验证集

# 预测准确率，为了给大家看看
print("在测试集上准确率：", gc.score(x_test, y_test))

print("在交叉验证当中最好的结果：", gc.best_score_)

print("选择最好的模型是：", gc.best_estimator_)

print("每个超参数每次交叉验证的结果：", gc.cv_results_)
gc.cv_results_



在测试集上准确率： 0.49763593380614657
在交叉验证当中最好的结果： 0.4816362349278435
选择最好的模型是： KNeighborsClassifier(n_neighbors=12, weights='distance')
每个超参数每次交叉验证的结果： {'mean_fit_time': array([0.01595179, 0.02326202, 0.0219396 , 0.01561419, 0.01595553,
       0.02359192, 0.01593741, 0.02925976, 0.02061264, 0.01628947]), 'std_fit_time': array([0.00281311, 0.00554443, 0.00373047, 0.00308823, 0.00215358,
       0.00972584, 0.00162434, 0.00448077, 0.00047251, 0.00261661]), 'mean_score_time': array([0.3530577 , 0.10073423, 0.30618683, 0.11172144, 0.33277933,
       0.15959978, 0.33444269, 0.16555214, 0.40823952, 0.26097822]), 'std_score_time': array([0.05894163, 0.00081702, 0.01589575, 0.00292262, 0.03692464,
       0.005719  , 0.01298632, 0.00406642, 0.02496825, 0.07871672]), 'param_n_neighbors': masked_array(data=[3, 3, 5, 5, 10, 10, 12, 12, 15, 15],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object), 'pa

{'mean_fit_time': array([0.01595179, 0.02326202, 0.0219396 , 0.01561419, 0.01595553,
        0.02359192, 0.01593741, 0.02925976, 0.02061264, 0.01628947]),
 'std_fit_time': array([0.00281311, 0.00554443, 0.00373047, 0.00308823, 0.00215358,
        0.00972584, 0.00162434, 0.00448077, 0.00047251, 0.00261661]),
 'mean_score_time': array([0.3530577 , 0.10073423, 0.30618683, 0.11172144, 0.33277933,
        0.15959978, 0.33444269, 0.16555214, 0.40823952, 0.26097822]),
 'std_score_time': array([0.05894163, 0.00081702, 0.01589575, 0.00292262, 0.03692464,
        0.005719  , 0.01298632, 0.00406642, 0.02496825, 0.07871672]),
 'param_n_neighbors': masked_array(data=[3, 3, 5, 5, 10, 10, 12, 12, 15, 15],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_weights': masked_array(data=['uniform', 'distance', 'uniform', 'distance',
                    'uniform', 'distance', 'uniform', 

In [22]:
"""
朴素贝叶斯进行文本分类
:return: None
"""
news = fetch_20newsgroups(subset='all', data_home='data')

print(len(news.data))  #样本数，包含的特征
print('-'*50)
print(news.data[0]) #第一个样本 特征
print('-'*50)
print(news.target) #标签
print(np.unique(news.target))
print(news.target_names)

18846
--------------------------------------------------
From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!


----------------------------------------

In [23]:
print('-'*50)
# 进行数据分割
x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25, random_state=1)

# 对数据集进行特征抽取
tf = TfidfVectorizer()

# 以训练集当中的词的列表进行每篇文章重要性统计['a','b','c','d']
x_train = tf.fit_transform(x_train)
#针对特征内容，可以自行打印

tf_feature_names = tf.get_feature_names_out()

print(len(tf.get_feature_names_out()))
print(tf.get_feature_names_out()[70000])
print(tf.get_feature_names_out()[0])

--------------------------------------------------
153196
glen
00


In [24]:
print(tf.get_feature_names_out()[0:10])
print(tf.get_feature_names_out()[70000:80000])


['00' '000' '0000' '00000' '0000000004' '0000000005' '0000000667'
 '0000001200' '000003' '000005102000']
['glen' 'glen_fullmer' 'glen_murray' ... 'intermetrics' 'intermidiate'
 'interminable']


In [None]:
import time
start=time.time()

print(len(tf.get_feature_names_out()))
# 进行朴素贝叶斯算法的预测,alpha是拉普拉斯平滑系数，分子和分母加上一个系数，分母加alpha*特征词数目
mlt = MultinomialNB(alpha=1.0)

print(x_train.toarray())
# 训练
mlt.fit(x_train, y_train)
end=time.time()
end-start

153196


In [None]:
from sklearn.datasets import load_iris
# 获取鸢尾花数据集
iris = load_iris()
print("鸢尾花数据集的返回值：\n", iris)
# 返回值是一个继承自字典的Bench
print("鸢尾花的特征值:\n", iris["data"])
print("鸢尾花的目标值：\n", iris.target)
print("鸢尾花特征的名字：\n", iris.feature_names)
print("鸢尾花目标值的名字：\n", iris.target_names)
print("鸢尾花的描述：\n", iris.DESCR)

In [None]:
%matplotlib inline
# 内嵌绘图
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# 把数据转换成dataframe的格式
iris_d = pd.DataFrame(iris['data'], columns = ['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width'])
iris_d['Species'] = iris.target

def plot_iris(iris, col1, col2):
    sns.lmplot(x = col1, y = col2, data = iris, hue = "Species", fit_reg = False)
    plt.xlabel(col1)
    plt.ylabel(col2)
    plt.title('鸢尾花种类分布图')
    plt.show()
plot_iris(iris_d, 'Petal_Width', 'Sepal_Length')

In [None]:
1+1
