In [None]:
# Bunch对象 的简要描述
"""
在真正进入数据集学习之前需要先认识一个特别的数据结构: Bunch对象

Bunch 是 sklearn.utils 模块中定义的一个类，继承自 Python 的 dict
可以将 Bunch对象 理解为做了功能扩展的字典
"""

# Bunch对象 的设计目的，及其优势
"""
Bunch 对象是 scikit-learn 中专门设计的数据容器


支持点号属性访问：
    from sklearn.datasets import load_iris
    iris = load_iris()
    iris.data = value
    value = iris.data


数据科学优化：预定义标准键名，形成约定俗成的数据结构
    {
        'data':         ndarray,        # 特征矩阵 (n_samples, n_features)
        'target':       ndarray,        # 标签数组 (n_samples,)
        'feature_names':list,           # 特征名称列表
        'target_names': list,           # 类别名称列表
        'DESCR':        str,            # 数据集描述文本
        'filename':	    str,            # 数据集文件名 (scikit-learn < 1.2时返回完整文件路径)
        'frame':        pd.DataFrame,	# 包含数据的Pandas DataFrame（较新版本中已弃用）
        'data_module':	str	            # 数据集来源模块
    }
"""


# 在 scikit-learn 中 Bunch对象 的使用
"""
所有通过 load_* 和 fetch_* 加载的真实世界数据集都返回 Bunch 对象

    玩具数据集（load_* 函数），例如: load_iris(), load_digits(), load_wine() 等
        iris = load_iris()
        digits = load_digits()
        wine = load_wine()
        # 上面这三行代码都返回 Bunch 对象
    
    下载型数据集（fetch_* 函数），例如：fetch_california_housing(), fetch_20newsgroups() 等
        california_housing = fetch_california_housing()
        newsgroups = fetch_20newsgroups()
        # 上面两行代码都返回 Bunch 对象
"""

In [None]:
# 加载本地玩具数据集
from sklearn.datasets import load_iris

iris = load_iris()  # 加载鸢尾花数据集
print(iris.keys())   # 这是一个 Bunch对象，包含了数据集的各种信息


data = iris.data    # 鸢尾花数据集的特征矩阵
print(data[:5])     # 输出前五个样本特征
# 注：此处 data 为 numpy.ndarray 类型，是一个二维数组，每一行代表一个样本，每一列代表一个特征


print(iris.target)  # 鸢尾花数据集的标签数组
# 注：标签数组中只有数字，每个数字对应 data 中的对应行的数据所对应的种类


print(iris.feature_names)   # 特征名称：['萼片长度', '萼片宽度', '花瓣长度', '花瓣宽度']
print(iris.target_names)    # 种类(标签)名称：'setosa-山鸢尾' 'versicolor-变色鸢尾' 'virginica-维基利亚鸢尾'


print(iris.filename)    # 数据集的文件名
print(iris.DESCR)       # 数据集的描述

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
['setosa' 'versicolor' 'virginica']
iris.csv
.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

:Number of Instances: 150 (50 in each of three classes)
:Number of Attributes: 4 numeric, predictive attributes and the class
:Attribute Information:
    - sepal length in cm
    - sepal width in cm
    - petal length in cm
    - petal width in cm
    - class:
            - Iris-Setosa
            - Iris-Versicolour
            - Iris-Virginica

:Summary Statistics:

        

In [None]:
# 加载联网下载的数据集
from sklearn.datasets import fetch_20newsgroups
from sklearn import datasets
# path=datasets.get_data_home()#查看数据集默认的下载路径
# print(path)#C:\Users\JYL\scikit_learn_data

news=fetch_20newsgroups(data_home="./src",subset="train")#联网加载新闻数据集
print(len(news.data))
print(news.data[0:4])
print(news.target)
print(news.target_names)

11314
["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n", "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLi

In [4]:
# 加载自己的数据
import pandas as pd
data=pd.read_csv("./src/ss.csv")
data=data.to_numpy()
print(data)

data=pd.read_excel("./src/data.xlsx")
print(data)

[['小王' 23 '女']
 ['小李' 34 '  男']
 ['小陶' 40 ' 32']]
    语文  数学  英语   综合
0  123  34  44  123
1   33  44  33  234
2   33  55  66  190
