# sklearn examples of some algorithm

#### 数据标准化（Standardization）

常用的 __数据预处理__ 方法, 它会将每个特征（每一列）的数据转换为 均值为 0、方差为 1 的分布 （也就是标准正态分布）。<br>
这在很多机器学习算法中是很有用的预处理步骤，尤其是那些依赖于距离计算 的模型（如 __KNN、SVM、PCA、线性回归__ 等）。


In [16]:
from sklearn.preprocessing import StandardScaler

X = [[0, 0], 
     [1, 1], 
     [2, 2]]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(X_scaled) # 可以观察到，每一列的均值为0,方差为1

[[-1.22474487 -1.22474487]
 [ 0.          0.        ]
 [ 1.22474487  1.22474487]]


在机器学习流程中的使用位置,通常放在训练模型之前：
```python
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# 加载数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 创建标准化对象
scaler = StandardScaler()

# 对训练集进行拟合和变换（fit + transform）
X_train_scaled = scaler.fit_transform(X_train)

# 对测试集只进行变换（不要 fit！）
X_test_scaled = scaler.transform(X_test)

# 使用标准化后的数据训练模型
model = KNeighborsClassifier()
model.fit(X_train_scaled, y_train)
```

#### Iris 鸢尾花数据集机器学习练习 KNN 算法

In [28]:
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 加载内置的 Iris 鸢尾花数据集
iris = datasets.load_iris()
print("datasets.iris have data size:", iris.data.shape)
print("datasets.iris have target size:", iris.target.shape)

print("Feature Names:")
print(iris.feature_names)
print("Target Names:")
print(iris.target_names)
print("Features:\n", iris.data[:2])     # 显示前2个样本的特征
print("Labels:\n", iris.target[:2])     # 显示前2个样本的标签


# X, y = iris.data[:, :2], iris.target # 选取前两个特征来训练
X, y = iris.data[:, :], iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33, test_size=0.2, stratify=y) 

scaler = preprocessing.StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

knn = neighbors.KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy_score(y_test, y_pred))

datasets.iris have data size: (150, 4)
datasets.iris have target size: (150,)
Feature Names:
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Target Names:
['setosa' 'versicolor' 'virginica']
Features:
 [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]]
Labels:
 [0 0]
Accuracy: 0.9333333333333333


In [16]:
from sklearn import datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

diabetes = datasets.load_diabetes()
print("diabetes have data scale", diabetes.data.shape)
print("diabetes have targets scale", diabetes.target.shape)

print("Feature name:", diabetes.feature_names)
# 回归数据集中，没有target_names,所以接下来的一行代码运行会出错
# print("Targets name:", diabetes.target_names)



diabetes have data scale (442, 10)
diabetes have targets scale (442,)
Feature name: ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']


#### 测试多种模型在sklearn.datasets.load_diabetes()数据集上的训练效果

In [1]:
import time

import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# 导入各种模型
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

# 加载数据
data = load_diabetes()
X, y = data.data, data.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 特征标准化（部分模型需要）
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 所有模型定义
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(random_state=42),
    "Lasso Regression": Lasso(random_state=42),
    "ElasticNet": ElasticNet(random_state=42),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Support Vector Regression": SVR(),
    "KNeighbors Regressor": KNeighborsRegressor(),
    "MLP Regressor": MLPRegressor(max_iter=30000, random_state=42),
    "XGBoost Regressor": XGBRegressor(random_state=42)
}

results = []

# 训练并评估每个模型
for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # 特殊处理：是否使用标准化后的数据
    if isinstance(model, (SVR, KNeighborsRegressor, MLPRegressor)):
        X_train_fit, X_test_fit = X_train_scaled, X_test_scaled
    else:
        X_train_fit, X_test_fit = X_train, X_test
        
    start_time = time.time()
    model.fit(X_train_fit, y_train)
    end_time = time.time()
    
    y_pred = model.predict(X_test_fit)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    duration = end_time - start_time
    
    results.append({
        "Model": model_name,
        "MSE": round(mse, 4),
        "R² Score": round(r2, 4),
        "Time (s)": round(duration, 4)
    })

# 输出结果
result_df = pd.DataFrame(results).sort_values(by="R² Score", ascending=False)
print("\nModel Performance:")
print(result_df.to_string(index=False))

Training Linear Regression...
Training Ridge Regression...
Training Lasso Regression...
Training ElasticNet...
Training Decision Tree...
Training Random Forest...
Training Gradient Boosting...
Training Support Vector Regression...
Training KNeighbors Regressor...
Training MLP Regressor...
Training XGBoost Regressor...

Model Performance:
                    Model       MSE  R² Score  Time (s)
        Gradient Boosting 2898.4367    0.4529    0.0803
        Linear Regression 2900.1936    0.4526    0.0042
            Random Forest 2952.0106    0.4428    0.1546
     KNeighbors Regressor 3047.4499    0.4248    0.0010
         Ridge Regression 3077.4159    0.4192    0.0018
        XGBoost Regressor 3351.0016    0.3675    0.3598
         Lasso Regression 3403.5757    0.3576    0.0009
Support Vector Regression 4332.7385    0.1822    0.0043
            Decision Tree 4976.7978    0.0607    0.0024
               ElasticNet 5311.2128   -0.0025    0.0006
            MLP Regressor 6104.6929   -0.152