In [3]:
import sys

assert sys.version_info >= (3, 7)   # 检查python版本

from packaging import version
import sklearn

assert version.parse(sklearn.__version__) >= version.parse("1.0.1")  # 检查sklearn版本

In [13]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_housing_data():
    tarball_path = Path("ml-teach-main/datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
    with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("ml-teach-main/datasets/housing/housing.csv"))

housing = load_housing_data()
type(housing)

pandas.core.frame.DataFrame

In [5]:
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler



class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        # KMeans估计器相关参数：集群数量，随机种子
        self.kmeans_ = KMeans(self.n_clusters, n_init=10,
                              random_state=self.random_state)

        # sample_weight可指定样本的相对权重
        self.kmeans_.fit(X, sample_weight=sample_weight)   #self.kmeans_的属性里面   有对特征分群的相关属性
        return self # 永远返回self

    def transform(self, X):
        # self.kmeans_.cluster_centers_ 集群中心的位置
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)  #exp（-gamma *（x-固定值）的平方）

    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

In [6]:
from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn.preprocessing import FunctionTransformer, OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np
def column_ratio(X):
    return X[:, [0]] / X[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]  # feature names out （输出特征的名字）

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler())

log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),  # one-to-one: 1比1映射
    StandardScaler())

cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)

default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                                     StandardScaler())

# cat_pipeline = make_pipeline(
#     SimpleImputer(strategy="most_frequent"),
#     OneHotEncoder(handle_unknown="ignore"))

# class ClusterSimilarity(BaseEstimator, TransformerMixin):
#     def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
#         self.n_clusters = n_clusters
#         self.gamma = gamma
#         self.random_state = random_state
#
#     def fit(self, X, y=None, sample_weight=None):
#         # KMeans估计器相关参数：集群数量，随机种子，KMeans是一个随机算法，依赖随机性来定位集群
#         self.kmeans_ = KMeans(self.n_clusters, n_init=10,
#                               random_state=self.random_state)
#
#         # sample_weight可指定样本的相对权重, 属于KMeans算法里的超参数，训练前指定。
#         self.kmeans_.fit(X, sample_weight=sample_weight)
#         return self # 永远返回self
#
#     def transform(self, X):
#         # self.kmeans_.cluster_centers_ 集群中心的位置
#         return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
#
#     def get_feature_names_out(self, names=None):
#         return [f"Cluster {i} similarity" for i in range(self.n_clusters)]
cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

preprocessing = ColumnTransformer([
        ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
        ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
        ("people_per_house", ratio_pipeline(), ["population", "households"]),
        ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                               "households", "median_income"]),
        ("geo", cluster_simil, ["latitude", "longitude"]),
        ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    ],
    remainder=default_num_pipeline)  # remainder，剩下的列用什么转换器，现在就剩下housing_median_age

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
# 1. 尝试支持向量机回归器(sklearn.svm.SVR)，用这个模型来做回归。
#      试试这个模型的超参数，例如kernel="linear"，kernel="rbf"，不同的kernel选择下也会有不同的超参数​。  分别用GridSearchCV和RandomizedSearchCV探索性能最优（交叉验证后的预测表现最好）的超参数
#
#      请注意，支持向量机不能扩展到大型数据集，因此应该仅在训练集的前5000个实例上训练你的模型，并且仅使用3折交叉验证，否则会要运行很久（按小时计）。
#      # 现在不要担心支持向量机超参数的含义，将在讲支持向量机（SVM）的时候详解

# 默认kernel=rbf
from sklearn.svm import SVR

In [20]:
housing["income_cat"] =pd.cut(housing["median_income"],
                              bins=[0.,1.5,3.0,4.5,6.,np.inf],
                              labels=[1,2,3,4,5])
strat_train_set,strat_test_set = train_test_split(housing, test_size=0.2, random_state=42)
housing_labels = strat_train_set["median_house_value"].copy()
type(strat_train_set)
strat_test_set[:3000]
svr=make_pipeline(preprocessing,SVR())
type(housing_labels)
svr.fit(strat_train_set[:3000],housing_labels[:3000])
#
#
svr_pipeline=Pipeline([
    ("preprocessing",preprocessing),
    ("model",DecisionTreeRegressor())
])

param_gris=({
    "preprocessing__geo__n_clusters":[3,4],
    "model__max_features":[4,6]
})
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
grid_search=GridSearchCV(svr_pipeline,param_gris,cv=3,scoring="neg_mean_squared_error")
grid_search.fit(strat_train_set[:3000],housing_labels[:3000])