In [None]:
import pandas as pd
import matplotlib.pyplot as plt
# pd.show_versions()

In [None]:
import os


def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join("images", fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
huamu = pd.read_csv("data/ke/xiaoqu/sh/20201020/pudong_huamu.csv", header=None,
        names=["date", "district", "area", "neighbour", "price", "sale_num"])
huamu.head()
huamu.info()
huamu.describe()

In [None]:
huamu["neighbour"].value_counts()
huamu.hist(bins=50, figsize=(20,15))
save_fig("huamu")
# plt.show()

In [None]:
huamu_nei = huamu[["neighbour"]]
huamu_nei.head(10)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ord_encoder = OrdinalEncoder()
huamu_nei_encoded = ord_encoder.fit_transform(huamu_nei)
ord_encoder.categories_

In [None]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder()
huamu_nei_1hot = one_hot_encoder.fit_transform(huamu_nei)
huamu_nei_1hot
huamu_nei_1hot.toarray()

In [None]:
import numpy as np
from pandas import DataFrame
from sklearn.base import TransformerMixin
from pandas.core.generic import NDFrame


class PriceExtractor(TransformerMixin):
    def fit(self, X, y=None):
        return self

    def _extract(self, ele):
        try:
            price, _ = ele.split(u'元', 1)
        except ValueError:
            return
        if not price:
            return
        return np.int32(price)

    def transform(self, X: NDFrame) -> NDFrame:
        res = X.iloc[:, 0].apply(self._extract)
        return np.c_[res]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

price_pipeline = Pipeline([
    ('price_extractor', PriceExtractor()),
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
    ])

price_pipeline.fit_transform(huamu[["price"]])