## 机器学习数据预处理

In [None]:
import pandas as pd
import sklearn
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

### 注意读取数据时的区别

- index_col: index_colint, str, sequence of int / str, or False, default None. Column(s) to use as the row labels of the DataFrame, either given as string name or column index. If a sequence of int / str is given, a MultiIndex is used.

In [None]:
strat_train_set = pd.read_csv('./strat_train_set.csv')
strat_train_set.head()

In [None]:
strat_train_set = pd.read_csv('./strat_train_set.csv', index_col=0)
strat_train_set.head()

## 训练标签与数据集的分离

In [None]:
# axis{0 or ‘index’, 1 or ‘columns’}, default 0 Whether to drop labels from the index (0 or ‘index’) or columns (1 or ‘columns’).
housing = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy()

In [None]:
housing.head()

### 查看数据的缺失值

In [None]:
sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()
sample_incomplete_rows

### 解决思路
    1. 放弃这些相应的地区
    2. 放弃这个属性
    3. 将缺失的值设置为某个值(0、平均数或者中位数等都可以)

- 方案一：
    删除缺失数据（行）

In [None]:
sample_incomplete_rows.dropna(subset=["total_bedrooms"])    # option 1

- 方案二：删除列数据

In [None]:
sample_incomplete_rows.drop("total_bedrooms", axis=1)       # option 2

注意对比：此处没写inplace=True（同方案一），因此原来的列表并不会改变。

In [None]:
sample_incomplete_rows

- 方案三：使用均值填充

In [None]:
median = housing["total_bedrooms"].median()
sample_incomplete_rows["total_bedrooms"].fillna(median, inplace=True) # option 3
# 在此处我们要使用方案三，因此设置inplace=True

In [None]:
median

In [None]:
# total_bedrooms变化
sample_incomplete_rows

## sklearn 缺失值处理（imputer）

strategy: string, default=’mean’
The imputation strategy.

If “mean”, then replace missing values using the mean along each column. Can only be used with numeric data.

If “median”, then replace missing values using the median along each column. Can only be used with numeric data.

If “most_frequent”, then replace missing using the most frequent value along each column. Can be used with strings or numeric data.

If “constant”, then replace missing values with fill_value. Can be used with strings or numeric data.

New in version 0.20: strategy=”constant” for fixed value imputation.

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

- 由于中位数值只能在数值属性上计算，所以我们需要创建一个没 有文本属性的数据副本ocean_proximity

In [None]:
housing_num = housing.drop("ocean_proximity", axis=1)
# alternatively: housing_num = housing.select_dtypes(include=[np.number])

In [None]:
housing_num.head()

In [None]:
imputer.fit(housing_num)

- 这里imputer仅仅只是计算了每个属性的中位数值，并将结果存储在其实例变量statistics_中。

In [None]:
# 获取中位数
imputer.statistics_

- 利用pandas作对比

In [None]:
housing_num.median().values

- 将Housing_num转换为数组

In [None]:
X = imputer.transform(housing_num)

- 将填充后的数据转换为pandas格式数据

In [None]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index=housing.index)

In [None]:
housing_tr.head()

In [None]:
sample_incomplete_rows.index

In [None]:
sample_incomplete_rows.index.values

In [None]:
housing_tr.loc[sample_incomplete_rows.index.values]

- 将文本类数据转换成标签（处理ocean_proximity）

In [None]:
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(10)

- 处理文本与分类属性

1. 将分类特征编码为整数数组。该转换器的输入应为整数或字符串之类的数组，表示分类（离散）特征所采用的值。要素将转换为序数整数。这将产生对应每个要素的一列整数（0到n_categories-1）

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder() # 创建实例
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

In [None]:
ordinal_encoder.categories_

2. 将分类特征编码为one-hot数组。该转换器的输入应为整数或字符串之类的数组，表示分类（离散）特征所采用的值。这些功能是使用单发（又称“ one-of-K”或“ dummy”）编码方案进行编码的。这将为每个类别创建一个二进制列，并返回一个稀疏矩阵或密集数组（取决于稀疏参数）。

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

In [None]:
# 默认情况下OneHotEncoder返回一个稀疏数组，如有需要，可利用toarray()方法将其转化成密集数组
housing_cat_1hot.toarray()

In [None]:
cat_encoder.categories_

或者在OneHotEncoder生成实例的时候，设置sparse=False

In [None]:
cat_encoder = OneHotEncoder(sparse=False)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

In [None]:
cat_encoder.categories_