### 机器学习分类
#### 监督 vs 无监督 vs 半监督 vs 强化学习
#### 批学习 vs 在线学习：是否能进行增量学习
#### 基于样本 vs 基于模型

In [7]:
import quantecon as qe
import numpy as np
import os 
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [8]:
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
fetch_housing_data()
housing=load_housing_data()

#descreption
housing.head()
housing.info()
housing.describe()
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50,figsize=(20,15))
plt.show()

In [9]:
import numpy as np
def split_train_test(data,test_ratio):
    np.random.seed(42)
    shuffled_indices=np.random.permutation(len(data))
    test_set_size=int(len(data)*test_ratio)
    test_indices=shuffled_indices[:test_set_size]
    train_indices=shuffled_indices[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]
#iloc return the whole data with train_indices
train_set,test_set=split_train_test(housing,0.2)



#both these solutions will break next time you fetch an updataed dataset
#compute a hash of each instance's indentifier to decide whether
#or not it shold go in the test set.
#keep only the last byte of the hash,and put the instance in the test
#set will remain consistent across multiple runs.
#even if you refresh the dataset the new test set will contains 20%
#of the new instances
import hashlib
def test_set_check(identifier,test_ratio,hash):
    return hash(np.int64(identifier)).digest()[-1]<256*test_ratio
def split_train_test_by_id(data,test_ratio,id_column,hash=hashlib.md5):
    ids=data[id_column]
    in_test_set=ids.apply(lambda id_:test_set_check(id_,test_ratio,hash))
    return data.loc[~in_test_set],data.loc[in_test_set]
#reset index with number
housing_with_id=housing.reset_index()
train_set,test_set=split_train_test_by_id(housing_with_id,0.2,'index')

#reset index with unique feature
housing_with_id['id']=housing['longitude']*1000+housing['latitude']
train_set,test_set=split_train_test_by_id(housing_with_id,0.2,'id')

#split the date with sklearn
from sklearn.model_selection import train_test_split
train_set,test_set=train_test_split(housing,test_size=0.2,random_state=42)




#split dataset by median income

housing['income_cat']=np.ceil(housing['median_income']/1.5)
housing['income_cat'].where(housing['income_cat']<5,5,inplace=True)
#keep elements meet condition(arg1),others are replaced by arg2

from sklearn.model_selection import StratifiedShuffleSplit
split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index,test_index in split.split(housing,housing['income_cat']):
    strat_train_set=housing.loc[train_index]
    strat_test_set=housing.loc[test_index]
    #print("TRAIN:", train_index, "TEST:", test_index)
#StratifiedShuffleSplit.split 返回分割结果的索引（数字）
#loc 按数字索引查询全部数据
housing['income_cat'].value_counts()/len(housing)
for set in (strat_train_set,strat_test_set):
    set.drop(['income_cat'],axis=1,inplace=True)
    
housing=strat_train_set.copy()
#深复制
housing.plot(kind='scatter',x='longitude',y='latitude',c='blue',alpha=0.2)

housing.plot(kind='scatter',x='longitude',y='latitude',alpha=0.4,
            s=housing['population']/100,label='population',
            c='median_house_value',cmap=plt.get_cmap('jet'),colorbar=True)
plt.legend()
plt.show()    
    
corr_matrix=housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)
from pandas.tools.plotting import scatter_matrix
attributes=['median_house_value','median_income','total_rooms','housing_median_age']
scatter_matrix(housing[attributes],figsize=(12,8))
plt.show()






In [15]:
housing=strat_train_set.drop('median_house_value',axis=1)
housing_labels=strat_train_set['median_house_value'].copy()
housing.dropna(subset=['total_bedrooms'])#drop value
housing.drop('total_bedrooms',axis=1)#drop feature
median=housing['total_bedrooms'].median()
housing['total_bedrooms'].fillna(median)#fill nan with median


from sklearn.preprocessing import Imputer
imputer=Imputer(strategy='median')
housing_num=housing.drop('ocean_proximity',axis=1)
imputer.fit(housing_num)#拟合而不应用
imputer.statistics_#all features' median
housing_num.median().values#all features' median
X=imputer.transform(housing_num)
housing_tr=pd.DataFrame(X,columns=housing_num.columns)

#fit/Trasform/fit-transform()/predict/Inspection(strategy,statistics_) 


#handing text and categorical attributes
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()#以单一数字编码
housing_cat=housing['ocean_proximity']
housing_cat_encoded=encoder.fit_transform(housing_cat)
housing_cat_encoded
encoder.classes_

#以onehotEncoder将单一数字编码变为one-hot键编码,
#形式为scipy sparse matrix
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder()
housing_cat_1hot=encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot

#直接将文本变为one-not编码,默认为dense Numpy array,
#取参数sparse_output=True 变为sparse matrix
from sklearn.preprocessing import LabelBinarizer
encoder=LabelBinarizer()
housing_cat_1hot=encoder.fit_transform(housing_cat)
housing_cat_1hot


In [17]:
#自定义类，以测试增加特征甚至改变超参数的效果
from sklearn.base import BaseEstimator,TransformerMixin
rooms_ix,bedrooms_ix,population_ix,household_ix=3,4,5,6
class CombineAttributesAdder(BaseEstimator,TransformerMixin):
    def __init__(self,add_bedrooms_per_room=True):#no *args or **kargs
        self.add_bedrooms_per_room=add_bedrooms_per_room
    def fit(self,X,y=None):
        return self#nothing else to do
    def transform(self,X,y=None):
        rooms_per_household=X[:,rooms_ix]/X[:,household_ix]
        population_per_household=X[:,population_ix]/X[:,household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room=X[:,bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X,rooms_per_household,population_per_household,bedrooms_per_room]
        else:
            return np.c_[X,rooms_per_household,population_per_household]
attr_adder=CombineAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs=attr_adder.transform(housing.values)
        
    
    

In [18]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """Encode categorical features as a numeric array.
    The input to this transformer should be a matrix of integers or strings,
    denoting the values taken on by categorical (discrete) features.
    The features can be encoded using a one-hot aka one-of-K scheme
    (``encoding='onehot'``, the default) or converted to ordinal integers
    (``encoding='ordinal'``).
    This encoding is needed for feeding categorical data to many scikit-learn
    estimators, notably linear models and SVMs with the standard kernels.
    Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
    Parameters
    ----------
    encoding : str, 'onehot', 'onehot-dense' or 'ordinal'
        The type of encoding to use (default is 'onehot'):
        - 'onehot': encode the features using a one-hot aka one-of-K scheme
          (or also called 'dummy' encoding). This creates a binary column for
          each category and returns a sparse matrix.
        - 'onehot-dense': the same as 'onehot' but returns a dense array
          instead of a sparse matrix.
        - 'ordinal': encode the features as ordinal integers. This results in
          a single column of integers (0 to n_categories - 1) per feature.
    categories : 'auto' or a list of lists/arrays of values.
        Categories (unique values) per feature:
        - 'auto' : Determine categories automatically from the training data.
        - list : ``categories[i]`` holds the categories expected in the ith
          column. The passed categories are sorted before encoding the data
          (used categories can be found in the ``categories_`` attribute).
    dtype : number type, default np.float64
        Desired dtype of output.
    handle_unknown : 'error' (default) or 'ignore'
        Whether to raise an error or ignore if a unknown categorical feature is
        present during transform (default is to raise). When this is parameter
        is set to 'ignore' and an unknown category is encountered during
        transform, the resulting one-hot encoded columns for this feature
        will be all zeros.
        Ignoring unknown categories is not supported for
        ``encoding='ordinal'``.
    Attributes
    ----------
    categories_ : list of arrays
        The categories of each feature determined during fitting. When
        categories were specified manually, this holds the sorted categories
        (in order corresponding with output of `transform`).
    Examples
    --------
    Given a dataset with three features and two samples, we let the encoder
    find the maximum value per feature and transform the data to a binary
    one-hot encoding.
    >>> from sklearn.preprocessing import CategoricalEncoder
    >>> enc = CategoricalEncoder(handle_unknown='ignore')
    >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
    ... # doctest: +ELLIPSIS
    CategoricalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
              encoding='onehot', handle_unknown='ignore')
    >>> enc.transform([[0, 1, 1], [1, 0, 4]]).toarray()
    array([[ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.],
           [ 0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.]])
    See also
    --------
    sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of
      integer ordinal features. The ``OneHotEncoder assumes`` that input
      features take on values in the range ``[0, max(feature)]`` instead of
      using the unique values.
    sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
      dictionary items (also handles string-valued features).
    sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
      encoding of dictionary items or strings.
    """

    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
                 handle_unknown='error'):
        self.encoding = encoding
        self.categories = categories
        self.dtype = dtype
        self.handle_unknown = handle_unknown

    def fit(self, X, y=None):
        """Fit the CategoricalEncoder to X.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_feature]
            The data to determine the categories of each feature.
        Returns
        -------
        self
        """

        if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
            template = ("encoding should be either 'onehot', 'onehot-dense' "
                        "or 'ordinal', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
                        "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
            raise ValueError("handle_unknown='ignore' is not supported for"
                             " encoding='ordinal'")

        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
        n_samples, n_features = X.shape

        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]

        for i in range(n_features):
            le = self._label_encoders_[i]
            Xi = X[:, i]
            if self.categories == 'auto':
                le.fit(Xi)
            else:
                valid_mask = np.in1d(Xi, self.categories[i])
                if not np.all(valid_mask):
                    if self.handle_unknown == 'error':
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
                le.classes_ = np.array(np.sort(self.categories[i]))

        self.categories_ = [le.classes_ for le in self._label_encoders_]

        return self

    def transform(self, X):
        """Transform X using one-hot encoding.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to encode.
        Returns
        -------
        X_out : sparse matrix or a 2-d array
            Transformed input.
        """
        X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
        n_samples, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            valid_mask = np.in1d(X[:, i], self.categories_[i])

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    # Set the problematic rows to an acceptable value and
                    # continue `The rows are marked `X_mask` and will be
                    # removed later.
                    X_mask[:, i] = valid_mask
                    X[:, i][~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(X[:, i])

        if self.encoding == 'ordinal':
            return X_int.astype(self.dtype, copy=False)

        mask = X_mask.ravel()
        n_values = [cats.shape[0] for cats in self.categories_]
        n_values = np.array([0] + n_values)
        indices = np.cumsum(n_values)

        column_indices = (X_int + indices[:-1]).ravel()[mask]
        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                n_features)[mask]
        data = np.ones(n_samples * n_features)[mask]

        out = sparse.csc_matrix((data, (row_indices, column_indices)),
                                shape=(n_samples, indices[-1]),
                                dtype=self.dtype).tocsr()
        if self.encoding == 'onehot-dense':
            return out.toarray()
        else:
            return ou

In [21]:
#scaling:min-max(normalization) and standardization
#MinMaxScaler,feature_range控制是否在0-1之间
#StandardScaler进行标准化，对异常值不敏感

#管道流对数据同时进行多种操作 name/estimator
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline=Pipeline([('imputer',Imputer(strategy='median')),
                      ('attribs_adder',CombineAttributesAdder()),
                      ('std_scaler',StandardScaler())])
#除最后一个外，其他必须为transformer,即必须有fit_transform方法
housing_num_tr=num_pipeline.fit_transform(housing_num)
#pipeline.fit从第一个开始执行fit_transform方法，至最后一个estimator只调用fit方法
#而pipeline.fit_transform每一步都执行fit_transform方法

#FeatureUnion类 将所有变形整合到一个管道流
from sklearn.pipeline import FeatureUnion
from sklearn_features.transformers import DataFrameSelector
#from sklearn import CategoricalEncoder
num_attribs=list(housing_num)
cat_attribs=['ocean_proximity']
num_pipeline=Pipeline([
    ('selector',DataFrameSelector(num_attribs)),
    ('imputer',Imputer(strategy='median')),
    ('attribs_adder',CombineAttributesAdder()),
    ('std_scaler',StandardScaler())
])
cat_pipeline=Pipeline([
    ('selector',DataFrameSelector(cat_attribs)),
    ('label_binarizer',CategoricalEncoder(encoding="onehot-dense"))
])

full_pipeline=FeatureUnion(transformer_list=[
    ('num_pipeline',num_pipeline),
    ('cat_pipeline',cat_pipeline)
])
housing_prepared=full_pipeline.fit_transform(housing)
housing_prepared.shape

(16512, 16)

from sklearn.base import BaseEstimator,TransformerMixin
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names=attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values
    
from sklearn.pipeline import FeatureUnion

num_attribs=list(housing_num)
cat_attribs=['ocean_proximity']
num_pipeline=Pipeline([
    ('selector',DataFrameSelector(num_attribs)),
    ('imputer',Imputer(strategy='median')),
    ('attribs_adder',CombineAttributesAdder()),
    ('std_scaler',StandardScaler())
])
cat_pipeline=Pipeline([
    ('selector',DataFrameSelector(cat_attribs)),
    ('label_binarizer',LabelBinarizer())
])

full_pipeline=FeatureUnion(transformer_list=[
    ('num_pipeline',num_pipeline),
    ('cat_pipeline',cat_pipeline)
])
housing_prepared=full_pipeline.fit_transform(housing)
housing_prepared.shape

In [36]:
from sklearn.linear_model import LinearRegression
lin_reg=LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)

some_data=housing.iloc[:5]
some_labels=housing_labels.iloc[:5]
some_data_prepared=full_pipeline.transform(some_data)
lin_reg.predict(some_data_prepared)

from sklearn.metrics import mean_squared_error as mse
housing_predictions=lin_reg.predict(housing_prepared)
lin_mse=mse(housing_labels,housing_predictions)

lin_rmse=np.sqrt(lin_mse)

from sklearn.tree import DecisionTreeRegressor
tree_reg=DecisionTreeRegressor()
tree_reg.fit(housing_prepared,housing_labels)
housing_predictions=tree_reg.predict(housing_prepared)
tree_mse=mse(housing_labels,housing_predictions)

tree_rmse=np.sqrt(tree_mse)

from sklearn.model_selection import cross_val_score as cvs
#将测试集分成10折，每次用9折训练1折学习，共10次，取10次结果的平均
scores=cvs(tree_reg,housing_prepared,housing_labels,scoring='neg_mean_squared_error',cv=10)
rmse_scores=np.sqrt(-scores)

lin_scores=cvs(lin_reg,housing_prepared,housing_labels,scoring='neg_mean_squared_error',cv=10)
lin_scores_rmse=np.sqrt(-lin_scores)
print(lin_scores_rmse.mean())
print(rmse_scores.mean())

from sklearn.ensemble import RandomForestRegressor
forest_reg=RandomForestRegressor()
forest_reg.fit(housing_prepared,housing_labels)
forest_reg_predictions=forest_reg.predict(housing_prepared)
forest_mse=mse(forest_reg_predictions,housing_labels)
forest_rmse=np.sqrt(forest_mse)


forest_scores=cvs(forest_reg,housing_prepared,housing_labels,scoring='neg_mean_squared_error',cv=10)
forest_socres_rmse=np.sqrt(-forest_scores)
print(forest_socres_rmse.mean())

print(lin_rmse,tree_rmse,forest_rmse)

69052.46136345083
70800.290109054
52747.97760573926
68628.19819848923 0.0 22249.136793360492


In [1]:
#调参
from sklearn.model_selection import GridSearchCV
param_grid=[{
    'n_estimators':[3,10,30],'max_features':[2,4,6,8]
},{'bootstrap':[False],'n_estimator':[3,10],'max_features':[2,3,4]},]

from skearn.ensemble import RandomForestRegressor
forest_reg=RandomForestRegressor()
grid_search=GridSearchCV(forest_reg,param_grid,cv=5,scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared,housing_labels)

grid_search.best_params_
grid_search.best_estimator_

cvres=grid_search.cv_results_
for mean_score,params in zip(cvres['mean_test_score'],cvres['params']):
    print(np.sqrt(-mean_score),params)
    
feature_importances=grid_search.best_estimator_.feature_importances_
extra_attribs=['rooms_per_hhold','pop_per_hhold','bedrooms_per_room']
cat_one_hot_attribs=list(encoder.classes_)
attributes=num_attribs+extra_attribs+cat_one_hot_attribs
sorted(zip(feature_importances,attributes),reverse=True)
#如果参数空间过大，则需使用RandomizedSearchCV

from sklearn.externals import joblib
joblib.dump(grid_search,'my_model.txt')
my_model_loaded=joblib.load('my_model.txt')

NameError: name 'RandomForestRegressor' is not defined

In [None]:
#测试数据
final_model=gird.search.best_estimator_
X_test=strat_test_set.drop('meadian_housing_value',axis=1)
y_test=strat_test_set['meadian_housing_value'].copy()

X_test_prepared=full_pipeline.transform(X_test)
final_predictions=final_model.predict(X_test_prepared)
final_mse=mse(y_test,final_predictions)
final_rmse=np.sqrt(final_mse)