In [62]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [63]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [64]:
badcols = train.columns[train.isnull().sum(axis=0) > 800]

In [65]:
train.isnull().sum(axis=1)

0       5
1       4
2       4
3       4
4       4
       ..
1455    4
1456    3
1457    2
1458    5
1459    5
Length: 1460, dtype: int64

In [66]:
data = train.drop(badcols, axis=1)
test_dropna = test.drop(badcols, axis=1)
object_cols = [col for col in data.columns if data[col].dtype == 'object']
good_label_cols = [col for col in object_cols if
                  set(data[col])==set(test_dropna[col])]
good_label_cols
bad_label_cols = list(set(object_cols)-set(good_label_cols))
bad_label_cols
data = train.drop(bad_label_cols, axis=1)
test_dropna = test.drop(bad_label_cols, axis=1)

object_cols = [col for col in data.columns if data[col].dtype == 'object']
data[object_cols] = data[object_cols].fillna('NULL')
test_dropna[object_cols] = test_dropna[object_cols].fillna('NULL')
data.head()
# print(test_dropna)

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleCondition,SalePrice
0,1,60,65.0,8450,Pave,,Reg,Lvl,Inside,Gtl,...,0,0,,,,0,2,2008,Normal,208500
1,2,20,80.0,9600,Pave,,Reg,Lvl,FR2,Gtl,...,0,0,,,,0,5,2007,Normal,181500
2,3,60,68.0,11250,Pave,,IR1,Lvl,Inside,Gtl,...,0,0,,,,0,9,2008,Normal,223500
3,4,70,60.0,9550,Pave,,IR1,Lvl,Corner,Gtl,...,0,0,,,,0,2,2006,Abnorml,140000
4,5,60,84.0,14260,Pave,,IR1,Lvl,FR2,Gtl,...,0,0,,,,0,12,2008,Normal,250000


In [67]:
y = data['SalePrice']
X = data.drop(['SalePrice'], axis=1)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 67 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   LotConfig      1460 non-null   object 
 9   LandSlope      1460 non-null   object 
 10  Neighborhood   1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  BldgType       1460 non-null   object 
 13  OverallQual    1460 non-null   int64  
 14  OverallCond    1460 non-null   int64  
 15  YearBuilt      1460 non-null   int64  
 16  YearRemodAdd   1460 non-null   int64  
 17  RoofStyle      1460 non-null   object 
 18  MasVnrTy

In [68]:
test_dropna.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 67 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   LotFrontage    1232 non-null   float64
 3   LotArea        1459 non-null   int64  
 4   Street         1459 non-null   object 
 5   Alley          1459 non-null   object 
 6   LotShape       1459 non-null   object 
 7   LandContour    1459 non-null   object 
 8   LotConfig      1459 non-null   object 
 9   LandSlope      1459 non-null   object 
 10  Neighborhood   1459 non-null   object 
 11  Condition1     1459 non-null   object 
 12  BldgType       1459 non-null   object 
 13  OverallQual    1459 non-null   int64  
 14  OverallCond    1459 non-null   int64  
 15  YearBuilt      1459 non-null   int64  
 16  YearRemodAdd   1459 non-null   int64  
 17  RoofStyle      1459 non-null   object 
 18  MasVnrTy

In [69]:
def num_cat_splitor(X):
    s = (X.dtypes == 'object')
    object_cols = list(s[s].index)
    # object_cols # ['package', 'division', 'salary']
    num_cols = list(set(X.columns) - set(object_cols))
    # num_cols
    # ['Work_accident', 'time_spend_company', 'promotion_last_5years', 'id',
    #  'average_monthly_hours',  'last_evaluation',  'number_project']
    return num_cols, object_cols
num_cols, object_cols = num_cat_splitor(X)
print(num_cols)
print(object_cols)
# X[object_cols].values

['LotFrontage', 'EnclosedPorch', 'Fireplaces', 'OverallCond', '3SsnPorch', '1stFlrSF', 'HalfBath', 'MasVnrArea', 'LotArea', 'GarageArea', 'BsmtFinSF2', 'OpenPorchSF', 'BsmtHalfBath', 'MSSubClass', 'LowQualFinSF', 'BedroomAbvGr', 'FullBath', '2ndFlrSF', 'ScreenPorch', 'KitchenAbvGr', 'BsmtFinSF1', 'GarageYrBlt', 'GarageCars', 'BsmtUnfSF', 'BsmtFullBath', 'YearBuilt', 'TotRmsAbvGrd', 'MiscVal', 'YrSold', 'TotalBsmtSF', 'OverallQual', 'WoodDeckSF', 'Id', 'YearRemodAdd', 'PoolArea', 'GrLivArea', 'MoSold']
['Street', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'BldgType', 'RoofStyle', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleCondition']


In [79]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_cols)),
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(object_cols)),
#         ('imputer', SimpleImputer(strategy="most_frequent")), # 不能对文字特征插值
        ('cat_encoder', OneHotEncoder(sparse=False,handle_unknown='ignore')),
    ])
full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])
X_prepared = full_pipeline.fit_transform(X)
temp = pd.DataFrame(X_prepared)
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 205 entries, 0 to 204
dtypes: float64(205)
memory usage: 2.3 MB


In [71]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_scores = cross_val_score(forest_reg,X_prepared,y,
                               scoring='neg_mean_squared_error',cv=3)
forest_rmse_scores = np.sqrt(-forest_scores)
print(forest_rmse_scores)
print(forest_rmse_scores.mean())
print(forest_rmse_scores.std())

[26640.90615842 32046.00790783 33825.59136566]
30837.501810638107
3055.082172598613


In [80]:
param_grid = [
    {'n_estimators' : [100,200,300],'max_features':[15,25,35,45,55,65,70]},
#     {'bootstrap':[False], 'n_estimators' : [3,10],'max_features':[2,3,4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error')
grid_search.fit(X_prepared,y)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [15, 25, 35, 45, 55, 65, 70],
                          'n_estimators': [100, 200, 300]}],
             scoring='neg_mean_squared_error')

In [81]:
grid_search.best_params_

{'max_features': 65, 'n_estimators': 200}

In [82]:
cv_result = grid_search.cv_results_
for mean_score, params in zip(cv_result['mean_test_score'], cv_result['params']):
    print(np.sqrt(-mean_score), params)

30798.545983289492 {'max_features': 15, 'n_estimators': 100}
30385.031662423495 {'max_features': 15, 'n_estimators': 200}
30668.28563369206 {'max_features': 15, 'n_estimators': 300}
29166.741213382484 {'max_features': 25, 'n_estimators': 100}
29606.84546585714 {'max_features': 25, 'n_estimators': 200}
29482.286968313958 {'max_features': 25, 'n_estimators': 300}
29395.350546464622 {'max_features': 35, 'n_estimators': 100}
29438.909488648074 {'max_features': 35, 'n_estimators': 200}
29047.08785504181 {'max_features': 35, 'n_estimators': 300}
29242.989598525455 {'max_features': 45, 'n_estimators': 100}
29258.06669950805 {'max_features': 45, 'n_estimators': 200}
29164.00345891022 {'max_features': 45, 'n_estimators': 300}
28875.03933949422 {'max_features': 55, 'n_estimators': 100}
29332.880921858192 {'max_features': 55, 'n_estimators': 200}
28938.873958197775 {'max_features': 55, 'n_estimators': 300}
29198.82958675946 {'max_features': 65, 'n_estimators': 100}
28705.12169185359 {'max_feature

In [83]:
feature_importances = grid_search.best_estimator_.feature_importances_


In [84]:
k = 3
def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]


In [85]:
prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('forst_reg', RandomForestRegressor(random_state=0))
])
param_grid = [{
    'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],
#     'feature_selection__k': list(range(20, len(feature_importances) + 1)),
    'forst_reg__n_estimators' : [200,250,300,330,350],
#     'forst_reg__max_features':[45, 55, 65, ]
}]

grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=2,
                                scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)


In [86]:
grid_search_prep.fit(X,y)
grid_search_prep.best_params_
final_model = grid_search_prep.best_estimator_

Fitting 2 folds for each of 15 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    8.4s finished


In [87]:
y_pred_test = final_model.predict(test_dropna)
result = pd.DataFrame()
result['Id'] = test['Id']
result['SalePrice'] = y_pred_test
result.to_csv('housing_price_rf_baseline.csv',index=False)
