# 数据导入与预处理

## 模块导入

In [2]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats
from scipy.stats import norm, skew
from scipy.special import boxcox1p

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
import gc
gc.enable()
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
pd.set_option('max_colwidth', 200)

## 数据导入

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.head(5)

Unnamed: 0,数据ID,容纳人数,便利设施,洗手间数量,床的数量,床的类型,卧室数量,取消条款,所在城市,清洁费,首次评论日期,房主是否有个人资料图片,房主身份是否验证,房主回复率,何时成为房主,是否支持随即预订,最近评论日期,维度,经度,民宿周边,评论个数,房产类型,民宿评分,房型,邮编,价格
0,train_0,4,"{TV,""Cable TV"",Internet,""Wireless Internet"",""Air conditioning"",Kitchen,""Pets live on this property"",Dog(s),""Hot tub"",Heating,Washer,Dryer,""Smoke detector"",""Fire extinguisher"",Essentials,""translati...",1.5,3.0,4,2.0,0,3,0,2015-05-07,t,t,,2015-02-25,0,2016-06-26,34.109039,-118.27339,Los Feliz,12,17,97.0,0,90027,64.918531
1,train_1,2,"{TV,""Wireless Internet"",Kitchen,""Free parking on premises"",""Pets allowed"",Heating,""Family/kid friendly"",Washer,Dryer,""Smoke detector"",""Carbon monoxide detector"",""First aid kit"",""Fire extinguisher""...",1.0,1.0,4,1.0,2,4,1,2016-07-02,t,t,,2009-10-27,1,2016-07-31,40.812897,-73.919163,Mott Haven,6,0,87.0,0,10454,54.918531
2,train_2,4,"{TV,""Air conditioning"",Kitchen,Heating,""Smoke detector"",""Carbon monoxide detector"",""First aid kit"",""Safety card"",Essentials,Shampoo,""Hair dryer""}",1.0,2.0,4,0.0,2,4,1,2017-07-01,t,f,100%,2017-06-29,1,2017-07-31,40.737643,-73.953309,Greenpoint,4,0,80.0,0,11222,73.219281
3,train_3,2,{},1.0,1.0,4,1.0,0,5,1,,t,t,,2013-03-19,0,,37.759935,-122.420558,Mission District,0,0,,1,94110,64.093909
4,train_4,3,"{Internet,""Wireless Internet"",""Air conditioning"",Kitchen,""Free parking on premises"",""Pets live on this property"",Cat(s),""Buzzer/wireless intercom"",Heating,Essentials,Shampoo,""translation missing: ...",1.0,1.0,4,1.0,1,4,1,2014-04-30,t,t,100%,2011-07-30,0,2016-05-22,40.683363,-73.94949,Bedford-Stuyvesant,16,0,99.0,0,11216,68.454901


In [5]:
test.head(5)

Unnamed: 0,数据ID,容纳人数,便利设施,洗手间数量,床的数量,床的类型,卧室数量,取消条款,所在城市,清洁费,首次评论日期,房主是否有个人资料图片,房主身份是否验证,房主回复率,何时成为房主,是否支持随即预订,最近评论日期,维度,经度,民宿周边,评论个数,房产类型,民宿评分,房型,邮编
0,test_0,2,"{TV,Internet,""Wireless Internet"",""Air conditioning"",Kitchen,Gym,Breakfast,""Indoor fireplace"",Heating,""Family/kid friendly"",""Suitable for events"",Washer,Dryer,""Smoke detector"",""Carbon monoxide dete...",1.5,1.0,4,1.0,2,1,1,2015-05-25,t,t,100%,2015-05-20,1,2017-01-01,41.849684,-87.67627,Pilsen,17,17,97.0,1,60608
1,test_1,2,"{TV,Internet,""Wireless Internet"",""Air conditioning"",Kitchen,Heating,""Family/kid friendly"",Washer,Dryer,""Smoke detector"",""Carbon monoxide detector"",Essentials}",2.0,1.0,4,1.0,2,3,1,2015-11-09,t,t,100%,2015-09-08,0,2015-11-15,34.068613,-118.246455,Echo Park,2,0,100.0,0,90012
2,test_2,5,"{TV,""Cable TV"",""Wireless Internet"",""Air conditioning"",Kitchen,""Pets allowed"",Heating,""Family/kid friendly"",""Suitable for events"",""Smoke detector"",""Carbon monoxide detector"",""First aid kit"",Essenti...",1.0,3.0,4,2.0,1,4,1,2017-05-15,t,t,100%,2017-05-06,1,2017-09-25,40.701958,-73.917352,Bushwick,25,0,88.0,0,11237
3,test_3,6,"{""Cable TV"",Internet,""Wireless Internet"",""Air conditioning"",Kitchen,Elevator,""Buzzer/wireless intercom"",Heating,""Family/kid friendly""}",1.0,3.0,4,1.0,2,4,1,2012-11-12,t,t,70%,2009-02-06,0,2017-07-29,40.742959,-73.99082,Flatiron District,12,0,82.0,0,10010
4,test_4,2,"{Internet,""Wireless Internet"",""Air conditioning"",""Free parking on premises"",""Hot tub"",Heating,""Family/kid friendly"",""Smoke detector"",""Carbon monoxide detector"",""Fire extinguisher"",Essentials,Shamp...",1.0,1.0,4,1.0,0,3,1,2017-02-17,t,t,100%,2015-10-20,0,2017-03-25,34.046473,-117.734095,,2,17,100.0,1,91766


## Id特征处理

In [6]:
# 处理前数据形状
print(train.shape)
print(test.shape)

(59288, 26)
(14823, 25)


In [6]:
train_Id = train['数据ID']
test_Id = test['数据ID']
train.drop('数据ID', axis=1, inplace=True)
test.drop('数据ID', axis=1, inplace=True)

In [7]:
# 处理后数据形状
print(train.shape)
print(test.shape)

(59288, 25)
(14823, 24)


## 异常值处理

In [8]:
# 处理前数据分布
# fig, ax = plt.subplots()
# ax.scatter(x=train['GrLivArea'], y=train['SalePrice'])
# plt.xlabel('GrLivArea', fontsize=13)
# plt.ylabel('SalePrice', fontsize=13)
# plt.show()

In [9]:
# 删除右下角两个异常值
# train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)

In [10]:
# 处理后数据分布
# fig, ax = plt.subplots()
# ax.scatter(x=train['GrLivArea'], y=train['SalePrice'])
# plt.xlabel('GrLivArea', fontsize=13)
# plt.ylabel('SalePrice', fontsize=13)
# plt.show()

## SalePrice特征处理

In [11]:
# 处理前数据分布
sns.distplot(train['价格'], fit=norm)

(mu, sigma) = norm.fit(train['价格'])
print('\n mu={:.2f} and sigma={:.2f} \n'.format(mu, sigma))

plt.legend(['Normal dist.($\mu=$ {:.2f} and $\sigma=$ {:.2f})'.format(mu, sigma)], loc='best')
plt.ylabel('Frequency')
plt.title('价格 distribution')

fig = plt.figure()
stats.probplot(train['价格'], plot=plt)
plt.show()


 mu=68.97 and sigma=10.35 



In [12]:
# 数据偏度大，用log1p函数转化，使其更加服从高斯分布。
# 最后需要将预测出的平滑数据进行还原，而还原过程就是log1p的逆运算expm1
train['价格'] = np.log1p(train['价格'])

In [13]:
# 处理后数据分布
sns.distplot(train['价格'], fit=norm)

(mu, sigma) = norm.fit(train['价格'])
print('\n mu={:.2f} and sigma={:.2f} \n'.format(mu, sigma))

plt.legend(['Normal dist.($\mu=$ {:.2f} and $\sigma=$ {:.2f})'.format(mu, sigma)], loc='best')
plt.ylabel('Frequency')
plt.title('价格 distribution')

fig = plt.figure()
stats.probplot(train['价格'], plot=plt)
plt.show()


 mu=4.24 and sigma=0.15 



# 特征工程

## 数据集连接

In [14]:
ntrain = train.shape[0]
ntest = test.shape[0]
y_train = train.价格.values
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.drop(['价格'], axis=1, inplace=True)
print('all_data size is {}'.format(all_data.shape))

all_data size is (74111, 24)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


## 缺失数据分析

In [15]:
all_data.isnull().head()

Unnamed: 0,何时成为房主,便利设施,卧室数量,取消条款,容纳人数,床的数量,床的类型,房主回复率,房主是否有个人资料图片,房主身份是否验证,房产类型,房型,所在城市,是否支持随即预订,最近评论日期,民宿周边,民宿评分,洗手间数量,清洁费,经度,维度,评论个数,邮编,首次评论日期
0,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,True
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [16]:
all_data.isnull().sum().head(10)

何时成为房主           188
便利设施               0
卧室数量              91
取消条款               0
容纳人数               0
床的数量             131
床的类型               0
房主回复率          18299
房主是否有个人资料图片      188
房主身份是否验证         188
dtype: int64

In [17]:
all_data_na = (all_data.isnull().sum()/len(all_data))*100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' : all_data_na})
missing_data.head(20)

Unnamed: 0,Missing Ratio
房主回复率,24.691341
民宿评分,22.563452
首次评论日期,21.405729
最近评论日期,21.355804
民宿周边,9.272578
邮编,1.30345
洗手间数量,0.269865
房主身份是否验证,0.253674
房主是否有个人资料图片,0.253674
何时成为房主,0.253674


In [18]:
# 含有缺失值的特征的缺失率
f, axis = plt.subplots(figsize=(15,12))
plt.xticks(rotation='90')
sns.barplot(x=all_data_na.index, y=all_data_na)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)

Text(0.5, 1.0, 'Percent missing data by feature')

## 数据相关性

In [19]:
# 全部数值特征
corrmat = train.corr()
plt.subplots(figsize=(15,12))
sns.heatmap(corrmat, vmax=0.9, square=True)

<matplotlib.axes._subplots.AxesSubplot at 0x7f371661e240>

In [20]:
# 相关性最大的10个特征
corrmat = train.corr()
plt.subplots(figsize=(10,8))
k = 10
cols = corrmat.nlargest(k, '价格')['价格'].index
cm = np.corrcoef(train[cols].values.T)
sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)

<matplotlib.axes._subplots.AxesSubplot at 0x7f3710e9b240>

## 缺失值填充

In [21]:
all_data.head()

Unnamed: 0,何时成为房主,便利设施,卧室数量,取消条款,容纳人数,床的数量,床的类型,房主回复率,房主是否有个人资料图片,房主身份是否验证,房产类型,房型,所在城市,是否支持随即预订,最近评论日期,民宿周边,民宿评分,洗手间数量,清洁费,经度,维度,评论个数,邮编,首次评论日期
0,2015-02-25,"{TV,""Cable TV"",Internet,""Wireless Internet"",""Air conditioning"",Kitchen,""Pets live on this property"",Dog(s),""Hot tub"",Heating,Washer,Dryer,""Smoke detector"",""Fire extinguisher"",Essentials,""translati...",2.0,0,4,3.0,4,,t,t,17,0,3,0,2016-06-26,Los Feliz,97.0,1.5,0,-118.27339,34.109039,12,90027,2015-05-07
1,2009-10-27,"{TV,""Wireless Internet"",Kitchen,""Free parking on premises"",""Pets allowed"",Heating,""Family/kid friendly"",Washer,Dryer,""Smoke detector"",""Carbon monoxide detector"",""First aid kit"",""Fire extinguisher""...",1.0,2,2,1.0,4,,t,t,0,0,4,1,2016-07-31,Mott Haven,87.0,1.0,1,-73.919163,40.812897,6,10454,2016-07-02
2,2017-06-29,"{TV,""Air conditioning"",Kitchen,Heating,""Smoke detector"",""Carbon monoxide detector"",""First aid kit"",""Safety card"",Essentials,Shampoo,""Hair dryer""}",0.0,2,4,2.0,4,100%,t,f,0,0,4,1,2017-07-31,Greenpoint,80.0,1.0,1,-73.953309,40.737643,4,11222,2017-07-01
3,2013-03-19,{},1.0,0,2,1.0,4,,t,t,0,1,5,0,,Mission District,,1.0,1,-122.420558,37.759935,0,94110,
4,2011-07-30,"{Internet,""Wireless Internet"",""Air conditioning"",Kitchen,""Free parking on premises"",""Pets live on this property"",Cat(s),""Buzzer/wireless intercom"",Heating,Essentials,Shampoo,""translation missing: ...",1.0,1,3,1.0,4,100%,t,t,0,0,4,0,2016-05-22,Bedford-Stuyvesant,99.0,1.0,1,-73.94949,40.683363,16,11216,2014-04-30


In [22]:
# 根据各特征的现实含义，填充合适的值
feature1 = ['床的类型','邮编','房主回复率','首次评论日期','最近评论日期','何时成为房主','民宿周边','房主身份是否验证','房主是否有个人资料图片']
for i in feature1:
    all_data[i] = all_data[i].fillna('None')
    
feature2 = ['评论个数','洗手间数量']
for i in feature2:
    all_data[i] = all_data[i].fillna(0)

## 数值型
feature3 = ['民宿评分', '卧室数量', '取消条款', '床的数量','经度' ,'维度']
for i in feature3:
    all_data[i] = all_data[i].fillna(all_data[i].mode()[0])


    
# all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
# all_data = all_data.drop(['Utilities'], axis=1)
# all_data['Functional'] = all_data['Functional'].fillna('Typ')

In [23]:
# 填充完成后，查看是否还有缺失值
all_data_na = (all_data.isnull().sum()/len(all_data))*100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' : all_data_na})
missing_data.head()

Unnamed: 0,Missing Ratio


In [24]:
all_data.head()

Unnamed: 0,何时成为房主,便利设施,卧室数量,取消条款,容纳人数,床的数量,床的类型,房主回复率,房主是否有个人资料图片,房主身份是否验证,房产类型,房型,所在城市,是否支持随即预订,最近评论日期,民宿周边,民宿评分,洗手间数量,清洁费,经度,维度,评论个数,邮编,首次评论日期
0,2015-02-25,"{TV,""Cable TV"",Internet,""Wireless Internet"",""Air conditioning"",Kitchen,""Pets live on this property"",Dog(s),""Hot tub"",Heating,Washer,Dryer,""Smoke detector"",""Fire extinguisher"",Essentials,""translati...",2.0,0,4,3.0,4,,t,t,17,0,3,0,2016-06-26,Los Feliz,97.0,1.5,0,-118.27339,34.109039,12,90027,2015-05-07
1,2009-10-27,"{TV,""Wireless Internet"",Kitchen,""Free parking on premises"",""Pets allowed"",Heating,""Family/kid friendly"",Washer,Dryer,""Smoke detector"",""Carbon monoxide detector"",""First aid kit"",""Fire extinguisher""...",1.0,2,2,1.0,4,,t,t,0,0,4,1,2016-07-31,Mott Haven,87.0,1.0,1,-73.919163,40.812897,6,10454,2016-07-02
2,2017-06-29,"{TV,""Air conditioning"",Kitchen,Heating,""Smoke detector"",""Carbon monoxide detector"",""First aid kit"",""Safety card"",Essentials,Shampoo,""Hair dryer""}",0.0,2,4,2.0,4,100%,t,f,0,0,4,1,2017-07-31,Greenpoint,80.0,1.0,1,-73.953309,40.737643,4,11222,2017-07-01
3,2013-03-19,{},1.0,0,2,1.0,4,,t,t,0,1,5,0,,Mission District,100.0,1.0,1,-122.420558,37.759935,0,94110,
4,2011-07-30,"{Internet,""Wireless Internet"",""Air conditioning"",Kitchen,""Free parking on premises"",""Pets live on this property"",Cat(s),""Buzzer/wireless intercom"",Heating,Essentials,Shampoo,""translation missing: ...",1.0,1,3,1.0,4,100%,t,t,0,0,4,0,2016-05-22,Bedford-Stuyvesant,99.0,1.0,1,-73.94949,40.683363,16,11216,2014-04-30


## 标签编码

In [25]:
# 查看待编码特征情况
cols = ['便利设施','床的类型','邮编','房主回复率','首次评论日期','最近评论日期','何时成为房主','民宿周边','房主身份是否验证','房主是否有个人资料图片']

all_data[cols].head()

Unnamed: 0,便利设施,床的类型,邮编,房主回复率,首次评论日期,最近评论日期,何时成为房主,民宿周边,房主身份是否验证,房主是否有个人资料图片
0,"{TV,""Cable TV"",Internet,""Wireless Internet"",""Air conditioning"",Kitchen,""Pets live on this property"",Dog(s),""Hot tub"",Heating,Washer,Dryer,""Smoke detector"",""Fire extinguisher"",Essentials,""translati...",4,90027,,2015-05-07,2016-06-26,2015-02-25,Los Feliz,t,t
1,"{TV,""Wireless Internet"",Kitchen,""Free parking on premises"",""Pets allowed"",Heating,""Family/kid friendly"",Washer,Dryer,""Smoke detector"",""Carbon monoxide detector"",""First aid kit"",""Fire extinguisher""...",4,10454,,2016-07-02,2016-07-31,2009-10-27,Mott Haven,t,t
2,"{TV,""Air conditioning"",Kitchen,Heating,""Smoke detector"",""Carbon monoxide detector"",""First aid kit"",""Safety card"",Essentials,Shampoo,""Hair dryer""}",4,11222,100%,2017-07-01,2017-07-31,2017-06-29,Greenpoint,f,t
3,{},4,94110,,,,2013-03-19,Mission District,t,t
4,"{Internet,""Wireless Internet"",""Air conditioning"",Kitchen,""Free parking on premises"",""Pets live on this property"",Cat(s),""Buzzer/wireless intercom"",Heating,Essentials,Shampoo,""translation missing: ...",4,11216,100%,2014-04-30,2016-05-22,2011-07-30,Bedford-Stuyvesant,t,t


In [26]:
# 将数值类型转化为字符串类型
feature = ['床的类型', '房产类型', '房型', '所在城市','是否支持随即预订']
for i in feature:
    all_data[i] = all_data[i].astype(str)

In [27]:
# 标签编码对不连续的数字或者文本进行编号,转换成连续的数值型变量
for c in cols:
    le = LabelEncoder()
    le.fit(list(all_data[c].values))
    all_data[c] = le.transform(list(all_data[c].values))

In [28]:
# 编码后特征情况
all_data[cols].head()



Unnamed: 0,便利设施,床的类型,邮编,房主回复率,首次评论日期,最近评论日期,何时成为房主,民宿周边,房主身份是否验证,房主是否有个人资料图片
0,32053,4,454,80,1671,904,2135,323,2,2
1,50507,4,150,80,2093,939,222,371,2,2
2,19694,4,231,2,2457,1304,2990,238,1,2
3,67121,4,739,80,2554,1371,1427,356,2,2
4,12765,4,220,2,1300,869,829,44,2,2


In [29]:
pd.set_option('max_columns', 80)
all_data.head()

Unnamed: 0,何时成为房主,便利设施,卧室数量,取消条款,容纳人数,床的数量,床的类型,房主回复率,房主是否有个人资料图片,房主身份是否验证,房产类型,房型,所在城市,是否支持随即预订,最近评论日期,民宿周边,民宿评分,洗手间数量,清洁费,经度,维度,评论个数,邮编,首次评论日期
0,2135,32053,2.0,0,4,3.0,4,80,2,2,17,0,3,0,904,323,97.0,1.5,0,-118.27339,34.109039,12,454,1671
1,222,50507,1.0,2,2,1.0,4,80,2,2,0,0,4,1,939,371,87.0,1.0,1,-73.919163,40.812897,6,150,2093
2,2990,19694,0.0,2,4,2.0,4,2,2,1,0,0,4,1,1304,238,80.0,1.0,1,-73.953309,40.737643,4,231,2457
3,1427,67121,1.0,0,2,1.0,4,80,2,2,0,1,5,0,1371,356,100.0,1.0,1,-122.420558,37.759935,0,739,2554
4,829,12765,1.0,1,3,1.0,4,2,2,2,0,0,4,0,869,44,99.0,1.0,1,-73.94949,40.683363,16,220,1300


## 增加特征

In [30]:
# all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

## 倾斜特征

In [31]:
all_data.dtypes.head(10)

何时成为房主           int64
便利设施             int64
卧室数量           float64
取消条款             int64
容纳人数             int64
床的数量           float64
床的类型             int64
房主回复率            int64
房主是否有个人资料图片      int64
房主身份是否验证         int64
dtype: object

In [32]:
all_data.dtypes[all_data.dtypes != 'object'].index

Index(['何时成为房主', '便利设施', '卧室数量', '取消条款', '容纳人数', '床的数量', '床的类型', '房主回复率',
       '房主是否有个人资料图片', '房主身份是否验证', '最近评论日期', '民宿周边', '民宿评分', '洗手间数量', '清洁费',
       '经度', '维度', '评论个数', '邮编', '首次评论日期'],
      dtype='object')

In [33]:
# 计算特征偏度
numeric_feats = all_data.dtypes[all_data.dtypes != 'object'].index
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew':skewed_feats})
skewness

Unnamed: 0,Skew
评论个数,3.70276
洗手间数量,3.621649
床的数量,3.360728
容纳人数,2.231515
卧室数量,1.991787
房主回复率,0.513749
邮编,0.292904
便利设施,0.003907
民宿周边,-0.050597
取消条款,-0.240908


In [34]:
all_data.isnull().sum()

何时成为房主         0
便利设施           0
卧室数量           0
取消条款           0
容纳人数           0
床的数量           0
床的类型           0
房主回复率          0
房主是否有个人资料图片    0
房主身份是否验证       0
房产类型           0
房型             0
所在城市           0
是否支持随即预订       0
最近评论日期         0
民宿周边           0
民宿评分           0
洗手间数量          0
清洁费            0
经度             0
维度             0
评论个数           0
邮编             0
首次评论日期         0
dtype: int64

In [35]:
# 对偏态分布的数据进行标准化处理，使其更加服从正态分布
skewness = skewness[abs(skewness.Skew) > 0.75]
print('There are {} skewed numerical features to Box Cox transform'.format(skewness.shape[0]))
# print(skewed_features)
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    print(feat)
    all_data[feat] = boxcox1p(all_data[feat], lam)
    
# all_data[skewed_features] = np.log1p(all_data[skewed_features])

There are 12 skewed numerical features to Box Cox transform
评论个数
洗手间数量
床的数量
容纳人数
卧室数量
房主身份是否验证
清洁费
首次评论日期
最近评论日期
民宿评分
床的类型
房主是否有个人资料图片


In [36]:
all_data.isnull().sum()

何时成为房主         0
便利设施           0
卧室数量           0
取消条款           0
容纳人数           0
床的数量           0
床的类型           0
房主回复率          0
房主是否有个人资料图片    0
房主身份是否验证       0
房产类型           0
房型             0
所在城市           0
是否支持随即预订       0
最近评论日期         0
民宿周边           0
民宿评分           0
洗手间数量          0
清洁费            0
经度             0
维度             0
评论个数           0
邮编             0
首次评论日期         0
dtype: int64

In [37]:
all_data.head()


Unnamed: 0,何时成为房主,便利设施,卧室数量,取消条款,容纳人数,床的数量,床的类型,房主回复率,房主是否有个人资料图片,房主身份是否验证,房产类型,房型,所在城市,是否支持随即预订,最近评论日期,民宿周边,民宿评分,洗手间数量,清洁费,经度,维度,评论个数,邮编,首次评论日期
0,2135,32053,1.194318,0,1.820334,1.540963,1.820334,80,1.194318,1.194318,17,0,3,0,11.843317,323,6.594833,0.982247,0.0,-118.27339,34.109039,3.128239,454,13.628573
1,222,50507,0.730463,2,1.194318,0.730463,1.820334,80,1.194318,1.194318,0,0,4,1,11.948971,371,6.382451,0.730463,0.730463,-73.919163,40.812897,2.259674,150,14.325402
2,2990,19694,0.0,2,1.820334,1.194318,1.820334,2,1.194318,0.730463,0,0,4,1,12.887996,238,6.221214,0.730463,0.730463,-73.953309,40.737643,1.820334,231,14.836183
3,1427,67121,0.730463,0,1.194318,0.730463,1.820334,80,1.194318,1.194318,0,1,5,0,13.035403,356,6.65495,0.730463,0.730463,-122.420558,37.759935,0.0,739,14.961384
4,829,12765,0.730463,1,1.540963,0.730463,1.820334,2,1.194318,1.194318,0,0,4,0,11.73413,44,6.635082,0.730463,0.730463,-73.94949,40.683363,3.530419,220,12.878993


## 独热编码

In [38]:
# 独热编码用来解决类别型数据的离散值问题
all_data = pd.get_dummies(all_data)
all_data.head()

Unnamed: 0,何时成为房主,便利设施,卧室数量,取消条款,容纳人数,床的数量,床的类型,房主回复率,房主是否有个人资料图片,房主身份是否验证,最近评论日期,民宿周边,民宿评分,洗手间数量,清洁费,经度,维度,评论个数,邮编,首次评论日期,房产类型_0,房产类型_1,房产类型_10,房产类型_11,房产类型_12,房产类型_13,房产类型_14,房产类型_15,房产类型_16,房产类型_17,房产类型_18,房产类型_19,房产类型_2,房产类型_20,房产类型_21,房产类型_22,房产类型_23,房产类型_24,房产类型_25,房产类型_26,房产类型_27,房产类型_28,房产类型_29,房产类型_3,房产类型_30,房产类型_31,房产类型_32,房产类型_33,房产类型_34,房产类型_4,房产类型_5,房产类型_6,房产类型_7,房产类型_8,房产类型_9,房型_0,房型_1,房型_2,所在城市_0,所在城市_1,所在城市_2,所在城市_3,所在城市_4,所在城市_5,是否支持随即预订_0,是否支持随即预订_1
0,2135,32053,1.194318,0,1.820334,1.540963,1.820334,80,1.194318,1.194318,11.843317,323,6.594833,0.982247,0.0,-118.27339,34.109039,3.128239,454,13.628573,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0
1,222,50507,0.730463,2,1.194318,0.730463,1.820334,80,1.194318,1.194318,11.948971,371,6.382451,0.730463,0.730463,-73.919163,40.812897,2.259674,150,14.325402,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1
2,2990,19694,0.0,2,1.820334,1.194318,1.820334,2,1.194318,0.730463,12.887996,238,6.221214,0.730463,0.730463,-73.953309,40.737643,1.820334,231,14.836183,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1
3,1427,67121,0.730463,0,1.194318,0.730463,1.820334,80,1.194318,1.194318,13.035403,356,6.65495,0.730463,0.730463,-122.420558,37.759935,0.0,739,14.961384,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0
4,829,12765,0.730463,1,1.540963,0.730463,1.820334,2,1.194318,1.194318,11.73413,44,6.635082,0.730463,0.730463,-73.94949,40.683363,3.530419,220,12.878993,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0


In [39]:
all_data.isnull().sum()

何时成为房主         0
便利设施           0
卧室数量           0
取消条款           0
容纳人数           0
床的数量           0
床的类型           0
房主回复率          0
房主是否有个人资料图片    0
房主身份是否验证       0
最近评论日期         0
民宿周边           0
民宿评分           0
洗手间数量          0
清洁费            0
经度             0
维度             0
评论个数           0
邮编             0
首次评论日期         0
房产类型_0         0
房产类型_1         0
房产类型_10        0
房产类型_11        0
房产类型_12        0
房产类型_13        0
房产类型_14        0
房产类型_15        0
房产类型_16        0
房产类型_17        0
房产类型_18        0
房产类型_19        0
房产类型_2         0
房产类型_20        0
房产类型_21        0
房产类型_22        0
房产类型_23        0
房产类型_24        0
房产类型_25        0
房产类型_26        0
房产类型_27        0
房产类型_28        0
房产类型_29        0
房产类型_3         0
房产类型_30        0
房产类型_31        0
房产类型_32        0
房产类型_33        0
房产类型_34        0
房产类型_4         0
房产类型_5         0
房产类型_6         0
房产类型_7         0
房产类型_8         0
房产类型_9         0
房型_0           0
房型_1           0
房型_2           0
所在城市_0        

## 重新划分数据集

In [40]:
# 将处理好的数据重新划分成训练集和测试集，为模型训练与测试做准备
train = all_data[:ntrain]
test = all_data[ntrain:]

## 特征重要性检测

In [41]:
lasso=Lasso(alpha=0.001)
lasso.fit(train,y_train)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [42]:
FI_lasso = pd.DataFrame({"Feature Importance":lasso.coef_}, index=train.columns)
FI_lasso.sort_values("Feature Importance",ascending=False)

Unnamed: 0,Feature Importance
房型_0,0.1162055
所在城市_5,0.1126906
容纳人数,0.07368532
洗手间数量,0.04033682
卧室数量,0.03816878
民宿评分,0.02973406
所在城市_2,0.02263767
是否支持随即预订_0,0.003115568
最近评论日期,0.002318739
房主回复率,8.136902e-05


In [43]:
FI_lasso[FI_lasso["Feature Importance"]!=0].sort_values("Feature Importance").plot(kind="barh",figsize=(15,25))
plt.xticks(rotation=90)
plt.show()

# 基础模型

## 定义交叉验证策略

In [44]:
# cross_val_score默认使用K折交叉验证策略。此处先使用KFold的shuffle参数混洗数据
# neg_mean_squared_error：负均方误差，是损失函数，优化目标是使其最小化
n_splits = 5
def nmse_cv(model):
    kf = KFold(n_splits, shuffle=True, random_state=42).get_n_splits(train.values)
    nmse = np.sqrt(-cross_val_score(model, train.values, y_train, scoring='neg_mean_squared_error', cv=kf,verbose=50))
    return(nmse)

## 建立基础模型

In [45]:
# lasso/ElasticNet模型对异常值敏感，使用RobustScaler缩放有离群值的数据
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=0.25)
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.2,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.2, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.2, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

## 基础模型分数

In [46]:
models = [lasso, ENet, GBoost, model_xgb, model_lgb]
names = ['Lasso', 'ELasticNet',  'GradientBoosting', 'Xgboost', 'LGBM']
for model, name in zip(models, names):
    score = nmse_cv(model)
    # 验证结果返回5个分数，求均值和标准差
    print('{} score:{:.4f} ({:.4f}) \n'.format(name, score.mean(), score.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV]  ................................................................
[CV] ................................... , score=-0.011, total=   0.8s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s
[CV]  ................................................................
[CV] ................................... , score=-0.009, total=   1.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.8s remaining:    0.0s
[CV]  ................................................................
[CV] ................................... , score=-0.009, total=   0.8s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    2.6s remaining:    0.0s
[CV]  ................................................................
[CV] ................................... , score=-0.009, total=   0.9s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    3.5s remaining:    0.0s
[CV]  .........................

# 模型融合

## 方法一：模型平均

### 模型平均类

In [47]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    
    def __init__(self, models):
        self.models = models
        
    def fit(self, X, y):
        self.clone_models = [clone(x) for x in self.models]
        for model in self.clone_models:
            model.fit(X, y)
        return self
    
    def predict(self, X):
        predictions = np.column_stack([model.predict(X) for model in self.clone_models])
        return np.mean(predictions, axis=1)

### 模型平均分数

In [50]:
averaged_models = AveragingModels(models = [ENet, GBoost, KRR, lasso])
score = nmse_cv(averaged_models)
print('Averaged base models score: {:.4f} ({:.4f}) \n'.format(score.mean(), score.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV]  ................................................................


In [51]:
averaged_models = AveragingModels(models = [lasso, ENet, KRR, GBoost, model_xgb, model_lgb])
score = nmse_cv(averaged_models)
print('Averaged base models score: {:.4f} ({:.4f}) \n'.format(score.mean(), score.std()))

Averaged base models score: 0.0918 (0.0149) 



## 方法二：模型叠加

### 模型叠加类

In [50]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
        
    # 将原来的模型clone出来，并且实现fit功能    
    def fit(self, X, y):
        self.clone_base_models = [list() for x in self.base_models]
        self.clone_meta_model = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        
        # 使用K-fold的方法来进行交叉验证，将每次验证的结果作为新的特征来进行处理
        for i, model in enumerate(self.base_models):
            for train_index, test_index in kfold.split(X, y):
                instance = clone(model)
                self.clone_base_models[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[test_index])
                out_of_fold_predictions[test_index, i] = y_pred
                
        # 将交叉验证预测出的结果(标签)和训练集中的标签值用元模型进行训练
        self.clone_meta_model.fit(out_of_fold_predictions, y)
        return self
        
    def predict(self, X):
        # 得到各模型预测结果平均值的二维数组
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.clone_base_models
        ])
        return self.clone_meta_model.predict(meta_features)

In [51]:
# 额外补充：举例理解column_stack

# a,b代表各个模型预测结果
a = np.array([1,2,3])
b = np.array([4,5,6])
# 将列排成二维数组，即将各个模型对每一个样本的预测结果分别组成一个数组，3行2列
np.column_stack((a,b))

array([[1, 4],
       [2, 5],
       [3, 6]])

In [52]:
# 按列取平均值，即各样本预测结果的平均值，得到一维数组
np.column_stack([a,b]).mean(axis=1)

array([2.5, 3.5, 4.5])

In [53]:
# 将一维数组转化为二维数组，3行1列
np.column_stack([np.column_stack([a,b]).mean(axis=1)])

array([[2.5],
       [3.5],
       [4.5]])

### 模型叠加分数

In [54]:
stacked_averaged_models = StackingAveragedModels(base_models=(ENet, GBoost, KRR), meta_model=lasso)
score = nmse_cv(stacked_averaged_models)
print('Stacking Averaged models score: {:.4f} ({:.4f})'.format(score.mean(), score.std()))
# 结果模型叠加分数比模型平均分数更低，模型效果更好

# 模型训练与预测

## 定义评估函数

In [58]:
# 均方差：MSE的值越小，预测模型具有更好的精确度
def mse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

## 模型训练、预测、评估

In [59]:
stacked_averaged_models.fit(train.values, y_train)
stacked_train_pred = stacked_averaged_models.predict(train.values)
# 前面用log1p函数转化使标签更加服从高斯分布，现用expm1将预测出的平滑数据进行还原
stacked_pred = np.expm1(stacked_averaged_models.predict(test.values))
print(mse(y_train, stacked_train_pred))

In [60]:
model_xgb.fit(train, y_train)
xgb_train_pred = model_xgb.predict(train)
xgb_pred = np.expm1(model_xgb.predict(test))
print(mse(y_train, xgb_train_pred))

In [61]:
model_lgb.fit(train, y_train)
lgb_train_pred = model_lgb.predict(train)
lgb_pred = np.expm1(model_lgb.predict(test))
print(mse(y_train, lgb_train_pred))

In [62]:
# 基于交叉验证分数给出权重
# Xgboost score:0.1161 (0.0079) 
# LGBM score:0.1167 (0.0072) 
# Stacking Averaged models score: 0.1084 (0.0073)
print('MSE score on train data:')
print(mse(y_train, stacked_train_pred*0.70 + xgb_train_pred*0.15 + lgb_train_pred*0.15))

## 集成预测

In [63]:
ensemble = stacked_pred*0.70 + xgb_pred*0.15 + lgb_pred*0.15
ensemble

## 生成结果文件

In [64]:
sub = pd.DataFrame()
sub['数据ID'] = test_Id
sub['价格'] = ensemble
sub.to_csv('submit.csv', index=False)