**<font size=4>数据预处理</font>**

In [1]:
#import packages
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import lars_path
from sklearn.linear_model import LinearRegression, Lasso, LassoCV
from sklearn.metrics import r2_score
import scipy.stats as stats

1. 加载数据集

In [2]:
data = pd.read_csv("Airbnbdataset.csv", header=0, encoding="utf-8")

In [3]:
data.head()

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,...,40.696524,-73.991617,Beautiful brownstone 1-bedroom,Brooklyn Heights,2,100.0,https://a0.muscache.com/im/pictures/6d7cbbf7-c...,11201.0,1.0,1.0
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,...,40.766115,-73.98904,Superb 3BR Apt Located Near Times Square,Hell's Kitchen,6,93.0,https://a0.muscache.com/im/pictures/348a55fe-4...,10019.0,3.0,3.0
2,7919400,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,...,40.80811,-73.943756,The Garden Oasis,Harlem,10,92.0,https://a0.muscache.com/im/pictures/6fae5362-9...,10027.0,1.0,3.0
3,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,...,37.772004,-122.431619,Beautiful Flat in the Heart of SF!,Lower Haight,0,,https://a0.muscache.com/im/pictures/72208dad-9...,94117.0,2.0,2.0
4,3808709,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,True,...,38.925627,-77.034596,Great studio in midtown DC,Columbia Heights,4,40.0,,20009.0,0.0,1.0


2. 利用pd.describe()查看数据的相关统计信息（注意：只有数值型数据的相关统计信息）

In [4]:
data.describe()

Unnamed: 0,id,log_price,accommodates,bathrooms,latitude,longitude,number_of_reviews,review_scores_rating,bedrooms,beds
count,74111.0,74111.0,74111.0,73911.0,74111.0,74111.0,74111.0,57389.0,74020.0,73980.0
mean,11266620.0,4.782069,3.155146,1.235263,38.445958,-92.397525,20.900568,94.067365,1.265793,1.710868
std,6081735.0,0.717394,2.153589,0.582044,3.080167,21.705322,37.828641,7.836556,0.852143,1.254142
min,344.0,0.0,1.0,0.0,33.338905,-122.5115,0.0,20.0,0.0,0.0
25%,6261964.0,4.317488,2.0,1.0,34.127908,-118.342374,1.0,92.0,1.0,1.0
50%,12254150.0,4.70953,2.0,1.0,40.662138,-76.996965,6.0,96.0,1.0,1.0
75%,16402260.0,5.220356,4.0,1.0,40.746096,-73.95466,23.0,100.0,1.0,2.0
max,21230900.0,7.600402,16.0,8.0,42.390437,-70.985047,605.0,100.0,10.0,18.0


3. 利用pd.info()查看是否有缺失值, 以及数据的类型

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 29 columns):
id                        74111 non-null int64
log_price                 74111 non-null float64
property_type             74111 non-null object
room_type                 74111 non-null object
amenities                 74111 non-null object
accommodates              74111 non-null int64
bathrooms                 73911 non-null float64
bed_type                  74111 non-null object
cancellation_policy       74111 non-null object
cleaning_fee              74111 non-null bool
city                      74111 non-null object
description               74111 non-null object
first_review              58247 non-null object
host_has_profile_pic      73923 non-null object
host_identity_verified    73923 non-null object
host_response_rate        55812 non-null object
host_since                73923 non-null object
instant_bookable          74111 non-null object
last_review               582

可以发现这里有缺失值

注意：我一般使用dtypes属性查看各个字段的类型，使用pd.count()查看各个字段的数据量

In [6]:
data.dtypes

id                          int64
log_price                 float64
property_type              object
room_type                  object
amenities                  object
accommodates                int64
bathrooms                 float64
bed_type                   object
cancellation_policy        object
cleaning_fee                 bool
city                       object
description                object
first_review               object
host_has_profile_pic       object
host_identity_verified     object
host_response_rate         object
host_since                 object
instant_bookable           object
last_review                object
latitude                  float64
longitude                 float64
name                       object
neighbourhood              object
number_of_reviews           int64
review_scores_rating      float64
thumbnail_url              object
zipcode                    object
bedrooms                  float64
beds                      float64
dtype: object

In [7]:
data.count()

id                        74111
log_price                 74111
property_type             74111
room_type                 74111
amenities                 74111
accommodates              74111
bathrooms                 73911
bed_type                  74111
cancellation_policy       74111
cleaning_fee              74111
city                      74111
description               74111
first_review              58247
host_has_profile_pic      73923
host_identity_verified    73923
host_response_rate        55812
host_since                73923
instant_bookable          74111
last_review               58284
latitude                  74111
longitude                 74111
name                      74111
neighbourhood             67239
number_of_reviews         74111
review_scores_rating      57389
thumbnail_url             65895
zipcode                   73145
bedrooms                  74020
beds                      73980
dtype: int64

4. 查看特征名

In [8]:
data.columns

Index(['id', 'log_price', 'property_type', 'room_type', 'amenities',
       'accommodates', 'bathrooms', 'bed_type', 'cancellation_policy',
       'cleaning_fee', 'city', 'description', 'first_review',
       'host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
       'host_since', 'instant_bookable', 'last_review', 'latitude',
       'longitude', 'name', 'neighbourhood', 'number_of_reviews',
       'review_scores_rating', 'thumbnail_url', 'zipcode', 'bedrooms', 'beds'],
      dtype='object')

5. 使用pd.value_counts()查看分类型数据的类别及其对应的数据量

In [9]:
data["property_type"].value_counts()

Apartment             49003
House                 16511
Condominium            2658
Townhouse              1692
Loft                   1244
Other                   607
Guesthouse              498
Bed & Breakfast         462
Bungalow                366
Villa                   179
Dorm                    142
Guest suite             123
Camper/RV                94
Timeshare                77
Cabin                    72
In-law                   71
Hostel                   70
Boutique hotel           69
Boat                     65
Serviced apartment       21
Tent                     18
Castle                   13
Vacation home            11
Yurt                      9
Hut                       8
Treehouse                 7
Chalet                    6
Earth House               4
Tipi                      3
Cave                      2
Train                     2
Island                    1
Casa particular           1
Parking Space             1
Lighthouse                1
Name: property_type,

In [10]:
len(data["property_type"].value_counts())

35

6. 删除一些无用的字段

In [11]:
data["host_has_profile_pic"].value_counts()

t    73697
f      226
Name: host_has_profile_pic, dtype: int64

In [12]:
data = data.drop(["host_has_profile_pic"], axis=1)

7. 缺失值填充

In [13]:
median = data["review_scores_rating"].median()

In [14]:
data["review_scores_rating"].fillna(median, inplace=True)

In [15]:
bathroom_median = data["bathrooms"].median()
bedroom_median = data["bedrooms"].median()
beds_median = data["beds"].median()

In [16]:
data["bathrooms"].fillna(bathroom_median, inplace=True)
data["bedrooms"].fillna(bedroom_median, inplace=True)
data["beds"].fillna(beds_median, inplace=True)

In [17]:
data.count()

id                        74111
log_price                 74111
property_type             74111
room_type                 74111
amenities                 74111
accommodates              74111
bathrooms                 74111
bed_type                  74111
cancellation_policy       74111
cleaning_fee              74111
city                      74111
description               74111
first_review              58247
host_identity_verified    73923
host_response_rate        55812
host_since                73923
instant_bookable          74111
last_review               58284
latitude                  74111
longitude                 74111
name                      74111
neighbourhood             67239
number_of_reviews         74111
review_scores_rating      74111
thumbnail_url             65895
zipcode                   73145
bedrooms                  74111
beds                      74111
dtype: int64

8. 处理类别型数据

In [18]:
data["cancellation_policy"].value_counts()

strict             32374
flexible           22545
moderate           19063
super_strict_30      112
super_strict_60       17
Name: cancellation_policy, dtype: int64

In [19]:
data["room_type"].value_counts()

Entire home/apt    41310
Private room       30638
Shared room         2163
Name: room_type, dtype: int64

In [20]:
data["bed_type"].value_counts()

Real Bed         72028
Futon              753
Pull-out Sofa      585
Airbed             477
Couch              268
Name: bed_type, dtype: int64

In [21]:
data["property_type"].value_counts()

Apartment             49003
House                 16511
Condominium            2658
Townhouse              1692
Loft                   1244
Other                   607
Guesthouse              498
Bed & Breakfast         462
Bungalow                366
Villa                   179
Dorm                    142
Guest suite             123
Camper/RV                94
Timeshare                77
Cabin                    72
In-law                   71
Hostel                   70
Boutique hotel           69
Boat                     65
Serviced apartment       21
Tent                     18
Castle                   13
Vacation home            11
Yurt                      9
Hut                       8
Treehouse                 7
Chalet                    6
Earth House               4
Tipi                      3
Cave                      2
Train                     2
Island                    1
Casa particular           1
Parking Space             1
Lighthouse                1
Name: property_type,

In [22]:
propertylist = ["Apartment", "House", "Condominium", "Townhouse", "Loft"]

In [23]:
m, n = data.shape

In [24]:
data["property_type"].iloc[0] in propertylist == False

False

In [25]:
for i in range(m):
    if data["property_type"].iloc[i] in propertylist:
        continue
    else:
        data["property_type"].iloc[i] = "others"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [26]:
data["property_type"].value_counts()

Apartment      49003
House          16511
others          3003
Condominium     2658
Townhouse       1692
Loft            1244
Name: property_type, dtype: int64

In [27]:
data.columns

Index(['id', 'log_price', 'property_type', 'room_type', 'amenities',
       'accommodates', 'bathrooms', 'bed_type', 'cancellation_policy',
       'cleaning_fee', 'city', 'description', 'first_review',
       'host_identity_verified', 'host_response_rate', 'host_since',
       'instant_bookable', 'last_review', 'latitude', 'longitude', 'name',
       'neighbourhood', 'number_of_reviews', 'review_scores_rating',
       'thumbnail_url', 'zipcode', 'bedrooms', 'beds'],
      dtype='object')

In [28]:
data = pd.get_dummies(data, columns=["cancellation_policy", "property_type", "room_type" , "bed_type"])

In [29]:
data.head()

Unnamed: 0,id,log_price,amenities,accommodates,bathrooms,cleaning_fee,city,description,first_review,host_identity_verified,...,property_type_Townhouse,property_type_others,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,bed_type_Airbed,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed
0,6901257,5.010635,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,True,NYC,"Beautiful, sunlit brownstone 1-bedroom in the ...",2016-06-18,t,...,0,0,1,0,0,0,0,0,0,1
1,6304928,5.129899,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,True,NYC,Enjoy travelling during your stay in Manhattan...,2017-08-05,f,...,0,0,1,0,0,0,0,0,0,1
2,7919400,4.976734,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,True,NYC,The Oasis comes complete with a full backyard ...,2017-04-30,t,...,0,0,1,0,0,0,0,0,0,1
3,13418779,6.620073,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,True,SF,This light-filled home-away-from-home is super...,,t,...,0,0,1,0,0,0,0,0,0,1
4,3808709,4.744932,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,True,DC,"Cool, cozy, and comfortable studio located in ...",2015-05-12,t,...,0,0,1,0,0,0,0,0,0,1


In [30]:
data.columns

Index(['id', 'log_price', 'amenities', 'accommodates', 'bathrooms',
       'cleaning_fee', 'city', 'description', 'first_review',
       'host_identity_verified', 'host_response_rate', 'host_since',
       'instant_bookable', 'last_review', 'latitude', 'longitude', 'name',
       'neighbourhood', 'number_of_reviews', 'review_scores_rating',
       'thumbnail_url', 'zipcode', 'bedrooms', 'beds',
       'cancellation_policy_flexible', 'cancellation_policy_moderate',
       'cancellation_policy_strict', 'cancellation_policy_super_strict_30',
       'cancellation_policy_super_strict_60', 'property_type_Apartment',
       'property_type_Condominium', 'property_type_House',
       'property_type_Loft', 'property_type_Townhouse', 'property_type_others',
       'room_type_Entire home/apt', 'room_type_Private room',
       'room_type_Shared room', 'bed_type_Airbed', 'bed_type_Couch',
       'bed_type_Futon', 'bed_type_Pull-out Sofa', 'bed_type_Real Bed'],
      dtype='object')

In [31]:
data.rename(columns={'room_type_Entire home/apt':'room_type_Entire_home/apt', 
                            'room_type_Private room':'room_type_Private_room',
                            'room_type_Shared room':'room_type_Shared_room'}, inplace = True)

In [32]:
data.rename(columns={'bed_type_Pull-out Sofa':'bed_type_Pull-out_Sofa', 
                            'bed_type_Real Bed':'bed_type_Real_Bed'}, inplace = True)

9. 删除其他字段

In [33]:
data = data.drop(["amenities","city","description","first_review","host_identity_verified","host_response_rate",
                  "host_since","instant_bookable","last_review","name", "neighbourhood","thumbnail_url","zipcode"], axis=1)

In [34]:
data.count()

id                                     74111
log_price                              74111
accommodates                           74111
bathrooms                              74111
cleaning_fee                           74111
latitude                               74111
longitude                              74111
number_of_reviews                      74111
review_scores_rating                   74111
bedrooms                               74111
beds                                   74111
cancellation_policy_flexible           74111
cancellation_policy_moderate           74111
cancellation_policy_strict             74111
cancellation_policy_super_strict_30    74111
cancellation_policy_super_strict_60    74111
property_type_Apartment                74111
property_type_Condominium              74111
property_type_House                    74111
property_type_Loft                     74111
property_type_Townhouse                74111
property_type_others                   74111
room_type_

发现数据中已经没有缺失值了

11. 划分特征和目标

In [35]:
target = data["log_price"]
features = data.drop(["log_price","id"], axis=1)

In [36]:
target_array = target.values
features_array = features.values
X_train, X_test, y_train, y_test = train_test_split(features_array, target_array, test_size=0.2, random_state=42)

**<font size=4>1. 尝试使用具有各种超参数的Support Vector Machine回归器（sklearn.svm.SVR），例如kernel =“ linear”（具有C超参数的各种值）或kernel =“ rbf”（具有C和gamma超参数的各种值） 。 现在不用担心这些超参数的含义。 最佳SVR预测器表现如何？</font>**

In [37]:
from sklearn.model_selection import GridSearchCV

#考虑到线性核只有一个超参数C，rbf核有两个超参数
param_grid = [
        {'kernel': ['linear'], 'C': [10., 30., 100.]},
        {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300.],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0]},
    ]

In [38]:
from sklearn.svm import SVR

In [39]:
svr_reg = SVR()

In [40]:
#grid_search = GridSearchCV(svr_reg, param_grid, cv=5, scoring="neg_mean_squared_error", verbose=2)

In [41]:
#grid_search.fit(X_train, y_train)

5折交叉验证找到的最佳score是：

In [42]:
#negative_mse = grid_search.best_score_
#rmse = np.sqrt(-negative_mse)
#rmse

结果比RandomForestRegressor差很多. Let's check the best hyperparameters found:

In [43]:
#grid_search.best_params_

线性核似乎比RBF核更好。请注意，C的值是最大测试值。当发生这种情况时，您肯定想再次使用C的较高值（删除最小值）再次启动网格搜索，因为C的较高值可能会更好。

**<font size=4>2. 使用RandomizedSearchCV替换GridSearchCV</font>**

In [44]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal

In [46]:
param_distribs = {
        'kernel': ['linear', 'rbf'],
        'C': reciprocal(20, 200000),
        'gamma': expon(scale=1.0),
    }

In [47]:
svm_reg = SVR()
rnd_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs,
                                n_iter=50, cv=5, scoring='neg_mean_squared_error',
                                verbose=2, random_state=42)

In [None]:
#rnd_search.fit(housing_prepared, housing_labels)

In [None]:
#negative_mse = rnd_search.best_score_
#rmse = np.sqrt(-negative_mse)
#rmse

In [None]:
#rnd_search.best_params_

这次搜索为RBF内核找到了一套很好的超参数。 在相同的时间内，随机搜索往往会比网格搜索找到更好的超参数。

**<font size=4>3. 尝试在准备管道中添加一个转换器，以仅选择最重要的属性。</font>**

In [49]:
from sklearn.base import BaseEstimator, TransformerMixin

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

注意，这个类中一定要有fit和transfrom方法，不然会报错

In [48]:
k = 5

In [50]:
#top_k_feature_indices = indices_of_top_k(feature_importances, k)
#top_k_feature_indices

NameError: name 'feature_importances' is not defined

In [51]:
#np.array(attributes)[top_k_feature_indices]

In [52]:
#sorted(zip(feature_importances, attributes), reverse=True)[:k]

In [53]:
#preparation_and_feature_selection_pipeline = Pipeline([
#    ('preparation', full_pipeline),
#    ('feature_selection', TopFeatureSelector(feature_importances, k))
#])

In [None]:
#housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)

**<font size=4>4. 尝试创建执行完整数据准备和最终预测的单个管道。</font>**

In [None]:
prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('svm_reg', SVR(**rnd_search.best_params_))
])

In [None]:
prepare_select_and_predict_pipeline.fit(housing, housing_labels)

In [None]:
some_data = housing.iloc[:4]
some_labels = housing_labels.iloc[:4]

print("Predictions:\t", prepare_select_and_predict_pipeline.predict(some_data))
print("Labels:\t\t", list(some_labels))

**<font size=4>5. 使用GridSearchCV自动探索一些准备选项</font>**

In [None]:
param_grid = [{
    'preparation__num__imputer__strategy': ['mean', 'median', 'most_frequent'],
    'feature_selection__k': list(range(1, len(feature_importances) + 1))
}]

grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,
                                scoring='neg_mean_squared_error', verbose=2)
grid_search_prep.fit(housing, housing_labels)

In [None]:
grid_search_prep.best_params_