In [1]:
import pandas
import numpy

features = ['accommodates', 'bedrooms', 'bathrooms', 'beds', 'price', 'minimum_nights', 'maximum_nights',
            'number_of_reviews']
dc_listings = pandas.read_csv("E:\\Workspace\\jupyter_notebook\\notebook_idata_lesson01\\KNN\\listings.csv")
dc_listings = dc_listings[features]
print(dc_listings.shape)

dc_listings[:5]


(3723, 8)


Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
0,4,1.0,1.0,2.0,$160.00,1,1125,0
1,6,3.0,3.0,3.0,$350.00,2,30,65
2,1,1.0,2.0,1.0,$50.00,2,1125,1
3,2,1.0,1.0,1.0,$95.00,1,1125,0
4,4,1.0,1.0,1.0,$50.00,7,1125,0


In [9]:
# 假设我们的房子有3个房间
our_acc_value = 3
dc_listings["distance"] = numpy.abs(dc_listings.accommodates - our_acc_value)
# 查看一共有多少种距离
dc_listings.distance.value_counts().sort_index()


0      461
1     2294
2      503
3      279
4       35
5       73
6       17
7       22
8        7
9       12
10       2
11       4
12       6
13       8
Name: distance, dtype: int64

In [10]:
# 打乱顺序
dc_listings = dc_listings.sample(frac=1, random_state=0)
dc_listings = dc_listings.sort_values("distance")
dc_listings.price.head()

740     $165.00
453     $207.00
602     $130.00
2348    $138.00
3119    $125.00
Name: price, dtype: object

In [20]:
# 转换price里面的数据格式，去掉$符转换成数值型
dc_listings["price"] = dc_listings["price"].replace("\$|,", "").astype(float)

# 计算前5个的平均值
mean_price = dc_listings["price"].iloc[:5].mean()
print(mean_price)


153.0


In [21]:
# 制定训练集和测试集
dc_listings.drop("distance", axis=1)

# 75%是训练集，25%是测试集
train_df = dc_listings.copy().iloc[:2792]
test_df = dc_listings.copy().iloc[2792:]


In [31]:
# 基于单变量预测价格
def predict_price(new_listing_value, feature_column):
    temp_df = train_df
    temp_df["distance"] = numpy.abs(dc_listings[feature_column] - new_listing_value)
    temp_df = temp_df.sort_values("distance")
    knn_5 = temp_df.price.iloc[:5]
    predicated_price = knn_5.mean()
    return predicated_price


# 调用predict_price预测
test_df["predicted_price"] = test_df.accommodates.apply(predict_price, feature_column="accommodates")


In [32]:
# 计算RMSE
test_df["squared_error"] = (test_df["predicted_price"] - test_df["price"]) ** 2
mse = test_df["squared_error"].mean()
rmse = mse ** (1 / 2)
print(rmse)


229.03265260947379


In [34]:
# 不同变量各自计算RMSE
for feature in ["accommodates", "bedrooms", "bathrooms", "number_of_reviews"]:
    test_df["predicted_price"] = test_df[feature].apply(predict_price, feature_column=feature)
    test_df["squared_error"] = (test_df["predicted_price"] - test_df["price"]) ** 2
    mse = test_df["squared_error"].mean()
    rmse = mse ** (1 / 2)
    print("RMSE for the {} column: {}".format(feature, rmse))


RMSE for the accommodates column: 229.03265260947379


RMSE for the bedrooms column: 214.33995002869366


RMSE for the bathrooms column: 235.21466649435237


RMSE for the number_of_reviews column: 249.9682819836384


In [4]:
# 0-1标准化处理
from sklearn import preprocessing

features = ['accommodates', 'bedrooms', 'bathrooms', 'beds', 'price', 'minimum_nights', 'maximum_nights',
            'number_of_reviews']

dc_listings = pandas.read_csv("E:\\Workspace\\jupyter_notebook\\notebook_idata_lesson01\\KNN\\listings.csv")
dc_listings = dc_listings[features]
dc_listings['price'] = dc_listings.price.str.replace("\$|,", '').astype(float)
dc_listings = dc_listings.dropna()

# 0-1标准化处理
dc_listings[features] = preprocessing.StandardScaler().fit_transform(dc_listings[features])
normalized_listings = dc_listings
normalized_listings.head()


Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
0,0.40142,-0.249501,-0.439211,0.297386,0.081119,-0.341421,-0.016575,-0.516779
1,1.399466,2.129508,2.969551,1.141704,1.462622,-0.065047,-0.016606,1.706767
2,-1.095648,-0.249501,1.26517,-0.546933,-0.718699,-0.065047,-0.016575,-0.482571
3,-0.596625,-0.249501,-0.439211,-0.546933,-0.391501,-0.341421,-0.016575,-0.516779
4,0.40142,-0.249501,-0.439211,-0.546933,-0.718699,1.316824,-0.016575,-0.516779


In [5]:
# 分出训练集和测试集
norm_train_df = normalized_listings.copy().iloc[:2792]
norm_test_df = normalized_listings.copy().iloc[2792:]


In [6]:
# 使用scipy中的工具计算距离
from scipy.spatial import distance

listing1 = normalized_listings.iloc[0][["accommodates", "bathrooms"]]
listing2 = normalized_listings.iloc[20][["accommodates", "bathrooms"]]
# 计算距离，欧式距离
dis = distance.euclidean(listing1, listing2)
print(dis)


3.723019604017032


In [7]:
# 多变量KNN模型
def predict_price_multivariate(new_listing_value, feature_columns):
    temp_df = norm_train_df
    # 计算距离，使用训练集
    temp_df["distance"] = distance.cdist(temp_df[feature_columns], [new_listing_value[feature_columns]])
    # 按照距离排列
    temp_df = temp_df.sort_values("distance")
    # 选择最近的5个
    knn_5 = temp_df.price.iloc[:5]
    # 计算平均价格
    predicted_price = knn_5.mean()
    return predicted_price


cols = ["accommodates", "bathrooms"]
norm_test_df["predicted_price"] = norm_test_df[cols].apply(predict_price_multivariate, feature_columns=cols, axis=1)
norm_test_df["squared_error"] = (norm_test_df["predicted_price"] - norm_test_df["price"]) ** 2
mse = norm_test_df["squared_error"].mean()
rmse = mse ** (1 / 2)
print(rmse)


0.7894063922577531


In [9]:
# 使用sklearn来完成KNN
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# 只计算两个特征值的时候
cols = ["accommodates", "bedrooms"]
knn = KNeighborsRegressor()
# 第一个参数就是候选特征，第二个就是标签纸，也就是要计算的值
knn.fit(norm_train_df[cols], norm_train_df["price"])
# 预测
two_features_predictions = knn.predict(norm_test_df[cols])

# 计算误差，也使用sklearn来计算

two_features_mse = mean_squared_error(norm_test_df["price"], two_features_predictions)
two_features_rmse = two_features_mse ** (1 / 2)
print(two_features_rmse)


0.8426824704818202


In [11]:
# 计算更多的特征值的时候
knn = KNeighborsRegressor()
cols = ['accommodates', 'bedrooms', 'bathrooms', 'beds', 'minimum_nights', 'maximum_nights', 'number_of_reviews']

# 第一个参数是特征值，第二个参数是标签值，也就是要计算的值
knn.fit(norm_train_df[cols], norm_train_df["price"])
# 预测
multi_features_predictions = knn.predict(norm_test_df[cols])
# 计算误差
multi_features_mse = mean_squared_error(norm_test_df["price"], multi_features_predictions)
multi_features_rmse = multi_features_mse ** (1 / 2)
print(multi_features_rmse)


0.8243838530880285


In [14]:
# 还可以指定KNN的K值

# 和计算两个特征值的时候做对比
cols = ["accommodates", "bedrooms"]
# k取10，也就是取10个最近的值做计算，默认是5
knn = KNeighborsRegressor(n_neighbors=10)
# 第一个参数就是候选特征，第二个就是标签纸，也就是要计算的值
knn.fit(norm_train_df[cols], norm_train_df["price"])
# 预测
two_features_predictions = knn.predict(norm_test_df[cols])

# 计算误差，也使用sklearn来计算

two_features_mse = mean_squared_error(norm_test_df["price"], two_features_predictions)
two_features_rmse = two_features_mse ** (1 / 2)
print(two_features_rmse)


0.8219515227259299
