# 数据预处理

In [5]:
import datetime
import re
from urllib.request import quote

import folium
import imageio
import jieba
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
import tensorflow.keras as keras
from folium import plugins
from folium.plugins import HeatMap
from PIL import Image
from scipy import stats
from scipy.stats import norm
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from wordcloud import ImageColorGenerator, WordCloud

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
mpl.rcParams['figure.figsize'] = [20, 10]
mpl.rcParams['font.size'] = 14

In [None]:
data = pd.read_csv('lianjia/lianjia.csv', encoding='gbk')

In [None]:
len(data)

重命名

In [None]:
rename = {
    'area': '面积',
    'description': '房源描述',
    'title': '标题',
    'location': '地点',
    'house_type': '房屋类型',
    'house_code': '房源编号',
    'price': '价格',
    'tags': '房源标签',
    'lease': '租赁方式',
    'orientation': '朝向',
    'floor': '楼层',
    'elevator': '电梯',
    'stall': '车位',
    'water': '用水',
    'electricity': '用电',
    'fuel_gas': '燃气',
    'heating': '采暖',
    'facility': '配套设施',
}
data.rename(columns=rename, inplace=True)

去重

In [None]:
data.drop(index=data[data['房屋类型'] == 'house_type'].index, inplace=True)

In [None]:
data = data.drop_duplicates(subset=['房源编号'])

In [None]:
len(data)

提取楼层数

In [None]:
def get_num(row):
    area = re.findall(r'\d+', row['面积'])
    row['面积'] = int(area[0]) if area else np.nan
    floor = re.findall(r'\d+', row['楼层'])
    row['楼层'] = int(floor[0]) if floor else np.nan
    price = re.findall(r'\d+', row['价格'])
    row['价格'] = int(price[0]) if price else np.nan
    return row

In [None]:
data = data.apply(get_num, axis=1)

提取房型，即几室几厅几卫

In [None]:
def get_house_type(row):
    room = re.findall(r'\d*室', row['房屋类型'])
    room_num = re.findall(r'\d+', room[0])
    row['室'] = int(room_num[0]) if room_num else np.nan
    hall = re.findall(r'\d+厅', row['房屋类型'])
    hall_num = re.findall(r'\d+', hall[0])
    row['厅'] = int(hall_num[0]) if hall_num else np.nan
    rest_room = re.findall(r'\d+卫', row['房屋类型'])
    rest_room_num = re.findall(r'\d+', rest_room[0])
    row['卫'] = int(rest_room_num[0]) if rest_room_num else np.nan
    return row

In [None]:
data = data.apply(get_house_type, axis=1)

提取所属区

In [None]:
data['区'] = data['地点'].map(lambda x: x.split('-')[0])

提取配套设施

In [None]:
def get_facility(row):
    if row['配套设施'] is not np.nan:
        facility = row['配套设施'].split(',')
        row[facility] = 1
        return row
    else:
        return row

In [None]:
data[['洗衣机', '空调', '衣柜', '电视', '冰箱', '热水器', '床', '暖气', '宽带', '天然气']] = 0

In [None]:
data = data.apply(get_facility, axis=1)

In [None]:
data.head()

# 数据分析

## 总览

In [None]:
data.describe()

In [None]:
data.median()

## 计数

所属区

In [None]:
data['区'].value_counts()

户型

In [None]:
data['房屋类型'].value_counts().head()

电梯

In [None]:
data['电梯'].value_counts()

朝向

In [None]:
data['朝向'].value_counts().head()

## 分组

统计每个区的价格均值

In [None]:
data.groupby('区')['价格'].mean().sort_values(ascending=False)

每种房间数的平均价格

In [None]:
data.groupby('室')['价格'].mean().sort_values(ascending=False)

通过透视表查看

In [None]:
grouped = data.groupby(['区', '室'], as_index= False)['价格'].mean()
grouped_pivot = grouped.pivot(index='区', columns='室', values='价格')
grouped_pivot

热力图

In [None]:
heatmap_plot=sns.heatmap(grouped_pivot)

考察各变量与价格的相关性

In [None]:
data.corr()["价格"].sort_values(ascending=False)

查看面积与价格的相关系数

In [None]:
mask = ~np.logical_or(np.isnan(data['面积']), np.isnan(data['价格']))
pearson_coef, p_value = stats.pearsonr(data['面积'][mask], data['价格'][mask])
print ("The Pearson Correlation Coefficient is", pearson_coef, "\nwith a P-value of P =", p_value)

# 数据可视化

## 简单图形

面积

In [None]:
data['面积'].hist(bins=1000)

价格

In [None]:
data['价格'].hist(bins=1000)

楼层

In [None]:
data['楼层'].hist(bins=100)

电梯

In [None]:
elevator_count = data['电梯'].value_counts()
plt.pie(elevator_count, labels=elevator_count.index, autopct='%1.1f%%', startangle=90)

朝向

In [None]:
elevator_count = data['朝向'].value_counts().sort_values(ascending=False)[:5]
plt.pie(elevator_count, labels=elevator_count.index, autopct='%1.1f%%', startangle=90)

车位

In [None]:
elevator_count = data['车位'].value_counts()
plt.pie(elevator_count, labels=elevator_count.index, autopct='%1.1f%%', startangle=90)

用水

In [None]:
elevator_count = data['用水'].value_counts()
plt.pie(elevator_count, labels=elevator_count.index, autopct='%1.1f%%', startangle=90)

用电

In [None]:
elevator_count = data['用电'].value_counts()
plt.pie(elevator_count, labels=elevator_count.index, autopct='%1.1f%%', startangle=90)

燃气

In [None]:
elevator_count = data['燃气'].value_counts()
plt.pie(elevator_count, labels=elevator_count.index, autopct='%1.1f%%', startangle=90)

采暖

In [None]:
elevator_count = data['采暖'].value_counts()
plt.pie(elevator_count, labels=elevator_count.index, autopct='%1.1f%%', startangle=90)

定义一个画条形图的帮助函数

In [None]:
def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{:.0f}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

In [None]:
def autolabelh(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        width = int(rect.get_width())
        yloc = rect.get_y() + rect.get_height() / 2
        height = rect.get_height()
        ax.annotate('{:.0f}'.format(width),
                    xy=(width, yloc), xytext=(20, 0),
                    textcoords="offset points",
                    horizontalalignment='center', verticalalignment='center',
                    clip_on=True)

房屋类型

In [None]:
fig, ax = plt.subplots(figsize=(20, 50))
house_type = data['房屋类型'].value_counts().sort_values()
labels = house_type.index
y = np.arange(len(labels))
region_bar = ax.barh(y, house_type)
_ = ax.set_yticks(y)
_ = ax.set_yticklabels(labels)
_ = ax.set_ylabel('房型')
_ = ax.set_title('各房型数量')
_ = autolabelh(region_bar)
plt.show()

配套设施

In [None]:
fig, ax = plt.subplots()
facility = data[['洗衣机', '空调', '衣柜', '电视', '冰箱', '热水器', '床', '暖气', '宽带', '天然气']].sum(axis=0).sort_values(ascending=False)
labels = facility.index
x = np.arange(len(labels))
region_bar = ax.bar(x, facility)
_ = ax.set_xticks(x)
_ = ax.set_xticklabels(labels)
_ = ax.set_ylabel('房源数')
_ = ax.set_title('各配套设施数量')
_ = autolabel(region_bar)
plt.show()

各区房源数

In [None]:
fig, ax = plt.subplots()
region = data['区'].value_counts().sort_values(ascending=False)
labels = region.index
x = np.arange(len(labels))
region_bar = ax.bar(x, region)
_ = ax.set_xticks(x)
_ = ax.set_xticklabels(labels)
_ = ax.set_ylabel('房源数')
_ = ax.set_title('各区房源数')
_ = autolabel(region_bar)
plt.show()

通过箱线图观察各区租房价格的分布

In [None]:
sns.boxplot(x="区", y="价格", data=data)

考察各区租房平均价格

In [None]:
fig, ax = plt.subplots()
region_price = data.groupby('区')['价格'].mean().sort_values(ascending=False)
x = np.arange(len(labels))
labels = region_price.index
region_bar = ax.bar(x, region_price)
_ = ax.set_xticks(x)
_ = ax.set_xticklabels(labels)
_ = ax.set_ylabel('租房平均价格')
_ = ax.set_title('各区租房平均价格')
_ = autolabel(region_bar)
plt.show()

进一步，考察电梯对租房价格的影响

In [None]:
sns.boxplot(x="区", y="价格", hue="电梯", data=data)

价格与面积的关系

In [None]:
sns.regplot(x='面积', y='价格', data=data)

价格与楼层的关系

先求各楼层的平均租房价格

In [None]:
floor_price = data[['楼层', '价格']].groupby('楼层', as_index=False).mean()

In [None]:
sns.regplot(x='楼层', y='价格', data=floor_price)

## 高级图形

### 词云

房源标签

In [None]:
tags =','.join([tag for tag in data['房源标签'].ravel() if tag is not np.nan])
tags = pd.Series(tags.split(','))

In [None]:
tags_counts = tags.value_counts()

In [None]:
wc = WordCloud(
    font_path='lianjia/simsun.ttf',
    background_color="white",  # 背景颜色
    # mask=bg,   # 背景形状
    max_words=2000,  # 词云显示的最大词数
    max_font_size=1000,  # 字体最大值
    random_state=42,
    width=1000, height=860, margin=2,# 设置图片默认的大小,但是如果使用背景图片的话,那么保存的图片大小将会按照其大小保存,margin为词语边缘距离
)
wc.generate_from_frequencies(tags_counts)
plt.figure()
# 以下代码显示图片
plt.imshow(wc)
plt.axis("off")
plt.show()

房源描述

导入停用词列表

In [None]:
stopwords1 = [line.strip() for line in open('lianjia/all_stopwords.txt', 'r', encoding='gbk').readlines()]
stopwords2 = [
    '房源', '亮点', '描述', '此房', '介绍', '原因', '房东', '号线', '就是', '东门', '北门', '旁边', '业主',
    '房子', '出租', '周边', '配套', '装修', '交通', '出行', '上海', '可以', '即可', '到达', '数据', '来源于',
    '一兆', '韦德', '租客', '到期', '来自', '百度', '所以', '位于', '公里', '内部', '希望', '城市', '有限公司',
    '适合', '出来', '地下', '便是', '直接', '小时', '两个', '充足', '小区', '出门', '距离', '户型', '两房',
    '一房', '比较', '高德', '地图', '百度', '中间', '提供', '直达', '同意', '里面', '十分', '门口', '欢迎',
    '二房', '正气', '一个', '满足', '各种', '多条', '一条街', '自己', '米左右', '三房', '之前', '目前', '以及',
    '左右', '本房', '不错', '等等', '需求', '现在', '上海市', '自带', '路上', '选择', '入住', '属于', '号口'
]
stopwords = stopwords1 + stopwords2

In [None]:
description = '。'.join(data['房源描述'][data['房源描述'].astype(str) != 'nan'].to_list())

In [None]:
seg_list = jieba.cut(description, cut_all=False)
seg_list = ' '.join(seg_list)

In [None]:
bg = np.array(Image.open("lianjia/linajia.jpg"))
wc = WordCloud(
    font_path='lianjia/simsun.ttf',
    background_color="white",  # 背景颜色
    mask=bg,   # 背景形状
    max_words=2000,  # 词云显示的最大词数
    max_font_size=1000,  # 字体最大值
    stopwords=stopwords,
    scale=14,   # 比列放大  数值越大  词云越清晰
    random_state=42,
    width=1000, height=860, margin=2,# 设置图片默认的大小,但是如果使用背景图片的话,那么保存的图片大小将会按照其大小保存,margin为词语边缘距离
)
wc.generate(seg_list)
plt.figure()
# 以下代码显示图片
#产生背景图片，基于彩色图像的颜色生成器
image_colors=ImageColorGenerator(bg)
#开始画图
#plt.imshow(wc.recolor(color_func=image_colors))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
wc.to_file('lianjia/房源描述.png')

### 地图

通过百度api获取地址经纬度

调用api函数

In [None]:
def address2latlng(address):
    url = 'http://api.map.baidu.com/geocoding/v3/'
    address = quote(address) # 防止中文地址导致乱码，先用quote方法编码
    city = quote('上海')
    output='json'
    ak = 'RheAGXP4aLazqrGt2AKHPwkUMCrSnkdo'
    uri = url + '?address=' + address + '&output=' + output + '&ak=' + ak + '&city=' + city
    req = requests.get(uri)
    temp = req.json()
    lat = temp['result']['location']['lat']  # 纬度值
    lng = temp['result']['location']['lng']  # 经度值
    return lat, lng

In [None]:
address2latlng('杨浦-崇业小区')

获取经纬度

In [None]:
def get_lat_lng(row):
    row['纬度'], row['经度'] = address2latlng(row['地点'])
    return row

In [None]:
data = data.apply(get_lat_lng, axis=1)

显示房屋地址

In [None]:
sh_map = folium.Map(location=[31.24257, 121.486801])
# Instantiate a feature group for the house in the dataframe
house = folium.map.FeatureGroup()
for lat, lng, in zip(data['纬度'], data['经度']):
    _ = house.add_child(
        folium.Circle(
            [lat, lng],
            radius=2, # define how big you want the circle markers to be
            color='red',
            opacity=0.4
        )
    )
sh_map.add_child(house)

统计各区房屋总数

In [None]:
pd.Categorical(data['区'])

In [None]:
# let's start again with a clean copy of the map of Shanghai
sh_map = folium.Map(location=[31.24257, 121.486801])
# instantiate a mark cluster object for the house in the dataframe
house = plugins.MarkerCluster().add_to(sh_map)
# loop through the dataframe and add each data point to the mark cluster
for lat, lng, label, in zip(data['纬度'], data['经度'], pd.Categorical(data['区'])):
    _ = folium.Marker(
        location=[lat, lng],
        icon=None,
        popup=label,
    ).add_to(house)
# add incidents to map
sh_map.add_child(house)

以热力图的方式呈现

In [None]:
sh_map = folium.Map(location=[31.24257, 121.486801])
# Convert data format
heatdata = data[['纬度','经度']].values.tolist()
# add incidents to map
HeatMap(heatdata, radius=15).add_to(sh_map)
sh_map

# 基于机器学习的数据分析——房租预测

## 数据预处理

在开始之前，先看看房租，面积与各数值特征间的线性相关系数。

In [None]:
data.corr()["价格"].sort_values(ascending=False)

In [None]:
data.corr()["面积"].sort_values(ascending=False)

价格与面积线性相关系数较高，而面积则与房间数息息相关

### 提取特定价格区间的数据

In [None]:
model_data = data[(data['价格'] >= 3000) & (data['价格'] <= 12000)].copy()

### 特征提取

训练集拆分为数值型，分类型，文本型三类特征

In [None]:
num = model_data[['面积', '室', '厅', '卫', '经度', '纬度', '楼层']]
onehot = model_data[['区', '朝向', '租赁方式', '燃气', '采暖', '用水', '用电', '电梯', '车位',
                   '洗衣机', '空调', '衣柜', '电视', '冰箱', '热水器', '床', '暖气', '宽带', '天然气']]
text = model_data['房源标签']

数值变量标准化

In [None]:
scaler = StandardScaler()
data_num = scaler.fit_transform(num)
data_num.shape

In [None]:
y_data = model_data['价格'].to_numpy() / 10000
# y_data = (y_data - y_data.min()) / (y_data.max() - y_data.min())  # normalized

分类变量OneHotEncoder编码

In [None]:
enc = OneHotEncoder(sparse=False)
data_onehot = enc.fit_transform(onehot)
data_onehot.shape

文本特征抽取

In [None]:
vectorizer = TfidfVectorizer()
data_text = vectorizer.fit_transform(text.map(lambda x: ' '.join(x.split(',')) if x is not np.nan else ''))

In [None]:
vectorizer.get_feature_names()

In [None]:
X_data = np.concatenate((data_num, data_onehot, data_text.toarray()), axis=1)

In [None]:
X_data = np.nan_to_num(X_data)

In [None]:
X_data.shape

### 拆分数据集

In [None]:
model_data['面积'].hist(bins=1000)
plt.xlim(0, 200)

为了减少抽样的不均匀，采用分层随机抽样。由于房租与面积最相关，按照面积将数据集拆分为4层。

In [None]:
model_data['area_cut'] = pd.cut(model_data['面积'], [0, 50, 100, 150, np.inf], labels=False)
model_data['area_cut'].hist()

抽样拆分数据集

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, random_state=27)
for train_index, test_index in sss.split(model_data, model_data['area_cut']):
    X_train = X_data[train_index]
    y_train = y_data[train_index]
    X_test = X_data[test_index]
    y_test = y_data[test_index]

In [None]:
X_train.shape

In [None]:
y_train

## 模型训练

构建神经网络

In [None]:
model = Sequential()
model.add(Dense(120, activation='relu', kernel_initializer='he_normal', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.2))

model.add(Dense(120, activation='relu', kernel_initializer='he_normal'))
model.add(Dropout(0.2))

model.add(Dense(1))

model.summary()

In [None]:
batch_size = 64
epochs = 50

# 初始化 RMSprop 优化器。
opt = keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6)

# 利用 RMSprop 来训练模型。
model.compile(loss='mae', optimizer='RMSprop', metrics=['mae'])
history = model.fit(X_train, y_train,
            batch_size=batch_size,
            epochs=epochs,
            validation_data=(X_test, y_test),
            shuffle=True)

# model.save("my_model")

绘制训练 & 验证的均方误差值

In [None]:
plt.plot(history.history['mae'])
plt.plot(history.history['val_mae'])
plt.title('Model MAE')
plt.ylabel('MSE')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'])
plt.savefig('lianjia/平均绝对误差.jpg')
plt.show()

预测并评估结果

In [None]:
pred = model.predict(X_test)
plt.plot(y_test*10000,label='True')
plt.plot(pred*10000, label='Predict')
plt.legend()
plt.savefig('lianjia/预测效果.jpg')
plt.show() 

In [None]:
y_test[:5] * 10000
pred[:5] * 10000

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
sns.regplot(y_test*10000, pred*10000, ax=ax)
ax.set_xlabel('真实值')
ax.set_ylabel('预测值')
fig.savefig('lianjia/预测值散点图.jpg')