# 波士顿房价预测案例——线性回归分析

比较对y和logy进行线性拟合的效果。

## 1、导入必要的工具包

In [1]:
import numpy as np  # 矩阵操作
import pandas as pd # SQL数据处理

from sklearn.metrics import r2_score  #评价回归预测模型的性能

## 2. 读取数据
已经是做完特征工程后的数据，请先运行2_FE_BostonHousePrice.ipynb，得到文件FE_boston_housing.csv

In [2]:
# path to where the data lies
#dpath = './data/'
data = pd.read_csv("FE_boston_housing.csv")

#通过观察前5行，了解数据每列（特征）的概况
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,log_MEDV
0,-0.417401,0.302696,-1.291856,-0.250812,-0.139895,0.50504,-0.109432,0.121208,-0.980635,-0.667101,-1.415179,0.443535,-1.122679,0.300878,0.446452
1,-0.414992,-0.486479,-0.593329,-0.250812,-0.731821,0.269017,0.377488,0.53733,-0.865459,-0.988734,-0.516361,0.443535,-0.534772,-0.004571,0.166718
2,-0.414995,-0.486479,-0.593329,-0.250812,-0.731821,1.439934,-0.255152,0.53733,-0.865459,-0.988734,-0.516361,0.399553,-1.256937,1.662674,1.433933
3,-0.414412,-0.486479,-1.310933,-0.250812,-0.826186,1.153335,-0.798939,1.056878,-0.750284,-1.107857,-0.066953,0.419005,-1.41098,1.497222,1.33112
4,-0.410202,-0.486479,-1.310933,-0.250812,-0.826186,1.381694,-0.50039,1.056878,-0.750284,-1.107857,-0.066953,0.443535,-1.073216,1.85358,1.54801


###  数据基本信息
样本数目、特征维数
每个特征的类型、空值样本的数目、数据类型

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 490 entries, 0 to 489
Data columns (total 15 columns):
CRIM        490 non-null float64
ZN          490 non-null float64
INDUS       490 non-null float64
CHAS        490 non-null float64
NOX         490 non-null float64
RM          490 non-null float64
AGE         490 non-null float64
DIS         490 non-null float64
RAD         490 non-null float64
TAX         490 non-null float64
PTRATIO     490 non-null float64
B           490 non-null float64
LSTAT       490 non-null float64
MEDV        490 non-null float64
log_MEDV    490 non-null float64
dtypes: float64(15)
memory usage: 57.5 KB


### 数据准备

In [5]:
# 从原始数据中分离输入特征x和输出y
# 这里我们y有2个取值，原始的MEDV及其log1p之后的值
col_y = ["MEDV","log_MEDV"]
y = pd.DataFrame(data,columns = col_y)

X = data.drop(["MEDV", "log_MEDV"], axis = 1)

#特征名称，用于后续显示权重系数对应的特征
feat_names = X.columns

当数据量比较大时，可用train_test_split从训练集中分出一部分做校验集；
样本数目较少时，建议用交叉验证。
在线性回归中，留一交叉验证有简便计算方式。

下面将训练数据分割成训练集和测试集，只是让大家对模型的训练误差、校验集上的测试误差估计、和测试集上的测试误差做个比较。

In [6]:
#将数据分割训练数据与测试数据
from sklearn.model_selection import train_test_split

# 随机采样20%的数据构建测试样本，其余作为训练样本
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33, test_size=0.2)
X_train.shape

(392, 13)

## 3、确定模型类型

### 3.1 尝试缺省参数的线性回归

In [8]:
# 线性回归
#class sklearn.linear_model.LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=1)
from sklearn.linear_model import LinearRegression

# 1.使用默认配置初始化学习器实例
lr = LinearRegression()

# 2.用训练数据训练模型参数
lr.fit(X_train, y_train)

# 3. 用训练好的模型对测试集进行预测
y_test_pred_lr = lr.predict(X_test)
y_train_pred_lr = lr.predict(X_train)


# 看看各特征的权重系数，系数的绝对值大小可视为该特征的重要性
fs = pd.DataFrame({"columns":list(feat_names), "coef_org":list((lr.coef_[0,:].T)),"coef_log":list((lr.coef_[1,:].T))})
fs.sort_values(by=['coef_org'],ascending=False)

Unnamed: 0,coef_log,coef_org,columns
5,0.167208,0.326014,RM
8,0.295809,0.248638,RAD
1,0.083136,0.122286,ZN
11,0.032289,0.053644,B
3,0.026345,0.020622,CHAS
2,0.025009,-0.026802,INDUS
6,-0.01373,-0.059063,AGE
0,-0.28216,-0.100635,CRIM
4,-0.175507,-0.160062,NOX
10,-0.190168,-0.212699,PTRATIO


#### 3.1.1 模型评价

In [9]:
# 使用r2_score评价模型在测试集和训练集上的性能，并输出评估结果
#测试集
print 'The r2 score of LinearRegression on test with original MEDV is', r2_score(y_test.iloc[:,0], y_test_pred_lr[:,0])
#训练集
print 'The r2 score of LinearRegression on train with original MEDV is', r2_score(y_train.iloc[:,0], y_train_pred_lr[:,0])

# y取log
#测试集
print 'The r2 score of LinearRegression on test with log MEDV is', r2_score(y_test.iloc[:,1], y_test_pred_lr[:,1])
#训练集
print 'The r2 score of LinearRegression on train  with log MEDV is', r2_score(y_train.iloc[:,01], y_train_pred_lr[:,1])

The r2 score of LinearRegression on test with original MEDV is 0.667072270284
The r2 score of LinearRegression on train with original MEDV is 0.790408908811
The r2 score of LinearRegression on test with log MEDV is 0.693781960956
The r2 score of LinearRegression on train  with log MEDV is 0.80898064651


对y（价格）取log后，r2 score略变好。