## 1.了解数据

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
 
# 配置画图参数
plt.rcParams["font.family"] = "SimHei"
plt.rcParams['axes.unicode_minus']=False

In [5]:
df = pd.read_csv("D:\jupyter file\date\housing.csv")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


## 2.数据预处理

In [None]:
df.total_bedrooms.isna().sum()
 # 这里简单使用均值对缺失值进行填充
df.total_bedrooms.fillna(df.total_bedrooms.mean(), inplace=True)

In [8]:
# 查看ocean_proximity的数据概况
df.ocean_proximity.value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [9]:
from sklearn.preprocessing import LabelEncoder
lb_encoder = LabelEncoder() 
df.ocean_proximity = lb_encoder.fit_transform(df.ocean_proximity)

In [10]:
target = df.median_house_value
data = df.drop('median_house_value', axis=1)
 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=1)

In [11]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
# 对X_train, X_test进行MinMax变换
mm_scaler = MinMaxScaler()
X_train_mm = mm_scaler.fit_transform(X_train)
X_test_mm = mm_scaler.fit_transform(X_test)

In [12]:
# 对X_train, X_test进行Standard变换
ss_scaler = StandardScaler()
X_train_ss = ss_scaler.fit_transform(X_train) 
X_test_ss = ss_scaler.fit_transform(X_test)

## 3.模型拟合及预测

In [13]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
# 使用未标准化过的数据进行模型训练
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
 
# 使用RMSE评估模型表现
from sklearn.metrics import mean_squared_error
lr_mse = mean_squared_error(y_test, lr_pred)
print(f"Linear Model RMSE: {np.sqrt(lr_mse):.3f}")

Linear Model RMSE: 70488.860


In [14]:
from sklearn.model_selection import cross_val_score
lr_scaler = LinearRegression()
# 对MinMaxScaler的效果进行交叉检验
scores = cross_val_score(lr_scaler,
                          X_train_mm,
                          y_train,
                          scoring='neg_root_mean_squared_error',
                          cv=5,
                        ) 
print(f"Linear Model(MinMax) RMSE: {scores.mean():.3f}")

Linear Model(MinMax) RMSE: -69653.004


In [15]:
# 对StandardScaler的效果进行交叉检验
scores = cross_val_score(lr_scaler,
                          X_train_ss,
                          y_train,
                          scoring='neg_root_mean_squared_error',
                          cv=5,
                        )
print(f"Linear Model(Standard Scaler) RMSE: {scores.mean():.3f}")

Linear Model(Standard Scaler) RMSE: -69653.004


In [21]:
lr_ss = LinearRegression()
# 使用未标准化过的数据进行模型训练
lr_ss.fit(X_train_ss, y_train)
lr_ss_pred = lr_ss.predict(X_test_ss)
 
lr_ss_mse = mean_squared_error(y_test, lr_ss_pred)
print(f"Linear Model(StandardScaler) RMSE: {np.sqrt(lr_ss_mse):.3f}")

Linear Model(StandardScaler) RMSE: 70497.341


## 3.3更多模型-岭回归

In [22]:
from sklearn.linear_model import Ridge, Lasso
# 岭回归和Lasso回归都要确定alpha参数，导入网格参数搜索方法，确定最优alpha参数
from sklearn.model_selection import GridSearchCV
alpha = [0.01, 0.1, 0.3, 0.5, 0.7, 1.0]

In [23]:
ridge = Ridge()
rid_model = GridSearchCV(ridge, param_grid={'alpha': alpha})
rid_model.fit(X_train, y_train)
 
rid_pred = rid_model.predict(X_test)
rid_mse = mean_squared_error(y_test, rid_pred)
print(f"Ridge Model RMSE: {np.sqrt(rid_mse):.3f}")

Ridge Model RMSE: 70488.688


## 3.3.2 Lasso回归

In [24]:
lasso = Lasso()
lasso_model = GridSearchCV(lasso, param_grid={'alpha': alpha})
lasso_model.fit(X_train, y_train)
 
lasso_pred = lasso_model.predict(X_test)
lasso_mse = mean_squared_error(y_test, lasso_pred) 
print(f"Lasso Model RMSE: {np.sqrt(lasso_mse):.3f}")

Lasso Model RMSE: 70488.774
