In [11]:
import pandas as pd
import numpy as np

data = pd.read_csv("bike.csv")

In [2]:
# 删掉id列
data = data.drop('id', axis=1)
print(data.head())

   city  hour  is_workday  weather  temp_air  temp_body  wind   y
0     0    22           1        2       3.0        0.7     0  15
1     0    10           1        1      21.0       24.9     3  48
2     0     0           1        1      25.3       27.4     0  21
3     0     7           0        1      15.7       16.2     0  11
4     1    10           1        1      21.1       25.0     2  39


In [3]:
# 筛选出上海市的所有数据，然后剔除city列
data_shanghai = data[data['city'] == 1]
data_shanghai = data_shanghai.drop('city', axis=1)
print(data_shanghai.head())

    hour  is_workday  weather  temp_air  temp_body  wind   y
4     10           1        1      21.1       25.0     2  39
5      0           1        1      20.4       18.2     0  12
9      4           1        3      17.4       18.0     3   2
10     0           1        1      14.9       15.3     2   6
11     8           0        1      25.0       28.1     0  25


In [4]:
# hour列中原来6点-18点统一为1；19点-次日5点统一为0
data_shanghai['hour'] = data_shanghai['hour'].apply(lambda x: 1 if 6 <= x <= 18 else 0)
print(data_shanghai.head())

    hour  is_workday  weather  temp_air  temp_body  wind   y
4      1           1        1      21.1       25.0     2  39
5      0           1        1      20.4       18.2     0  12
9      0           1        3      17.4       18.0     3   2
10     0           1        1      14.9       15.3     2   6
11     1           0        1      25.0       28.1     0  25


In [5]:
# y列为单车租用数量，是我们的预测目标（标签），请将该列提取出来，并转换为一个numpy列向量，将原先的y列剔除

y = data_shanghai['y'].values.reshape(-1, 1)
# .values将 pandas Series 转换为 numpy 数组
# .reshape(-1, 1)将一维数组转换为二维列向量
data_shanghai = data_shanghai.drop('y', axis=1)
print(y)
print(data_shanghai.head())


[[39]
 [12]
 [ 2]
 ...
 [ 1]
 [ 1]
 [11]]
    hour  is_workday  weather  temp_air  temp_body  wind
4      1           1        1      21.1       25.0     2
5      0           1        1      20.4       18.2     0
9      0           1        3      17.4       18.0     3
10     0           1        1      14.9       15.3     2
11     1           0        1      25.0       28.1     0


In [6]:
# 请将DataFrame对象转换为Numpy数组
data_shanghai = data_shanghai.values
print(data_shanghai)

[[ 1.   1.   1.  21.1 25.   2. ]
 [ 0.   1.   1.  20.4 18.2  0. ]
 [ 0.   1.   3.  17.4 18.   3. ]
 ...
 [ 0.   1.   3.  13.7 14.1  2. ]
 [ 0.   0.   1.  22.3 22.2  0. ]
 [ 0.   0.   1.   9.6  9.7  0. ]]


In [8]:
# 按照训练集与测试集8:2的比例将原始数据集划分

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_shanghai, y, test_size=0.2, random_state=42)

In [9]:
# 对训练集数据、训练集标签、测试集数据和测试集标签进行归一化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)


In [15]:
# 构建一个线性回归模型（多元一次函数），然后利用训练集训练模型
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train, y_train)

In [16]:
# 利用测试集对训练好的模型进行评估
# 提示：使用predict(data_array)方法输入测试集，该函数返回值为模型预测值

y_pred = model.predict(x_test)

In [17]:
# 请使用均方根误差（RMSE）作为评估指标，并输出RMSE值
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(rmse)


0.7868598401729346
