In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 读取CSV文件
data = pd.read_csv('Train_Data.csv')

# 数据预处理
# 将性别、吸烟者和地区转换为数值型变量
data['sex'] = data['sex'].map({'male': 0, 'female': 1})
data['smoker'] = data['smoker'].map({'no': 0, 'yes': 1})
data['region'] = data['region'].map({'northeast': 0, 'northwest': 1, 'southeast': 2, 'southwest': 3})

# 拆分特征和目标变量
X = data.drop('charges', axis=1)
y = data['charges']

# 将数据集拆分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 构建线性回归模型
model = LinearRegression()

# 在训练集上训练模型
model.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = model.predict(X_test)

# 模型评估
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('均方误差（MSE）：', mse)
print('确定系数（R^2）：', r2)


均方误差（MSE）： 32137046.701800626
确定系数（R^2）： 0.7210655737644529


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 读取CSV文件
data = pd.read_csv('train_data.csv')

# 数据预处理
data['sex'] = data['sex'].map({'male': 0, 'female': 1})
data['smoker'] = data['smoker'].map({'no': 0, 'yes': 1})
data['region'] = data['region'].map({'northeast': 0, 'northwest': 1, 'southeast': 2, 'southwest': 3})

# 拆分特征和目标变量
X = data.drop('charges', axis=1)
y = data['charges']

# 将数据集拆分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 构建SVR模型
svr_model = SVR(kernel='rbf')  # 使用高斯核函数

# 在训练集上训练SVR模型
svr_model.fit(X_train, y_train)

# 在测试集上进行预测
svr_y_pred = svr_model.predict(X_test)

# SVR模型评估
svr_mse = mean_squared_error(y_test, svr_y_pred)
svr_r2 = r2_score(y_test, svr_y_pred)

print('SVR均方误差（MSE）：', svr_mse)
print('SVR确定系数（R^2）：', svr_r2)

# 构建随机森林回归模型
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# 在训练集上训练随机森林回归模型
rf_model.fit(X_train, y_train)

# 在测试集上进行预测
rf_y_pred = rf_model.predict(X_test)

# 随机森林回归模型评估
rf_mse = mean_squared_error(y_test, rf_y_pred)
rf_r2 = r2_score(y_test, rf_y_pred)

print('随机森林均方误差（MSE）：', rf_mse)
print('随机森林确定系数（R^2）：', rf_r2)


  from numpy.core.umath_tests import inner1d


SVR均方误差（MSE）： 127140905.94532232
SVR确定系数（R^2）： -0.10352317000364253
随机森林均方误差（MSE）： 10922238.853975298
随机森林确定系数（R^2）： 0.905200112001253


In [3]:
import pandas as pd
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

# 读取CSV文件
test_data = pd.read_csv('test_data.csv')

# 数据预处理
test_data['sex'] = test_data['sex'].map({'male': 0, 'female': 1})
test_data['smoker'] = test_data['smoker'].map({'no': 0, 'yes': 1})
test_data['region'] = test_data['region'].map({'northeast': 0, 'northwest': 1, 'southeast': 2, 'southwest': 3})

# 提取特征
X_test = test_data
# 加载训练好的模型

# 在测试集上进行预测
svr_y_pred = svr_model.predict(X_test)


# 在测试集上进行预测
rf_y_pred = rf_model.predict(X_test)


In [4]:
svr_y_pred

array([9341.02860925, 9365.30806141, 9388.06223238, 9342.64558506,
       9379.08776164, 9364.16702238, 9398.8276976 , 9361.17983994,
       9354.63111257, 9390.23629724, 9358.34656117, 9348.30051736,
       9329.40094827, 9324.42429269, 9357.93090828, 9362.51173196,
       9368.21320539, 9385.40820493, 9375.29402942, 9358.62394982,
       9384.54861057, 9382.81510145, 9356.01026785, 9401.49209864,
       9417.56856146, 9421.47548257, 9371.09499496, 9381.66975006,
       9360.68898874, 9376.95243613, 9358.80048862, 9359.95898017,
       9341.00751939, 9379.67929476, 9363.49370994, 9351.41793772,
       9357.4119493 , 9344.67369034, 9343.21509538, 9336.89762912,
       9382.70514272, 9344.07686088, 9380.90933878, 9394.90342993,
       9341.26350543, 9340.54968316, 9406.47780824, 9358.63848045,
       9387.85146029, 9365.37302552, 9358.40722111, 9353.93284668,
       9388.75725224, 9359.88211898, 9390.99648789, 9355.33239951,
       9391.55910761, 9377.71146535, 9330.09074694, 9380.52897