# simplt linear regression

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
dataset = pd.read_csv('./data/salary_experience_year.csv')
dataset.head()

Unnamed: 0,Experience Years,Salary
0,1.1,39343
1,1.2,42774
2,1.3,46205
3,1.5,37731
4,2.0,43525


In [6]:
data = dataset.copy()
y = data['Salary']
X = data[['Experience Years']]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7, random_state=2529)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((28, 1), (12, 1), (28,), (12,))

In [10]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [11]:
model.fit(X_train,y_train)

In [12]:
model.intercept_

26596.961311068255

In [13]:
model.coef_

array([9405.61663234])

In [15]:
y_pred = model.predict(X_test)
y_pred

array([ 90555.15441095,  59516.61952424, 106544.70268592,  64219.42784041,
        68922.23615658, 123474.81262412,  84911.78443155,  63278.86617718,
        65159.98950364,  61397.74285071,  37883.70126987,  50111.00289191])

# multiple linear regression

In [16]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [17]:
dataset = pd.read_csv('./data/insurance.csv', sep=',', encoding='UTF-8')
df = dataset.copy()
df.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [19]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [20]:
from sklearn.preprocessing import label_binarize
# 性别
df['sex'] = label_binarize(df['sex'],classes=['female', 'male'])
# 是否吸烟
df['smoker'] = label_binarize(df['smoker'], classes=['yes', 'no'])
# 4种居住地区
residence = df['region'].unique()
residence

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [21]:
# print(residence)
# 当字符串非常相似时，可能会出现重复的编码，导致编码丢失。为了避免这种情况，可以将居住地的代码分别映射到独特的整数
df['region'] = df['region'].map({'southwest':0, 'southeast':1, 'northwest':2, 'northeast':3})
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,0,0,16884.924
1,18,1,33.77,1,1,1,1725.5523
2,28,1,33.0,3,1,1,4449.462
3,33,1,22.705,0,1,2,21984.47061
4,32,1,28.88,0,1,2,3866.8552


In [22]:
correlation = df.corr()
correlation

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
age,1.0,-0.020856,0.109272,0.042469,0.025019,-0.002127,0.299008
sex,-0.020856,1.0,0.046371,0.017163,-0.076185,-0.004588,0.057292
bmi,0.109272,0.046371,1.0,0.012759,-0.00375,-0.157566,0.198341
children,0.042469,0.017163,0.012759,1.0,-0.007673,-0.016569,0.067998
smoker,0.025019,-0.076185,-0.00375,-0.007673,1.0,-0.002181,-0.787251
region,-0.002127,-0.004588,-0.157566,-0.016569,-0.002181,1.0,0.006208
charges,0.299008,0.057292,0.198341,0.067998,-0.787251,0.006208,1.0


In [23]:
data = df[['smoker','age','bmi','charges']]
data

Unnamed: 0,smoker,age,bmi,charges
0,0,19,27.900,16884.92400
1,1,18,33.770,1725.55230
2,1,28,33.000,4449.46200
3,1,33,22.705,21984.47061
4,1,32,28.880,3866.85520
...,...,...,...,...
1333,1,50,30.970,10600.54830
1334,1,18,31.920,2205.98080
1335,1,18,36.850,1629.83350
1336,1,21,25.800,2007.94500


In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [26]:
from sklearn.preprocessing import StandardScaler

In [27]:
#作用：去均值和方差归一化。可保存训练集中的均值、方差参数，然后直接用于转换测试集数据。
scaler = StandardScaler()
x_train = scaler.fit_transform(X_train)
x_test = scaler.fit_transform(X_test)

In [28]:
from sklearn.decomposition import PCA

In [29]:
pac = PCA(n_components=0.9)
x_train = pac.fit_transform(x_train)
x_test = pac.fit_transform(x_test)

In [30]:
# knn算法
from sklearn.neighbors import KNeighborsRegressor
# 逻辑回归
from sklearn.linear_model import LinearRegression, Lasso, Ridge
# 决策树 极端随机树
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
# 支持向量机
from sklearn.svm import SVR
# 随机森林 Adaboost回归  梯度提升回归  Bagging回归
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, BaggingRegressor
# 模型损失评估
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score
# 交叉验证评分,效验曲线
from sklearn.model_selection import cross_val_score, validation_curve

In [34]:
model_list = [KNeighborsRegressor(), LinearRegression(), Lasso(), Ridge(), DecisionTreeRegressor(),ExtraTreeRegressor(),
              SVR(), RandomForestRegressor(), AdaBoostRegressor(), GradientBoostingRegressor(), BaggingRegressor()]
model_name = ['knn', 'line', 'lasso', 'ridge', 'tree', 'extratree', 'svm', 'random', 'adaboost', 'gbr', 'bagging']
mean_error = []
score = []
for name, model in zip(model_name, model_list, ):
    model = model
    # 训练
    model.fit(x_train, y_train)
    # 预测
    predict_y = model.predict(x_test)
    # r2评分
    score.append(r2_score(y_test,predict_y))
    # 均方误差
    mean_error.append(mean_squared_error(y_test,predict_y))

# models = pd.Series(mean_error,index=model_name,name='均方误差').round(0)
# models
result = pd.DataFrame({'r2评分': score,'均方误差': mean_error},index=model_name)
result

Unnamed: 0,r2评分,均方误差
knn,0.782276,32501990.0
line,0.673874,48684320.0
lasso,0.673845,48688690.0
ridge,0.67393,48675930.0
tree,0.495042,75380480.0
extratree,0.579551,62764840.0
svm,-0.105424,165018400.0
random,0.729324,40406700.0
adaboost,0.559478,65761460.0
gbr,0.736815,39288360.0


In [35]:
from sklearn.model_selection import GridSearchCV

In [36]:
knn = KNeighborsRegressor()
knn.fit(x_train, y_train)
y_predict = knn.predict(x_test)
r2_score(y_test,y_predict)

0.7822759315577155

In [37]:
params = {'n_neighbors': list(range(3,100))}
grid = GridSearchCV(estimator=knn,param_grid=params,cv=10)
grid.fit(x_train, y_train)

In [38]:
# 最优参数
grid.best_params_

{'n_neighbors': 18}

In [39]:
# 最优成绩  等于model.score
grid.best_score_

0.8376764061824467