# 1. 선형 회귀

In [3]:
import pandas as pd

df = pd.read_csv("../data/rider_data.csv")
df.head()

Unnamed: 0,Distance_km,Riders_Available,Weather_Index,Rush_Hour,Time_min
0,2.934511,19,6,0,24.01786
1,6.679643,16,2,0,28.422919
2,5.257961,20,7,0,31.826773
3,4.39128,28,5,0,20.906637
4,1.514121,23,4,0,6.290465


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

X_train, X_test, y_train, y_test = train_test_split(df.drop("Time_min", axis=1), df["Time_min"], random_state=3333)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(750, 4)
(250, 4)
(750,)
(250,)


In [5]:
model = LinearRegression()

model.fit(X_train, y_train)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [6]:
pred = model.predict(X_test)
pred

array([25.01843854, 10.48012383, 15.1157771 , 46.08180671, 48.60436677,
       24.1896645 ,  9.76832107, 34.74839388, 17.67706247, 53.7882989 ,
       19.4248271 , 27.64204823, 30.37279141, 26.33086772, 36.819484  ,
       31.81592172, 46.64018979, 19.3022175 , 26.36891267,  7.0525249 ,
       27.16597526, 26.2623304 , 15.36456386,  4.90563613, 25.30667892,
       33.04654936, 38.10330387, 19.92710632, 34.65607035, 42.44707552,
       38.70086148, 35.17255163, 10.24253864, 36.16048366,  5.10110111,
       27.20816263, 23.02543621, 21.30621436, 19.21278098, 24.6104503 ,
       27.87415925, 10.1608238 , 33.53006714, 18.78120065,  4.97800397,
       19.87089177, 26.93716342, 24.08262315, 29.69251372, 27.79158747,
       27.53879572, 42.00725618, 27.21316041, 28.18635775, 29.85168555,
       46.08998092, 12.41696325, 26.70518731, 12.06346009, 27.07558395,
       43.97280784, 40.57279131, 45.28475342, 26.116674  , 34.19259589,
       41.94837004, 38.27490476, 30.4284245 , 38.58996818, 29.14

In [7]:
mae = mean_absolute_error(y_test, pred)
mae

2.3817835704731167

In [8]:
model.coef_

coef_df = pd.DataFrame({
    'feature': ["Distance_km", "Riders_Available", "Weather_Index",	"Rush_Hour"],
    'coef': model.coef_
})
print(coef_df)


            feature       coef
0       Distance_km   3.967227
1  Riders_Available  -0.508047
2     Weather_Index   1.477702
3         Rush_Hour  10.224068


In [9]:
# 절편
print("절편: ", model.intercept_)

절편:  10.24753491967267


In [10]:
# 평가
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, pred)

print(f"MAE : {mae}\nMSE: {mse}\nRMSE : {rmse}\nR2: {r2}")


MAE : 2.3817835704731167
MSE: 8.828727788282874
RMSE : 2.9713175172443074
R2: 0.9281633439010146


In [17]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV # 실전 : 옴튜나

r_model = Ridge() # L2 규제 : 2는 제곱
l_model = Lasso() # L1 규제 : 절대값

r_param = {"alpha": [0.001, 0.01, 0.1, 1, 10, 100]}
r_grid = GridSearchCV(r_model, param_grid=r_param, cv=5)
l_grid = GridSearchCV(l_model, param_grid=r_param, cv=5)


r_grid.fit(X_train, y_train)

l_pred = r_grid.best_estimator_.predict(X_test)
l_mae = mean_absolute_error(y_test, l_pred)
l_r2 = r2_score(y_test, l_pred)

print("Ridge param(alpha)", r_grid.best_params_)
print("Ridge score(alpha)", r_grid.best_score_)
print("Ridge mae", l_mae)
print("Ridge r2", l_r2)

print("="*50)

l_grid.fit(X_train, y_train)
l_pred = l_grid.best_estimator_.predict(X_test)
l_mae = mean_absolute_error(y_test, l_pred)
l_r2 = r2_score(y_test, l_pred)

print("Ridge param(alpha)", l_grid.best_params_)
print("Ridge score(alpha)", l_grid.best_score_)
print("Ridge mae", l_mae)
print("Ridge r2", l_r2)


Ridge param(alpha) {'alpha': 0.1}
Ridge score(alpha) 0.9247421914054955
Ridge mae 2.3817692225974714
Ridge r2 0.9281689900962815
Ridge param(alpha) {'alpha': 0.001}
Ridge score(alpha) 0.9247418619524866
Ridge mae 2.3817914293491893
Ridge r2 0.928166456143585


# 2. 다항 회귀

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly_model = PolynomialFeatures(degree=2, include_bias=False)

print(X_train.shape)

X_train_poly = poly_model.fit_transform(X_train)
X_test_poly = poly_model.transform(X_test)

model = LinearRegression()

model.fit(X_train_poly, y_train)

pred = model.predict(X_test_poly)

r2 = r2_score(y_test, pred)


(750, 4)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,5.426000,24.0,6.0,0.0,29.441473,130.223993,32.555998,0.0,576.0,144.0,0.0,36.0,0.0,0.0
1,6.510509,9.0,3.0,0.0,42.386721,58.594577,19.531526,0.0,81.0,27.0,0.0,9.0,0.0,0.0
2,5.813383,9.0,0.0,0.0,33.795424,52.320448,0.000000,0.0,81.0,0.0,0.0,0.0,0.0,0.0
3,5.066147,21.0,3.0,0.0,25.665841,106.389077,15.198440,0.0,441.0,63.0,0.0,9.0,0.0,0.0
4,0.535894,17.0,8.0,0.0,0.287182,9.110194,4.287150,0.0,289.0,136.0,0.0,64.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,0.874132,19.0,2.0,0.0,0.764107,16.608507,1.748264,0.0,361.0,38.0,0.0,4.0,0.0,0.0
746,4.027525,8.0,10.0,0.0,16.220954,32.220196,40.275245,0.0,64.0,80.0,0.0,100.0,0.0,0.0
747,4.806395,14.0,10.0,0.0,23.101431,67.289528,48.063948,0.0,196.0,140.0,0.0,100.0,0.0,0.0
748,2.140840,19.0,10.0,0.0,4.583196,40.675963,21.408401,0.0,361.0,190.0,0.0,100.0,0.0,0.0


In [None]:
# a, b, c, d, a2, b2, c2, d2, ab, ac, ad, bc, bd, cd