In [1]:
# Scipy를 활용한 단순 선형 회귀분석 

import pandas as pd 
from sklearn.datasets import load_diabetes

diabetes = load_diabetes()

data = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
# diabetes["target"]
target = diabetes.target
# print(target)

from scipy.stats import linregress

model = linregress(data["bmi"], target)
print(model)

# 귀무가설: BMI와 당뇨병 진행정도 사이에 선형관계 없다. 
# 대립가설: BMI와 당뇨병 진행정도 사이에 선형관계가 있다. 


LinregressResult(slope=np.float64(949.4352603840384), intercept=np.float64(152.13348416289617), rvalue=np.float64(0.5864501344746884), pvalue=np.float64(3.466006445167547e-42), stderr=np.float64(62.515122002852664), intercept_stderr=np.float64(2.973541118790735))


In [None]:
# 독립변수에 대한 추정된 회귀계수(Slope)
print(model.slope)

In [None]:
# 상수항에 대한 추정된 회귀계수 
print(model.intercept)

In [None]:
# p-value (통계적 유의성)
print(model.pvalue)

# 결정계수 (모형의 설명력)
print(model.rvalue)

In [11]:
# Statsmodels를 활용한 다중 선형 회귀분석

import pandas as pd

tips = pd.read_csv("./예제/tips.csv")
tips.head()

import statsmodels.api as sm

X = tips[["total_bill", "size"]]
y = tips["tip"]

X = sm.add_constant(X) # 상수항 추가''1_선형회귀모형.ipynb

model = sm.OLS(y, X).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                    tip   R-squared:                       0.468
Model:                            OLS   Adj. R-squared:                  0.463
Method:                 Least Squares   F-statistic:                     105.9
Date:                Wed, 02 Oct 2024   Prob (F-statistic):           9.67e-34
Time:                        18:25:08   Log-Likelihood:                -347.99
No. Observations:                 244   AIC:                             702.0
Df Residuals:                     241   BIC:                             712.5
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.6689      0.194      3.455      0.0

In [7]:
import pandas as pd

tips = pd.read_csv("./예제/tips.csv")
tips.head()

formula = "tip ~ total_bill + size"

from statsmodels.formula.api import ols 

model = ols(formula, data=tips).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    tip   R-squared:                       0.468
Model:                            OLS   Adj. R-squared:                  0.463
Method:                 Least Squares   F-statistic:                     105.9
Date:                Wed, 02 Oct 2024   Prob (F-statistic):           9.67e-34
Time:                        18:21:52   Log-Likelihood:                -347.99
No. Observations:                 244   AIC:                             702.0
Df Residuals:                     241   BIC:                             712.5
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.6689      0.194      3.455      0.0

In [8]:
X.head()

Unnamed: 0,const,total_bill,size
0,1.0,16.99,2
1,1.0,10.34,3
2,1.0,21.01,3
3,1.0,23.68,2
4,1.0,24.59,4


In [9]:
X.iloc[4]

const          1.00
total_bill    24.59
size           4.00
Name: 4, dtype: float64

In [12]:
model.predict(X.iloc[4])

None    3.719157
dtype: float64

In [18]:
import pandas as pd 

new_data = pd.DataFrame(
    {
        'const': [1],
        'total_bill': [24.59], 
        'size': [4], 
    }
)

result = model.get_prediction(new_data)
print(result.summary_frame())


       mean  mean_se  mean_ci_lower  mean_ci_upper  obs_ci_lower  obs_ci_upper
0  3.719157  0.12093       3.480943       3.957371      1.708534      5.729779
