In [4]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes

In [8]:
diabetes = load_diabetes()
x = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = pd.DataFrame(diabetes.target, columns=['target'])

### 1. Sklearn 라이브러리 이용

In [13]:
from sklearn.linear_model import LinearRegression

x= x[['age', 'sex', 'bmi']]

In [14]:
# y = b0 + b1*x1 + b2*x2 + b3*x3
model = LinearRegression()
model.fit(x, y)

In [21]:
model.score(x, y)

print(np.round(model.coef_[0][0], 2))
print(model.intercept_)

138.9
[152.13348416]


### 2. stastmodels.api 라이브러리 이용

In [33]:
diabetes = load_diabetes()
x = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = pd.DataFrame(diabetes.target, columns=['target'])

In [34]:
import statsmodels.api as sm

x= x[['age', 'sex', 'bmi']]
y = y['target']

In [35]:
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
model.summary()

0,1,2,3
Dep. Variable:,target,R-squared:,0.351
Model:,OLS,Adj. R-squared:,0.346
Method:,Least Squares,F-statistic:,78.94
Date:,"Thu, 30 Nov 2023",Prob (F-statistic):,7.77e-41
Time:,09:24:37,Log-Likelihood:,-2451.6
No. Observations:,442,AIC:,4911.0
Df Residuals:,438,BIC:,4928.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,152.1335,2.964,51.321,0.000,146.307,157.960
age,138.9039,64.254,2.162,0.031,12.618,265.189
sex,-36.1353,63.391,-0.570,0.569,-160.724,88.453
bmi,926.9120,63.525,14.591,0.000,802.061,1051.763

0,1,2,3
Omnibus:,14.687,Durbin-Watson:,1.851
Prob(Omnibus):,0.001,Jarque-Bera (JB):,8.29
Skew:,0.15,Prob(JB):,0.0158
Kurtosis:,2.4,Cond. No.,23.7


### 로지스틱 회귀
회귀식 : P = 1 / (1+ exp(-f(x)))
- f(x) = b0 + b1*x1 + b2*x2 ...
- ln(P/1-P) = b0 + b1*x1 + b2*x2 ...
- P/1-P : odds
- ln(odds) => logis

In [45]:
import pandas as pd
import numpy as np
import seaborn as sns

df = sns.load_dataset('titanic')
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [56]:
indepen = ['survived', 'sex', 'sibsp', 'fare']
x = df[indepen]
# x.head(3)

x['sex'] = x['sex'].map({
    'female':1,
    'male':0
})
# x.head(3)

y = x['survived']
x = x.drop('survived', axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['sex'] = x['sex'].map({


In [60]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(penalty = None)
model.fit(x, y)

In [62]:
print(model.coef_)
print(model.intercept_)

[[ 2.56677766 -0.40170461  0.01375665]]
[-1.69644546]


In [71]:
odds = model.intercept_[0] + model.coef_[0][0] * x['sex'] + model.coef_[0][1] * x['sibsp'] + model.coef_[0][2] * x['fare']
logist = np.log(odds)
# print(logist)

"""
sibsp 변수가 한 단위 증가할 때 생존할 오즈가 몇 배 증가?
"""
# 양변에 exp을 곱해주면 된다
# odds = exp(b0) + exp(b1*x1) + exp(b2*x2) + exp(b3*x3)
# 여기서 exp(b2)를 구해주면 된다.

odds_ratio = np.exp(model.coef_[0][1])
print(odds_ratio) #-- 33퍼 감소

0.6691783830064514


  result = getattr(ufunc, method)(*inputs, **kwargs)


### 상관분석

In [40]:
from scipy.stats import pearsonr

diabetes = load_diabetes()
x = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = pd.DataFrame(diabetes.target, columns=['target'])

x = x['bmi']
y = y['target']

r, p_v = pearsonr(x, y)

print(round(r, 2))
print(round(p_v, 2))

#-- 검정 통계량 구하는 공식
n = len(x)
r2 = np.power(r, 2)

s_v = r * np.sqrt(n-2) / np.sqrt(1-r2)

0.59
0.0
15.18728957036531
