- Python Version : 3.11.9

# 상관관계

- 상관관계 분석의 가정
1. 선형성 : X와 Y의 관계가 직선적이어야 한다.
2. 등분산성 : X의 값에 관계없이 Y의 분산이 일정해야 한다.
3. 정규성 : 각 변인은 모두 정규분포를 따라야한다.
4. 독립성 : 각 샘플들은 모두 독립적이어야한다.

In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib as plt

from sklearn.datasets import load_diabetes
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

random_state = 42

# 데이터 셋 준비

In [2]:
df = load_diabetes(scaled=False)

print(df["DESCR"])

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

:Number of Instances: 442

:Number of Attributes: First 10 columns are numeric predictive values

:Target: Column 11 is a quantitative measure of disease progression one year after baseline

:Attribute Information:
    - age     age in years
    - sex
    - bmi     body mass index
    - bp      average blood pressure
    - s1      tc, total serum cholesterol
    - s2      ldl, low-density lipoproteins
    - s3      hdl, high-density lipoproteins
    - s4      tch, total cholesterol / HDL
    - s5      ltg, possibly log of serum triglycerides level
    - s6      glu, blood sugar level

Note: Each of these 10 feature variables have bee

# 데이터 셋 전처리

In [3]:
data = df['data']
data = pd.DataFrame(data=data, columns=df['feature_names'])

data.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,59.0,2.0,32.1,101.0,157.0,93.2,38.0,4.0,4.8598,87.0
1,48.0,1.0,21.6,87.0,183.0,103.2,70.0,3.0,3.8918,69.0
2,72.0,2.0,30.5,93.0,156.0,93.6,41.0,4.0,4.6728,85.0
3,24.0,1.0,25.3,84.0,198.0,131.4,40.0,5.0,4.8903,89.0
4,50.0,1.0,23.0,101.0,192.0,125.4,52.0,4.0,4.2905,80.0


## 표준화

In [4]:
x_mean = data.mean(axis=0)
x_std = data.std(axis=0)
data = (data - x_mean) / x_std

data.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,8.037814e-18,2.17021e-16,-4.694083e-15,-1.016783e-15,-3.134747e-16,8.278948e-16,-1.28605e-16,-1.527185e-16,1.941132e-15,2.210399e-16
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-2.251738,-0.9374744,-1.895781,-2.360375,-2.662394,-2.427874,-2.148448,-1.604285,-2.64804,-2.893112
25%,-0.7832846,-0.9374744,-0.7188104,-0.7697777,-0.7192046,-0.6375263,-0.7374604,-0.829361,-0.6981574,-0.6967595
50%,0.1130443,-0.9374744,-0.1529591,-0.1190789,-0.09073818,-0.08020037,-0.1382738,-0.0544375,-0.04089059,-0.02263165
75%,0.799594,1.064282,0.6562083,0.7485196,0.5955183,0.6267323,0.6155415,0.720486,0.6810788,0.5862581
max,2.32526,1.064282,3.58166,2.772916,3.232188,4.174548,3.80476,3.889923,2.805543,2.847848


In [5]:
target = df['target']

y_mean = target.mean()
y_std = target.std()

target = (target - y_mean) / y_std

## 데이터셋 분할

In [8]:
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=random_state)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(309, 10) (309,)
(133, 10) (133,)


# 선형회귀 모델 학습

In [10]:
lr = LinearRegression()

lr.fit(x_train, y_train)

In [11]:
print(lr.intercept_) # 상수항
print(lr.coef_) # 회귀식

-0.014612821882645191
[ 0.01809019 -0.16183484  0.33782249  0.24017895 -0.55775657  0.3133738
  0.0749198   0.17811613  0.40768074  0.02558665]


## 결과 확인

In [12]:
y_train_pred = lr.predict(x_train)
y_test_pred = lr.predict(x_test)

train_mse = mean_squared_error(y_train_pred, y_train)
test_mse = mean_squared_error(y_test_pred, y_test)

In [13]:
print(train_mse, test_mse)

0.4931033957499142 0.4758525721926767


In [14]:
plt.figure(figsize = (4,4))

plt.xlabel("target")
plt.ylabel("prediction")
plt.plot(y_test, y_test_pred, '.')

plt.show()

TypeError: 'module' object is not callable