## 기초통계를 이용한 상관계수 도출

ch11

### p. 376

In [2]:
train = [[25,100],[52,256],[38,152],[32,140],[25,150]]

x = [i[0] for i in train]
y = [j[1] for j in train]

In [3]:
def mean(x):
    return sum(x) / len(x)

mean(x), mean(y)

(34.4, 159.6)

In [4]:
## 원래 값 - 평균 값
def d_mean(x):
    x_mean = mean(x)
    return [i - x_mean for i in x]

d_mean(x), d_mean(y)

([-9.399999999999999,
  17.6,
  3.6000000000000014,
  -2.3999999999999986,
  -9.399999999999999],
 [-59.599999999999994,
  96.4,
  -7.599999999999994,
  -19.599999999999994,
  -9.599999999999994])

In [5]:
## zip 함수 짚고 넘어가기
zip(x,y) # 데이터 묶기

<zip at 0x26ec0df46c0>

In [6]:
## x * y 의 총합계
def dot(x,y):
    return sum([x * y for x, y in zip(x,y)])

dot(x,y)

29818

In [7]:
## 제곱의 합
def sum_of_squares(v):
    return dot(v,v)
sum_of_squares(x), sum_of_squares(y)

(6422, 140740)

In [8]:
## 분산
def variance(x):
    n = len(x)
    d = d_mean(x)
    return sum_of_squares(d) / (n-1)
variance(x)

126.3

In [9]:
## 표준편차
def standard_deviation(x):
    return variance(x) ** 0.5
standard_deviation(x)

11.23832727766904

In [10]:
## 공분산
def convariance(x,y):
    n = len(x)
    return dot(d_mean(x), d_mean(y)) / (n-1)
convariance(x,y)

591.7

In [11]:
## 상관계수
def correlation(x,y):
    stdev_x = standard_deviation(x)
    stdev_y = standard_deviation(y)

    if stdev_x > 0 and stdev_y > 0:
        return convariance(x,y) / (stdev_x * stdev_y)
    else:
        return 0
correlation(x,y)

0.910363457817553

In [12]:
## 넘파이 함수로 기초 통계 구하기
import numpy as np
x1 = np.array(x)
x1.mean(), x1.std(), x1.var()

(np.float64(34.4),
 np.float64(10.05186549850325),
 np.float64(101.03999999999999))

In [13]:
## 넘파이 공분산, 피어슨상관관계
np.cov(x1, y), np.corrcoef(x1,y)

(array([[ 126.3,  591.7],
        [ 591.7, 3344.8]]),
 array([[1.        , 0.91036346],
        [0.91036346, 1.        ]]))

## 회귀 계수
: 공분산과 독립변수의 분산 사이의 비율
- 회귀 분석

In [14]:
## 회귀 계수 구하기
def OLS(x,y):
    beta = convariance(x,y) / variance(x)
    alpha = mean(y) - beta * mean(x)
    return [alpha, beta]

OLS(x,y)

[-1.5597783056215633, 4.684877276326208]

In [15]:
def OLS_fit(x,y):
    beta = (correlation(x,y) * standard_deviation(y)) / standard_deviation(x)
    alpha = mean(y) - beta * mean(x)
    return [alpha, beta]
OLS_fit(x,y)

[-1.5597783056215349, 4.684877276326207]

In [16]:
## 예측값 구하기
def predict(alpha, beta, train, test):
    predictions = list()
    x = [i[0] for i in train]
    y = [j[1] for j in train]
    alpha, beta = OLS_fit(x,y)
    for i in test:
        yhat = alpha + beta * i[0]
        predictions.append(yhat)
    return predictions

train = [[25,100],[52,256],[38,152],[32,140],[25,150]]
alpha, beta = OLS_fit(x,y)

pr = predict(alpha, beta, train, train)
print(pr)

[115.56215360253366, 242.05384006334126, 176.46555819477436, 148.3562945368171, 115.56215360253366]


In [17]:
## SSE : 실제값과 예측값의 차를 나타내는 에러값의 제곱의 합
## Error Sum of Squares

def SSE(alpha, beta, train, test):
    sse = 0
    for i in test:
        error = (i[1] - (alpha + beta * i[0])) **2
        sse += error
    return sse
SSE(alpha, beta, train, train)

2291.0324623911324

In [18]:
## SST : 총 변동분
## Total Sum of Squares

def SST(alpha, beta, train, test):
    sst = 0
    x = [i[0] for i in train]
    y = [j[1] for j in train]

    for i in test:
        sum_ds = (i[1] - mean(y)) **2
        sst = sum_ds + sst
    return sst

SST(alpha, beta, train, train)

13379.2

- ssr -> sst(개념 공부하고) 이것들로 결정계수 구함

In [19]:
## 결정계수 구하기
def R_squared(alpha, beta, train, test):
    return 1.0-(SSE(alpha, beta, train, test) / SST(alpha, beta, train, test))

R_squared(alpha,beta, train, train)

0.8287616253295315

In [20]:
## statsmodels library
train = [[25,100],[52,256],[38,152],[32,140],[25,150]]
x = [i[0] for i in train]
y = [j[1] for j in train]

import statsmodels.api as sms
_X = sms.add_constant(x)
model = sms.OLS(y, _X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.829
Model:                            OLS   Adj. R-squared:                  0.772
Method:                 Least Squares   F-statistic:                     14.52
Date:                Thu, 10 Apr 2025   Prob (F-statistic):             0.0318
Time:                        09:30:20   Log-Likelihood:                -22.413
No. Observations:                   5   AIC:                             48.83
Df Residuals:                       3   BIC:                             48.04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.5598     44.063     -0.035      0.9

  warn("omni_normtest is not valid with less than 8 observations; %i "


In [22]:
test = [[45,183],[40,175],[55,203],[28,152],[42,198]]
def predict(alpha, beta, train, test):
    predictions = list()
    x = [i[0] for i in train]
    y = [j[1] for j in train]

    alpha, beta = OLS_fit(x,y)

    for i in test:
        yhat = alpha + beta * i[0]
        predictions.append(yhat)
    return predictions
predict(alpha, beta, train, test)

[209.2596991290578,
 185.83531274742677,
 256.10847189231987,
 129.61678543151228,
 195.20506730007918]

In [23]:
## 예측 결과 평가하기
# 비교를 위해 실제 값을 불러옴
actual = [i[1] for i in test]

predicted  = predict(alpha, beta, train, test)
actual, predicted


([183, 175, 203, 152, 198],
 [209.2596991290578,
  185.83531274742677,
  256.10847189231987,
  129.61678543151228,
  195.20506730007918])

In [None]:
## 예측값과 실제값이 얼마나 가까운지 나타냄

from math import sqrt
def RMSE(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i] # 차이
        sum_error += (prediction_error**2) # 예측 에러값 제곱 누적
        mean_error = sum_error / float(len(actual))
    return sqrt(mean_error)
RMSE(actual, predicted)

28.76214710565456

## 경사하강법을 이용한 회귀계수 

In [25]:
## 회귀계수 구하기
dataset = [[25,100],[52,256],[38,152],[32,140],[25,150],[45,183],[40,175],[55,203],[28,152],[42,198]]
train = dataset[:5]
test = dataset[5:]
print('학습데이터(train): {}, 테스트데이터(test): {}'.format(train, test))

학습데이터(train): [[25, 100], [52, 256], [38, 152], [32, 140], [25, 150]], 테스트데이터(test): [[45, 183], [40, 175], [55, 203], [28, 152], [42, 198]]


In [None]:
# 0으로 초기화
coef = [0.0 for i in range(len(train[0]))]
coef

[0.0, 0.0]

In [None]:
def predict(row, coef):
    yhat = coef[0] # 절편
    for i in range(len(row) - 1):
        yhat += coef[i + 1] * row[i]
    return yhat

In [28]:
## sse를 구하고 초기화하는 방법으로 회귀계수 업데이트
def confficients_sgd(trian, l_rate, n_epoch):
    coef = [0.0 for i in range(len(train[0]))] 
    for epoch in range(n_epoch):
        sse = 0
        for row in train:
            yhat = predict(row, coef)
            error = yhat - row[-1]
            sse += error **2
            coef[0] = coef[0] - l_rate * error
            for i in range(len(row) - 1):
                coef[i + 1] = coef[i + 1] - l_rate * error * row[i]
            return coef, sse

In [29]:
## 경사하강법으로 구한 회귀계수 값을 최소자승법으로 구한 회귀계수 값과 비교한 후 sse 값을 구함
import math
def confficients_sgd(train, l_rate, n_epoch): # 경사하강법
    coef = [0.0 for i in range(len(train[0]))]
    for epoch in range(n_epoch):
        sum_error = 0
        for row in train:
            yhat = predict(row, coef)
            error = row[-1] - yhat
            sum_error += (error **2)
            coef[0] = coef[0] + l_rate * error
            for i in range(len(row) - 1):
                coef[i + 1] = coef[i + 1] - l_rate * error * row[i]
    return coef, math.sqrt(sum_error / len(train))


In [30]:
l_rate = 0.0001
n_epoch = 10
coef = confficients_sgd(train, l_rate, n_epoch)
coef

([47.6680517194948, -1725.9200733210791], 43941.02750898198)