## 기초통계를 이용한 상관계수 도출

ch11

### p. 376

In [1]:
train = [[25,100],[52,256],[38,152],[32,140],[25,150]]

x = [i[0] for i in train]
y = [j[1] for j in train]

In [2]:
def mean(x):
    return sum(x) / len(x)

mean(x), mean(y)

(34.4, 159.6)

In [3]:
## 원래 값 - 평균 값
def d_mean(x):
    x_mean = mean(x)
    return [i - x_mean for i in x]

d_mean(x), d_mean(y)

([-9.399999999999999,
  17.6,
  3.6000000000000014,
  -2.3999999999999986,
  -9.399999999999999],
 [-59.599999999999994,
  96.4,
  -7.599999999999994,
  -19.599999999999994,
  -9.599999999999994])

In [4]:
## zip 함수 짚고 넘어가기
zip(x,y) # 데이터 묶기

<zip at 0x1ade26fc480>

In [5]:
## x * y 의 총합계
def dot(x,y):
    return sum([x * y for x, y in zip(x,y)])

dot(x,y)

29818

In [6]:
## 제곱의 합
def sum_of_squares(v):
    return dot(v,v)
sum_of_squares(x), sum_of_squares(y)

(6422, 140740)

In [7]:
## 분산
def variance(x):
    n = len(x)
    d = d_mean(x)
    return sum_of_squares(d) / (n-1)
variance(x)

126.3

In [8]:
## 표준편차
def standard_deviation(x):
    return variance(x) ** 0.5
standard_deviation(x)

11.23832727766904

In [9]:
## 공분산
def convariance(x,y):
    n = len(x)
    return dot(d_mean(x), d_mean(y)) / (n-1)
convariance(x,y)

591.7

In [10]:
## 상관계수
def correlation(x,y):
    stdev_x = standard_deviation(x)
    stdev_y = standard_deviation(y)

    if stdev_x > 0 and stdev_y > 0:
        return convariance(x,y) / (stdev_x * stdev_y)
    else:
        return 0
correlation(x,y)

0.910363457817553

In [11]:
## 넘파이 함수로 기초 통계 구하기
import numpy as np
x1 = np.array(x)
x1.mean(), x1.std(), x1.var()

(np.float64(34.4),
 np.float64(10.05186549850325),
 np.float64(101.03999999999999))

In [12]:
## 넘파이 공분산, 피어슨상관관계
np.cov(x1, y), np.corrcoef(x1,y)

(array([[ 126.3,  591.7],
        [ 591.7, 3344.8]]),
 array([[1.        , 0.91036346],
        [0.91036346, 1.        ]]))

## 회귀 계수
: 공분산과 독립변수의 분산 사이의 비율
- 회귀 분석

In [13]:
## 회귀 계수 구하기
def OLS(x,y):
    beta = convariance(x,y) / variance(x)
    alpha = mean(y) - beta * mean(x)
    return [alpha, beta]

OLS(x,y)

[-1.5597783056215633, 4.684877276326208]

In [14]:
def OLS_fit(x,y):
    beta = (correlation(x,y) * standard_deviation(y)) / standard_deviation(x)
    alpha = mean(y) - beta * mean(x)
    return [alpha, beta]
OLS_fit(x,y)

[-1.5597783056215349, 4.684877276326207]

In [15]:
## 예측값 구하기
def predict(alpha, beta, train, test):
    predictions = list()
    x = [i[0] for i in train]
    y = [j[1] for j in train]
    alpha, beta = OLS_fit(x,y)
    for i in test:
        yhat = alpha + beta * i[0]
        predictions.append(yhat)
    return predictions

train = [[25,100],[52,256],[38,152],[32,140],[25,150]]
alpha, beta = OLS_fit(x,y)

pr = predict(alpha, beta, train, train)
print(pr)

[25.000000000000004, 52.0, 38.0, 32.0, 25.000000000000004]


In [17]:
## SSE : 실제값과 예측값의 차를 나타내는 에러값의 제곱의 합
## Error Sum of Squares

def SSE(alpha, beta, train, test):
    sse = 0
    for i in test:
        error = (i[1] - (alpha + beta * i[0])) **2
        sse += error
    return sse
SSE(alpha, beta, train, train)

2291.0324623911324

In [18]:
## SST : 총 변동분
## Total Sum of Squares

def SST(alpha, beta, train, test):
    sst = 0
    x = [i[0] for i in train]
    y = [j[1] for j in train]

    for i in test:
        sum_ds = (i[1] - mean(y)) **2
        sst = sum_ds + sst
    return sst

SST(alpha, beta, train, train)

13379.2

- ssr -> sst(개념 공부하고) 이것들로 결정계수 구함