## 상관

### 데이터 열기

In [3]:
import pandas

In [4]:
cars = pandas.read_csv('cars.csv')

In [5]:
cars.head()

Unnamed: 0.1,Unnamed: 0,speed,dist
0,1,4,2
1,2,4,10
2,3,7,4
3,4,7,22
4,5,8,16


### 상관계수 확인

In [7]:
from scipy.stats import pearsonr

(상관계수, p 값) 
p값 < .05 : 95% 신뢰구간 반대 부호가 포함X
p값 < .01 : 99% 신뢰구간 반대 부호가 포함X
p값 < .001 : 99.9% 신뢰구간 반대 부호가 포함X

In [8]:
pearsonr(cars['speed'], cars['dist']) 

(0.8068949006892105, 1.4898364962950702e-12)

In [36]:
.8068949006892105 ** 2

0.651079380758251

### 부트스트래핑을 이용한 신뢰구간 추정

In [11]:
from sklearn.utils import resample

In [21]:
cors = [] #  빈 리스트를 만든다
for _ in range(10000):  # 1만번 반복
    df = resample(cars) # 리샘플링
    res = pearsonr(df['speed'], df['dist'])  # 상관계수를 구한다
    cors.append(res[0]) # 상관계수를 리스트에 추가 [0]은 상관계수, [1]은 p값

In [18]:
import numpy

In [22]:
numpy.quantile(cors, [.025, .975])  # 상관계수의 95% 신뢰구간

array([0.69742228, 0.88492261])

In [23]:
numpy.quantile(cors, [.005, .995])  # 상관계수의 99% 신뢰구간

array([0.65068697, 0.90088391])

## 서열 상관계수

In [24]:
liar = pandas.read_csv('liar.csv')

In [25]:
liar.head()

Unnamed: 0,Creativity,Position,Novice
0,53,1,0
1,36,3,1
2,31,4,0
3,43,2,0
4,30,4,1


In [26]:
from scipy.stats import spearmanr, kendalltau

창의성과 거짓말 등수 사이에 역상관 --> 창의성이 높을 수록 거짓말을 잘한다

In [27]:
spearmanr(liar['Creativity'], liar['Position'])

SpearmanrResult(correlation=-0.37321838128767815, pvalue=0.0017204168895658578)

In [28]:
kendalltau(liar['Creativity'], liar['Position'])

KendalltauResult(correlation=-0.3002413080651747, pvalue=0.001258802279346817)

In [29]:
pearsonr(liar['Creativity'], liar['Position']) 

(-0.3060314348357021, 0.01114802877289373)

## statsmodels를 이용한 선형 모형

### 설치

In [None]:
!conda install statsmodels

### 자동차 데이터로 회귀분석

In [33]:
from statsmodels.formula.api import ols

In [34]:
res = ols('dist ~ speed', data=cars).fit()  # dist(y,종속) ~ speed(x,독립)

In [35]:
res.summary()

0,1,2,3
Dep. Variable:,dist,R-squared:,0.651
Model:,OLS,Adj. R-squared:,0.644
Method:,Least Squares,F-statistic:,89.57
Date:,"Mon, 16 Sep 2019",Prob (F-statistic):,1.49e-12
Time:,14:01:53,Log-Likelihood:,-206.58
No. Observations:,50,AIC:,417.2
Df Residuals:,48,BIC:,421.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-17.5791,6.758,-2.601,0.012,-31.168,-3.990
speed,3.9324,0.416,9.464,0.000,3.097,4.768

0,1,2,3
Omnibus:,8.975,Durbin-Watson:,1.676
Prob(Omnibus):,0.011,Jarque-Bera (JB):,8.189
Skew:,0.885,Prob(JB):,0.0167
Kurtosis:,3.893,Cond. No.,50.7


## 아동 공격성

In [37]:
child = pandas.read_csv('child.csv')

In [38]:
child.head()

Unnamed: 0,Aggression,Television,Computer_Games,Sibling_Aggression,Diet,Parenting_Style
0,0.37416,0.172671,0.141907,-0.328216,-0.110303,-0.279034
1,0.771153,-0.032872,0.709918,0.576837,-0.02299,-1.248167
2,-0.097728,-0.07446,-0.390141,-0.217184,0.280301,-0.328063
3,0.015935,-0.004427,-0.40808,0.046223,-0.263479,-1.005119
4,-0.275385,-0.675239,-0.277778,-0.891045,0.226581,0.489478


In [41]:
res = ols('Aggression ~ Television', child).fit() 
res.summary()

0,1,2,3
Dep. Variable:,Aggression,R-squared:,0.025
Model:,OLS,Adj. R-squared:,0.024
Method:,Least Squares,F-statistic:,17.11
Date:,"Mon, 16 Sep 2019",Prob (F-statistic):,3.98e-05
Time:,15:03:58,Log-Likelihood:,-175.93
No. Observations:,666,AIC:,355.9
Df Residuals:,664,BIC:,364.9
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0005,0.012,-0.041,0.967,-0.025,0.024
Television,0.1634,0.040,4.137,0.000,0.086,0.241

0,1,2,3
Omnibus:,24.471,Durbin-Watson:,1.931
Prob(Omnibus):,0.0,Jarque-Bera (JB):,58.038
Skew:,0.108,Prob(JB):,2.5e-13
Kurtosis:,4.43,Cond. No.,3.23


### 여러 가지 변수로 회귀분석

In [68]:
res = ols('Aggression ~ Computer_Games + Sibling_Aggression + Diet + Parenting_Style ', child).fit()
res.summary()

0,1,2,3
Dep. Variable:,Aggression,R-squared:,0.082
Model:,OLS,Adj. R-squared:,0.076
Method:,Least Squares,F-statistic:,14.74
Date:,"Mon, 16 Sep 2019",Prob (F-statistic):,1.54e-11
Time:,16:34:15,Log-Likelihood:,-155.96
No. Observations:,666,AIC:,321.9
Df Residuals:,661,BIC:,344.4
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0059,0.012,-0.497,0.619,-0.029,0.017
Computer_Games,0.1434,0.037,3.891,0.000,0.071,0.216
Sibling_Aggression,0.0863,0.038,2.258,0.024,0.011,0.161
Diet,-0.1116,0.038,-2.947,0.003,-0.186,-0.037
Parenting_Style,0.0619,0.013,4.925,0.000,0.037,0.087

0,1,2,3
Omnibus:,25.206,Durbin-Watson:,1.911
Prob(Omnibus):,0.0,Jarque-Bera (JB):,64.229
Skew:,0.051,Prob(JB):,1.13e-14
Kurtosis:,4.518,Cond. No.,3.48
