# 산포통계

In [2]:
import numpy as np
from scipy import stats
import pandas as pd

### 분산계산

In [3]:
x = [1, 2, 3, 4, 5]
print(np.var(x, ddof = 1)) # 분모 = n - 1 ( 5 - 1), 표본
print(np.array(x).var()) # 분모 = n
# 분모 = n , 자유도(ddof) 0:모분산인경우, 표본분산
print(pd.Series(x).var(ddof = 0)) 

2.5
2.0
2.0


### 표준편차 계산

In [4]:
x = [1, 2, 3, 4, 5]
print(np.std(x, ddof = 1))
print(np.array(x).std(ddof = 0))
print(pd.Series(x).std(ddof = 1))

1.5811388300841898
1.4142135623730951
1.5811388300841898


### 변동계수의 필요성

- 분산과 표준편차 모두 값의 스케일에 크게 영향을 받아 상대적인 산포를 보여주는데 부적합함.
- 변동계수 = 표준편차 / 평균

In [5]:
x1 = np.array([1, 2, 3, 4, 5])
x2 = x1 * 10

print(np.std(x1, ddof = 1))
print(np.std(x2, ddof = 1))

1.5811388300841898
15.811388300841896


### 스케일링 
- 둘 이상의 변수의 값을 상대적으로 비교할 때 사용

In [6]:
import numpy as np
import pandas as pd

In [7]:
x1 = np.array([1, 2, 3, 4, 5])
x2 = x1 * 10

In [8]:
x1

array([1, 2, 3, 4, 5])

In [9]:
x2

array([10, 20, 30, 40, 50])

In [10]:
# Standard Scaling
z1 = (x1 - x1.mean()) / x1.std()
z2 = (x2 - x2.mean()) / x2.std()

print(z1)
print(z2)

[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]
[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]


In [11]:
# Min-max Scaling
z1 = (x1 - x1.min()) / (x1.max() - x1.min())
z2 = (x2 - x2.min()) / (x2.max() - x2.min())

print(z1)
print(z2)

[0.   0.25 0.5  0.75 1.  ]
[0.   0.25 0.5  0.75 1.  ]


In [12]:
# sklearn을 이용한 스케일링을 위한 데이터 준비
X = pd.DataFrame({"X1":[1, 2, 3, 4, 5],
    "X2": [10, 20, 30, 40, 50]})

X

Unnamed: 0,X1,X2
0,1,10
1,2,20
2,3,30
3,4,40
4,5,50


- 라이브러리 설치 pip install scikit-learn 

In [13]:
# scikit learn 의 MinMaxScaler 적용
# 0 ~ 1 데이터 스크케일링함
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() # 인스턴스화
Z = scaler.fit_transform(X) # fit_transform => ndarray
pd.DataFrame(Z, columns=['X1', 'X2'])

Unnamed: 0,X1,X2
0,0.0,0.0
1,0.25,0.25
2,0.5,0.5
3,0.75,0.75
4,1.0,1.0


In [14]:
# 표준 정규분포에 따라서 데이터의 수준을 맞춤
# 표준 정규분포 : 
from sklearn.preprocessing import StandardScaler

In [15]:
ss_scaler = StandardScaler() # 인스턴스화
Z = ss_scaler.fit_transform(X) # fit_transform => ndarray
pd.DataFrame(Z, columns=['X1', 'X2'])

Unnamed: 0,X1,X2
0,-1.414214,-1.414214
1,-0.707107,-0.707107
2,0.0,0.0
3,0.707107,0.707107
4,1.414214,1.414214


In [16]:
# Standard scaling
from sklearn.preprocessing import StandardScaler

ss_scaler = StandardScaler() # 인스턴스화
S = ss_scaler.fit_transform(X) # fit_transform => ndarray
pd.DataFrame(S, columns=['X1', 'X2'])

Unnamed: 0,X1,X2
0,-1.414214,-1.414214
1,-0.707107,-0.707107
2,0.0,0.0
3,0.707107,0.707107
4,1.414214,1.414214


### 범위와 사분위 범위 계산
- 정규분포(np.random.normal)로 부터 무작위 샘플 만들기
- np.random.normal(0,1,1000) # [정규분포] 평균 0, 표준편차 1, 개수 1000개

In [17]:
# np.random.normal(평균, 편차, size = 갯수)
x = np.random.normal(100, 20, size = 1000)
x

array([105.22395189, 114.43635407, 128.45046027, 142.73875595,
       108.35525814, 110.32410094, 146.2771634 , 124.58292807,
        81.3275295 , 114.96269106, 117.51996444,  88.48166085,
       111.23326811, 120.92381329,  75.11181598,  73.76885752,
        96.14128863,  90.87732666, 104.09948488, 102.49106264,
       121.46810328, 101.52673459,  98.02009849,  84.20840583,
       102.01929449, 117.74417134, 112.98948139, 140.050206  ,
       106.58200327,  73.95431472,  68.20999834,  96.18945665,
        81.55701937,  94.5996288 ,  98.2394296 ,  89.23458235,
       131.17532735, 101.84777232, 106.94143436, 116.01698783,
       139.00182651, 120.13601205, 102.2164391 , 131.94414569,
        86.63188826, 110.12445005, 117.07469616,  97.79177735,
        90.5505472 ,  98.75950066,  95.75550157,  88.70294376,
        97.90968076, 108.9251604 , 111.71452818,  90.35298812,
       126.34583534,  81.68857422,  91.1985481 , 116.13915434,
        82.16099629,  93.63759714, 122.46258138, 103.82

In [18]:
print(np.ptp(x))
print(np.max(x) - np.min(x))

141.01802195289284
141.01802195289284


In [19]:
print(np.quantile(x, 0.75) - np.quantile(x, 0.25))
print(stats.iqr(x))

25.930327911493038
25.930327911493038
