# 多変量データ


In [44]:
import scipy as sp
import pandas as pd
# 表示桁数
%precision 3
import warnings
warnings.filterwarnings('ignore')


In [45]:
fish_data = pd.read_csv('../data/3-2-1-fish_multi.csv')
fish_data

Unnamed: 0,species,length
0,A,2
1,A,3
2,A,4
3,B,6
4,B,8
5,B,10


In [46]:
group = fish_data.groupby('species')
print("## mean ##")
print(group.mean())
print("## max ##")
print(group.max())

## mean ##
         length
species        
A           3.0
B           8.0
## max ##
         length
species        
A             4
B            10


In [47]:
group.describe()

Unnamed: 0_level_0,length,length,length,length,length,length,length,length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
A,3.0,3.0,1.0,2.0,2.5,3.0,3.5,4.0
B,3.0,8.0,2.0,6.0,7.0,8.0,9.0,10.0


In [48]:
shoes = pd.read_csv('../data/3-2-2-shoes.csv')
shoes

Unnamed: 0,store,color,sales
0,tokyo,blue,10
1,tokyo,red,15
2,osaka,blue,13
3,osaka,red,9


In [49]:
cross = pd.pivot_table(
    data=shoes,
    values='sales',
    aggfunc='sum',
    index='store',
    columns='color'
)
cross

color,blue,red
store,Unnamed: 1_level_1,Unnamed: 2_level_1
osaka,13,9
tokyo,10,15


## 共分散

- 共分散の特徴
    - 共分散が0: 相関関係がない
    - 共分散が大きい: 正の相関
    - 共分散が小さい: 負の相関

$$
  \mathcal{Cov}(x, y) = \frac{1}{N} \sum_{i=1}^N (x_i - \mu_x)(y_i - \mu_y)
$$


In [50]:
cov_data = pd.read_csv('../data/3-2-3-cov.csv')
print(cov_data)
N = len(cov_data)
x = cov_data['x']
y = cov_data['y']
mu_x = x.mean()
mu_y = y.mean()
cov = sum((x - mu_x)*(y - mu_y))/N
print(f'covariance(N) {cov}')
cov = sum((x - mu_x)*(y - mu_y))/(N-1)
print(f'covariance(N-1) {cov}')

      x   y
0  18.5  34
1  18.7  39
2  19.1  41
3  19.7  38
4  21.5  45
5  21.7  41
6  21.8  52
7  22.0  44
8  23.4  44
9  23.8  49
covariance(N) 6.9060000000000015
covariance(N-1) 7.673333333333336


In [51]:
# 分散共分散行列
print("######### N ##########")
print(sp.cov(x, y, ddof=0))
print('######## N -1 ########')
print(sp.cov(x, y, ddof=1))


######### N ##########
[[ 3.282  6.906]
 [ 6.906 25.21 ]]
######## N -1 ########
[[ 3.646  7.673]
 [ 7.673 28.011]]


## ピアソンの積率相関係数 (相関係数)

共分散を、最大値=1、最小値=-1に標準化したもの

$$
p_{xy} =  \frac{\mathcal{Cov}(x, y)}{\sqrt{\sigma_x^2\sigma_y^2}}
$$


- Min-Max normalization: 最小値0～最大値1にスケーリングする
- Z-score normalization: 平均0、分散1にスケーリングする
  - 標準化（Standardization）とも呼ばれる


In [52]:
# variance
sigma_2_x = sp.var(x, ddof=1)
sigma_2_y = sp.var(y, ddof=1)
# coefficient
coefficient = cov/sp.sqrt(sigma_2_x * sigma_2_y)
print(coefficient)
print(sp.corrcoef(x, y))

0.7592719041137088
[[1.    0.759]
 [0.759 1.   ]]
