In [150]:
from sklearn.datasets import load_iris

import missingno as msno
import pandas as pd
import numpy as np
import random
import seaborn as sns

# 한글 깨짐 해결
import matplotlib.pyplot as plt
import matplotlib as mpl

# 기본 세팅
mpl.rc("font", family='NanumGothic')
mpl.rcParams['axes.unicode_minus'] = False
plt.rcParams["figure.figsize"] = (12,10)
%matplotlib inline
sns.set()

In [151]:
from statsmodels.multivariate.manova import MANOVA

In [164]:
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.columns = df.columns.str[:-5].str.replace(' ', '_')
df['species'] = iris.target

target_name = pd.DataFrame(iris.target_names, columns=['target'])

merged = pd.merge(df, target_name, left_on='species', right_index=True)
dropped = merged.drop('species', axis=1,)
iris_df = dropped.rename(columns={'target':'species'})
iris_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


# ANOVA
- 독립변수 1개
- 종속변수 1개 (연속형)

# MANOVA

- 여러 변인들의 효과를 동시에 분석하기에 종속변인에 대한 효과가 여러 변인들간의 선형조합으로 해석된다.
- 조건
    - 종속변수 2개 이상, 연속형
    - 독립변수가 적어도 2개 이상

- 귀무가설: 모든 평균들의 차이는 없다.
- 대립가설: 평균들의 차이가 있다. (p-value < 0.05)

In [189]:
heartopt = pd.read_csv('./statistics/heartopt.csv')
heartopt

Unnamed: 0,age,gender,clotsolv,proc,los,cost,lnlos,lncost
0,63,1,2,1,4,27.94,1.386294,3.330059
1,67,0,2,1,4,30.93,1.386294,3.431727
2,74,0,1,2,7,44.38,1.945910,3.792789
3,69,0,3,1,4,36.31,1.386294,3.592093
4,54,1,3,2,5,40.48,1.609438,3.700808
...,...,...,...,...,...,...,...,...
1476,63,0,2,2,6,42.16,1.791759,3.741472
1477,68,1,3,2,6,40.33,1.791759,3.697096
1478,76,1,3,1,5,41.65,1.609438,3.729301
1479,65,1,3,1,4,27.84,1.386294,3.326474


In [190]:
heartopt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1481 entries, 0 to 1480
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1481 non-null   int64  
 1   gender    1481 non-null   int64  
 2   clotsolv  1481 non-null   int64  
 3   proc      1481 non-null   int64  
 4   los       1481 non-null   int64  
 5   cost      1481 non-null   float64
 6   lnlos     1481 non-null   float64
 7   lncost    1481 non-null   float64
dtypes: float64(3), int64(5)
memory usage: 92.7 KB


In [191]:
# 혈전 종류
heartopt.clotsolv = heartopt.clotsolv.astype('str')
# 수술 방법
heartopt.proc = heartopt.proc.astype('str')
heartopt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1481 entries, 0 to 1480
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1481 non-null   int64  
 1   gender    1481 non-null   int64  
 2   clotsolv  1481 non-null   object 
 3   proc      1481 non-null   object 
 4   los       1481 non-null   int64  
 5   cost      1481 non-null   float64
 6   lnlos     1481 non-null   float64
 7   lncost    1481 non-null   float64
dtypes: float64(3), int64(3), object(2)
memory usage: 92.7+ KB


In [198]:
# formaular: 종속변수 ~ 독립변수
# 독립변수 clotsolv + proc
# 종속변수 los + cost
maov = MANOVA.from_formula('los + cost ~ clotsolv + proc', data=heartopt)
print(maov.mv_test())

                   Multivariate linear model
                                                                
----------------------------------------------------------------
       Intercept        Value  Num DF   Den DF   F Value  Pr > F
----------------------------------------------------------------
          Wilks' lambda 0.2165 2.0000 1476.0000 2670.4628 0.0000
         Pillai's trace 0.7835 2.0000 1476.0000 2670.4628 0.0000
 Hotelling-Lawley trace 3.6185 2.0000 1476.0000 2670.4628 0.0000
    Roy's greatest root 3.6185 2.0000 1476.0000 2670.4628 0.0000
----------------------------------------------------------------
                                                                
----------------------------------------------------------------
          clotsolv        Value  Num DF   Den DF  F Value Pr > F
----------------------------------------------------------------
            Wilks' lambda 0.9745 4.0000 2952.0000  9.6110 0.0000
           Pillai's trace 0.0255 4.0000 2954.

MANOVA 해석
- 모든 독립변수의 p-value < 0.05 이므로, 모델에 영향을 미친다. 즉, 평균 차이가 있다.
- 