# ◈ 데이터 분석 준비

## 1) Colab 환경 설정

In [None]:
# 시각화 폰트 설치
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

# 트리맵
!pip install squarify

# 요인분석
!pip install pingouin factor_analyzer

# 연관분석
!pip install -U apyori mlxtend

## 2) 필수 라이브러리 로드

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from scipy import stats
from patsy import demo_data, dmatrix, dmatrices

## 3) 글로벌 환경 설정

In [2]:
#%precision 3 # np 정확도를 소숫점 3자리로 고정
%matplotlib inline
sns.set_theme()

# plt.rc('font', family='NanumBarunGothic')
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.size'] = 12
plt.rcParams['figure.figsize'] = (10, 5)

sns_color = sns.color_palette('pastel')
plt_line = ['-', '--', ':', '-.']

## 4) 분석 데이터 로드

In [3]:
dataset = [['Milk', 'Onion', 'Nutmeg', 'Eggs', 'Yogurt'],
           ['Onion', 'Nutmeg', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Ice cream', 'Eggs']]

# ◈ Association
- 계산방법 
  - 지지도 : $P(A∩B)$
  - 신뢰도 : $P(B | A)$
  - 지지도 : $P(B | A) / P(B)$, 1이면 독립, 1보다 크면 양의 상관, 적으면 음의 상관

- 알고리즘 : Apriori, FP-Growth, DHP

## 1) mlxtend 이용
- (장점) 분석 결과가 데이터프레임으로 깔끔하게 정리되어 표시된다. FP-Growth도 지원한다.
- (단점) 데이터를 아이템을 컬럼으로 하는 더미 데이터프레임으로 만들어 줘야 한다.

> 데이터 변환

In [4]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.frequent_patterns import fpgrowth

te = TransactionEncoder()
df = pd.DataFrame(data=te.fit_transform(dataset), columns=te.columns_).astype(int)
df

Unnamed: 0,Apple,Corn,Eggs,Ice cream,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,0,0,1,0,1,1,1,0,1
1,0,0,1,0,0,1,1,0,1
2,1,0,1,0,1,0,0,0,0
3,0,1,0,0,1,0,0,1,1
4,0,1,1,1,0,0,1,0,0


> Apriori 연관분석

In [5]:
freq = apriori(
    df, 
    min_support=0.5, 
    use_colnames=True,
    max_len = 6
    )

rules = association_rules(
    freq, 
    metric='lift', 
    min_threshold=1)

rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].\
    sort_values('lift', ascending=False)

Unnamed: 0,antecedents,consequents,support,confidence,lift
1,(Onion),(Eggs),0.6,1.0,1.25
0,(Eggs),(Onion),0.6,0.75,1.25


> FP Growth 연관분석

In [6]:
# FP Grwoth 적용
freq = fpgrowth(
    df,
    min_support=0.5,
    use_colnames=True,
    max_len = 6
)

rules = association_rules(
    freq, 
    metric='lift', 
    min_threshold=1)

rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].\
    sort_values('lift', ascending=False)

Unnamed: 0,antecedents,consequents,support,confidence,lift
1,(Onion),(Eggs),0.6,1.0,1.25
0,(Eggs),(Onion),0.6,0.75,1.25


## 2) apyori 이용
- (장점) 데이터를 변환 할 필요 없이 바로 사용할 수 있다.
- (단점) 데이터 분석 결과가 텍스트형식으로 보기 불편하다.

In [7]:
from apyori import apriori

rules = list(
    apriori(
        dataset,
        min_support = 0.5,
        min_confidence = 0.5,
        min_lift = 1, 
        min_length = 2
    )
)

for r in rules:
    print(r)

RelationRecord(items=frozenset({'Eggs'}), support=0.8, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Eggs'}), confidence=0.8, lift=1.0)])
RelationRecord(items=frozenset({'Milk'}), support=0.6, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Milk'}), confidence=0.6, lift=1.0)])
RelationRecord(items=frozenset({'Onion'}), support=0.6, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Onion'}), confidence=0.6, lift=1.0)])
RelationRecord(items=frozenset({'Yogurt'}), support=0.6, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Yogurt'}), confidence=0.6, lift=1.0)])
RelationRecord(items=frozenset({'Eggs', 'Onion'}), support=0.6, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Eggs', 'Onion'}), confidence=0.6, lift=1.0), OrderedStatistic(items_base=frozenset({'Eggs'}), items_add=frozenset({'Onion'}), confidence=0.74999999

# ◈ Clustering