## 연관규칙

#### mlxtend 설치

In [4]:
!pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.22.0-py2.py3-none-any.whl (1.4 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m31m19.0 MB/s[0m eta [36m0:00:01[0m
Installing collected packages: mlxtend
Successfully installed mlxtend-0.22.0


#### 패키지 불러오기

In [5]:
# 데이터 구성: Sereis, DataFrame
import pandas as pd
# 행렬 연산
import numpy as np
# 데이터 전처리 : 항목 값에 대한 index 생성
from mlxtend.preprocessing import TransactionEncoder
# 지지도 계산
from mlxtend.frequent_patterns import apriori
# 연관 규칙
from mlxtend.frequent_patterns import association_rules

#### 한글 폰트 및 옵션

In [6]:
# 그래픽 한글폰트 적용
import matplotlib
matplotlib.rc("font", family = "NanumGothic")

### 데이터 구성하기

In [8]:
# 데이터 불러오기
import os
os.chdir("/home/piai/문서/2. Big Data 분석")

In [9]:
# 데이터 구성하기
df_raw = pd.read_csv("./상품구매.csv", encoding = 'euc-kr')
df_raw.head(10)

Unnamed: 0,ID,PRODUCT
0,C-11,BAGUETTE
1,C-11,HERRING
2,C-11,AVOCADO
3,C-11,ARTICHOKE
4,C-11,HEINEKEN
5,C-11,APPLES
6,C-11,CORNED BEEF
7,C-12,HERRING
8,C-12,CORNED BEEF
9,C-12,APPLES


#### 데이터 전처리

In [10]:
# 고객 ID 리스트 생성 및 정렬
ID = list(set(df_raw["ID"]))
ID.sort()

# 고객 ID별 상품 매칭
list_association = []
for i in ID:
    # ID별 구매 데이터 추출
    tmp_list = list(df_raw[df_raw["ID"] == i]["PRODUCT"])
    tmp_list.sort()
    list_association.append(tmp_list)

# 리스트 출력
for row in list_association:
    print(row)

['APPLES', 'ARTICHOKE', 'AVOCADO', 'BAGUETTE', 'CORNED BEEF', 'HEINEKEN', 'HERRING']
['APPLES', 'CORNED BEEF', 'HEINEKEN', 'HERRING', 'OLIVES', 'SARDINES', 'STEAK']
['APPLES', 'AVOCADO', 'BAGUETTE', 'ICE CREAM', 'PEPPERS', 'SARDINES', 'STEAK']
['APPLES', 'COKE', 'CORNED BEEF', 'HAM', 'HERRING', 'OLIVES', 'TURKEY']
['ARTICHOKE', 'BOURBON', 'COKE', 'HAM', 'ICE CREAM', 'OLIVES', 'TURKEY']
['ARTICHOKE', 'AVOCADO', 'BAGUETTE', 'COKE', 'HEINEKEN', 'HERRING', 'TURKEY']
['APPLES', 'CHICKEN', 'COKE', 'CORNED BEEF', 'HEINEKEN', 'ICE CREAM', 'SARDINES']
['BAGUETTE', 'BOURBON', 'CRACKERS', 'HEINEKEN', 'OLIVES', 'PEPPERS', 'SODA']
['BOURBON', 'CRACKERS', 'HEINEKEN', 'HERRING', 'OLIVES', 'SODA', 'STEAK']
['APPLES', 'BAGUETTE', 'CORNED BEEF', 'HAM', 'HERRING', 'OLIVES', 'TURKEY']
['ARTICHOKE', 'AVOCADO', 'BAGUETTE', 'BOURBON', 'CORNED BEEF', 'HEINEKEN', 'HERRING']
['ARTICHOKE', 'BOURBON', 'CRACKERS', 'HEINEKEN', 'OLIVES', 'SODA', 'STEAK']
['BOURBON', 'CORNED BEEF', 'CRACKERS', 'HEINEKEN', 'HERRING', 

#### 데이터 전처리(encoding)

In [11]:
# Items 기준으로 포함 여부에 따라 True/False로 변환
enc = TransactionEncoder()
df_raw_enc = enc.fit_transform(X = list_association)

# 함수 fit_transform는 데이터를 행렬의 형태로 반환, pandas의 DataFrame의 형태로 변환
df_asso = pd.DataFrame(df_raw_enc, columns = enc.columns_)
df_asso.head()

Unnamed: 0,APPLES,ARTICHOKE,AVOCADO,BAGUETTE,BOURBON,CHICKEN,COKE,CORNED BEEF,CRACKERS,HAM,HEINEKEN,HERRING,ICE CREAM,OLIVES,PEPPERS,SARDINES,SODA,STEAK,TURKEY
0,True,True,True,True,False,False,False,True,False,False,True,True,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,True,False,False,True,True,False,True,False,True,False,True,False
2,True,False,True,True,False,False,False,False,False,False,False,False,True,False,True,True,False,True,False
3,True,False,False,False,False,False,True,True,False,True,False,True,False,True,False,False,False,False,True
4,False,True,False,False,True,False,True,False,False,True,False,False,True,True,False,False,False,False,True


### 연관규칙 분석

#### 규칙 선택 임계값 입력 (지지도 40% 이상)

In [12]:
# 규칙 선택 최소 지지도
v_min_support = 0.4

# 연관규칙 생성 및 지지도 임계값 적용
# df_freq = apriori(df_asso, min_support = 0.4, use_colnames = True) # 직접 입력
df_freq = apriori(df_asso, min_support = v_min_support, use_colnames = True)
# itemsets 항목 갯수별 확인: 부분 선택
df_freq[9:16].round(3)

Unnamed: 0,support,itemsets
9,0.4,(SODA)
10,0.4,(TURKEY)
11,0.5,"(BOURBON, OLIVES)"
12,0.4,"(BOURBON, SODA)"
13,0.4,"(ICE CREAM, COKE)"
14,0.4,"(OLIVES, SODA)"
15,0.4,"(BOURBON, OLIVES, SODA)"


In [13]:
# 연관규칙 선택 및 해석
# 신뢰도 기준: confidence
df_asso_rule = association_rules(df_freq, metric = "confidence", min_threshold = 0.7)
df_asso_rule.round(3)

# 향상도 기준: lift
df_asso_rule = association_rules(df_freq, metric = "lift", min_threshold = 1.3)
df_asso_rule.round(3)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(BOURBON),(OLIVES),0.55,0.65,0.5,0.909,1.399,0.142,3.85,0.633
1,(OLIVES),(BOURBON),0.65,0.55,0.5,0.769,1.399,0.142,1.95,0.814
2,(BOURBON),(SODA),0.55,0.4,0.4,0.727,1.818,0.18,2.2,1.0
3,(SODA),(BOURBON),0.4,0.55,0.4,1.0,1.818,0.18,inf,0.75
4,(ICE CREAM),(COKE),0.45,0.5,0.4,0.889,1.778,0.175,4.5,0.795
5,(COKE),(ICE CREAM),0.5,0.45,0.4,0.8,1.778,0.175,2.75,0.875
6,(OLIVES),(SODA),0.65,0.4,0.4,0.615,1.538,0.14,1.56,1.0
7,(SODA),(OLIVES),0.4,0.65,0.4,1.0,1.538,0.14,inf,0.583
8,"(BOURBON, OLIVES)",(SODA),0.5,0.4,0.4,0.8,2.0,0.2,3.0,1.0
9,"(BOURBON, SODA)",(OLIVES),0.4,0.65,0.4,1.0,1.538,0.14,inf,0.583
