# 연관분석(장바구니분석)

## 빈발항목집합을 추출하는 Apriori algorithm 의 원리

### 최소지지도 이상을 갖는 항목집합을 빈발항목집합(frequent item set)이라고 한다.
### 모든 항목집합에 대한 지지도를 계산하는 대신에 최소 지지도 이상의 빈발항목집합만을 찾아내서 연관규칙을 계산하는 것이 
### Apriori algorithm의 주요 내용이다.

### [ 빈발항목집합 추출의 Apriori Principle ]

#### (1) 한 항목집합이 빈발(frequent)하다면 이 항목집합의 모든 부분집합은 역시 빈발항목집합이다.
####    (frequent item sets -> next step) 
#### (2) 한 항목집합이 비비발(infrequent)하다면 이 항목집합을 포함하는 모든 집합은 비빈발항목집합이다. 
####    (superset -> pruning) 

## 지지도(support) s(X→Y) 

### X와 Y를 모두 포함하는 거래 수 / 전체 거래 수  

## 신뢰도(Confidence) c(X→Y) 

### X와 Y를 모두 포함하는 거래 수 / X가 포함된 거래 수

## 향상도(Lift)

### 연관규칙의 신뢰도/지지도 


![edu1](images/edu1.png)

In [8]:
import pandas as pd

In [9]:
from apyori import apriori

baskets = [
    ['Milk', 'Tea', 'Cake'],
    ['Eggs', 'Tea', 'Cold Drink'],
    ['Milk', 'Eggs', 'Tea', 'Cold Drink'],
    ['Eggs', 'Cold Drink'],
    ['Juice']
]
association_result = list(apriori(baskets, min_support=0.4))

In [3]:
columns = ['source', 'target', 'support']
network_df = pd.DataFrame(columns=columns)
for result in  association_result:
    if len(result.items) == 2:
        items = [x for x in result.items]
        row = [items[0], items[1], result.support]
        series = pd.Series(row, index=network_df.columns)
        network_df = network_df.append(series, ignore_index=True)
network_df.head(10)

Unnamed: 0,source,target,support
0,Eggs,Cold Drink,0.6
1,Tea,Cold Drink,0.4
2,Tea,Eggs,0.4
3,Tea,Milk,0.4


![edu1](images/edu2.png)

In [1]:
from apyori import apriori

baskets = [
    ['Milk', 'Tea', 'Cake'],
    ['Eggs', 'Tea', 'Cold Drink'],
    ['Milk', 'Eggs', 'Tea', 'Cold Drink'],
    ['Eggs', 'Cold Drink'],
    ['Juice']
]
association_result = list(apriori(baskets, min_support=0.6))

In [13]:
association_result

[RelationRecord(items=frozenset({'Cold Drink'}), support=0.6, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Cold Drink'}), confidence=0.6, lift=1.0)]),
 RelationRecord(items=frozenset({'Eggs'}), support=0.6, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Eggs'}), confidence=0.6, lift=1.0)]),
 RelationRecord(items=frozenset({'Milk'}), support=0.4, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Milk'}), confidence=0.4, lift=1.0)]),
 RelationRecord(items=frozenset({'Tea'}), support=0.6, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Tea'}), confidence=0.6, lift=1.0)]),
 RelationRecord(items=frozenset({'Cold Drink', 'Eggs'}), support=0.6, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Cold Drink', 'Eggs'}), confidence=0.6, lift=1.0), OrderedStatistic(items_base=frozenset({'Cold Drink'}), items_add=frozenset({'Eg

In [12]:
columns = ['source', 'target', 'support', 'confidence']
network_df = pd.DataFrame(columns=columns)
for result in  association_result:
    if len(result.items) == 2:
        items = [x for x in result.items]
        row = [items[0], items[1], result.support, result.ordered_statistics[0].confidence]
        series = pd.Series(row, index=network_df.columns)
        network_df = network_df.append(series, ignore_index=True)
network_df.head(10)

Unnamed: 0,source,target,support,confidence
0,Cold Drink,Eggs,0.6,0.6
1,Cold Drink,Tea,0.4,0.4
2,Tea,Eggs,0.4,0.4
3,Milk,Tea,0.4,0.4


In [11]:
from apyori import apriori

baskets = [
    ['계란', '우유'],
    ['계란', '기저귀', '맥주', '사과'],
    ['우유', '기저귀', '맥주', '콜라'],
    ['계란', '우유', '맥주', '기저귀'],
    ['계란', '우유', '멕주', '콜라']
]
association_result = list(apriori(baskets, min_support=0.4))

In [7]:
association_result

[RelationRecord(items=frozenset({'계란'}), support=0.8, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'계란'}), confidence=0.8, lift=1.0)]),
 RelationRecord(items=frozenset({'기저귀'}), support=0.6, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'기저귀'}), confidence=0.6, lift=1.0)]),
 RelationRecord(items=frozenset({'맥주'}), support=0.6, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'맥주'}), confidence=0.6, lift=1.0)]),
 RelationRecord(items=frozenset({'우유'}), support=0.8, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'우유'}), confidence=0.8, lift=1.0)]),
 RelationRecord(items=frozenset({'콜라'}), support=0.4, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'콜라'}), confidence=0.4, lift=1.0)]),
 RelationRecord(items=frozenset({'계란', '기저귀'}), support=0.4, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=f

In [12]:
columns = ['source', 'target', 'support']
network_df = pd.DataFrame(columns=columns)
for result in  association_result:
    if len(result.items) == 2:
        items = [x for x in result.items]
        row = [items[0], items[1], result.support]
        series = pd.Series(row, index=network_df.columns)
        network_df = network_df.append(series, ignore_index=True)
network_df.head(10)

Unnamed: 0,source,target,support
0,계란,기저귀,0.4
1,계란,맥주,0.4
2,계란,우유,0.6
3,맥주,기저귀,0.6
4,기저귀,우유,0.4
5,맥주,우유,0.4
6,콜라,우유,0.4


In [13]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

In [15]:
dataset= [['양말','팬티','신발'],
         ['신발','바지','팬티','셔츠'],
         ['모자','양말','신발'],
         ['신발','바지','팬티','장갑']]

In [16]:
t = TransactionEncoder()
t_a = t.fit(dataset).transform(dataset)
df = pd.DataFrame(t_a, columns = t.columns_)
df

Unnamed: 0,모자,바지,셔츠,신발,양말,장갑,팬티
0,False,False,False,True,True,False,True
1,False,True,True,True,False,False,True
2,True,False,False,True,True,False,False
3,False,True,False,True,False,True,True


In [17]:
frequent = apriori(df, min_support=0.5, use_colnames=True)
frequent

Unnamed: 0,support,itemsets
0,0.5,(바지)
1,1.0,(신발)
2,0.5,(양말)
3,0.75,(팬티)
4,0.5,"(바지, 신발)"
5,0.5,"(바지, 팬티)"
6,0.5,"(신발, 양말)"
7,0.75,"(신발, 팬티)"
8,0.5,"(바지, 신발, 팬티)"
