# 연관규칙분석 (association rule)
Market Basket Analysis 이라고도 하며, Apriori 알고리즘이 가장 많이 사용된다

In [1]:
#실습
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
 
    
dataset = [['Milk', 'Onion', 'Nutmeg', 'Eggs', 'Yogurt'],
           ['Onion', 'Nutmeg', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Ice cream', 'Eggs']]

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)



In [3]:
df

Unnamed: 0,Apple,Corn,Eggs,Ice cream,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,False,False,True,False,True,True,True,False,True
1,False,False,True,False,False,True,True,False,True
2,True,False,True,False,True,False,False,False,False
3,False,True,False,False,True,False,False,True,True
4,False,True,True,True,False,False,True,False,False


In [2]:
frequent_itemsets = apriori(df, min_support=0.5, use_colnames=True)

In [4]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.8,(Eggs)
1,0.6,(Milk)
2,0.6,(Onion)
3,0.6,(Yogurt)
4,0.6,"(Eggs, Onion)"


5개의 장바구니 중 eggs는 4번 들어가있으므로 0.8의 확률, 즉 itemsets는 물건의 조합, support는 그 물건의 조합이 들어있을 확률

In [5]:
association_rules(frequent_itemsets, metric="lift", min_threshold=1)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Eggs),(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6
1,(Onion),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf


support가 0.5가 넘는 항목에 대해 향상도가 양의 상관관계에 있는것. eggs와 onion

------------

## Data load

In [1]:
import pandas as pd
import pandas as pd
import json
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
#csv파일 불러와서 dataframe화
with open("C:/Users/user/Kakao Arena/data/genre_gn_all.json", encoding = 'utf-8') as json_file: genre = json.load(json_file)
with open("C:/Users/user/Kakao Arena/data/song_meta.json", encoding = 'utf-8') as json_file: song_meta = json.load(json_file)
with open("C:/Users/user/Kakao Arena/data/train.json", encoding = 'utf-8') as json_file: train = json.load(json_file)
with open("C:/Users/user/Kakao Arena/data/val.json", encoding = 'utf-8') as json_file: valid = json.load(json_file)
    
# json to dataframe
df_genre = pd.Series(genre).to_frame('genre').reset_index().rename({'index' : 'gen_id'}, axis = 1)
df_song_meta = pd.DataFrame(song_meta)
df_train = pd.DataFrame(train)
df_valid = pd.DataFrame(valid)


In [3]:
#songmeta에서 장르만 추출
gnr_meta=df_song_meta[['song_gn_dtl_gnr_basket','id']]

#playlist의 songs만 추출
playlist_songs=df_train['songs']

#각 playlist에 들어있는 장르만 추출 -> dataset
c=0
dataset_gnr=[]
for p in playlist_songs:
    gnr_list=[]
    for song in p:
        g=gnr_meta.loc[song,'song_gn_dtl_gnr_basket']
        
        try: #결측값 패스
            if len(g)==1:
                gnr_list.append(g[0])
            else:            
                for i in g:
                    gnr_list.append(i)
        except: pass    
    dataset_gnr.append(gnr_list)
    
    c+=1
    #print(c)
    if c==30:
        break #엄청빠름

In [4]:
#print (dataset_gnr)

#association_rules 분석을 위한 one-hot encoding
te = TransactionEncoder()
te_ary = te.fit(dataset_gnr).transform(dataset_gnr)
result = pd.DataFrame(te_ary, columns=te.columns_)

In [5]:
result.head()

Unnamed: 0,GN0101,GN0103,GN0104,GN0105,GN0201,GN0203,GN0204,GN0205,GN0301,GN0302,...,GN2504,GN2505,GN2506,GN2601,GN2602,GN2603,GN2703,GN2704,GN2901,GN2902
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,True,True,True,False,False,False,False,False,False,...,False,False,False,True,True,True,False,False,False,False
2,True,False,False,False,False,False,False,False,True,False,...,False,False,False,True,False,True,False,False,False,False
3,True,True,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,True,False,False,True,True,True,False,True,True,...,True,True,True,False,False,False,False,False,False,False


## Apriori 알고리즘 적용

In [12]:
frequent_itemsets = apriori(result, min_support=0.4, use_colnames=True) #Apriori 적용, 전체에서 등장확률이 min_support이상인 itemset만 출력
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.6,(GN0101)
1,0.466667,(GN0105)
2,0.433333,(GN0201)
3,0.5,(GN0301)
4,0.466667,(GN0303)
5,0.4,(GN0401)
6,0.466667,(GN0501)
7,0.466667,(GN0509)
8,0.533333,(GN0601)
9,0.433333,(GN0606)


In [13]:
#위의 결과에서 min_support 이상인 itemset에 대해 연관 규칙 결과 보여주기
rules=association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(GN0101),(GN0105),0.600000,0.466667,0.466667,0.777778,1.666667,0.186667,2.4
1,(GN0105),(GN0101),0.466667,0.600000,0.466667,1.000000,1.666667,0.186667,inf
2,(GN0101),(GN0301),0.600000,0.500000,0.400000,0.666667,1.333333,0.100000,1.5
3,(GN0301),(GN0101),0.500000,0.600000,0.400000,0.800000,1.333333,0.100000,2.0
4,(GN0101),(GN0303),0.600000,0.466667,0.400000,0.666667,1.428571,0.120000,1.6
...,...,...,...,...,...,...,...,...,...
101,"(GN0801, GN0509)","(GN0805, GN0501)",0.400000,0.400000,0.400000,1.000000,2.500000,0.240000,inf
102,(GN0805),"(GN0801, GN0501, GN0509)",0.400000,0.400000,0.400000,1.000000,2.500000,0.240000,inf
103,(GN0501),"(GN0805, GN0509, GN0801)",0.466667,0.400000,0.400000,0.857143,2.142857,0.213333,4.2
104,(GN0509),"(GN0805, GN0501, GN0801)",0.466667,0.400000,0.400000,0.857143,2.142857,0.213333,4.2


#### rules 분석

1. 어떤 플레이리스트에 GN0805,GN0101,GN0301이 있다고 할 때

-각 열은 frozenset이라는 변경할 수 없는 집합 자료형으로, list와 dict처럼 특정요소를 출력하는게 불가능한 대신 특정 요소가 set안에 포함되어있는지 in, not in 구문으로 True, False값을 반환

In [101]:
playlist=['GN0805','GN0101','GN0301']
#rules[(rules['antecedents']=={playlist[0]}) | (rules['antecedents']=={playlist[1]}) | (rules['antecedents']=={playlist[2]})]
#rules[rules['antecedents']=={p for p in playlist}]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(GN0101),(GN0105),0.6,0.466667,0.466667,0.777778,1.666667,0.186667,2.4
2,(GN0101),(GN0301),0.6,0.5,0.4,0.666667,1.333333,0.1,1.5
3,(GN0301),(GN0101),0.5,0.6,0.4,0.8,1.333333,0.1,2.0
4,(GN0101),(GN0303),0.6,0.466667,0.4,0.666667,1.428571,0.12,1.6
6,(GN0101),(GN0601),0.6,0.533333,0.466667,0.777778,1.458333,0.146667,2.1
8,(GN0101),(GN1501),0.6,0.6,0.433333,0.722222,1.203704,0.073333,1.44
10,(GN0101),(GN1504),0.6,0.4,0.4,0.666667,1.666667,0.16,1.8
12,(GN0101),(GN2501),0.6,0.466667,0.4,0.666667,1.428571,0.12,1.6
18,(GN0301),(GN0303),0.5,0.466667,0.466667,0.933333,2.0,0.233333,8.0
20,(GN0301),(GN2501),0.5,0.466667,0.4,0.8,1.714286,0.166667,2.666667


In [116]:
A=rules[(rules.antecedents=={playlist[0]})]
for p in range(len(playlist)-1):
    B=rules[(rules.antecedents=={playlist[p+1]})]
    C=pd.concat([A,B],axis=0).reset_index(drop=True)
    A=C

selected=A[['antecedents','consequents','support','confidence','lift']].sort_values(by=['lift'],axis=0,ascending=False) #향상도에 내림차순 정렬
selected[selected['lift']>1.5] #향상도 1.5이상만 보여줌

Unnamed: 0,antecedents,consequents,support,confidence,lift
2,(GN0805),(GN0801),0.4,1.0,2.5
4,(GN0805),"(GN0801, GN0501)",0.4,1.0,2.5
5,(GN0805),"(GN0801, GN0509)",0.4,1.0,2.5
6,(GN0805),"(GN0801, GN0501, GN0509)",0.4,1.0,2.5
0,(GN0805),(GN0501),0.4,1.0,2.142857
1,(GN0805),(GN0509),0.4,1.0,2.142857
3,(GN0805),"(GN0501, GN0509)",0.4,1.0,2.142857
19,(GN0301),"(GN0101, GN0303)",0.4,0.8,2.0
17,(GN0301),(GN0303),0.466667,0.933333,2.0
20,(GN0301),"(GN2501, GN0303)",0.4,0.8,2.0


1.GN0805장르가 있으면 GN0801,0501,0509이 포함될 확률이 매우 크다. 

2.GN0301장르가 있으면 GN0101,0303,2501이 포함될 확률이 크다.

3.GN0101장르가 있으면 GN1504,1501,0105가 포함될 확률이 조금 크다.

향상도와 신뢰도를 바탕으로 장바구니에 담기 

set은 중복을 허용하지 않는 대신 순서가 없음..

In [138]:
import itertools

S=selected[selected['lift']>1.5].reset_index(drop=True)
genre_list=[]
for c in range(len(S.consequents)):
    G=list(S.consequents[c])
    genre_list.append(G)
    
genre_list=list(itertools.chain.from_iterable(genre_list))        
#genre_list

Genre=[]
for v in genre_list:
    if v not in Genre:
        Genre.append(v)

Genre


['GN0801',
 'GN0501',
 'GN0509',
 'GN0101',
 'GN0303',
 'GN2501',
 'GN1504',
 'GN1501',
 'GN0105']

### 분석 결론

GN0805,GN0101,GN0301 > 포크/블루스 2010년이후발매, 발라드 전체, 랩 힙합 전체 장르와 가장 연관깊은 장르는

포크/블루스 세부장르 전체, 인디음악 세부장르전체, 인디음악 2010년이후발매, 발라드 전체, 랩/힙합 보컬스타일, 아이돌 세부장르전체, 국내드리마 ost, 국내드라마 ost전체, 10년도 이후 발매된 발라드 순으로 나타남