## `markov` 함수 내부

### flow
1. 전이행렬을 계산
2. sim 변수 값에 따라 기여도 값을 부여
2-1. sim이 아니라면 prob_convert를 통해 계산한 
2-2. sim이라면 어쩌구
3. normalize할거면 해서 attribution이라는 클래스 변수(딕셔너리)에 저장  

### 변수
- sim: 시뮬레이션을 돌릴거냐 안돌릴거냐

### 함수
- self.trans_matrix()
    - self.count_pairs()
        - self.pairs()
- self.prob_convert(trans_mat, drop)
    - self.pairs()
- self.simulate_path(trans_mat, drop_channel

### issues
- trans_matrix() 의 속도
- simulate_path() 의 속도

## 클래스 해체하기

### 0. __init__ 파트(해체 X)

In [7]:
import pandas as pd
from itertools import chain, tee, combinations
from functools import reduce, wraps
from operator import mul
from collections import defaultdict, Counter
import random
import time
import numpy as np
import copy
import json
import os
import sys

In [4]:
from mta import MTA
mta = MTA(data='data.csv', allow_loops=False)

runnning {func.__name__}..
elapsed time: {s:.3f} sec


In [6]:
mta.data.head()

Unnamed: 0,path,total_conversions,total_conversion_value,total_null,exposure_times
0,[alpha],5742,16461.754307,20583,[2019-11-11 08:41:50]
1,"[alpha, beta]",11,89.985,28,"[2019-11-11 08:41:50, 2019-11-11 08:41:51]"
2,"[alpha, beta, alpha]",11,43.5025,36,"[2019-11-11 08:41:50, 2019-11-11 08:41:51, 201..."
3,"[alpha, beta, alpha, beta, alpha, beta, alpha,...",1,2.28,3,"[2019-11-11 08:41:50, 2019-11-11 08:41:51, 201..."
4,"[alpha, beta, alpha, beta, alpha, epsilon, the...",1,1.14,3,"[2019-11-11 08:41:50, 2019-11-11 08:41:51, 201..."


In [8]:
# 클래스를 뜯어보기 위해 클래스 어트리뷰트를 global 변수로 전환
data = mta.data

In [15]:
START = mta.START
NULL = mta.NULL
CONV = mta.CONV
print(START, NULL, CONV)

(start) (null) (conversion)


### 1. trans_matrix(): 전이행렬 도출하기
> `itertuples` vs `iterrows`  
> https://cmdlinetips.com/2018/12/how-to-loop-through-pandas-rows-or-how-to-iterate-over-pandas-rows/  
> 둘 다 판다스에서 제공하는 함수로, 데이터프레임의 row를 접근하기 위한 방법. itertuples가 대체적으로 더 빠르며, column명을 인덱스로 접근할 수 있다는 장점이 있다. 즉, iterrows()가 iterate하는 row의 특정칼럼에 해당하는 값에 접근하기 위해서는 `row['columnname']`이라고 해야하는데 itertuples()에서는 `row.columnname`으로 접근할 수 있다는 것  

```python
def count_pairs(self):

    """
    count how many times channel pairs appear on all recorded customer journey paths
    """

    c = defaultdict(int)

    for row in self.data.itertuples():

        for ch_pair in self.pairs([self.START] + row.path):
            c[ch_pair] += (row.total_conversions + row.total_null)

        c[(row.path[-1], self.NULL)] += row.total_null
        c[(row.path[-1], self.CONV)] += row.total_conversions

    return c
```

In [9]:
# count_pairs 내부에 쓰이는 함수 pairs
# itertools의 tee란? Return n independent iterators from a single iterable
def pairs(lst):

    it1, it2 = tee(lst) # 하나의 리스트로 두 개의 iterator 만들기(개수 default가 2임)
    next(it2, None) # 두 번째 iterator는 한 스텝 수행

    return zip(it1, it2)

In [13]:
# iterator의 길이가 다를 때 zip은 어떻게 작동하는가
for item in zip([1,2,3,4,5], [2,3,4,5]):
    print(item)

(1, 2)
(2, 3)
(3, 4)
(4, 5)


In [11]:
# 그래서 pair의 결과값을 iterate하면 어떤 결과가 나오는가
for pair in pairs([1, 2, 3, 4, 5]):
    print(pair)

(1, 2)
(2, 3)
(3, 4)
(4, 5)


In [18]:
# trans_matrix 내부에 쓰이는 함수 count_pairs
c = defaultdict(int)

for row in data.itertuples():

    # start + path에서 생성할수 있는 pair당 발생 수 집계
    for ch_pair in pairs([START] + row.path): # start에 현재의 path를 붙여서 연속된 두 씩 pair 생성
        c[ch_pair] += (row.total_conversions + row.total_null) 

    # path 끝단 처리: 마지막 포인트에서 '(null)'으로의 횟수와 '(conv)'으로의 발생수 집계
    c[(row.path[-1], NULL)] += row.total_null
    c[(row.path[-1], CONV)] += row.total_conversions

print(c)

defaultdict(<class 'int'>, {('(start)', 'alpha'): 28846, ('alpha', '(null)'): 29945, ('alpha', '(conversion)'): 8447, ('alpha', 'beta'): 1965, ('beta', '(null)'): 3392, ('beta', '(conversion)'): 989, ('beta', 'alpha'): 3414, ('alpha', 'gamma'): 197, ('gamma', 'beta'): 132, ('beta', 'eta'): 10583, ('eta', 'beta'): 4513, ('eta', '(null)'): 14205, ('eta', '(conversion)'): 4167, ('alpha', 'epsilon'): 1138, ('epsilon', 'theta'): 322, ('theta', 'beta'): 1702, ('beta', 'theta'): 1899, ('theta', '(null)'): 2072, ('theta', '(conversion)'): 653, ('alpha', 'eta'): 2341, ('eta', 'alpha'): 4608, ('beta', 'iota'): 4873, ('iota', 'alpha'): 7889, ('alpha', 'kappa'): 277, ('kappa', '(null)'): 716, ('kappa', '(conversion)'): 230, ('theta', 'iota'): 4670, ('iota', 'theta'): 4054, ('eta', 'theta'): 1602, ('epsilon', '(null)'): 1692, ('epsilon', '(conversion)'): 531, ('epsilon', 'alpha'): 1064, ('alpha', 'iota'): 6202, ('iota', '(null)'): 11963, ('iota', '(conversion)'): 3355, ('alpha', 'lambda'): 2128, ('

In [27]:
# markov 내부에 사용되는 trans_matrix() 함수
tr = defaultdict(float)

outs = defaultdict(int)

# pair_counts = self.count_pairs()
pair_counts = c

for pair in pair_counts:

    outs[pair[0]] += pair_counts[pair] # 어떤 아이템이 pair에 첫번째로 오는 경우가 총 몇가지인가?

for pair in pair_counts:

    tr[pair] = pair_counts[pair]/outs[pair[0]]

print(outs)

defaultdict(<class 'int'>, {'(start)': 88387, 'alpha': 56634, 'beta': 26987, 'gamma': 1602, 'eta': 34872, 'epsilon': 5582, 'theta': 20366, 'iota': 43188, 'kappa': 2491, 'lambda': 12317, 'zeta': 3621, 'delta': 42, 'mi': 15})


In [26]:
# tr에서 alpha로 시작하는 pair들의 probability
[(k,v) for k, v in tr.items() if k[0] == 'alpha']

[(('alpha', '(null)'), 0.5287459829784229),
 (('alpha', '(conversion)'), 0.1491506868665466),
 (('alpha', 'beta'), 0.03469647208390719),
 (('alpha', 'gamma'), 0.0034784758272415864),
 (('alpha', 'epsilon'), 0.020093936504573223),
 (('alpha', 'eta'), 0.04133559345975916),
 (('alpha', 'kappa'), 0.004891054843380302),
 (('alpha', 'iota'), 0.1095101882261539),
 (('alpha', 'lambda'), 0.03757460182928982),
 (('alpha', 'theta'), 0.06151781615284105),
 (('alpha', 'zeta'), 0.008952219514779107),
 (('alpha', 'delta'), 5.297171310520182e-05)]

### 2. prob_convert(): 단순 곱셈으로 전환 path가 발생할 확률 계산하기
dictionary의 `get(key, default)` method  
> 해당하는 key의 value를 반환받으며, 해당 key가 dictionary에 존재하지 않을 때 반환할 default 값을 설정할 수 있다.(https://www.tutorialspoint.com/python/dictionary_get.htm)
  
`reduce`  
> 하나의 리스트 내에서 연속적으로 함수를 적용하고자 할 때. (예 - 리스트의 원소를 모두 곱하라)  
> https://www.geeksforgeeks.org/reduce-in-python/

In [32]:
# markov의 if not sim 파트에 사용되는 함수 prob_convert()
# drop: drop 할 터치포인트의 이름. drop을 하지 않는다면 None

## parameters
drop = None
trans_mat = tr

# total_conversions가 0보다 크고 drop 터치포인트를 해당하지 않는 data만 추리기
# conversions는 없고 null만 있는 행도 data에 기록되어 있기 때문에
# drop이 None이라면 첫번째 조건은 있으나마나
_d = data[data['path'].apply(lambda x: drop not in x) & (data['total_conversions'] > 0)]

p = 0

for row in _d.itertuples():

    pr_this_path = [] # 하나의 path를 pair씩 분해했을때, 해당 pair가 발생할 확률을 순차적으로 저장할 리스트

    # path의 맨앞과 맨뒤에 '(start)', '(conv)' 아이템을 붙인 리스트로 pair 생성
    for t in pairs([START] + row.path + [CONV]): 

        # 생성한 trans_mat 딕셔너리에서 해당 pair의 확률 찾기. 
        # dictionary에 해당 pair가 없으면 확률 0이라는 뜻
        pr_this_path.append(trans_mat.get(t, 0)) 

    print(pr_this_path) 
    print(reduce(mul, pr_this_path))
    
    # 한 리스트의 pair 당 확률을 모두 곱한 값(==해당 path가 발생할 확률)의 총합
    p += reduce(mul, pr_this_path) 

print(p)

[0.32636021134329707, 0.1491506868665466]
0.04867684968776407
[0.32636021134329707, 0.03469647208390719, 0.036647274613702895]
0.00041497717177110756
[0.32636021134329707, 0.03469647208390719, 0.1265053544299107, 0.1491506868665466]
0.0002136567851517514
[0.32636021134329707, 0.03469647208390719, 0.1265053544299107, 0.03469647208390719, 0.1265053544299107, 0.03469647208390719, 0.1265053544299107, 0.0034784758272415864, 0.08239700374531835, 0.1265053544299107, 0.03469647208390719, 0.3921517767814133, 0.12941615049323238, 0.3921517767814133, 0.11949415003441156]
8.256918821275024e-17
[0.32636021134329707, 0.03469647208390719, 0.1265053544299107, 0.03469647208390719, 0.1265053544299107, 0.020093936504573223, 0.05768541741311358, 0.08357065697731514, 0.07036721384370252, 0.03206324265933418]
1.3741949132050471e-12
[0.32636021134329707, 0.03469647208390719, 0.1265053544299107, 0.03469647208390719, 0.07036721384370252, 0.229303741530001, 0.09386866722237659, 0.08357065697731514, 0.0703672138

[0.137112923846267, 0.3921517767814133, 0.08952741454462032, 0.027114013151801425, 0.09512719455392332]
1.241615464318141e-05
[0.137112923846267, 0.3921517767814133, 0.08952741454462032, 0.1016949152542373, 0.11949415003441156]
5.8497122703148294e-05
[0.137112923846267, 0.3921517767814133, 0.08952741454462032, 0.1016949152542373, 0.1321403991741225, 0.1491506868665466]
9.648254098955308e-06
[0.137112923846267, 0.3921517767814133, 0.08952741454462032, 0.1016949152542373, 0.1321403991741225, 0.06151781615284105, 0.03206324265933418]
1.2759446246589317e-07
[0.137112923846267, 0.3921517767814133, 0.08952741454462032, 0.1016949152542373, 0.12941615049323238, 0.3921517767814133, 0.11949415003441156]
2.968774216527303e-06
[0.137112923846267, 0.3921517767814133, 0.08952741454462032, 0.1016949152542373, 0.00022941041523285156, 0.11904761904761904]
1.3369701327470553e-08
[0.137112923846267, 0.3921517767814133, 0.08952741454462032, 0.1016949152542373, 0.030712319339298005, 0.05768541741311358, 0.

[0.137112923846267, 0.02923629895875792, 0.04790127466103759, 0.3921517767814133, 0.12941615049323238, 0.036647274613702895]
3.5713482683621677e-07
[0.137112923846267, 0.02923629895875792, 0.04790127466103759, 0.02923629895875792, 0.04790127466103759, 0.036647274613702895]
9.85505388616949e-09
[0.137112923846267, 0.02923629895875792, 0.04790127466103759, 0.02923629895875792, 0.04790127466103759, 0.07036721384370252, 0.03206324265933418]
6.067295095497004e-10
[0.137112923846267, 0.02923629895875792, 0.04790127466103759, 0.07036721384370252, 0.03206324265933418]
4.332371137688561e-07
[0.137112923846267, 0.02923629895875792, 0.04603393683526833, 0.1321403991741225, 0.1491506868665466]
3.636970477312399e-06
[0.137112923846267, 0.02923629895875792, 0.04603393683526833, 0.12941615049323238, 0.036647274613702895]
8.752035360879429e-07
[0.137112923846267, 0.02923629895875792, 0.04603393683526833, 0.023141775636613904, 0.04790127466103759, 0.3921517767814133, 0.11949415003441156]
9.585692300266

8.757914917528757e-13
[0.16080419066152263, 0.12941615049323238, 0.18056842183273428, 0.12077428915439474, 0.3921517767814133, 0.08952741454462032, 0.12077428915439474, 0.18056842183273428, 0.12077428915439474, 0.3921517767814133, 0.12941615049323238, 0.18056842183273428, 0.06344354913401871, 0.09799464155232605]
2.39098807613129e-12
[0.16080419066152263, 0.12941615049323238, 0.18056842183273428, 0.12077428915439474, 0.18056842183273428, 0.12077428915439474, 0.1265053544299107, 0.03469647208390719, 0.02923629895875792, 0.09799464155232605]
1.2446217141003759e-10
[0.16080419066152263, 0.12941615049323238, 0.18056842183273428, 0.1016949152542373, 0.11949415003441156]
4.566395522391946e-05
[0.16080419066152263, 0.12941615049323238, 0.18056842183273428, 0.1016949152542373, 0.12941615049323238, 0.3921517767814133, 0.12941615049323238, 0.3921517767814133, 0.1321403991741225, 0.1491506868665466]
1.939872035366307e-08
[0.16080419066152263, 0.12941615049323238, 0.18056842183273428, 0.1016949152

2.7308928310757327e-08
[0.007874461176417347, 0.08239700374531835, 0.18056842183273428, 0.07768361581920905]
9.10130146196845e-06
[0.007874461176417347, 0.08239700374531835, 0.02923629895875792, 0.09799464155232605]
1.85890411334904e-06
[0.007874461176417347, 0.01435705368289638, 0.09512719455392332]
1.0754515735166344e-05
[0.007874461176417347, 0.01435705368289638, 0.08706556789681118, 0.08952741454462032, 0.09386866722237659, 0.229303741530001, 0.09386866722237659, 0.031817735441421975, 0.09512719455392332]
5.389085203853226e-12
[0.007874461176417347, 0.08801498127340825, 0.11949415003441156]
8.28178766422908e-05
[0.007874461176417347, 0.08801498127340825, 0.1321403991741225, 0.1491506868665466]
1.365961060744194e-05
[0.007874461176417347, 0.08801498127340825, 0.08952741454462032, 0.07768361581920905]
4.820176283608414e-06
[0.007874461176417347, 0.08801498127340825, 0.023141775636613904, 0.09799464155232605]
1.571724613750573e-06
[0.007874461176417347, 0.1679151061173533, 0.077683615

[0.23110864719924876, 0.12077428915439474, 0.1265053544299107, 0.06151781615284105, 0.07944613571639006, 0.1321403991741225, 0.03469647208390719, 0.18056842183273428, 0.027114013151801425, 0.19061268362594053, 0.1491506868665466]
1.1013031254493926e-11
[0.23110864719924876, 0.12077428915439474, 0.014562567161966872, 0.09512719455392332]
3.86663622795315e-05
[0.23110864719924876, 0.12077428915439474, 0.014562567161966872, 0.19061268362594053, 0.1491506868665466]
1.1555950697082324e-05
[0.23110864719924876, 0.12077428915439474, 0.014562567161966872, 0.10766750268720889, 0.07768361581920905]
3.399716463845835e-06
[0.23110864719924876, 0.12077428915439474, 0.014562567161966872, 0.10766750268720889, 0.09386866722237659, 0.229303741530001, 0.07768361581920905]
7.317698150621078e-08
[0.23110864719924876, 0.12077428915439474, 0.3921517767814133, 0.11949415003441156]
0.0013079511286598385
[0.23110864719924876, 0.12077428915439474, 0.3921517767814133, 0.1321403991741225, 0.1491506868665466]
0.00

[0.23110864719924876, 0.1016949152542373, 0.12941615049323238, 0.18056842183273428, 0.12077428915439474, 0.3921517767814133, 0.11949415003441156]
3.1082864524560573e-06
[0.23110864719924876, 0.1016949152542373, 0.12941615049323238, 0.18056842183273428, 0.12077428915439474, 0.18056842183273428, 0.12077428915439474, 0.1265053544299107, 0.1095101882261539, 0.12077428915439474, 0.1265053544299107, 0.04133559345975916, 0.08952741454462032, 0.12077428915439474, 0.3921517767814133, 0.08952741454462032, 0.1016949152542373, 0.1321403991741225, 0.04133559345975916, 0.11949415003441156]
3.1889957602409314e-19
[0.23110864719924876, 0.1016949152542373, 0.12941615049323238, 0.18056842183273428, 0.12077428915439474, 0.18056842183273428, 0.12077428915439474, 0.18056842183273428, 0.18266648143002687, 0.1095101882261539, 0.07768361581920905]
4.05901932776543e-10
[0.23110864719924876, 0.1016949152542373, 0.12941615049323238, 0.18056842183273428, 0.1016949152542373, 0.08952741454462032, 0.0776836158192090

2.932095464933504e-23
[0.23110864719924876, 0.06344354913401871, 0.018835755459933424, 0.19304059652029826, 0.020093936504573223, 0.08706556789681118, 0.08952741454462032, 0.18266648143002687, 0.1491506868665466]
2.275029258749093e-10
[0.23110864719924876, 0.06344354913401871, 0.018835755459933424, 0.2913559790113228, 0.07768361581920905]
6.250864367903986e-06
[0.23110864719924876, 0.06344354913401871, 0.018835755459933424, 0.08395470864402099, 0.09799464155232605]
2.2721348160846363e-06
[0.23110864719924876, 0.06344354913401871, 0.018835755459933424, 0.08395470864402099, 0.13168791101729319, 0.036260072242289526, 0.029549848108257386]
3.271610505861091e-09
[0.23110864719924876, 0.09386866722237659, 0.03206324265933418]
0.0006955755197187678
[0.23110864719924876, 0.09386866722237659, 0.36099381321810864, 0.1491506868665466]
0.0011680511564396568
[0.23110864719924876, 0.09386866722237659, 0.36099381321810864, 0.03469647208390719, 0.18056842183273428, 0.09386866722237659, 0.0320632426593

1.8929661614816288e-10
[0.23110864719924876, 0.036260072242289526, 0.06848936757801712, 0.04821761759795738, 0.08508565397418202, 0.04821761759795738, 0.13168791101729319, 0.06344354913401871, 0.08508565397418202, 0.04821761759795738, 0.09799464155232605]
3.8135869362031506e-13
[0.0034507337051828887, 0.0923323966278603]
0.00031861451312407247
[0.0034507337051828887, 0.11079887595343235, 0.1491506868665466]
5.7025888173726864e-05
[0.0034507337051828887, 0.11079887595343235, 0.03469647208390719, 0.1265053544299107, 0.1491506868665466]
2.503031321002775e-07
[0.0034507337051828887, 0.11079887595343235, 0.03469647208390719, 0.18056842183273428, 0.12077428915439474, 0.004965353688813132, 0.04937775993576877, 0.004965353688813132, 0.0923323966278603]
3.2518782852679703e-14
[0.0034507337051828887, 0.11079887595343235, 0.04133559345975916, 0.1321403991741225, 0.020093936504573223, 0.19061268362594053, 0.1491506868665466]
1.193022600899413e-09
[0.0034507337051828887, 0.11079887595343235, 0.1095

[0.0434113613993008, 0.08508565397418202, 0.031817735441421975, 0.06359727696166248, 0.17228221157749452, 0.03757460182928982, 0.17228221157749452, 0.1491506868665466]
1.2432774463892796e-09
[0.0434113613993008, 0.08508565397418202, 0.07944613571639006, 0.023141775636613904, 0.04790127466103759, 0.3921517767814133, 0.023141775636613904, 0.04603393683526833, 0.04593943565037853, 0.07944613571639006, 0.11949415003441156]
5.926664388880557e-14
[0.0434113613993008, 0.08508565397418202, 0.229303741530001, 0.07768361581920905]
6.579612543512104e-05
[0.0434113613993008, 0.08508565397418202, 0.229303741530001, 0.18266648143002687, 0.06151781615284105, 0.229303741530001, 0.18266648143002687, 0.1095101882261539, 0.09386866722237659, 0.36099381321810864, 0.1095101882261539, 0.09386866722237659, 0.36099381321810864, 0.1095101882261539, 0.18266648143002687, 0.1095101882261539, 0.18266648143002687, 0.1095101882261539, 0.09386866722237659, 0.229303741530001, 0.09386866722237659, 0.229303741530001, 0.

### 3. simulate_path(): 시뮬레이션으로 전환 확률 추정하기
generate n random user journeys and see where these users end up - converted or not; drop_channel is a channel to exclude from journeys if specified