# MLS Bucket 비율 조정 자동화
---
- <span style = "color:red"> 2022.04.25 초안 작성 </sapn>
---

### 000_Import Library

In [None]:
####################################################
# dataframe 및 수치연산을 위한 library load
import pandas as pd
import numpy as np
import os
from datetime import datetime
####################################################
# 그래프를 그리기 위한 library 로드 (matplotlib / seaborn)
from matplotlib import pyplot as plt
import seaborn as sns # matplotlib 기반의 visualization library 
plt.style.use(['ggplot'])

import matplotlib as mpl
mpl.rcParams['axes.unicode_minus']=False
# NanumGothic 폰트가 안나오는 현상을 제거하기 위해 cache 업데이트
plt.rcParams["font.family"] = 'NanumGothic'
sns.set_style("darkgrid", {"font.family":['NanumGothicCoding']}) # 배경색 및 한글론트 설정
sns.set_palette("deep")
%matplotlib inline

####################################################
from datetime import date
import datetime
from dateutil.relativedelta import relativedelta
from datetime import timedelta
pd.set_option('display.max_columns', 100)

from skt.gcp import bq_to_pandas, df_to_bq_table, bq_insert_overwrite 
from skt.gcp import get_bigquery_client, load_bigquery_ipython_magic
from skt.ye import slack_send

load_bigquery_ipython_magic()

In [None]:
# skt mls api 
import time
from skt.mls import get_mls_config, get_mls_component_client, get_mls_experiment_client
from skt.mls import get_mls_model_registry, get_mls_dimension_client, get_mls_ml_model_client, get_mls_profile_api_client

from sktmls import MLSENV, MLSRuntimeENV, MLSClient, ModelRegistry
from sktmls.filters.filter import FilterClient
from sktmls.datasets import DatasetClient, ProblemType, FeatureStoreConf, LabelDataConf
from sktmls.models import MLModelClient
from sktmls.ml_features import MLFeatureClient
from sktmls.experiments.experiment import Bucket, Experiment, ExperimentClient

<br><br>

### 001_기본 설정 

In [None]:
from sktmls import MLSENV, MLSRuntimeENV

from skt.mls import get_mls_config
config = get_mls_config(env='stg', user='scm')
config

In [None]:
experiment_client = ExperimentClient(**config)

<br><br>

### 002_내 실험 찾기, 그냥 입력해도 되고

In [None]:
experiments = experiment_client.list_experiments()

In [None]:
for e in experiments:
    print(e.name)

In [None]:
target_experiment = experiments[-1].name
target_experiment

In [None]:
target_experiment = 'battleground_jhjh_test'
target_experiment

<br><br>

### 002_실험 불러오기

In [None]:
my_experiment = experiment_client.get_experiment(name=target_experiment)

In [None]:
for e in dir(my_experiment):
    print(e)
    print

<br><br>

### 003_버킷 리스트 불러오기

In [None]:
my_bucket = experiment_client.list_buckets(
    experiment=my_experiment
)

In [None]:
print('--------------------------------------------------------')
for e in my_bucket:
    print('- 버킷명 : ' + e.name)
    print('- ID : ' + str(e.id))   
    print('- 버킷 비율 : ' + e.bucket_range + ' <--- 중요, 전체 100% 버킷중 비율 의미')    
    print('- 세부내용 : ' + str(e.description))    
    print('- 실험명 : ' + e.experiment.name) 
    print('--------------------------------------------------------')


<br><br>

### <span style="color:red"> 004_버킷 비율 변경</span>
---
- 버킷 개수를 고려해서 코딩을 하는것까지는 불필요할 듯
- 전체 100% 비율을 나누어 입력하게 되는데 100이 넘어가면 어떻게 되나 궁금해서 해보니
  넘어가네? ㅋㅋㅋㅋ
---

In [None]:
my_bucket_ratio = {my_bucket[0]: 70, my_bucket[1]: 20, my_bucket[2]: 10}
my_bucket_ratio

In [None]:
experiment_client.update_bucket_ratio(my_experiment, my_bucket_ratio)

#### <span style='color:blue'>변경 후 비율 다시 확인</span>

In [None]:
my_bucket = experiment_client.list_buckets(
    experiment=my_experiment
)

In [None]:
print('--------------------------------------------------------')
for e in my_bucket:
    print('- 버킷명 : ' + e.name)
    print('- ID : ' + str(e.id))   
    print('- 버킷 비율 : ' + e.bucket_range + ' <--- 중요, 전체 100% 버킷중 비율 의미')    
    print('- 세부내용 : ' + str(e.description))    
    print('- 실험명 : ' + e.experiment.name) 
    print('--------------------------------------------------------')


<br>

---
#### <span style='color:red'>끝. 다양한 기능이 있으나 이것만으로도 가능할 듯</span>

<br><br>

### 999_Thompson Sampling 추가
---
- Thompson sampling을 적용하더라도, 각 bucket별 상한, 하한은 존재해야 할것이며,
  세부 항목은 우리가 정하겠지?

- 그 안에서 자동으로 Bucket Ratio를 조절하고, 그로 인한 실적 향상을 측정해보는 것에 의의가 있음
---
> **[ 예시 ]** <br><br>&nbsp;&nbsp;&nbsp;&nbsp;**<span style='color:red'>A (BTS v1)</span>**&nbsp;&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;&nbsp;**<span style='color:blue'>B (BTS v2)</span>**&nbsp;&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;&nbsp;**<span style='color:black'>Z (Random)</span>**<br><br>위와 같이 3개의 bucket이 있을때,<br>
  각각 70%, 20%, 10% 고정이 아닌 50%, 10%, 10%는 고정해둔 상태에서 <br>
  A와 B에 대하여 나머지 30%의 비율을 적절하게 배분하여 추천하는 방식을 채택한다면?


<br>

#### Beta 분포 업데이트를 위하여 일자별 버킷별 불러오기 ( w/ 실험 플랫폼 )

In [None]:
import os
os.system('mkdir -p battleground')
os.system('mkdir -p temp')

In [None]:
from skt.vault_utils import get_secrets
from skt.github_utils import GithubUtil
import io

path = f"https://github.com/sktaiflow/dag-advanced_analytics/blob/develop/experiment_platform/battleground/experiment"
files = [
        f'monitoring_report.py'
        ]

secrets = get_secrets('github/sktaiflow')
token = secrets['token']

proxies = {
    'http': secrets['proxy'],
    'https': secrets['proxy']
    }

g = GithubUtil(token, proxies=proxies)
for file in files:
    code = g.download_from_git(f"{path}/{file}").decode('utf-8')
    with open(f"battleground/{file}", "w+") as f:
        f.write(code)



In [None]:
from skt.gcp import bq_to_pandas
from skt.ye import slack_send
from datetime import datetime, date
from battleground.monitoring_report import desc_experiment, get_channels_from_experiment, get_daily_ctr, plot_performance, send_slack, update_experiment_end_dt, create_report

#### 우리가 테스트 하고자하는 실험의 어떤 버켓이던 상관없을듯. 
---
- 나는 우선 BTS V2를 예시로 선택
---

In [None]:
experiment_nm = '[exp001]galileo_single_reco_model'

In [None]:
experiment_info = desc_experiment(experiment_nm)

In [None]:
experiment_info

In [None]:
df = get_daily_ctr(experiment_info)

In [None]:
# from datetime import datetime
# df.dt.strftime("%V")

In [None]:
df.loc[df.dt <= '2022-04-01']

In [None]:
sample_size1 = df \
        .groupby('bucket_name')['denominator'] \
        .sum() \
        .reset_index()

sample_size2 = df \
        .groupby('bucket_name')['numerator'] \
        .sum() \
        .reset_index()


In [None]:
sample_size12 = pd.merge(left = sample_size1 , right = sample_size2, how = "inner", on = "bucket_name")

In [None]:
sample_size12

In [None]:
bts_v1_imp = sample_size12.loc[sample_size12.bucket_name == 'B'].denominator.values[0]
bts_v1_cli = sample_size12.loc[sample_size12.bucket_name == 'B'].numerator.values[0]

bts_v2_imp = sample_size12.loc[sample_size12.bucket_name == 'GA'].denominator.values[0]
bts_v2_cli = sample_size12.loc[sample_size12.bucket_name == 'GA'].numerator.values[0]

bts_v1_ctr = bts_v1_cli/bts_v1_imp
bts_v2_ctr = bts_v2_cli/bts_v2_imp

In [None]:
bts_v1_ctr
bts_v2_ctr

In [None]:
1000 100 900

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import beta, bernoulli
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
from plotly.subplots import make_subplots
import random
import math

In [None]:
ads = ['A', 'B']
ACTUAL_CTR = {'A': bts_v1_ctr, 'B': bts_v2_ctr}

In [None]:
# #@title functions for manual Tompson sampling

# def calculate_beta_dist(win_ad):
#     impressions[win_ad] += 1
#     did_click = bernoulli.rvs(ACTUAL_CTR[win_ad])
#     if did_click:
#         clicks[win_ad] += did_click

#     # update ctr values according to beta destribution expected values
#     ctr_0 = random.betavariate(priors['A']+clicks['A'], priors['A'] + impressions['A'] - clicks['A'])
#     ctr_1 = random.betavariate(priors['B']+clicks['B'], priors['B'] + impressions['B'] - clicks['B'])
#     highest_ad = np.argmax([ctr_0, ctr_1])
#     chosen_ads.append(highest_ad)

#     ctr['A'].append(ctr_0)
#     ctr['B'].append(ctr_1)
#     return highest_ad


# def plot_beta_dist():
#     x = np.arange(0, 1, 0.01)
#     y = beta.pdf(x, priors['A']+clicks['A'], priors['B'] + impressions['A'] - clicks['A'])
#     y /= y.max() ## normalize

#     trace0 = go.Scatter(x=x,
#                     y=y,
#                     name='Beta Distribution (Ad A)',
#                     marker = dict(color=('rgba(10, 108, 94, 1)')),
#                     fill='tozeroy',
#                     fillcolor = 'rgba(10, 108, 94, .7)')

#     trace1 = go.Scatter(x = [ACTUAL_CTR[0]] * 2,
#                     y = [0, 1],
#                     name = 'Actual CTR A Value',
#                     mode='lines',
#                     line = dict(
#                         color = ('rgb(205, 12, 24)'),
#                         width = 2,
#                         dash = 'dash'))

#     y = beta.pdf(x, priors['A']+clicks['B'], priors['B'] + impressions['B'] - clicks['B'])
#     y /= y.max()

#     trace2 = go.Scatter(x=x,
#                     y=y,
#                     name='Beta Distribution (Ad B)',
#                     marker = dict(color=('rgba(187, 121, 24, 1)')),
#                     fill='tozeroy',
#                     fillcolor = 'rgba(187, 121, 24, .7)')

#     trace3 = go.Scatter(x = [ACTUAL_CTR[1]] * 2,
#                     y = [0, 1],
#                     name = 'Actual CTR B Value',
#                     mode='lines',
#                     line = dict(
#                         color = ('rgb(205, 12, 24)'),
#                         width = 2,
#                         dash = 'dash'))

#     fig = go.Figure([data1, data2, data3, data4])
#     fig.updatedate_layout(
#         title='Beta Distributions for both Ads',
#         xaxis={'title': 'Possible CTR values'},
#         yaxis={'title': 'Probability Density'})

#     fig.show()

In [None]:
## plot the Beta distributions
x = np.linspace(0.01,0.04,1000)
x = np.linspace(0.01,0.04,1000)

# y = beta.pdf(x, priors['A']+clicks['A'], priors['B'] + impressions['A'] - clicks['A'])
y = beta.pdf(x, 1 + bts_v1_cli, 1 + bts_v1_imp - bts_v1_cli)

y /= y.max() ## normalize

trace0 = go.Scatter(x=x,
                   y=y,
                   name='Beta Distribution (Ad A)',
                   marker = dict(color=('rgba(10, 108, 94, 1)')),
                   fill='tozeroy',
                   fillcolor = 'rgba(10, 108, 94, .7)')

trace1 = go.Scatter(x = [ACTUAL_CTR['A']] * 2,
                   y = [0, 1],
                   name = 'Actual CTR A Value',
                   mode='lines',
                   line = dict(
                       color = ('rgb(205, 12, 24)'),
                       width = 2,
                       dash = 'dash'))

# y = beta.pdf(x, priors['A']+clicks['B'], priors['B'] + impressions['B'] - clicks['B'])
y = beta.pdf(x, 1 + bts_v2_cli, 1 + bts_v2_imp - bts_v2_cli)

y /= y.max()

trace2 = go.Scatter(x=x,
                   y=y,
                   name='Beta Distribution (Ad B)',
                   marker = dict(color=('rgba(187, 121, 24, 1)')),
                   fill='tozeroy',
                   fillcolor = 'rgba(187, 121, 24, .7)')

trace3 = go.Scatter(x = [ACTUAL_CTR['B']] * 2,
                   y = [0, 1],
                   name = 'Actual CTR B Value',
                   mode='lines',
                   line = dict(
                       color = ('rgb(205, 12, 24)'),
                       width = 2,
                       dash = 'dash'))

fig = go.Figure([trace0, trace1, trace2, trace3])
fig.update_layout(
    title='Beta Distributions for both Ads',
    xaxis={'title': 'Possible CTR values'},
    yaxis={'title': 'Probability Density'})


fig.show()

In [None]:
R = [0, 0]

In [None]:
for trial in range(10000):
    # print("\nTrial " + str(trial))

    rnd = np.random.RandomState()  # for machine payouts and Beta

    probs = [rnd.beta(1 + bts_v1_cli, 1 + bts_v1_imp - bts_v1_cli),
             rnd.beta(1 + bts_v2_cli, 1 + bts_v2_imp - bts_v2_cli)]

    machine = np.argmax(probs)
    
    R[machine] += 1

print("\nFinal Success vector: ", end="")
R

In [None]:
control_ratio = 30

In [None]:
A_control = round(R[0]/sum(R) * control_ratio)
B_control = control_ratio - A_control

In [None]:
my_bucket = experiment_client.list_buckets(
    experiment=my_experiment
)

In [None]:
print('--------------------------------------------------------')
for e in my_bucket:
    print('- 버킷명 : ' + e.name)
    print('- ID : ' + str(e.id))   
    print('- 버킷 비율 : ' + e.bucket_range + ' <--- 중요, 전체 100% 버킷중 비율 의미')    
    print('- 세부내용 : ' + str(e.description))    
    print('- 실험명 : ' + e.experiment.name) 
    print('--------------------------------------------------------')


### 버킷 비율 변경
---
- 버킷 개수를 고려해서 코딩을 하는것까지는 불필요할 듯
- 전체 100% 비율을 나누어 입력하게 되는데 100이 넘어가면 어떻게 되나 궁금해서 해보니
  넘어가네? ㅋㅋㅋㅋ
---

In [None]:
my_bucket_ratio = {my_bucket[0]: 50 + A_control, my_bucket[1]: 10 + B_control, my_bucket[2]: 10}
my_bucket_ratio

In [None]:
experiment_client.update_bucket_ratio(my_experiment, my_bucket_ratio)

#### 변경 후 비율 다시 확인

In [None]:
my_bucket = experiment_client.list_buckets(
    experiment=my_experiment
)

In [None]:
print('--------------------------------------------------------')
for e in my_bucket:
    print('- 버킷명 : ' + e.name)
    print('- ID : ' + str(e.id))   
    print('- 버킷 비율 : ' + e.bucket_range + ' <--- 중요, 전체 100% 버킷중 비율 의미')    
    print('- 세부내용 : ' + str(e.description))    
    print('- 실험명 : ' + e.experiment.name) 
    print('--------------------------------------------------------')
