### 1. 데이터 불러오기 및 전처리

In [None]:
import numpy as np
import joblib
import pandas as pd
import pymysql
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate, KFold, train_test_split

# 그래프 관련 패키지
import seaborn as sns
import matplotlib.pyplot as plt

plt.rc('font', family = 'NANUMBARUNGOTHIC')
print(plt.rcParams['font.family'])

%matplotlib inline

In [None]:
def summary_func(group):

    return {'count': group.count(), 
             'mean': group.mean(), 
             'std': group.std(), 
             'min': group.min(), 
             'max': group.max()}

In [None]:
# DB 접속(정보 생략)
conn = pymysql.connect()

In [None]:
import time

start = time.time()

df_query = """SELECT a.statDate, a.media_code, a.customerId, b.site_id, a.campaignId, a.adgroupName, a.keywordName, a.deviceType, a.displayArea, 
                    AVG(a.avgImpressionRank) AS avgImpressionRank, 
                    SUM(a.adCost / (CASE WHEN c.vatYN = 'Y' THEN 1.1 ELSE 1 END)) AS adCost, 
                    SUM(a.impression) AS impression, SUM(a.click) AS click
            FROM data9_search_ad_stat_daily a, data9_search_campaign b, data9_media_info c
            WHERE a.media_code = b.media_code AND b.media_code = c.media_code AND a.customerId = b.customerId AND a.campaignId = b.campaignId
                    AND statDate BETWEEN '20200101' AND '20210430' ### 가져오는 날짜 변경
            GROUP BY statDate, media_code, customerId, campaignId, adgroupName, keywordName, deviceType, displayArea"""

customer_query = "SELECT media_code, customerId, customerName, advertiser_id FROM data9_customer_info"

advertiser_query = "SELECT advertiser_id, advertiser_name FROM data9_advertiser_info"

site_query = "SELECT site_id, site_name FROM data9_site_info WHERE deleteFlag = 'N'"

ga_query = """SELECT statDate, media_code, customerId, campaignId, adgroupName, keywordName, deviceType, displayArea,
                    SUM(directBuyConvCnt) AS directBuyConvCnt, SUM(directBuyCostConvCnt) AS directBuyCostConvCnt, 
                    SUM(indirectBuyConvCnt) AS indirectBuyConvCnt, SUM(indirectBuyCostConvCnt) AS indirectBuyCostConvCnt
            FROM data9_search_ga_ad_stat_daily
            WHERE statDate BETWEEN '20200101' AND '20210430' ### 가져오는 날짜 변경
            GROUP BY statDate, media_code, customerId, campaignId, adgroupName,keywordName, deviceType, displayArea"""

df = pd.read_sql(df_query, conn)
customer_df = pd.read_sql(customer_query, conn)
advertiser_df = pd.read_sql(advertiser_query, conn)

site_df = pd.read_sql(site_query, conn)
ga_df = pd.read_sql(ga_query, conn)

print(df.shape) # 15,400,802*13
print(customer_df.shape) # 90*4
print(advertiser_df.shape) # 31*2

print(site_df.shape) # 96*2
print(ga_df.shape) # 42,269*16

end = time.time()
print(end - start) # 2,369 sec(40 min)

In [None]:
df2 = pd.merge(df, customer_df, how = 'left', on = ['media_code', 'customerId'])
df2 = pd.merge(df2, advertiser_df, how = 'left', on = 'advertiser_id')

df2 = pd.merge(df2, site_df, how = 'left', on = 'site_id')
df2 = pd.merge(df2, ga_df, how = 'left', on = ['statDate', 'media_code', 'customerId', 'campaignId', 'adgroupName','keywordName', 'deviceType', 'displayArea']) ###

df2 = df2[['statDate', 'media_code', 'customerId', 'customerName', 'advertiser_id', 'advertiser_name',
           'site_id', 'site_name', 'campaignId', 'adgroupName', 'keywordName', 'deviceType', 'displayArea',
           'avgImpressionRank', 'adCost', 'impression', 'click', 'directBuyConvCnt', 'directBuyCostConvCnt',
           'indirectBuyConvCnt', 'indirectBuyCostConvCnt']]

print(df2.shape) # 15,400,802*21
df2.head()

In [None]:
df2 = df2[df2.impression > 0] # 노출이 0인 로우 제외
df2['ctr'] = df2.click / df2.impression
df2['cpc'] = np.where(df2.click == 0, 0, df2.adCost / df2.click)

In [None]:
df2.isnull().sum() # 111,993

In [None]:
# 일부 변수 형식 변경
df2['advertiser_id'] = np.where(df2.advertiser_id.isnull(), 0, df2.advertiser_id)
df2 = df2.astype({'advertiser_id': int})

df2['statDate'] = pd.to_datetime(df2['statDate'], format='%Y%m%d').dt.strftime("%Y-%m-%d").astype('datetime64[ns]')
df2['statYear'] = df2['statDate'].dt.year
df2['statMonth'] = df2['statDate'].dt.month
df2['statWeekday'] = df2['statDate'].dt.day_name()

In [None]:
# 요일별 광고비 추이(전체)
summary_weekday = df2.groupby(['statWeekday']).agg({'adCost': 'sum'})
summary_weekday = summary_weekday.T
summary_weekday = summary_weekday[['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']]
summary_weekday = summary_weekday.T

summary_weekday.plot.bar(stacked = False, rot = 0, figsize = (20, 5))
plt.rc('font', size = 12)
plt.title('Ad Cost by Days of the Week')
plt.xticks(rotation = 0)
plt.xlabel('')
plt.ylabel('Ad Cost')
plt.show()

In [None]:
# 요일별 광고비 추이(매체/디바이스): PC가 모바일보다 평일/주말의 광고비 편차가 크게 나타남
# PC는 평일 대비 주말에 소진 광고비가 줄어들었으나, MO은 오히려 주말에 소진 광고비가 증가하거나 평일과 유사
# 예외적으로 네이버 MO의 경우 PC와 유사한 패턴을 보이는데, 이는 예산 cap을 씌워 소진을 제한했기 때문으로 추정!
df2['mediaDevice'] = df2['media_code'] + "_" + df2['deviceType']

mediaDevice_list = ['daum_M', 'daum_P', 'google_M', 'google_P', 'naver_M', 'naver_P']
color_list = ['gold', 'orange', 'tomato', 'orangered', 'lightgreen', 'limegreen']

weekday_dt = []
for col in range(len(mediaDevice_list)):
    summary_weekday_tmp = df2[df2.mediaDevice == mediaDevice_list[col]]
    summary_weekday_tmp = summary_weekday_tmp.groupby(['statWeekday']).agg({'adCost': 'sum'})
    summary_weekday_tmp = summary_weekday_tmp.T
    summary_weekday_tmp = summary_weekday_tmp[['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']]
    summary_weekday_tmp = summary_weekday_tmp.T
    summary_weekday_tmp = summary_weekday_tmp.rename(columns = {'adCost': mediaDevice_list[col]})
    weekday_dt.append(summary_weekday_tmp)

    summary_weekday_tmp.plot.bar(stacked = False, rot = 0, figsize = (20, 5), color = color_list[col])
    plt.rc('font', size = 12)
    plt.title('Ad Cost by Days of the Week - {}'.format(mediaDevice_list[col]))
    plt.xticks(rotation = 0)
    plt.xlabel('')
    plt.ylabel('Ad Cost')
    plt.show()

In [None]:
summary_weekday = pd.concat(weekday_dt, axis = 1)

total_weekday = summary_weekday.sum(axis = 0)
total_weekday = total_weekday.values.reshape(summary_weekday.shape[1], 1)

summary_weekday = summary_weekday.T
summary_weekday = summary_weekday / total_weekday
summary_weekday ### DB에 해당 table or CSV 저장

summary_weekday.to_csv('/home/anaconda3/da/data/media mix_data9/data9_mediamix_weekday_2104.csv') ### 파일명의 집행년월 변경!

In [None]:
# 수집 일자: 2020.1 ~ 
statDate_count = df2.groupby(['statYear', 'statMonth']).agg({'media_code': 'count'}).reset_index()
statDate_count = statDate_count.rename(columns = {'media_code': 'cnt'})
statDate_count.sort_values(by = ['statYear', 'statMonth'], ascending = True)

In [None]:
# 캠페인 기준 Summary
summary_df = df2.groupby(['advertiser_id', 'advertiser_name', 'site_id', 'site_name', 'media_code', 'deviceType', 'statYear', 'statMonth']).agg(
    {'avgImpressionRank': 'mean', 'adCost': 'sum', 'impression': 'sum', 'click': 'sum', 
     'directBuyConvCnt': 'sum', 'directBuyCostConvCnt': 'sum', 'indirectBuyConvCnt': 'sum', 'indirectBuyCostConvCnt': 'sum'}).reset_index() ###

# 테이블과 형식 통일
summary_df.avgImpressionRank = round(summary_df.avgImpressionRank, 2)

print(summary_df.shape)
summary_df.head() # 2,536*16

---

### 1. 요약 테이블 불러오기

In [None]:
import numpy as np
import joblib
import pandas as pd
import pymysql
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate, KFold, train_test_split

# 그래프 관련 패키지
import seaborn as sns
import matplotlib.pyplot as plt

plt.rc('font', family = 'NANUMBARUNGOTHIC')
print(plt.rcParams['font.family'])

%matplotlib inline

['NANUMBARUNGOTHIC']


In [None]:
# DB 접속(정보 생략)
conn = pymysql.connect()

In [None]:
summary_query = "SELECT * FROM data9_search_summary"
summary_df = pd.read_sql(summary_query, conn)

print(summary_df.shape) # 2,536*16

(2536, 16)


In [None]:
# 캠페인 구분 변수 생성
summary_df = summary_df.astype({'statYear': str, 'statMonth': str})
summary_df['campaign'] = summary_df['advertiser_name'] + "_" + summary_df['site_name'] + "_" + summary_df['statYear'] + "_" + summary_df['statMonth']
summary_df['mediaDevice'] = summary_df['media_code'] + "_" + summary_df['deviceType'] ###

# 상품별 광고비 비중
campaign_slot = summary_df.groupby(['campaign', 'mediaDevice'])['adCost'].sum().unstack('mediaDevice')
campaign_slot = campaign_slot.fillna(0)

total_adCost = campaign_slot.sum(axis = 1)
total_adCost = total_adCost.values.reshape(len(total_adCost), 1) # dimension이 있어야 연산 가능

campaign_slot_ratio = campaign_slot / total_adCost
campaign_slot_ratio = campaign_slot_ratio.reset_index()
campaign_slot_ratio = campaign_slot_ratio[campaign_slot_ratio.daum_M.notnull()] # total_adCost = 0인 캠페인 제외

# 테이블과 형식 통일
campaign_slot_ratio.daum_M = round(campaign_slot_ratio.daum_M, 4)
campaign_slot_ratio.daum_P = round(campaign_slot_ratio.daum_P, 4)
campaign_slot_ratio.google_M = round(campaign_slot_ratio.google_M, 4)
campaign_slot_ratio.google_P = round(campaign_slot_ratio.google_P, 4)
campaign_slot_ratio.naver_M = round(campaign_slot_ratio.naver_M, 4)
campaign_slot_ratio.naver_P = round(campaign_slot_ratio.naver_P, 4)

print(campaign_slot_ratio.shape) # 1,028*7
campaign_slot_ratio.head()

(1028, 7)


mediaDevice,campaign,daum_M,daum_P,google_M,google_P,naver_M,naver_P
0,MBC아카데미_가맹_광주_2020_10,0.0,0.0,0.0,0.0,0.7717,0.2283
1,MBC아카데미_가맹_광주_2020_11,0.0,0.0,0.0,0.0,0.8816,0.1184
2,MBC아카데미_가맹_광주_2020_12,0.0,0.0,0.0,0.0,0.9016,0.0984
3,MBC아카데미_가맹_광주_2020_2,0.0,0.0,0.0,0.0,0.8581,0.1419
4,MBC아카데미_가맹_광주_2020_3,0.0,0.0,0.0,0.0,0.6399,0.3601


In [None]:
# 캠페인 기준 Summary(사이트/월 기준)
summary_df_fin = summary_df.groupby(['advertiser_id', 'advertiser_name', 'site_id', 'site_name', 'statYear', 'statMonth']).agg(
    {'avgImpressionRank': 'mean', 'adCost': 'sum', 'impression': 'sum', 'click': 'sum', 
     'directBuyConvCnt': 'sum', 'directBuyCostConvCnt': 'sum', 'indirectBuyConvCnt': 'sum', 'indirectBuyCostConvCnt': 'sum'}).reset_index() ###

summary_df_fin['buyConvCnt'] = summary_df_fin.directBuyConvCnt + summary_df_fin.indirectBuyConvCnt
summary_df_fin['buyCostConvCnt'] = summary_df_fin.directBuyCostConvCnt + summary_df_fin.indirectBuyCostConvCnt
summary_df_fin['cpc'] = np.where(summary_df_fin.click == 0, 0, summary_df_fin.adCost / summary_df_fin.click)
summary_df_fin['cpa'] = np.where(summary_df_fin.buyConvCnt == 0, 0, summary_df_fin.adCost / summary_df_fin.buyConvCnt)
summary_df_fin['roas'] = np.where(summary_df_fin.adCost == 0, 0, summary_df_fin.buyCostConvCnt / summary_df_fin.adCost)

print(summary_df_fin.shape)
summary_df_fin.head() # 1,061*19

(1061, 19)


Unnamed: 0,advertiser_id,advertiser_name,site_id,site_name,statYear,statMonth,avgImpressionRank,adCost,impression,click,directBuyConvCnt,directBuyCostConvCnt,indirectBuyConvCnt,indirectBuyCostConvCnt,buyConvCnt,buyCostConvCnt,cpc,cpa,roas
0,13,에스티유니타스,103,중고등_스카이에듀ON,2020,11,0.0,1346117,105327,2263,71,4032027,0,0,71,4032027,594.837384,18959.39,2.995302
1,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,11,1.3675,14296901,340148,1696,2,107004,0,0,2,107004,8429.776533,7148450.0,0.007484419
2,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,12,1.6225,53954015,579732,3016,0,16,0,0,0,16,17889.262268,0.0,2.965488e-07
3,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,5,1.335,9272654,484502,2318,2,0,0,0,2,0,4000.28214,4636327.0,0.0
4,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,6,1.27,283826,23339,65,0,0,0,0,0,0,4366.553846,0.0,0.0


#### 1-1. 클릭수 최대화(CPC)

In [None]:
# 사이트별 캠페인 수 확인(key unique 깂이 5가지 이상인 사이트만 분석에 활용)
summary_df_fin_cpc = summary_df_fin[summary_df_fin.cpc != 0] # 지표가 0인 경우 제외
summary_df_fin_cpc_tmp = summary_df_fin_cpc[['site_id', 'cpc']].drop_duplicates()

summary_df_fin_cpc_cnt = summary_df_fin_cpc_tmp.groupby(['site_id']).agg({'cpc': 'count'}).reset_index()
summary_df_fin_cpc_cnt = summary_df_fin_cpc_cnt.rename(columns = {'cpc': 'cnt'})
summary_df_fin_cpc_cnt = summary_df_fin_cpc_cnt[summary_df_fin_cpc_cnt.cnt >= 5]

summary_df_fin_cpc2 = pd.merge(summary_df_fin_cpc, summary_df_fin_cpc_cnt, how = 'left', on = 'site_id')
summary_df_fin_cpc2 = summary_df_fin_cpc2[summary_df_fin_cpc2.cnt.notnull()] ###

In [None]:
# 사이트별 KPI 그룹 번호 부여
group_cpc = []

for col in summary_df_fin_cpc2.site_name.unique():
    summary_df_fin_cpc2_tmp = summary_df_fin_cpc2.copy()
    summary_df_fin_cpc2_tmp = summary_df_fin_cpc2_tmp[summary_df_fin_cpc2_tmp.site_name == col]
    summary_df_fin_cpc2_tmp['cpc_cut'] = pd.qcut(summary_df_fin_cpc2_tmp.cpc, q = 3, labels = [3, 2, 1], duplicates = 'drop')
    group_cpc.append(summary_df_fin_cpc2_tmp)
    
group_cpc = pd.concat(group_cpc)
print(group_cpc.shape) # 995*21
group_cpc.head()

(995, 21)


Unnamed: 0,advertiser_id,advertiser_name,site_id,site_name,statYear,statMonth,avgImpressionRank,adCost,impression,click,...,directBuyCostConvCnt,indirectBuyConvCnt,indirectBuyCostConvCnt,buyConvCnt,buyCostConvCnt,cpc,cpa,roas,cnt,cpc_cut
1,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,11,1.3675,14296901,340148,1696,...,107004,0,0,2,107004,8429.776533,7148450.5,0.007484419,8.0,2
2,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,12,1.6225,53954015,579732,3016,...,16,0,0,0,16,17889.262268,0.0,2.965488e-07,8.0,1
3,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,5,1.335,9272654,484502,2318,...,0,0,0,2,0,4000.28214,4636327.0,0.0,8.0,3
4,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,6,1.27,283826,23339,65,...,0,0,0,0,0,4366.553846,0.0,0.0,8.0,3
5,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,1,2.355,7773640,347300,656,...,1,0,0,0,1,11850.060976,0.0,1.286399e-07,8.0,1


In [None]:
# 데이터마트 생성
group_cpc = group_cpc[['advertiser_id', 'advertiser_name', 'site_id', 'site_name', 'statYear', 'statMonth',
                       'adCost', 'click', 'buyConvCnt', 'buyCostConvCnt', 'cpc_cut']]
# group_cpc['cpc'] = np.where(group_cpc.click == 0, 0, group_cpc.adCost / group_cpc.click)
group_cpc = group_cpc.astype({'statYear': str, 'statMonth': str})
group_cpc['campaign'] = group_cpc['advertiser_name'] + "_" + group_cpc['site_name'] + "_" + group_cpc['statYear'] + "_" + group_cpc['statMonth']

cpc_df = pd.merge(group_cpc, campaign_slot_ratio, how = 'left', on = 'campaign')
cpc_df = cpc_df[cpc_df.daum_M.notnull()] # total_adCost = 0인 캠페인 제외

print(cpc_df.shape) # 995*18
cpc_df.head()

(995, 18)


Unnamed: 0,advertiser_id,advertiser_name,site_id,site_name,statYear,statMonth,adCost,click,buyConvCnt,buyCostConvCnt,cpc_cut,campaign,daum_M,daum_P,google_M,google_P,naver_M,naver_P
0,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,11,14296901,1696,2,107004,2,에스티유니타스_중고등_스카이에듀OFF_2020_11,0.0,0.0,0.146,0.1845,0.5269,0.1426
1,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,12,53954015,3016,0,16,1,에스티유니타스_중고등_스카이에듀OFF_2020_12,0.0,0.0,0.0091,0.0127,0.6548,0.3234
2,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,5,9272654,2318,2,0,3,에스티유니타스_중고등_스카이에듀OFF_2020_5,0.0,0.0,0.2901,0.2486,0.3332,0.1281
3,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,6,283826,65,0,0,3,에스티유니타스_중고등_스카이에듀OFF_2020_6,0.0,0.0,0.2268,0.2159,0.3735,0.1838
4,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,1,7773640,656,0,1,1,에스티유니타스_중고등_스카이에듀OFF_2021_1,0.0,0.0,0.0,0.0,0.6782,0.3218


In [None]:
# 데이터마트 생성(사이트 기준)
cpc_site = cpc_df[cpc_df.site_name == '수험1팀_공단기'] # 특정 사이트 데이터만 추출
print(cpc_site.shape) # 13*18
cpc_site.head()

(13, 18)


Unnamed: 0,advertiser_id,advertiser_name,site_id,site_name,statYear,statMonth,adCost,click,buyConvCnt,buyCostConvCnt,cpc_cut,campaign,daum_M,daum_P,google_M,google_P,naver_M,naver_P
230,13,에스티유니타스,85,수험1팀_공단기,2020,10,38489625,73855,2136,448139096,2,에스티유니타스_수험1팀_공단기_2020_10,0.0,0.0,0.3245,0.2045,0.2359,0.235
231,13,에스티유니타스,85,수험1팀_공단기,2020,11,65689145,75026,2422,513680372,1,에스티유니타스_수험1팀_공단기_2020_11,0.0,0.0,0.3787,0.3328,0.1523,0.1362
232,13,에스티유니타스,85,수험1팀_공단기,2020,12,100097113,82381,3857,853139673,1,에스티유니타스_수험1팀_공단기_2020_12,0.0,0.0,0.2481,0.4462,0.143,0.1627
233,13,에스티유니타스,85,수험1팀_공단기,2020,4,13730661,23081,0,0,2,에스티유니타스_수험1팀_공단기_2020_4,0.0,0.0,0.0395,0.0954,0.6194,0.2458
234,13,에스티유니타스,85,수험1팀_공단기,2020,5,15888339,26323,145,0,2,에스티유니타스_수험1팀_공단기_2020_5,0.0,0.0,0.0172,0.0681,0.6887,0.226


In [None]:
# 데이터마트 생성(광고주 기준)
cpc_ad = cpc_df[cpc_df.advertiser_name == '에스티유니타스'] # 특정 사이트 데이터만 추출
print(cpc_ad.shape) # 355*18
cpc_ad.head()

(355, 18)


Unnamed: 0,advertiser_id,advertiser_name,site_id,site_name,statYear,statMonth,adCost,click,buyConvCnt,buyCostConvCnt,cpc_cut,campaign,daum_M,daum_P,google_M,google_P,naver_M,naver_P
0,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,11,14296901,1696,2,107004,2,에스티유니타스_중고등_스카이에듀OFF_2020_11,0.0,0.0,0.146,0.1845,0.5269,0.1426
1,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,12,53954015,3016,0,16,1,에스티유니타스_중고등_스카이에듀OFF_2020_12,0.0,0.0,0.0091,0.0127,0.6548,0.3234
2,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,5,9272654,2318,2,0,3,에스티유니타스_중고등_스카이에듀OFF_2020_5,0.0,0.0,0.2901,0.2486,0.3332,0.1281
3,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,6,283826,65,0,0,3,에스티유니타스_중고등_스카이에듀OFF_2020_6,0.0,0.0,0.2268,0.2159,0.3735,0.1838
4,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,1,7773640,656,0,1,1,에스티유니타스_중고등_스카이에듀OFF_2021_1,0.0,0.0,0.0,0.0,0.6782,0.3218


#### 1-2. 전환수 최대화(CPA)

In [None]:
# 사이트별 캠페인 수 확인(key unique 깂이 5가지 이상인 사이트만 분석에 활용)
summary_df_fin_cpa = summary_df_fin[summary_df_fin.cpa != 0] # 지표가 0인 경우 제외
summary_df_fin_cpa_tmp = summary_df_fin_cpa[['site_id', 'cpa']].drop_duplicates()

summary_df_fin_cpa_cnt = summary_df_fin_cpa_tmp.groupby(['site_id']).agg({'cpa': 'count'}).reset_index()
summary_df_fin_cpa_cnt = summary_df_fin_cpa_cnt.rename(columns = {'cpa': 'cnt'})
summary_df_fin_cpa_cnt = summary_df_fin_cpa_cnt[summary_df_fin_cpa_cnt.cnt >= 5]

summary_df_fin_cpa2 = pd.merge(summary_df_fin_cpa, summary_df_fin_cpa_cnt, how = 'left', on = 'site_id')
summary_df_fin_cpa2 = summary_df_fin_cpa2[summary_df_fin_cpa2.cnt.notnull()]

In [None]:
# 사이트별 KPI 그룹 번호 부여(CPA는 반대로!! 3 그룹이 CPA가 낮게 나온 Case)
group_cpa = []

for col in summary_df_fin_cpa2.site_name.unique():
    summary_df_fin_cpa2_tmp = summary_df_fin_cpa2.copy()
    summary_df_fin_cpa2_tmp = summary_df_fin_cpa2_tmp[summary_df_fin_cpa2_tmp.site_name == col]
    summary_df_fin_cpa2_tmp['cpa_cut'] = pd.qcut(summary_df_fin_cpa2_tmp.cpa, q = 3, labels = [3, 2, 1], duplicates = 'drop')
    group_cpa.append(summary_df_fin_cpa2_tmp)
    
group_cpa = pd.concat(group_cpa)
print(group_cpa.shape) # 265*21
group_cpa.head()

(265, 21)


Unnamed: 0,advertiser_id,advertiser_name,site_id,site_name,statYear,statMonth,avgImpressionRank,adCost,impression,click,...,directBuyCostConvCnt,indirectBuyConvCnt,indirectBuyCostConvCnt,buyConvCnt,buyCostConvCnt,cpc,cpa,roas,cnt,cpa_cut
1,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,11,1.3675,14296901,340148,1696,...,107004,0,0,2,107004,8429.776533,7148450.0,0.007484,5.0,1
2,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,5,1.335,9272654,484502,2318,...,0,0,0,2,0,4000.28214,4636327.0,0.0,5.0,1
3,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,2,1.3025,59513313,2351398,4410,...,1410742,2,27000,48,1437742,13495.082313,1239861.0,0.024158,5.0,2
4,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,3,1.1575,43443813,2128592,3903,...,1605037,4,73000,60,1678037,11130.877018,724063.6,0.038625,5.0,3
5,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,4,1.633333,11053426,716829,1522,...,810507,0,0,31,810507,7262.434954,356562.1,0.073326,5.0,3


In [None]:
# 데이터마트 생성
group_cpa = group_cpa[['advertiser_id', 'advertiser_name', 'site_id', 'site_name', 'statYear', 'statMonth',
                           'adCost', 'click', 'buyConvCnt', 'buyCostConvCnt', 'cpa', 'cpa_cut']]
group_cpa = group_cpa.astype({'statYear': str, 'statMonth': str})
group_cpa['campaign'] = group_cpa['advertiser_name'] + "_" + group_cpa['site_name'] + "_" + group_cpa['statYear'] + "_" + group_cpa['statMonth']

cpa_df = pd.merge(group_cpa, campaign_slot_ratio, how = 'left', on = 'campaign')
cpa_df = cpa_df[cpa_df.daum_M.notnull()] # total_adCost = 0인 캠페인 제외

print(cpa_df.shape) # 265*19
cpa_df.head()

(265, 19)


Unnamed: 0,advertiser_id,advertiser_name,site_id,site_name,statYear,statMonth,adCost,click,buyConvCnt,buyCostConvCnt,cpa,cpa_cut,campaign,daum_M,daum_P,google_M,google_P,naver_M,naver_P
0,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,11,14296901,1696,2,107004,7148450.0,1,에스티유니타스_중고등_스카이에듀OFF_2020_11,0.0,0.0,0.146,0.1845,0.5269,0.1426
1,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,5,9272654,2318,2,0,4636327.0,1,에스티유니타스_중고등_스카이에듀OFF_2020_5,0.0,0.0,0.2901,0.2486,0.3332,0.1281
2,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,2,59513313,4410,48,1437742,1239861.0,2,에스티유니타스_중고등_스카이에듀OFF_2021_2,0.0,0.0,0.0452,0.08,0.6867,0.1881
3,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,3,43443813,3903,60,1678037,724063.6,3,에스티유니타스_중고등_스카이에듀OFF_2021_3,0.0,0.0,0.0313,0.0906,0.6218,0.2563
4,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,4,11053426,1522,31,810507,356562.1,3,에스티유니타스_중고등_스카이에듀OFF_2021_4,0.0,0.0,0.0,0.1065,0.6537,0.2398


In [None]:
# 데이터마트 생성(사이트 기준)
cpa_site = cpa_df[cpa_df.site_name == '수험1팀_공단기'] # 특정 사이트 데이터만 추출
print(cpa_site.shape) # 12*19
cpa_site.head()

(12, 19)


Unnamed: 0,advertiser_id,advertiser_name,site_id,site_name,statYear,statMonth,adCost,click,buyConvCnt,buyCostConvCnt,cpa,cpa_cut,campaign,daum_M,daum_P,google_M,google_P,naver_M,naver_P
166,13,에스티유니타스,85,수험1팀_공단기,2020,10,38489625,73855,2136,448139096,18019.48736,2,에스티유니타스_수험1팀_공단기_2020_10,0.0,0.0,0.3245,0.2045,0.2359,0.235
167,13,에스티유니타스,85,수험1팀_공단기,2020,11,65689145,75026,2422,513680372,27121.860033,1,에스티유니타스_수험1팀_공단기_2020_11,0.0,0.0,0.3787,0.3328,0.1523,0.1362
168,13,에스티유니타스,85,수험1팀_공단기,2020,12,100097113,82381,3857,853139673,25952.064558,1,에스티유니타스_수험1팀_공단기_2020_12,0.0,0.0,0.2481,0.4462,0.143,0.1627
169,13,에스티유니타스,85,수험1팀_공단기,2020,5,15888339,26323,145,0,109574.751724,1,에스티유니타스_수험1팀_공단기_2020_5,0.0,0.0,0.0172,0.0681,0.6887,0.226
170,13,에스티유니타스,85,수험1팀_공단기,2020,6,18454757,72203,887,0,20805.81398,2,에스티유니타스_수험1팀_공단기_2020_6,0.0,0.0,0.0654,0.0454,0.5792,0.3101


In [None]:
# 데이터마트 생성(광고주 기준)
cpa_ad = cpa_df[cpa_df.advertiser_name == '에스티유니타스'] # 특정 사이트 데이터만 추출
print(cpa_ad.shape) # 265*19
cpa_ad.head()

(265, 19)


Unnamed: 0,advertiser_id,advertiser_name,site_id,site_name,statYear,statMonth,adCost,click,buyConvCnt,buyCostConvCnt,cpa,cpa_cut,campaign,daum_M,daum_P,google_M,google_P,naver_M,naver_P
0,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,11,14296901,1696,2,107004,7148450.0,1,에스티유니타스_중고등_스카이에듀OFF_2020_11,0.0,0.0,0.146,0.1845,0.5269,0.1426
1,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,5,9272654,2318,2,0,4636327.0,1,에스티유니타스_중고등_스카이에듀OFF_2020_5,0.0,0.0,0.2901,0.2486,0.3332,0.1281
2,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,2,59513313,4410,48,1437742,1239861.0,2,에스티유니타스_중고등_스카이에듀OFF_2021_2,0.0,0.0,0.0452,0.08,0.6867,0.1881
3,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,3,43443813,3903,60,1678037,724063.6,3,에스티유니타스_중고등_스카이에듀OFF_2021_3,0.0,0.0,0.0313,0.0906,0.6218,0.2563
4,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,4,11053426,1522,31,810507,356562.1,3,에스티유니타스_중고등_스카이에듀OFF_2021_4,0.0,0.0,0.0,0.1065,0.6537,0.2398


#### 1-3. 매출 최대화(ROAS)

In [None]:
# 사이트별 캠페인 수 확인(key unique 깂이 5가지 이상인 사이트만 분석에 활용)
summary_df_fin_roas = summary_df_fin[summary_df_fin.roas != 0] # 지표가 0인 경우 제외
summary_df_fin_roas_tmp = summary_df_fin_roas[['site_id', 'roas']].drop_duplicates()

summary_df_fin_roas_cnt = summary_df_fin_roas_tmp.groupby(['site_id']).agg({'roas': 'count'}).reset_index()
summary_df_fin_roas_cnt = summary_df_fin_roas_cnt.rename(columns = {'roas': 'cnt'})
summary_df_fin_roas_cnt = summary_df_fin_roas_cnt[summary_df_fin_roas_cnt.cnt >= 5]

summary_df_fin_roas2 = pd.merge(summary_df_fin_roas, summary_df_fin_roas_cnt, how = 'left', on = 'site_id')
summary_df_fin_roas2 = summary_df_fin_roas2[summary_df_fin_roas2.cnt.notnull()]

In [None]:
# 사이트별 KPI 그룹 번호 부여
group_roas = []

for col in summary_df_fin_roas2.site_name.unique():
    summary_df_fin_roas2_tmp = summary_df_fin_roas2.copy()
    summary_df_fin_roas2_tmp = summary_df_fin_roas2_tmp[summary_df_fin_roas2_tmp.site_name == col]
    summary_df_fin_roas2_tmp['roas_cut'] = pd.qcut(summary_df_fin_roas2_tmp.roas, q = 3, labels = [1, 2, 3], duplicates = 'drop')
    group_roas.append(summary_df_fin_roas2_tmp)
    
group_roas = pd.concat(group_roas)
print(group_roas.shape) # 183*21
group_roas.head()

(183, 21)


Unnamed: 0,advertiser_id,advertiser_name,site_id,site_name,statYear,statMonth,avgImpressionRank,adCost,impression,click,...,directBuyCostConvCnt,indirectBuyConvCnt,indirectBuyCostConvCnt,buyConvCnt,buyCostConvCnt,cpc,cpa,roas,cnt,roas_cut
1,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,11,1.3675,14296901,340148,1696,...,107004,0,0,2,107004,8429.776533,7148450.0,0.007484419,6.0,2
2,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,12,1.6225,53954015,579732,3016,...,16,0,0,0,16,17889.262268,0.0,2.965488e-07,6.0,1
3,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,1,2.355,7773640,347300,656,...,1,0,0,0,1,11850.060976,0.0,1.286399e-07,6.0,1
4,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,2,1.3025,59513313,2351398,4410,...,1410742,2,27000,48,1437742,13495.082313,1239861.0,0.02415833,6.0,2
5,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,3,1.1575,43443813,2128592,3903,...,1605037,4,73000,60,1678037,11130.877018,724063.6,0.03862545,6.0,3


In [None]:
# 데이터마트 생성
group_roas = group_roas[['advertiser_id', 'advertiser_name', 'site_id', 'site_name', 'statYear', 'statMonth',
                           'adCost', 'click', 'buyConvCnt', 'buyCostConvCnt', 'roas', 'roas_cut']]
group_roas = group_roas.astype({'statYear': str, 'statMonth': str})
group_roas['campaign'] = group_roas['advertiser_name'] + "_" + group_roas['site_name'] + "_" + group_roas['statYear'] + "_" + group_roas['statMonth']

roas_df = pd.merge(group_roas, campaign_slot_ratio, how = 'left', on = 'campaign')
roas_df = roas_df[roas_df.daum_M.notnull()] # total_adCost = 0인 캠페인 제외

print(roas_df.shape) # 183*19
roas_df.head()

(183, 19)


Unnamed: 0,advertiser_id,advertiser_name,site_id,site_name,statYear,statMonth,adCost,click,buyConvCnt,buyCostConvCnt,roas,roas_cut,campaign,daum_M,daum_P,google_M,google_P,naver_M,naver_P
0,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,11,14296901,1696,2,107004,0.007484419,2,에스티유니타스_중고등_스카이에듀OFF_2020_11,0.0,0.0,0.146,0.1845,0.5269,0.1426
1,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,12,53954015,3016,0,16,2.965488e-07,1,에스티유니타스_중고등_스카이에듀OFF_2020_12,0.0,0.0,0.0091,0.0127,0.6548,0.3234
2,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,1,7773640,656,0,1,1.286399e-07,1,에스티유니타스_중고등_스카이에듀OFF_2021_1,0.0,0.0,0.0,0.0,0.6782,0.3218
3,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,2,59513313,4410,48,1437742,0.02415833,2,에스티유니타스_중고등_스카이에듀OFF_2021_2,0.0,0.0,0.0452,0.08,0.6867,0.1881
4,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,3,43443813,3903,60,1678037,0.03862545,3,에스티유니타스_중고등_스카이에듀OFF_2021_3,0.0,0.0,0.0313,0.0906,0.6218,0.2563


In [None]:
# 데이터마트 생성(사이트 기준)
roas_site = roas_df[roas_df.site_name == '수험1팀_공단기'] # 특정 사이트 데이터만 추출
print(roas_site.shape) # 8*19
roas_site.head()

(8, 19)


Unnamed: 0,advertiser_id,advertiser_name,site_id,site_name,statYear,statMonth,adCost,click,buyConvCnt,buyCostConvCnt,roas,roas_cut,campaign,daum_M,daum_P,google_M,google_P,naver_M,naver_P
116,13,에스티유니타스,85,수험1팀_공단기,2020,10,38489625,73855,2136,448139096,11.643114,2,에스티유니타스_수험1팀_공단기_2020_10,0.0,0.0,0.3245,0.2045,0.2359,0.235
117,13,에스티유니타스,85,수험1팀_공단기,2020,11,65689145,75026,2422,513680372,7.819867,1,에스티유니타스_수험1팀_공단기_2020_11,0.0,0.0,0.3787,0.3328,0.1523,0.1362
118,13,에스티유니타스,85,수험1팀_공단기,2020,12,100097113,82381,3857,853139673,8.52312,1,에스티유니타스_수험1팀_공단기_2020_12,0.0,0.0,0.2481,0.4462,0.143,0.1627
119,13,에스티유니타스,85,수험1팀_공단기,2020,9,27190323,75156,1181,303785747,11.172569,2,에스티유니타스_수험1팀_공단기_2020_9,0.0,0.0,0.1788,0.1603,0.3493,0.3116
120,13,에스티유니타스,85,수험1팀_공단기,2021,1,87098171,105042,5634,1076226063,12.356471,3,에스티유니타스_수험1팀_공단기_2021_1,0.0,0.0,0.2032,0.3534,0.2291,0.2143


In [None]:
roas_ad = roas_df[roas_df.advertiser_name == '에스티유니타스'] # 특정 사이트 데이터만 추출
print(roas_ad.shape) # 183*19
roas_ad.head()

(183, 19)


Unnamed: 0,advertiser_id,advertiser_name,site_id,site_name,statYear,statMonth,adCost,click,buyConvCnt,buyCostConvCnt,roas,roas_cut,campaign,daum_M,daum_P,google_M,google_P,naver_M,naver_P
0,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,11,14296901,1696,2,107004,0.007484419,2,에스티유니타스_중고등_스카이에듀OFF_2020_11,0.0,0.0,0.146,0.1845,0.5269,0.1426
1,13,에스티유니타스,104,중고등_스카이에듀OFF,2020,12,53954015,3016,0,16,2.965488e-07,1,에스티유니타스_중고등_스카이에듀OFF_2020_12,0.0,0.0,0.0091,0.0127,0.6548,0.3234
2,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,1,7773640,656,0,1,1.286399e-07,1,에스티유니타스_중고등_스카이에듀OFF_2021_1,0.0,0.0,0.0,0.0,0.6782,0.3218
3,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,2,59513313,4410,48,1437742,0.02415833,2,에스티유니타스_중고등_스카이에듀OFF_2021_2,0.0,0.0,0.0452,0.08,0.6867,0.1881
4,13,에스티유니타스,104,중고등_스카이에듀OFF,2021,3,43443813,3903,60,1678037,0.03862545,3,에스티유니타스_중고등_스카이에듀OFF_2021_3,0.0,0.0,0.0313,0.0906,0.6218,0.2563


### 2. 모형 구축

#### 2-1. 클릭수 최대화(CPC)

##### 2-1-1. 클릭수(전체)

In [None]:
# 데이터셋 준비
click_X = click_df[['click_cut']]
click_y = click_df[['daum_M', 'daum_P', 'google_M', 'google_P', 'naver_M', 'naver_P']]

# 교차 검증
cv = KFold(n_splits = 5, shuffle = True, random_state = 3)

# linear regression for multioutput regression
click_model = LinearRegression()
click_results = cross_validate(click_model, click_X, click_y, scoring = 'r2', cv = cv, n_jobs = -1, return_estimator = True)

print('cross-val-score:', click_results['test_score'])
print('cross-val-score.mean:{:.3f}'.format(click_results['test_score'].mean())) # r2 = -0.052

In [None]:
# # 변수 중요도 그래프: click_cnt >>> cpc > click
# sum = 0

# for i in range(len(click_results['estimator'])):
#     sum += click_results['estimator'][i].coef_.sum(axis = 0)

# click_importance = pd.DataFrame(abs(sum), index = click_X.columns, columns = ["importance"])
# click_importance = click_importance.sort_values(by = 'importance', ascending = False)

# click_importance.plot.bar(stacked = False, rot = 0, figsize = (20, 6))
# plt.rc('font', size = 10)
# plt.title('Feature Importance')
# plt.xticks(rotation = 90)
# plt.ylabel('importance')
# plt.show()

In [None]:
# 트레이닝 / 테스트 셋으로 데이터 분리
X_train_click, X_test_click, y_train_click, y_test_click = train_test_split(click_X, click_y, test_size = 0.2, random_state = 2)

print(X_train_click.shape, y_train_click.shape) # 796
print(X_test_click.shape, y_test_click.shape) # 199

In [None]:
# 최종 모형
click_reg  = LinearRegression().fit(X_train_click, y_train_click)
click_pred = click_reg.predict(X_test_click)

print('uniform_average:', r2_score(y_test_click, click_pred, multioutput = 'uniform_average')) # r2 = -0.25
print('variance_weighted:', r2_score(y_test_click, click_pred, multioutput = 'variance_weighted')) # r2 = -0.008

In [None]:
# 모델 저장
joblib.dump(click_reg, '/home/anaconda3/da/data/media mix_data9/mix_model_click.pkl')

In [None]:
# 확률 형태로 변환(softmax로 변환 시 너무 완만하게 변화하므로 사용 X)
click_pd_limit = 0.001
click_pred2 = np.where(click_pred < click_pd_limit, 0, click_pred) # 특정 비율보다 낮게 추정된 상품은 0으로 변환

click_pred_sum = np.sum(click_pred2, axis = 1, keepdims = True)

# 모든 상품의 확률이 0인 상품 제외
click_pred2 = pd.DataFrame(click_pred2)
click_pred_sum = np.where(click_pred_sum == 0, 0.00001, click_pred_sum) # 합을 0이 아닌 아주 작은 값으로 변환

click_pred_pb = np.where(click_pred2 == 0, 0, click_pred2 / click_pred_sum)
click_pred_pb = pd.DataFrame(click_pred_pb, columns = click_y.columns)

In [None]:
# 그룹별 매체 광고비 비중(예측)
click_pred_dt = pd.concat([X_test_click.reset_index(drop = True), pd.DataFrame(click_pred_pb, columns = click_y.columns)], axis = 1)
click_pred_pivot = click_pred_dt.groupby(['click_cut']).agg({'daum_M': 'mean', 'daum_P': 'mean', 'google_M': 'mean', 'google_P': 'mean',
                                         'naver_M': 'mean', 'naver_P': 'mean'})

click_pred_pivot.plot(kind = 'barh', stacked = True, figsize = (20, 8),
                      color = ['gold', 'orange', 'tomato', 'orangered', 'lightgreen', 'limegreen'])
plt.rc('font', size = 12)
plt.title('Media Proportion by Click(pred)')
plt.xticks(rotation = 90)
plt.legend(loc = 'upper right')
plt.show()

In [None]:
click_pred_pivot

##### 2-1-2. 클릭수(사이트)

In [None]:
# 데이터셋 준비
click_X_site = click_site[['click_cut']] ### 12개월 이상 집행 데이터가 있는 경우, '사이트' 기준 모델 구축
click_y_site = click_site[['daum_M', 'daum_P', 'google_M', 'google_P', 'naver_M', 'naver_P']]

# 교차 검증
cv = KFold(n_splits = 5, shuffle = True, random_state = 3)

# linear regression for multioutput regression
click_model_site = LinearRegression()
click_results_site = cross_validate(click_model_site, click_X_site, click_y_site, scoring = 'r2', cv = cv, n_jobs = -1, 
                                    return_estimator = True)

print('cross-val-score:', click_results_site['test_score'])
print('cross-val-score.mean:{:.3f}'.format(click_results_site['test_score'].mean())) # r2 = -208.138

In [None]:
# 최종 모형(사이트 모형 구축의 경우, 데이터 양이 부족하므로 데이터셋 분리 X) ###
click_reg_site  = LinearRegression().fit(click_X_site, click_y_site)
click_pred_site = click_reg_site.predict(click_X_site)

print('uniform_average:', r2_score(click_y_site, click_pred_site, multioutput = 'uniform_average')) # r2 = 0.52
print('variance_weighted:', r2_score(click_y_site, click_pred_site, multioutput = 'variance_weighted')) # r2 = 0.31

In [None]:
# 확률 형태로 변환(softmax로 변환 시 너무 완만하게 변화하므로 사용 X)
click_pd_limit = 0.001
click_pred2_site = np.where(click_pred_site < click_pd_limit, 0, click_pred_site) # 특정 비율보다 낮게 추정된 상품은 0으로 변환

click_pred_sum_site = np.sum(click_pred2_site, axis = 1, keepdims = True)

# 모든 상품의 확률이 0인 상품 제외
click_pred2_site = pd.DataFrame(click_pred2_site)
click_pred_sum_site = np.where(click_pred_sum_site == 0, 0.00001, click_pred_sum_site) # 합을 0이 아닌 아주 작은 값으로 변환

click_pred_pb_site = np.where(click_pred2_site == 0, 0, click_pred2_site / click_pred_sum_site)
click_pred_pb_site = pd.DataFrame(click_pred_pb_site, columns = click_y_site.columns)

In [None]:
# 그룹별 매체 광고비 비중(예측)
click_pred_dt_site = pd.concat([click_X_site.reset_index(drop = True), pd.DataFrame(click_pred_pb_site, columns = click_y_site.columns)], axis = 1) ###
click_pred_pivot_site = click_pred_dt_site.groupby(['click_cut']).agg({'daum_M': 'mean', 'daum_P': 'mean', 'google_M': 'mean', 'google_P': 'mean',
                                         'naver_M': 'mean', 'naver_P': 'mean'})

click_pred_pivot_site.plot(kind = 'barh', stacked = True, figsize = (20, 8),
                      color = ['gold', 'orange', 'tomato', 'orangered', 'lightgreen', 'limegreen'])
plt.rc('font', size = 12)
plt.title('Media Proportion by Click(pred - site)')
plt.xticks(rotation = 90)
plt.legend(loc = 'upper right')
plt.show()

In [None]:
click_pred_pivot_site

In [None]:
# data = pd.DataFrame([3], index = [0])
# click_reg_site.predict(data)

In [None]:
# 그룹별 평균 효율
click_site2 = click_site[click_site.click_cut == 3]
click_site2 = click_site2[['campaign', 'click_cut']]

summary_df2 = pd.merge(summary_df, click_site2, how = 'left', on = ['campaign'])
summary_df2 = summary_df2[summary_df2.click_cut.notnull()]

summary_df2['buyConvCnt'] = summary_df2.directBuyConvCnt + summary_df2.indirectBuyConvCnt
summary_df2['buyCostConvCnt'] = summary_df2.directBuyCostConvCnt + summary_df2.indirectBuyCostConvCnt

summary_click_site = summary_df2.groupby(['mediaDevice']).agg({
    'adCost': 'sum', 'click': 'sum', 'buyConvCnt': 'sum', 'buyCostConvCnt': 'sum'}).reset_index()

summary_click_site['cpc'] = np.where(summary_click_site.click == 0, 0, summary_click_site.adCost / summary_click_site.click)
summary_click_site['cpa'] = np.where(summary_click_site.buyConvCnt == 0, 0, summary_click_site.adCost / summary_click_site.buyConvCnt)
summary_click_site['cvr'] = np.where(summary_click_site.click == 0, 0, summary_click_site.buyConvCnt / summary_click_site.click)
summary_click_site['ct'] = np.where(summary_click_site.buyConvCnt == 0, 0, summary_click_site.buyCostConvCnt / summary_click_site.buyConvCnt) # 객단가

summary_click_site

##### 2-1-3. 클릭수(광고주)

In [None]:
# 데이터셋 준비
click_X_ad = click_ad[['click_cut']] ### 12개월 이상 집행 데이터가 있는 경우, '사이트' 기준 모델 구축
click_y_ad = click_ad[['daum_M', 'daum_P', 'google_M', 'google_P', 'naver_M', 'naver_P']]

# 교차 검증
cv = KFold(n_splits = 5, shuffle = True, random_state = 3)

# linear regression for multioutput regression
click_model_ad = LinearRegression()
click_results_ad = cross_validate(click_model_ad, click_X_ad, click_y_ad, scoring = 'r2', cv = cv, n_jobs = -1, 
                                    return_estimator = True)

print('cross-val-score:', click_results_ad['test_score'])
print('cross-val-score.mean:{:.3f}'.format(click_results_ad['test_score'].mean())) # r2 = -9.843

In [None]:
# 트레이닝 / 테스트 셋으로 데이터 분리
X_train_click_ad, X_test_click_ad, y_train_click_ad, y_test_click_ad = train_test_split(click_X_ad, click_y_ad, 
                                                                                                test_size = 0.2, random_state = 2)

print(X_train_click_ad.shape, y_train_click_ad.shape) # 261
print(X_test_click_ad.shape, y_test_click_ad.shape) # 66

In [None]:
# 최종 모형
click_reg_ad  = LinearRegression().fit(X_train_click_ad, y_train_click_ad)
click_pred_ad = click_reg_ad.predict(X_test_click_ad)

print('uniform_average:', r2_score(y_test_click_ad, click_pred_ad, multioutput = 'uniform_average')) # r2 = -0.04
print('variance_weighted:', r2_score(y_test_click_ad, click_pred_ad, multioutput = 'variance_weighted')) # r2 = -0.07

In [None]:
# 확률 형태로 변환(softmax로 변환 시 너무 완만하게 변화하므로 사용 X)
click_pd_limit = 0.001
click_pred2_ad = np.where(click_pred_ad < click_pd_limit, 0, click_pred_ad) # 특정 비율보다 낮게 추정된 상품은 0으로 변환

click_pred_sum_ad = np.sum(click_pred2_ad, axis = 1, keepdims = True)

# 모든 상품의 확률이 0인 상품 제외
click_pred2_ad = pd.DataFrame(click_pred2_ad)
click_pred_sum_ad = np.where(click_pred_sum_ad == 0, 0.00001, click_pred_sum_ad) # 합을 0이 아닌 아주 작은 값으로 변환

click_pred_pb_ad = np.where(click_pred2_ad == 0, 0, click_pred2_ad / click_pred_sum_ad)
click_pred_pb_ad = pd.DataFrame(click_pred_pb_ad, columns = click_y_ad.columns)

In [None]:
# 그룹별 매체 광고비 비중(예측)
click_pred_dt_ad = pd.concat([X_test_click_ad.reset_index(drop = True), pd.DataFrame(click_pred_pb_ad, columns = click_y_ad.columns)], axis = 1)
click_pred_pivot_ad = click_pred_dt_ad.groupby(['click_cut']).agg({'daum_M': 'mean', 'daum_P': 'mean', 'google_M': 'mean', 'google_P': 'mean',
                                         'naver_M': 'mean', 'naver_P': 'mean'})

click_pred_pivot_ad.plot(kind = 'barh', stacked = True, figsize = (20, 8),
                      color = ['gold', 'orange', 'tomato', 'orangered', 'lightgreen', 'limegreen'])
plt.rc('font', size = 12)
plt.title('Media Proportion by Click(pred - ad)')
plt.xticks(rotation = 90)
plt.legend(loc = 'upper right')
plt.show()

In [None]:
click_pred_pivot_ad

#### 2-2. 전환수 최대화(CPA)


##### 2-2-1. 전환수(전체)

In [None]:
# 데이터셋 준비
cpa_X = cpa_df[['cpa_cut']]
cpa_y = cpa_df[['daum_M', 'daum_P', 'google_M', 'google_P', 'naver_M', 'naver_P']]

# 교차 검증
cv = KFold(n_splits = 5, shuffle = True, random_state = 3)

# linear regression for multioutput regression
cpa_model = LinearRegression()
cpa_results = cross_validate(cpa_model, cpa_X, cpa_y, scoring = 'r2', cv = cv, n_jobs = -1, return_estimator = True)

print('cross-val-score:', cpa_results['test_score'])
print('cross-val-score.mean:{:.3f}'.format(cpa_results['test_score'].mean())) # r2 = -0.041

In [None]:
# 트레이닝 / 테스트 셋으로 데이터 분리
X_train_cpa, X_test_cpa, y_train_cpa, y_test_cpa = train_test_split(cpa_X, cpa_y, test_size = 0.2, random_state = 2)

print(X_train_cpa.shape, y_train_cpa.shape) # 212
print(X_test_cpa.shape, y_test_cpa.shape) # 53

In [None]:
# 최종 모형
cpa_reg  = LinearRegression().fit(X_train_cpa, y_train_cpa)
cpa_pred = cpa_reg.predict(X_test_cpa)

print('uniform_average:', r2_score(y_test_cpa, cpa_pred, multioutput = 'uniform_average')) # r2 = -0.036
print('variance_weighted:', r2_score(y_test_cpa, cpa_pred, multioutput = 'variance_weighted')) # r2 = -0.029

In [None]:
# 모델 저장
joblib.dump(cpa_reg, '/home/anaconda3/da/data/media mix_data9/mix_model_cpa.pkl')

In [None]:
# 확률 형태로 변환(softmax로 변환 시 너무 완만하게 변화하므로 사용 X)
cpa_pd_limit = 0.001
cpa_pred2 = np.where(cpa_pred < cpa_pd_limit, 0, cpa_pred) # 특정 비율보다 낮게 추정된 상품은 0으로 변환

cpa_pred_sum = np.sum(cpa_pred2, axis = 1, keepdims = True)

# 모든 상품의 확률이 0인 상품 제외
cpa_pred2 = pd.DataFrame(cpa_pred2)
cpa_pred_sum = np.where(cpa_pred_sum == 0, 0.00001, cpa_pred_sum) # 합을 0이 아닌 아주 작은 값으로 변환

cpa_pred_pb = np.where(cpa_pred2 == 0, 0, cpa_pred2 / cpa_pred_sum)
cpa_pred_pb = pd.DataFrame(cpa_pred_pb, columns = cpa_y.columns)

In [None]:
# 그룹별 매체 광고비 비중(예측)
cpa_pred_dt = pd.concat([X_test_cpa.reset_index(drop = True), pd.DataFrame(cpa_pred_pb, columns = cpa_y.columns)], axis = 1)
cpa_pred_pivot = cpa_pred_dt.groupby(['cpa_cut']).agg({'daum_M': 'mean', 'daum_P': 'mean', 'google_M': 'mean', 'google_P': 'mean',
                                         'naver_M': 'mean', 'naver_P': 'mean'})

cpa_pred_pivot.plot(kind = 'barh', stacked = True, figsize = (20, 8),
                      color = ['gold', 'orange', 'tomato', 'orangered', 'lightgreen', 'limegreen'])
plt.rc('font', size = 12)
plt.title('Media Proportion by CPA(pred)')
plt.xticks(rotation = 90)
plt.legend(loc = 'upper right')
plt.show()

In [None]:
cpa_pred_pivot

##### 2-2-2. 전환수(사이트)

In [None]:
# 데이터셋 준비
cpa_X_site = cpa_site[['cpa_cut']]
cpa_y_site = cpa_site[['daum_M', 'daum_P', 'google_M', 'google_P', 'naver_M', 'naver_P']]

# 교차 검증
cv = KFold(n_splits = 5, shuffle = True, random_state = 3)

# linear regression for multioutput regression
cpa_model_site = LinearRegression()
cpa_results_site = cross_validate(cpa_model_site, cpa_X_site, cpa_y_site, scoring = 'r2', cv = cv, n_jobs = -1, 
                                    return_estimator = True)

print('cross-val-score:', cpa_results_site['test_score'])
print('cross-val-score.mean:{:.3f}'.format(cpa_results_site['test_score'].mean())) # r2 = -13.676

In [None]:
# 최종 모형(사이트 모형 구축의 경우, 데이터 양이 부족하므로 데이터셋 분리 X) ###
cpa_reg_site  = LinearRegression().fit(cpa_X_site, cpa_y_site)
cpa_pred_site = cpa_reg_site.predict(cpa_X_site)

print('uniform_average:', r2_score(cpa_y_site, cpa_pred_site, multioutput = 'uniform_average')) # r2 = 0.38
print('variance_weighted:', r2_score(cpa_y_site, cpa_pred_site, multioutput = 'variance_weighted')) # r2 = 0.04

In [None]:
# 확률 형태로 변환(softmax로 변환 시 너무 완만하게 변화하므로 사용 X)
cpa_pd_limit = 0.001
cpa_pred2_site = np.where(cpa_pred_site < cpa_pd_limit, 0, cpa_pred_site) # 특정 비율보다 낮게 추정된 상품은 0으로 변환

cpa_pred_sum_site = np.sum(cpa_pred2_site, axis = 1, keepdims = True)

# 모든 상품의 확률이 0인 상품 제외
cpa_pred2_site = pd.DataFrame(cpa_pred2_site)
cpa_pred_sum_site = np.where(cpa_pred_sum_site == 0, 0.00001, cpa_pred_sum_site) # 합을 0이 아닌 아주 작은 값으로 변환

cpa_pred_pb_site = np.where(cpa_pred2_site == 0, 0, cpa_pred2_site / cpa_pred_sum_site)
cpa_pred_pb_site = pd.DataFrame(cpa_pred_pb_site, columns = cpa_y_site.columns)

In [None]:
# 그룹별 매체 광고비 비중(예측)
cpa_pred_dt_site = pd.concat([cpa_X_site.reset_index(drop = True), pd.DataFrame(cpa_pred_pb_site, columns = cpa_y_site.columns)], axis = 1)
cpa_pred_pivot_site = cpa_pred_dt_site.groupby(['cpa_cut']).agg({'daum_M': 'mean', 'daum_P': 'mean', 'google_M': 'mean', 'google_P': 'mean',
                                         'naver_M': 'mean', 'naver_P': 'mean'})

cpa_pred_pivot_site.plot(kind = 'barh', stacked = True, figsize = (20, 8),
                      color = ['gold', 'orange', 'tomato', 'orangered', 'lightgreen', 'limegreen'])
plt.rc('font', size = 12)
plt.title('Media Proportion by CPA(pred - site)')
plt.xticks(rotation = 90)
plt.legend(loc = 'upper right')
plt.show()

In [None]:
cpa_pred_pivot_site

In [None]:
# 그룹별 평균 효율
cpa_site2 = cpa_site[cpa_site.cpa_cut == 3]
cpa_site2 = cpa_site2[['campaign', 'cpa_cut']]

summary_df2 = pd.merge(summary_df, cpa_site2, how = 'left', on = ['campaign'])
summary_df2 = summary_df2[summary_df2.cpa_cut.notnull()]

summary_df2['buyConvCnt'] = summary_df2.directBuyConvCnt + summary_df2.indirectBuyConvCnt
summary_df2['buyCostConvCnt'] = summary_df2.directBuyCostConvCnt + summary_df2.indirectBuyCostConvCnt

summary_cpa_site = summary_df2.groupby(['mediaDevice']).agg({
    'adCost': 'sum', 'click': 'sum', 'buyConvCnt': 'sum', 'buyCostConvCnt': 'sum'}).reset_index()

summary_cpa_site['cpc'] = np.where(summary_cpa_site.click == 0, 0, summary_cpa_site.adCost / summary_cpa_site.click)
summary_cpa_site['cpa'] = np.where(summary_cpa_site.buyConvCnt == 0, 0, summary_cpa_site.adCost / summary_cpa_site.buyConvCnt)
summary_cpa_site['cvr'] = np.where(summary_cpa_site.click == 0, 0, summary_cpa_site.buyConvCnt / summary_cpa_site.click)
summary_cpa_site['ct'] = np.where(summary_cpa_site.buyConvCnt == 0, 0, summary_cpa_site.buyCostConvCnt / summary_cpa_site.buyConvCnt) # 객단가

summary_cpa_site

##### 2-2-3. 전환수(광고주)

In [None]:
# 데이터셋 준비
cpa_X_ad = cpa_ad[['cpa_cut']]
cpa_y_ad = cpa_ad[['daum_M', 'daum_P', 'google_M', 'google_P', 'naver_M', 'naver_P']]

# 교차 검증
cv = KFold(n_splits = 5, shuffle = True, random_state = 3)

# linear regression for multioutput regression
cpa_model_ad = LinearRegression()
cpa_results_ad = cross_validate(cpa_model_ad, cpa_X_ad, cpa_y_ad, scoring = 'r2', cv = cv, n_jobs = -1, 
                                    return_estimator = True)

print('cross-val-score:', cpa_results_ad['test_score'])
print('cross-val-score.mean:{:.3f}'.format(cpa_results_ad['test_score'].mean())) # r2 = -0.041

In [None]:
# 트레이닝 / 테스트 셋으로 데이터 분리
X_train_cpa_ad, X_test_cpa_ad, y_train_cpa_ad, y_test_cpa_ad = train_test_split(cpa_X_ad, cpa_y_ad, 
                                                                                                test_size = 0.2, random_state = 2)

print(X_train_cpa_ad.shape, y_train_cpa_ad.shape) # 185
print(X_test_cpa_ad.shape, y_test_cpa_ad.shape) # 47

In [None]:
# 최종 모형
cpa_reg_ad  = LinearRegression().fit(X_train_cpa_ad, y_train_cpa_ad)
cpa_pred_ad = cpa_reg_ad.predict(X_test_cpa_ad)

print('uniform_average:', r2_score(y_test_cpa_ad, cpa_pred_ad, multioutput = 'uniform_average')) # r2 = -0.013
print('variance_weighted:', r2_score(y_test_cpa_ad, cpa_pred_ad, multioutput = 'variance_weighted')) # r2 = -0.024

In [None]:
# 확률 형태로 변환(softmax로 변환 시 너무 완만하게 변화하므로 사용 X)
cpa_pd_limit = 0.001
cpa_pred2_ad = np.where(cpa_pred_ad < cpa_pd_limit, 0, cpa_pred_ad) # 특정 비율보다 낮게 추정된 상품은 0으로 변환

cpa_pred_sum_ad = np.sum(cpa_pred2_ad, axis = 1, keepdims = True)

# 모든 상품의 확률이 0인 상품 제외
cpa_pred2_ad = pd.DataFrame(cpa_pred2_ad)
cpa_pred_sum_ad = np.where(cpa_pred_sum_ad == 0, 0.00001, cpa_pred_sum_ad) # 합을 0이 아닌 아주 작은 값으로 변환

cpa_pred_pb_ad = np.where(cpa_pred2_ad == 0, 0, cpa_pred2_ad / cpa_pred_sum_ad)
cpa_pred_pb_ad = pd.DataFrame(cpa_pred_pb_ad, columns = cpa_y_ad.columns)

In [None]:
# 그룹별 매체 광고비 비중(예측)
cpa_pred_dt_ad = pd.concat([X_test_cpa_ad.reset_index(drop = True), pd.DataFrame(cpa_pred_pb_ad, columns = cpa_y_ad.columns)], axis = 1)
cpa_pred_pivot_ad = cpa_pred_dt_ad.groupby(['cpa_cut']).agg({'daum_M': 'mean', 'daum_P': 'mean', 'google_M': 'mean', 'google_P': 'mean',
                                         'naver_M': 'mean', 'naver_P': 'mean'})

cpa_pred_pivot_ad.plot(kind = 'barh', stacked = True, figsize = (20, 8),
                      color = ['gold', 'orange', 'tomato', 'orangered', 'lightgreen', 'limegreen'])
plt.rc('font', size = 12)
plt.title('Media Proportion by CPA(pred - ad)')
plt.xticks(rotation = 90)
plt.legend(loc = 'upper right')
plt.show()

In [None]:
cpa_pred_pivot_ad

#### 2-3. 매출

##### 2-3-1. 매출(전체)

In [None]:
# 데이터셋 준비
roas_X = roas_df[['roas_cut']]
roas_y = roas_df[['daum_M', 'daum_P', 'google_M', 'google_P', 'naver_M', 'naver_P']]

# 교차 검증
cv = KFold(n_splits = 5, shuffle = True, random_state = 3)

# linear regression for multioutput regression
roas_model = LinearRegression()
roas_results = cross_validate(roas_model, roas_X, roas_y, scoring = 'r2', cv = cv, n_jobs = -1, return_estimator = True)

print('cross-val-score:', roas_results['test_score'])
print('cross-val-score.mean:{:.3f}'.format(roas_results['test_score'].mean())) # r2 = -0.051

In [None]:
# 트레이닝 / 테스트 셋으로 데이터 분리
X_train_roas, X_test_roas, y_train_roas, y_test_roas = train_test_split(roas_X, roas_y, test_size = 0.2, random_state = 2)

print(X_train_roas.shape, y_train_roas.shape) # 146
print(X_test_roas.shape, y_test_roas.shape) # 37

In [None]:
# 최종 모형
roas_reg  = LinearRegression().fit(X_train_roas, y_train_roas)
roas_pred = roas_reg.predict(X_test_roas)

print('uniform_average:', r2_score(y_test_roas, roas_pred, multioutput = 'uniform_average')) # r2 = -0.09
print('variance_weighted:', r2_score(y_test_roas, roas_pred, multioutput = 'variance_weighted')) # r2 = -0.17

In [None]:
# 모델 저장
joblib.dump(roas_reg, '/home/anaconda3/da/data/media mix_data9/mix_model_roas.pkl')

In [None]:
# 확률 형태로 변환(softmax로 변환 시 너무 완만하게 변화하므로 사용 X)
roas_pd_limit = 0.001
roas_pred2 = np.where(roas_pred < roas_pd_limit, 0, roas_pred) # 특정 비율보다 낮게 추정된 상품은 0으로 변환

roas_pred_sum = np.sum(roas_pred2, axis = 1, keepdims = True)

# 모든 상품의 확률이 0인 상품 제외
roas_pred2 = pd.DataFrame(roas_pred2)
roas_pred_sum = np.where(roas_pred_sum == 0, 0.00001, roas_pred_sum) # 합을 0이 아닌 아주 작은 값으로 변환

roas_pred_pb = np.where(roas_pred2 == 0, 0, roas_pred2 / roas_pred_sum)
roas_pred_pb = pd.DataFrame(roas_pred_pb, columns = roas_y.columns)

In [None]:
# 그룹별 매체 광고비 비중(예측)
roas_pred_dt = pd.concat([X_test_roas.reset_index(drop = True), pd.DataFrame(roas_pred_pb, columns = roas_y.columns)], axis = 1)
roas_pred_pivot = roas_pred_dt.groupby(['roas_cut']).agg({'daum_M': 'mean', 'daum_P': 'mean', 'google_M': 'mean', 'google_P': 'mean',
                                         'naver_M': 'mean', 'naver_P': 'mean'})

roas_pred_pivot.plot(kind = 'barh', stacked = True, figsize = (20, 8),
                      color = ['gold', 'orange', 'tomato', 'orangered', 'lightgreen', 'limegreen'])
plt.rc('font', size = 12)
plt.title('Media Proportion by ROAS(pred)')
plt.xticks(rotation = 90)
plt.legend(loc = 'upper right')
plt.show()

In [None]:
roas_pred_pivot

##### 2-3-2.전환수(사이트)

In [None]:
# 데이터셋 준비
roas_X_site = roas_site[['roas_cut']]
roas_y_site = roas_site[['daum_M', 'daum_P', 'google_M', 'google_P', 'naver_M', 'naver_P']]

# 교차 검증
cv = KFold(n_splits = 5, shuffle = True, random_state = 3)

# linear regression for multioutput regression
roas_model_site = LinearRegression()
roas_results_site = cross_validate(roas_model_site, roas_X_site, roas_y_site, scoring = 'r2', cv = cv, n_jobs = -1, 
                                    return_estimator = True)

print('cross-val-score:', roas_results_site['test_score'])
print('cross-val-score.mean:{:.3f}'.format(roas_results_site['test_score'].mean())) # r2 = nan ### 데이터 부족!!!

##### 2-3-3. 전환수(광고주)

In [None]:
# 데이터셋 준비
roas_X_ad = roas_ad[['roas_cut']]
roas_y_ad = roas_ad[['daum_M', 'daum_P', 'google_M', 'google_P', 'naver_M', 'naver_P']]

# 교차 검증
cv = KFold(n_splits = 5, shuffle = True, random_state = 3)

# linear regression for multioutput regression
roas_model_ad = LinearRegression()
roas_results_ad = cross_validate(roas_model_ad, roas_X_ad, roas_y_ad, scoring = 'r2', cv = cv, n_jobs = -1, 
                                    return_estimator = True)

print('cross-val-score:', roas_results_ad['test_score'])
print('cross-val-score.mean:{:.3f}'.format(roas_results_ad['test_score'].mean())) # r2 = -0.032

In [None]:
# 트레이닝 / 테스트 셋으로 데이터 분리
X_train_roas_ad, X_test_roas_ad, y_train_roas_ad, y_test_roas_ad = train_test_split(roas_X_ad, roas_y_ad, 
                                                                                                test_size = 0.2, random_state = 2)

print(X_train_roas_ad.shape, y_train_roas_ad.shape) # 146
print(X_test_roas_ad.shape, y_test_roas_ad.shape) # 37

In [None]:
# 최종 모형
roas_reg_ad  = LinearRegression().fit(X_train_roas_ad, y_train_roas_ad)
roas_pred_ad = roas_reg_ad.predict(X_test_roas_ad)

print('uniform_average:', r2_score(y_test_roas_ad, roas_pred_ad, multioutput = 'uniform_average')) # r2 = -0.069
print('variance_weighted:', r2_score(y_test_roas_ad, roas_pred_ad, multioutput = 'variance_weighted')) # r2 = -0.057

In [None]:
# 확률 형태로 변환(softmax로 변환 시 너무 완만하게 변화하므로 사용 X)
roas_pd_limit = 0.001
roas_pred2_ad = np.where(roas_pred_ad < roas_pd_limit, 0, roas_pred_ad) # 특정 비율보다 낮게 추정된 상품은 0으로 변환

roas_pred_sum_ad = np.sum(roas_pred2_ad, axis = 1, keepdims = True)

# 모든 상품의 확률이 0인 상품 제외
roas_pred2_ad = pd.DataFrame(roas_pred2_ad)
roas_pred_sum_ad = np.where(roas_pred_sum_ad == 0, 0.00001, roas_pred_sum_ad) # 합을 0이 아닌 아주 작은 값으로 변환

roas_pred_pb_ad = np.where(roas_pred2_ad == 0, 0, roas_pred2_ad / roas_pred_sum_ad)
roas_pred_pb_ad = pd.DataFrame(roas_pred_pb_ad, columns = roas_y_ad.columns)

In [None]:
# 그룹별 매체 광고비 비중(예측)
roas_pred_dt_ad = pd.concat([X_test_roas_ad.reset_index(drop = True), pd.DataFrame(roas_pred_pb_ad, columns = roas_y_ad.columns)], axis = 1)
roas_pred_pivot_ad = roas_pred_dt_ad.groupby(['roas_cut']).agg({'daum_M': 'mean', 'daum_P': 'mean', 'google_M': 'mean', 'google_P': 'mean',
                                         'naver_M': 'mean', 'naver_P': 'mean'})

roas_pred_pivot_ad.plot(kind = 'barh', stacked = True, figsize = (20, 8),
                      color = ['gold', 'orange', 'tomato', 'orangered', 'lightgreen', 'limegreen'])
plt.rc('font', size = 12)
plt.title('Media Proportion by Revenue(pred - ad)')
plt.xticks(rotation = 90)
plt.legend(loc = 'upper right')
plt.show()

In [None]:
roas_pred_pivot_ad

In [None]:
# 그룹별 평균 효율
#roas_ad2 = roas_ad.loc[X_train_roas_ad.index,] # 트레이닝 데이터셋에 포함된 경우만 계산
roas_ad2 = roas_ad[roas_ad.roas_cut == 3]
roas_ad2 = roas_ad2[['campaign', 'roas_cut']] ###

summary_df2 = pd.merge(summary_df, roas_ad2, how = 'left', on = ['campaign'])
summary_df2 = summary_df2[summary_df2.roas_cut.notnull()]

summary_df2['buyConvCnt'] = summary_df2.directBuyConvCnt + summary_df2.indirectBuyConvCnt
summary_df2['buyCostConvCnt'] = summary_df2.directBuyCostConvCnt + summary_df2.indirectBuyCostConvCnt

summary_roas_ad = summary_df2.groupby(['mediaDevice']).agg({
    'adCost': 'sum', 'click': 'sum', 'buyConvCnt': 'sum', 'buyCostConvCnt': 'sum'}).reset_index()

summary_roas_ad['cpc'] = np.where(summary_roas_ad.click == 0, 0, summary_roas_ad.adCost / summary_roas_ad.click)
summary_roas_ad['cpa'] = np.where(summary_roas_ad.buyConvCnt == 0, 0, summary_roas_ad.adCost / summary_roas_ad.buyConvCnt)
summary_roas_ad['cvr'] = np.where(summary_roas_ad.click == 0, 0, summary_roas_ad.buyConvCnt / summary_roas_ad.click)
summary_roas_ad['ct'] = np.where(summary_roas_ad.buyConvCnt == 0, 0, summary_roas_ad.buyCostConvCnt / summary_roas_ad.buyConvCnt) # 객단가

summary_roas_ad

In [None]:
roas_ad2.shape