In [None]:
from pathlib import Path
import pandas as pd
import sys
import os
sys.path.append(os.path.abspath(".."))

BASE_DIR = Path().resolve().parent

DATA_DIR = BASE_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
INTERIM_DIR = DATA_DIR / "interim"
PROCESSED_DIR = DATA_DIR / "processed"

LOG_PRC_PATH = PROCESSED_DIR / "log_prc.pkl"
MART_PRC_PATH = PROCESSED_DIR / "mart_prc.pkl"
TPS_PRC_PATH = PROCESSED_DIR / "tps_prc.pkl"




In [None]:
log = pd.read_pickle(LOG_PRC_PATH)
mart = pd.read_pickle(MART_PRC_PATH)
tps = pd.read_pickle(TPS_PRC_PATH)

In [None]:
log_base = log[['sha2_hash', 'asset','disp_rtm_sec', 'strt_dt', 'use_tms']]
mart_base = mart[['full_asset_id','asset_prod','genre','asset_nm','super_asset_nm']]


In [None]:
log_mart = log_base.merge(
    mart_base,
    left_on='asset',
    right_on='full_asset_id',
    how='left'
).drop(columns=['full_asset_id'])  

In [None]:
log_mart.info()

In [None]:
log_mart.head(3)

In [None]:
genre_df = log_mart['genre'].to_frame()
unique_genres = genre_df.drop_duplicates().reset_index(drop=True)



unique_genres.head(25)

In [None]:
erotic_df  = mart[mart['genre']== '에로틱']
erotic_top = erotic_df[['super_asset_nm', 'genre']].drop_duplicates().reset_index(drop=True)
erotic_top.head(20)

In [None]:
# 1️⃣ 트로트 키워드
trot_keywords = [
    '트로트', '트롯', '성인가요', '사랑의 콜센타', '뽕숭아학당',
    '화요일은 밤이 좋아', '불타는 장미단', '미스터 로또', '트랄랄라 브라더스'
]

pattern = '|'.join(trot_keywords)

# 2️⃣ row 단위: 트로트 프로그램 여부 (중간 컬럼)
log_mart['is_trot_program'] = (
    log_mart['super_asset_nm'].str.contains(pattern, na=False)
    & (log_mart['genre'] == '연예오락')
)

# 3️⃣ 사용자 단위 플래그 생성
user_flags = (
    log_mart
    .groupby('sha2_hash')
    .agg(
        # 에로틱 시청 여부
        is_adult=('genre', lambda x: (x == '에로틱').any()),

        # 연예오락 트로트 프로그램 시청 여부
        is_trot=('is_trot_program', 'any'),

        # 시청한 전체 장르
        watched_genres=('genre', lambda x: sorted(x.dropna().unique())),

        # RVOD 시청 횟수
        rvod_count=('asset_prod', lambda x: (x == 'RVOD').sum())
    )
    .astype({
        'is_adult': int,
        'is_trot': int
    })
    .reset_index()
)

# 4️⃣ 기존 user 단위 컬럼 정리 후 병합
log_mart = log_mart.drop(
    columns=[c for c in log_mart.columns if c in ['is_adult', 'is_trot', 'rvod_count']],
    errors='ignore'
)

log_mart = log_mart.merge(
    user_flags,
    on='sha2_hash',
    how='left'
)

log_mart = log_mart.drop(columns=['is_trot_program'], errors='ignore')
log_mart = log_mart.drop(columns=['asset_nm'])


In [None]:
log_mart.head(4)

In [None]:
log_mart.head(10)

In [None]:
log_mart['strt_dt'] = log_mart['strt_dt'].dt.strftime('%Y%m')  
log_mart = log_mart.dropna(subset=['strt_dt'])                
log_mart['strt_dt'] = log_mart['strt_dt'].astype('str')    
log_mart = log_mart.rename(columns={'strt_dt': 'p_mt'})

In [None]:
log_mart['p_mt'].unique()

In [None]:
tps['p_mt'] = tps['p_mt'].astype('str')

In [None]:
tps['p_mt'].dtypes

In [None]:
# 시청 시간 기준 
top_genre_by_time = (
    log_mart
    .groupby(['sha2_hash', 'genre'], observed=True)['disp_rtm_sec']
    .sum()
    .reset_index()
    .sort_values(['sha2_hash', 'disp_rtm_sec'], ascending=[True, False])
    .groupby('sha2_hash')
    .head(1)
    .rename(columns={'genre': 'top_genre', 'disp_rtm_sec': 'top_genre_watch_sec'})
)


In [None]:
top_genre_by_time.head(10)

In [None]:
# 시청 횟수 기준 
top_genre_by_cnt = (
    log_mart
    .groupby(['sha2_hash', 'genre'], observed=True)
    .size()
    .reset_index(name='watch_cnt')
    .sort_values(['sha2_hash', 'watch_cnt'], ascending=[True, False])
    .groupby('sha2_hash')
    .head(1)
    .rename(columns={'genre': 'top_genre_cnt'})
)


In [None]:
top_genre_by_cnt.head(10)

In [None]:
#완주율
log_mart['complete_rate'] = log_mart['use_tms'] / log_mart['disp_rtm_sec'] # 실제 시청 시간 / 프로그램 전체 길이 

top_genre_by_complete = (
    log_mart[log_mart['complete_rate'] >= 0.7]
    .groupby(['sha2_hash', 'genre'], observed=True)['disp_rtm_sec']
    .sum()
    .reset_index()
    .sort_values(['sha2_hash', 'disp_rtm_sec'], ascending=[True, False])
    .groupby('sha2_hash')
    .head(1)
    .rename(columns={'genre': 'favorite_genre'})
)


In [None]:
log_mart = log_mart.merge(
    top_genre_by_complete[['sha2_hash', 'favorite_genre']],
    on='sha2_hash',
    how='left'
)


In [None]:
log_mart.head(10)

In [None]:
tps_base = tps[['sha2_hash', 'CH_FAV_RNK1','p_mt']]

In [None]:
log_mart_sample = log_mart.sample(frac=0.1, random_state=42)
tps_base_sample = tps_base.sample(frac=0.1, random_state=42)


In [None]:
log_mart_tps = log_mart_sample.merge(
    tps_base_sample,
    on=['sha2_hash', 'p_mt'],
    how='left'
)


In [None]:
log_mart_tps.head(10)

In [None]:
tps.head(10)