In [1]:
from pathlib import Path
import pandas as pd
import sys
import os
sys.path.append(os.path.abspath(".."))

BASE_DIR = Path().resolve().parent

DATA_DIR = BASE_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
INTERIM_DIR = DATA_DIR / "interim"
PROCESSED_DIR = DATA_DIR / "processed"

MART_RAW_PATH = RAW_DIR / "mart_raw.csv"
LOG_RAW_PATH = RAW_DIR / "log_raw.pkl"

LOG_INT_PATH = INTERIM_DIR / "log_int.pkl"
MART_INT_PATH = INTERIM_DIR / "mart_int.pkl"
TPS_INT_PATH = INTERIM_DIR / "tps_int.csv"



In [None]:
LOG_PRC_PATH = PROCESSED_DIR / "log_prc.pkl"
MART_PRC_PATH = PROCESSED_DIR / "mart_prc.pkl"
TPS_PRC_PATH = PROCESSED_DIR / "tps_prc.pkl"

In [2]:
from src.validate import check_id_matching

from src.clean import (
    drop_high_null_cols,
    drop_mart_cols,
    drop_log_cols,
    get_column_summary,
    clean_mart,
    clean_log,
)

from src.encoder import (
    encode_asset_prod,
    encode_screen_tp,
    encode_publctn_bit
)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)


In [None]:
mart_raw = pd.read_csv(MART_RAW_PATH, 
                       sep = ",", 
                       encoding = 'utf-8', 
                       engine="python",
                       on_bad_lines="skip")
mart_raw.info()

In [None]:
mart_raw, dropped_by_null = drop_high_null_cols(mart_raw, threshold=90)

print(f"Dropped {len(dropped_by_null)}")
dropped_by_null


In [None]:
mart_raw_sum = get_column_summary(mart_raw)
mart_raw_sum

In [None]:
mart_raw = drop_mart_cols(mart_raw)

In [None]:
mart_clean = clean_mart(mart_raw)

In [None]:
mart_clean_sum = get_column_summary(mart_clean)
mart_clean_sum

In [None]:
log_raw = pd.read_pickle(LOG_RAW_PATH)
log_raw.info()

In [None]:
log_raw_sum = get_column_summary(log_raw)
log_raw_sum

In [None]:
log_raw = drop_log_cols(log_raw)

In [None]:
log_clean = clean_log(log_raw)

In [None]:
log_clean_sum = get_column_summary(log_clean)
log_clean_sum

In [None]:
mart_int_path = "../data/interim/mart_int.pkl"
log_int_path = "../data/interim/log_int.pkl"
mart_raw.to_pickle(mart_int_path)
log_raw.to_pickle(log_int_path)
print(f" 파일 저장 완료: {mart_int_path}")
print(f" 파일 저장 완료: {log_int_path}")


- interim

In [3]:
log_int = pd.read_pickle(LOG_INT_PATH)
log_int.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50804390 entries, 0 to 50804389
Data columns (total 5 columns):
 #   Column     Dtype         
---  ------     -----         
 0   sha2_hash  object        
 1   asset      object        
 2   use_tms    float64       
 3   disp_rtm   object        
 4   strt_dt    datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 1.9+ GB


In [4]:
tps_int = pd.read_csv(TPS_INT_PATH,  
                        sep = "|", 
                        encoding = 'utf-8')
tps_int.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21719882 entries, 0 to 21719881
Data columns (total 40 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   Unnamed: 0                 int64  
 1   sha2_hash                  object 
 2   SVC_USE_DAYS_GRP           int64  
 3   MEDIA_NM_GRP               object 
 4   PROD_NM_GRP                object 
 5   PROD_OLD_YN                int64  
 6   PROD_ONE_PLUS_YN           int64  
 7   AGMT_KIND_NM               object 
 8   STB_RES_1M_YN              int64  
 9   SVOD_SCRB_CNT_GRP          int64  
 10  PAID_CHNL_CNT_GRP          int64  
 11  SCRB_PATH_NM_GRP           object 
 12  INHOME_RATE                float64
 13  AGMT_END_SEG               int64  
 14  TOTAL_USED_DAYS            int64  
 15  TV_SCRB                    int64  
 16  ANALOG_SCRB                int64  
 17  DIGITAL_SCRB               int64  
 18  TOTAL_INTERNET_SCRB        int64  
 19  GIGA_INTERNET_SCRB         int64  
 20  

In [5]:
tps_int.head(5)

Unnamed: 0.1,Unnamed: 0,sha2_hash,SVC_USE_DAYS_GRP,MEDIA_NM_GRP,PROD_NM_GRP,PROD_OLD_YN,PROD_ONE_PLUS_YN,AGMT_KIND_NM,STB_RES_1M_YN,SVOD_SCRB_CNT_GRP,PAID_CHNL_CNT_GRP,SCRB_PATH_NM_GRP,INHOME_RATE,AGMT_END_SEG,TOTAL_USED_DAYS,TV_SCRB,ANALOG_SCRB,DIGITAL_SCRB,TOTAL_INTERNET_SCRB,GIGA_INTERNET_SCRB,BUNDLE_YN,DIGITAL_GIGA_YN,DIGITAL_ALOG_YN,TV_I_CNT,CH_LAST_DAYS_BF_GRP,VOC_TOTAL_MONTH1_YN,VOC_STOP_CANCEL_MONTH1_YN,AGE_GRP10,EMAIL_RECV_CLS_NM,SMS_SEND_CLS_NM,CH_HH_AVG_MONTH1,CH_25_RATIO_MONTH1,CH_25_RATIO_MEAN_3MM,CH_FAV_RNK1,KIDS_USE_PV_MONTH1,NFX_USE_YN,YTB_USE_YN,p_mt,cancel_yn,time
0,0,6c2fb1fb0b1e316975157671e03d0e2eb3d250a7373321...,0,UHD,이코노미,0,0,신규,0,0,0,직영몰,10.0,13,45,1,0,1,0,0,0,0,0,1,0,1,0,50.0,전체거부,전체거부,4.46,3.63,3.63,기타,2,0,0,202303,유지,1
1,1,39b9ff560dbcbe4e138f04fd082d55d921c5528f1cc279...,0,UHD,베이직,0,0,신규,0,0,0,현장경로,10.0,13,45,1,0,1,1,1,1,1,0,2,0,1,0,70.0,수신,수신,4.57,5.67,5.67,MBC,1,0,0,202303,유지,1
2,2,db168b085b5c0cd2d21faab70ceb4ec36c5409aea1f8f4...,0,UHD,베이직,0,0,신규,0,0,0,현장경로,20.0,13,44,1,0,1,1,1,1,1,0,2,0,1,0,70.0,전체거부,수신,6.32,0.36,0.36,기타,0,0,0,202303,유지,1
3,3,59a02dfe6d4e5e4ffc1b295dc0ebf0fdf7ffc37321f653...,0,HD,이코노미,0,1,신규,0,0,0,I/B,0.0,13,3006,2,0,2,1,0,1,0,0,3,0,1,0,40.0,수신,수신,1.18,1.85,1.85,SBS,0,0,0,202303,유지,1
4,4,6f415993f9b6968f3b7bd8b2188f8651dfdc7e50e3022a...,0,UHD,베이직,0,0,신규,0,0,0,현장경로,10.0,13,42,1,0,1,1,0,1,0,0,2,0,1,0,60.0,전체거부,전체거부,3.05,0.55,0.55,연합뉴스TV,0,0,1,202303,유지,1


In [6]:
mart_int = pd.read_pickle(MART_INT_PATH)
mart_int.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470064 entries, 0 to 470063
Data columns (total 24 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   actr_disp          470060 non-null  object 
 1   asset_nm           470064 non-null  object 
 2   asset_prod         470064 non-null  object 
 3   broad_ymd          466682 non-null  object 
 4   category           468827 non-null  object 
 5   created            470064 non-null  int64  
 6   created_by         470064 non-null  object 
 7   crt_ymd            467690 non-null  object 
 8   ct_cl              470064 non-null  object 
 9   cts_id             468751 non-null  object 
 10  epsd_id            469976 non-null  object 
 11  epsd_no            469720 non-null  float64
 12  full_asset_id      470064 non-null  object 
 13  genre              470064 non-null  object 
 14  genre_of_ct_cl     470064 non-null  object 
 15  genre_of_ct_cl_cd  470064 non-null  int64  
 16  pr

In [7]:
mart_int['asset_prod'].value_counts().head(10)
mart_int['asset_prod'], uniques = pd.factorize(mart_int['asset_prod'])

uniques


Index(['FOD', 'RVOD', 'SVOD'], dtype='object')

In [8]:
mart_int = encode_asset_prod(mart_int)

In [9]:
mart_int['screen_tp'].value_counts().head(10)
mart_int['screen_tp'], uniques = pd.factorize(mart_int['screen_tp'])

uniques


Index(['HD', 'SD', 'UHD'], dtype='object')

In [10]:
mart_int = encode_screen_tp(mart_int)

In [11]:
mart_int = encode_publctn_bit(mart_int)

In [12]:
mart_int.head(5)

Unnamed: 0,actr_disp,asset_nm,asset_prod,broad_ymd,category,created,created_by,crt_ymd,ct_cl,cts_id,epsd_id,epsd_no,full_asset_id,genre,genre_of_ct_cl,genre_of_ct_cl_cd,product_tp,prpgt_pri,publctn_rt,rate,rlse_year,screen_tp,studio_nm,super_asset_id
0,"양택조,김영환",건강 체조,-1,,지역콘텐츠/CCS충북/건강 체조,20211126152916,VODIN:2.0.3,2021-11-26,기타,CCS000001,건강 체조_1,1.0,CCS|CCS00000000000000101,기타,기타,0,10.0,1,1,0,2010.0,-1,CCS충북방송,null-00001
1,-,자연을 닮은 농부들,-1,,지역콘텐츠/CCS충북/의림지뜰 사람들,20211223151741,VODIN:2.0.3,2021-12-23,기타,CCS000006,의림지뜰 사람들_1,1.0,CCS|CCS00000000000000601,다큐,기타,0,10.0,1,1,15,2021.0,-1,CCS충북방송,null-00001
2,-,가을의 추억,-1,,지역콘텐츠/CCS충북/의림지뜰 사람들,20211225171925,VODIN:2.0.3,2021-12-25,기타,CCS000007,의림지뜰 사람들_2,2.0,CCS|CCS00000000000000701,다큐,기타,0,10.0,1,1,15,2021.0,-1,CCS충북방송,null-00002
3,-,덤벙김치 담그는 날,-1,,지역콘텐츠/CCS충북/의림지뜰 사람들,20211225172138,VODIN:2.0.3,2021-12-25,기타,CCS000008,의림지뜰 사람들_3,3.0,CCS|CCS00000000000000801,다큐,기타,0,10.0,1,1,15,2021.0,-1,CCS충북방송,null-00003
4,-,새로운 도전의 시작,-1,,지역콘텐츠/CCS충북/의림지뜰 사람들,20211225172359,VODIN:2.0.3,2021-12-25,기타,CCS000009,의림지뜰 사람들_4,4.0,CCS|CCS00000000000000901,다큐,기타,0,10.0,1,1,15,2021.0,-1,CCS충북방송,null-00004


In [13]:
mart_int_sum = get_column_summary(mart_int)
mart_int_sum   


Unnamed: 0,column,dtype,null_cnt,null_rate,unique_cnt
0,actr_disp,object,4,0.0,41757
1,asset_nm,object,0,0.0,360181
2,asset_prod,int8,0,0.0,1
3,broad_ymd,object,3382,0.72,10575
4,category,object,1237,0.26,20122
5,created,int64,0,0.0,465546
6,created_by,object,0,0.0,11
7,crt_ymd,object,2374,0.51,2218
8,ct_cl,object,0,0.0,15
9,cts_id,object,1313,0.28,468747


In [15]:
l_users = list(log_int['sha2_hash'].unique())
t_users = list(tps_int['sha2_hash'].unique())

In [16]:
m_ids = list(mart_int['full_asset_id'].unique())
l_ids = list(log_int['asset'].unique())

In [17]:
tps_log_match = check_id_matching(
    tps_int['sha2_hash'],
    log_int['sha2_hash'],
    dataset_type="tps_vs_log"
)

1. 공통 ID 개수 : 581,223개
2. Log 매칭률   : 71.13% (Log 중 TPS 정보가 존재하는 비율)
3. TPS 매칭률  : 26.90% (TPS 중 Log 정보가 존재하는 비율)
 경고: Log에는 있으나 TPS에 없는 ID가 235,896개 있습니다.
   예시: ['b3b8b09680e9db5b72508e43f7d91b90ba85068c5aced694e0ff2a60dd912d41', 'adbc26cc589a65b5a09aceb031342dc33cafbf402989bf4485ced71c18d13920', '9dd7adf439c99bc18f2e1a6d1023a0934a50e6aee1c471adfb5dbebb3feb1901', '923645666e2616cd7bd6d759ef9e00fad240622d5c399940c59eb54d80958164', 'abad4b9c5043c6f803ca1d17085520400cc1ecf4089309111378b463f2139cb2']


In [18]:
mart_log_match = check_id_matching(
    mart_int['full_asset_id'],
    log_int['asset'],
    dataset_type="mart_vs_log"
)

1. 공통 ID 개수 : 438,961개
2. Log 매칭률   : 96.87% (Log 중 Mart 정보가 존재하는 비율)
3. Mart 매칭률  : 93.38% (Mart 중 Log 정보가 존재하는 비율)
 경고: Log에는 있으나 Mart에 없는 ID가 14,206개 있습니다.
   예시: [nan, nan, nan, nan, nan]


In [None]:
# valid_mart_log = log_int[
#     log_int['asset'].isin(mart_int['full_asset_id'])
# ]

# valid_tps_log = log_int[
#     log_int['sha2_hash'].isin(tps_int['sha2_hash'])
# ]

In [None]:
# mart_prc_path = "../data/interim/mart_int.pkl"
# log_prc_path  = "../data/interim/log_int.pkl"
# tps_prc_path  = "../data/interim/tps_int.pkl"

# mart_int.to_pickle(mart_prc_path)
# tps_int.to_pickle(tps_prc_path)

# valid_log = pd.concat([valid_mart_log, valid_tps_log]).drop_duplicates().copy()
# valid_log.to_pickle(log_prc_path)

# print(f" 파일 저장 완료: {mart_prc_path}")
# print(f" 파일 저장 완료: {log_prc_path}")
# print(f" 파일 저장 완료: {tps_prc_path}")


- processed