In [1]:
from pathlib import Path
import pandas as pd
import sys
import os
sys.path.append(os.path.abspath(".."))

BASE_DIR = Path().resolve().parent

DATA_DIR = BASE_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
INTERIM_DIR = DATA_DIR / "interim"
PROCESSED_DIR = DATA_DIR / "processed"

MART_RAW_PATH = RAW_DIR / "mart_raw.csv"
LOG_RAW_PATH = RAW_DIR / "log_raw.pkl"
TPS_RAW_PATH = RAW_DIR / "tps_raw.csv"

LOG_INT_PATH = INTERIM_DIR / "log_int.pkl"
MART_INT_PATH = INTERIM_DIR / "mart_int.pkl"
TPS_INT_PATH = INTERIM_DIR / "tps_int.pkl"



In [2]:
LOG_PRC_PATH = PROCESSED_DIR / "log_prc.pkl"
MART_PRC_PATH = PROCESSED_DIR / "mart_prc.pkl"
TPS_PRC_PATH = PROCESSED_DIR / "tps_prc.pkl"

In [3]:
from src.validate import check_id_matching

from src.clean import (
    drop_high_null_cols,
    drop_mart_cols,
    drop_log_cols,
    get_column_summary,
    clean_mart,
    clean_log,
)

from src.encoder import (
    encode_asset_prod,
    encode_screen_tp,
    encode_publctn_bit
)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)


In [4]:
mart_raw = pd.read_csv(MART_RAW_PATH, 
                       sep = ",", 
                       encoding = 'utf-8', 
                       engine="python",
                       on_bad_lines="skip")
mart_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470064 entries, 0 to 470063
Data columns (total 88 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   asset_id            470064 non-null  object 
 1   actr_disp           470060 non-null  object 
 2   asset_nm            470064 non-null  object 
 3   asset_prod          470064 non-null  object 
 4   aud                 0 non-null       float64
 5   audience_cnt        0 non-null       float64
 6   broad_ymd           466682 non-null  object 
 7   category            468827 non-null  object 
 8   chapter             468852 non-null  object 
 9   created             470064 non-null  int64  
 10  created_by          470064 non-null  object 
 11  crt_ymd             467690 non-null  object 
 12  ct_cl               470064 non-null  object 
 13  cts_id              468751 non-null  object 
 14  description         470064 non-null  object 
 15  director            470036 non-nul

In [5]:
mart_raw, dropped_by_null = drop_high_null_cols(mart_raw, threshold=90)

print(f"Dropped {len(dropped_by_null)}")
dropped_by_null


Dropped 23


['aud',
 'audience_cnt',
 'disp_as_lst_chnc',
 'disp_as_new',
 'dsbtr_nm',
 'grade_score',
 'hash_tag',
 'mobile_watch_url',
 'one_line_review',
 'orgnl_air_dt',
 'prdcrs',
 'preview_file_nm',
 'preview_rate',
 'preview_rtm',
 'seasn_fin_fl',
 'seasn_prem_fl',
 'show_tp',
 'star_score',
 'sub_title',
 'svc_applied',
 'ttl_lng',
 'ttl_mdm',
 'ttl_sort_nm']

In [6]:
mart_raw_sum = get_column_summary(mart_raw)
mart_raw_sum

Unnamed: 0,column,dtype,null_cnt,null_rate,unique_cnt
0,asset_id,object,0,0.0,470064
1,actr_disp,object,4,0.0,41757
2,asset_nm,object,0,0.0,360181
3,asset_prod,object,0,0.0,3
4,broad_ymd,object,3382,0.72,10575
5,category,object,1237,0.26,20122
6,chapter,object,1212,0.26,17483
7,created,int64,0,0.0,465546
8,created_by,object,0,0.0,11
9,crt_ymd,object,2374,0.51,2218


In [7]:
mart_raw = drop_mart_cols(mart_raw)

In [8]:
mart_clean = clean_mart(mart_raw)

In [9]:
mart_clean_sum = get_column_summary(mart_clean)
mart_clean_sum

Unnamed: 0,column,dtype,null_cnt,null_rate,unique_cnt
0,actr_disp,category,0,0.0,41758
1,asset_nm,category,0,0.0,360181
2,asset_prod,category,0,0.0,3
3,broad_ymd,datetime64[ns],3385,0.72,10572
4,category,category,0,0.0,20123
5,created,datetime64[ns],0,0.0,465546
6,created_by,category,0,0.0,11
7,crt_ymd,datetime64[ns],2724,0.58,2184
8,ct_cl,category,0,0.0,15
9,cts_id,category,0,0.0,468748


In [10]:
mart_int_path = "../data/interim/mart_int.pkl"
mart_clean.to_pickle(mart_int_path)
print(f" 파일 저장 완료: {mart_int_path}")


 파일 저장 완료: ../data/interim/mart_int.pkl


In [11]:
log_raw = pd.read_pickle(LOG_RAW_PATH)
log_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50804390 entries, 0 to 50804389
Data columns (total 9 columns):
 #   Column          Dtype         
---  ------          -----         
 0   sha2_hash       object        
 1   asset           object        
 2   asset_nm        object        
 3   CT_CL           object        
 4   genre_of_ct_cl  object        
 5   use_tms         float64       
 6   disp_rtm        object        
 7   strt_dt         datetime64[ns]
 8   category        object        
dtypes: datetime64[ns](1), float64(1), object(7)
memory usage: 3.4+ GB


In [12]:
log_raw_sum = get_column_summary(log_raw)
log_raw_sum

Unnamed: 0,column,dtype,null_cnt,null_rate,unique_cnt
0,sha2_hash,object,0,0.0,817119
1,asset,object,13997,0.03,439170
2,asset_nm,object,1157048,2.28,340030
3,CT_CL,object,1157048,2.28,15
4,genre_of_ct_cl,object,1157048,2.28,59
5,use_tms,float64,1,0.0,20494
6,disp_rtm,object,1157048,2.28,359
7,strt_dt,datetime64[ns],1,0.0,17365219
8,category,object,1183154,2.33,19419


In [13]:
log_raw = drop_log_cols(log_raw)

In [14]:
log_clean = clean_log(log_raw)

In [15]:
log_clean_sum = get_column_summary(log_clean)
log_clean_sum

Unnamed: 0,column,dtype,null_cnt,null_rate,unique_cnt
0,sha2_hash,category,0,0.0,817119
1,asset,category,0,0.0,439171
2,use_tms,Int64,1,0.0,20494
3,strt_dt,datetime64[ns],1,0.0,17365219
4,disp_rtm_sec,Int64,1157048,2.28,352


In [16]:
log_int_path = "../data/interim/log_int.pkl"
log_clean.to_pickle(log_int_path)
print(f" 파일 저장 완료: {log_int_path}")

 파일 저장 완료: ../data/interim/log_int.pkl


In [None]:
tps_raw = pd.read_csv(TPS_RAW_PATH,  
                        sep = "|", 
                        encoding = 'utf-8')
tps_raw.info()

In [None]:
tps_raw.info()

In [None]:
tps_raw.head(3)

In [None]:
obj_cols = tps_raw.select_dtypes(include="object").columns
tps_raw[obj_cols] = tps_raw[obj_cols].astype("category")


In [None]:
tps_raw["AGE_GRP10"] = tps_raw["AGE_GRP10"].astype("category")
tps_raw["KIDS_USE_PV_MONTH1"] = tps_raw["KIDS_USE_PV_MONTH1"].astype("float32")

In [None]:
tps_raw_sum = get_column_summary(tps_raw)
tps_raw_sum

In [None]:
tps_int_path = "../data/interim/tps_int.pkl"
tps_raw.to_pickle(tps_int_path)
print(f" 파일 저장 완료: {tps_int_path}")

- interim

In [17]:
log_int = pd.read_pickle(LOG_INT_PATH)
log_int.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50804390 entries, 0 to 50804389
Data columns (total 5 columns):
 #   Column        Dtype         
---  ------        -----         
 0   sha2_hash     category      
 1   asset         category      
 2   use_tms       Int64         
 3   strt_dt       datetime64[ns]
 4   disp_rtm_sec  Int64         
dtypes: Int64(2), category(2), datetime64[ns](1)
memory usage: 1.6 GB


In [18]:
tps_int = pd.read_pickle(TPS_INT_PATH)
tps_int.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21719882 entries, 0 to 21719881
Data columns (total 39 columns):
 #   Column                     Dtype   
---  ------                     -----   
 0   sha2_hash                  category
 1   SVC_USE_DAYS_GRP           int64   
 2   MEDIA_NM_GRP               category
 3   PROD_NM_GRP                category
 4   PROD_OLD_YN                int64   
 5   PROD_ONE_PLUS_YN           int64   
 6   AGMT_KIND_NM               category
 7   STB_RES_1M_YN              int64   
 8   SVOD_SCRB_CNT_GRP          int64   
 9   PAID_CHNL_CNT_GRP          int64   
 10  SCRB_PATH_NM_GRP           category
 11  INHOME_RATE                float64 
 12  AGMT_END_SEG               int64   
 13  TOTAL_USED_DAYS            int64   
 14  TV_SCRB                    int64   
 15  ANALOG_SCRB                int64   
 16  DIGITAL_SCRB               int64   
 17  TOTAL_INTERNET_SCRB        int64   
 18  GIGA_INTERNET_SCRB         int64   
 19  BUNDLE_YN          

In [19]:
mart_int = pd.read_pickle(MART_INT_PATH)
mart_int.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470064 entries, 0 to 470063
Data columns (total 26 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   actr_disp          470064 non-null  category      
 1   asset_nm           470064 non-null  category      
 2   asset_prod         470064 non-null  category      
 3   broad_ymd          466679 non-null  datetime64[ns]
 4   category           470064 non-null  category      
 5   created            470064 non-null  datetime64[ns]
 6   created_by         470064 non-null  category      
 7   crt_ymd            467340 non-null  datetime64[ns]
 8   ct_cl              470064 non-null  category      
 9   cts_id             470064 non-null  category      
 10  epsd_id            470064 non-null  category      
 11  epsd_no            470064 non-null  category      
 12  full_asset_id      470064 non-null  category      
 13  genre              470064 non-null  category

In [20]:
mart_int['asset_prod'].value_counts().head(10)
mart_int['asset_prod'], uniques = pd.factorize(mart_int['asset_prod'])

uniques


CategoricalIndex(['FOD', 'RVOD', 'SVOD'], categories=['FOD', 'RVOD', 'SVOD'], ordered=False, dtype='category')

In [21]:
mart_int = encode_asset_prod(mart_int)

In [22]:
mart_int['screen_tp'].value_counts().head(10)
mart_int['screen_tp'], uniques = pd.factorize(mart_int['screen_tp'])

uniques


CategoricalIndex(['HD', 'unknown', 'SD', 'UHD'], categories=['HD', 'SD', 'UHD', 'unknown'], ordered=False, dtype='category')

In [23]:
mart_int = encode_screen_tp(mart_int)

In [24]:
mart_int = encode_publctn_bit(mart_int)

In [25]:
mart_int.head(5)

Unnamed: 0,actr_disp,asset_nm,asset_prod,broad_ymd,category,created,created_by,crt_ymd,ct_cl,cts_id,epsd_id,epsd_no,full_asset_id,genre,genre_of_ct_cl,genre_of_ct_cl_cd,product_tp,prpgt_pri,publctn_rt,rate,rlse_year,screen_tp,studio_nm,super_asset_id,super_asset_nm,ttl
0,"양택조,김영환",건강 체조,-1,NaT,지역콘텐츠/CCS충북/건강 체조,1970-01-01 05:36:51.126152916,VODIN:2.0.3,2021-11-26,기타,CCS000001,건강 체조_1,1.0,CCS|CCS00000000000000101,기타,기타,0,10,1,1,0,1970-01-01 00:00:00.000002010,-1,CCS충북방송,null-00001,건강 체조,건강 체조
1,-,자연을 닮은 농부들,-1,NaT,지역콘텐츠/CCS충북/의림지뜰 사람들,1970-01-01 05:36:51.223151741,VODIN:2.0.3,2021-12-23,기타,CCS000006,의림지뜰 사람들_1,1.0,CCS|CCS00000000000000601,다큐,기타,0,10,1,1,15,1970-01-01 00:00:00.000002021,-1,CCS충북방송,null-00001,자연을 닮은 농부들,자연을 닮은 농부들
2,-,가을의 추억,-1,NaT,지역콘텐츠/CCS충북/의림지뜰 사람들,1970-01-01 05:36:51.225171925,VODIN:2.0.3,2021-12-25,기타,CCS000007,의림지뜰 사람들_2,2.0,CCS|CCS00000000000000701,다큐,기타,0,10,1,1,15,1970-01-01 00:00:00.000002021,-1,CCS충북방송,null-00002,가을의 추억,가을의 추억
3,-,덤벙김치 담그는 날,-1,NaT,지역콘텐츠/CCS충북/의림지뜰 사람들,1970-01-01 05:36:51.225172138,VODIN:2.0.3,2021-12-25,기타,CCS000008,의림지뜰 사람들_3,3.0,CCS|CCS00000000000000801,다큐,기타,0,10,1,1,15,1970-01-01 00:00:00.000002021,-1,CCS충북방송,null-00003,덤벙김치 담그는 날,덤벙김치 담그는 날
4,-,새로운 도전의 시작,-1,NaT,지역콘텐츠/CCS충북/의림지뜰 사람들,1970-01-01 05:36:51.225172359,VODIN:2.0.3,2021-12-25,기타,CCS000009,의림지뜰 사람들_4,4.0,CCS|CCS00000000000000901,다큐,기타,0,10,1,1,15,1970-01-01 00:00:00.000002021,-1,CCS충북방송,null-00004,새로운 도전의 시작,새로운 도전의 시작


In [26]:
mart_int_sum = get_column_summary(mart_int)
mart_int_sum   


Unnamed: 0,column,dtype,null_cnt,null_rate,unique_cnt
0,actr_disp,category,0,0.0,41758
1,asset_nm,category,0,0.0,360181
2,asset_prod,int8,0,0.0,1
3,broad_ymd,datetime64[ns],3385,0.72,10572
4,category,category,0,0.0,20123
5,created,datetime64[ns],0,0.0,465546
6,created_by,category,0,0.0,11
7,crt_ymd,datetime64[ns],2724,0.58,2184
8,ct_cl,category,0,0.0,15
9,cts_id,category,0,0.0,468748


In [27]:
l_users = list(log_int['sha2_hash'].unique())
t_users = list(tps_int['sha2_hash'].unique())

In [28]:
m_ids = list(mart_int['full_asset_id'].unique())
l_ids = list(log_int['asset'].unique())

In [29]:
tps_log_match = check_id_matching(
    tps_int['sha2_hash'],
    log_int['sha2_hash'],
    dataset_type="tps_vs_log"
)

1. 공통 ID 개수 : 581,223개
2. Log 매칭률   : 71.13% (Log 중 TPS 정보가 존재하는 비율)
3. TPS 매칭률  : 26.90% (TPS 중 Log 정보가 존재하는 비율)
 경고: Log에는 있으나 TPS에 없는 ID가 235,896개 있습니다.
   예시: ['0bf3eb33594b68bcac00d57fdde368921e6a9eed3cf86c0c714c9da343fbc366', '79a462dda78505d586abc8a5f0d408f8d47827a2005196eb8031036cd190ed7d', '87b20809ee31fda00b53406baa0cdd55f6c4bec01aeea0062f4eca8fcdaf8ed1', 'c0d237250b160a3e0a7e1640316bf0bb2e8ff7d481ecd4f6ca1b165b3cf9565e', 'b1c74e41716fd4c9a27aecca4832c00e9c50093b11a2ed039306183cdd978c8a']


In [30]:
mart_log_match = check_id_matching(
    mart_int['full_asset_id'],
    log_int['asset'],
    dataset_type="mart_vs_log"
)

1. 공통 ID 개수 : 438,961개
2. Log 매칭률   : 99.95% (Log 중 Mart 정보가 존재하는 비율)
3. Mart 매칭률  : 93.38% (Mart 중 Log 정보가 존재하는 비율)
 경고: Log에는 있으나 Mart에 없는 ID가 210개 있습니다.
   예시: ['cjc|M5168583LSVM89023601', 'CJIQ2205241711083557', 'CCS|CCS10000000000091701', 'CJPF2205241556083509', 'cjc|M5126592LSVK72696001']


In [32]:
log_prc_path  = "../data/processed/log_prc.pkl"
mart_prc_path = "../data/processed/mart_prc.pkl"
tps_prc_path  = "../data/processed/tps_prc.pkl"
log_int.to_pickle(log_prc_path)
mart_int.to_pickle(mart_prc_path)
tps_int.to_pickle(tps_prc_path)
print(f" 파일 저장 완료: {log_prc_path}")
print(f" 파일 저장 완료: {mart_prc_path}")
print(f" 파일 저장 완료: {tps_prc_path}")

 파일 저장 완료: ../data/processed/log_prc.pkl
 파일 저장 완료: ../data/processed/mart_prc.pkl
 파일 저장 완료: ../data/processed/tps_prc.pkl
