In [1]:
from pathlib import Path
import pandas as pd
import sys
import os
sys.path.append(os.path.abspath(".."))

BASE_DIR = Path().resolve().parent

DATA_DIR = BASE_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
INTERIM_DIR = DATA_DIR / "interim"
PROCESSED_DIR = DATA_DIR / "processed"

MART_RAW_PATH = RAW_DIR / "mart_raw.csv"
LOG_RAW_PATH = RAW_DIR / "log_raw.pkl"
TPS_RAW_PATH = RAW_DIR / "tps_raw.csv"

LOG_INT_PATH = INTERIM_DIR / "log_int.pkl"
MART_INT_PATH = INTERIM_DIR / "mart_int.pkl"
TPS_INT_PATH = INTERIM_DIR / "tps_int.pkl"



In [2]:
mart_raw = pd.read_csv(MART_RAW_PATH, 
                       sep = ",", 
                       encoding = 'utf-8', 
                       engine="python",
                       on_bad_lines="skip")
mart_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470064 entries, 0 to 470063
Data columns (total 88 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   asset_id            470064 non-null  object 
 1   actr_disp           470060 non-null  object 
 2   asset_nm            470064 non-null  object 
 3   asset_prod          470064 non-null  object 
 4   aud                 0 non-null       float64
 5   audience_cnt        0 non-null       float64
 6   broad_ymd           466682 non-null  object 
 7   category            468827 non-null  object 
 8   chapter             468852 non-null  object 
 9   created             470064 non-null  int64  
 10  created_by          470064 non-null  object 
 11  crt_ymd             467690 non-null  object 
 12  ct_cl               470064 non-null  object 
 13  cts_id              468751 non-null  object 
 14  description         470064 non-null  object 
 15  director            470036 non-nul

In [4]:
log_raw = pd.read_pickle(LOG_INT_PATH)
log_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50804389 entries, 0 to 50804389
Data columns (total 5 columns):
 #   Column        Dtype   
---  ------        -----   
 0   sha2_hash     category
 1   asset         category
 2   use_tms       Int64   
 3   strt_dt       int64   
 4   disp_rtm_sec  Int64   
dtypes: Int64(2), category(2), int64(1)
memory usage: 2.0 GB


In [None]:
tps_raw = pd.read_csv(TPS_RAW_PATH,  
                        sep = "|", 
                        encoding = 'utf-8')
tps_raw.info()

In [None]:
import pandas as pd

# =========================
# 1. TPS
# =========================
tps_raw = pd.read_csv(
    TPS_RAW_PATH,
    sep="|",
    encoding="utf-8"
)

# categorical 있으면 문자열로 변환 (parquet 안정성)
for col in tps_raw.select_dtypes(include="category").columns:
    tps_raw[col] = tps_raw[col].astype(str)

tps_raw.to_parquet(
    "tps_raw.parquet",
    index=False
)

print("✅ tps_raw.parquet 저장 완료")








In [2]:
LOG_PRC_PATH = PROCESSED_DIR / "log_prc.pkl"
MART_PRC_PATH = PROCESSED_DIR / "mart_prc.pkl"
TPS_PRC_PATH = PROCESSED_DIR / "tps_prc.pkl"

In [3]:
from src.validate import check_id_matching

from src.clean import (
    drop_high_null_cols,
    drop_mart_cols,
    drop_log_cols,
    get_column_summary,
    clean_mart,
    clean_log,
)

from src.encoder import (
    encode_asset_prod,
    encode_screen_tp,
    encode_publctn_bit
)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)


In [10]:
mart_raw = pd.read_csv(MART_RAW_PATH, 
                       sep = ",", 
                       encoding = 'utf-8', 
                       engine="python",
                       on_bad_lines="skip")
mart_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470064 entries, 0 to 470063
Data columns (total 88 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   asset_id            470064 non-null  object 
 1   actr_disp           470060 non-null  object 
 2   asset_nm            470064 non-null  object 
 3   asset_prod          470064 non-null  object 
 4   aud                 0 non-null       float64
 5   audience_cnt        0 non-null       float64
 6   broad_ymd           466682 non-null  object 
 7   category            468827 non-null  object 
 8   chapter             468852 non-null  object 
 9   created             470064 non-null  int64  
 10  created_by          470064 non-null  object 
 11  crt_ymd             467690 non-null  object 
 12  ct_cl               470064 non-null  object 
 13  cts_id              468751 non-null  object 
 14  description         470064 non-null  object 
 15  director            470036 non-nul

In [11]:
mart_raw, dropped_by_null = drop_high_null_cols(mart_raw, threshold=90)

print(f"Dropped {len(dropped_by_null)}")
dropped_by_null


Dropped 23


['aud',
 'audience_cnt',
 'disp_as_lst_chnc',
 'disp_as_new',
 'dsbtr_nm',
 'grade_score',
 'hash_tag',
 'mobile_watch_url',
 'one_line_review',
 'orgnl_air_dt',
 'prdcrs',
 'preview_file_nm',
 'preview_rate',
 'preview_rtm',
 'seasn_fin_fl',
 'seasn_prem_fl',
 'show_tp',
 'star_score',
 'sub_title',
 'svc_applied',
 'ttl_lng',
 'ttl_mdm',
 'ttl_sort_nm']

In [12]:
mart_raw_sum = get_column_summary(mart_raw)
mart_raw_sum

Unnamed: 0,column,dtype,null_cnt,null_rate,unique_cnt
0,asset_id,object,0,0.0,470064
1,actr_disp,object,4,0.0,41757
2,asset_nm,object,0,0.0,360181
3,asset_prod,object,0,0.0,3
4,broad_ymd,object,3382,0.72,10575
5,category,object,1237,0.26,20122
6,chapter,object,1212,0.26,17483
7,created,int64,0,0.0,465546
8,created_by,object,0,0.0,11
9,crt_ymd,object,2374,0.51,2218


In [13]:
mart_raw = drop_mart_cols(mart_raw)

In [14]:
mart_clean = clean_mart(mart_raw)

In [15]:
mart_clean_sum = get_column_summary(mart_clean)
mart_clean_sum

Unnamed: 0,column,dtype,null_cnt,null_rate,unique_cnt
0,actr_disp,category,0,0.0,41758
1,asset_nm,category,0,0.0,360181
2,asset_prod,category,0,0.0,3
3,broad_ymd,datetime64[ns],3385,0.72,10572
4,category,category,0,0.0,20123
5,created,datetime64[ns],0,0.0,465546
6,created_by,category,0,0.0,11
7,crt_ymd,datetime64[ns],2724,0.58,2184
8,ct_cl,category,0,0.0,15
9,cts_id,category,0,0.0,468748


In [16]:
mart_int_path = "../data/interim/mart_int.pkl"
mart_clean.to_pickle(mart_int_path)
print(f" 파일 저장 완료: {mart_int_path}")


 파일 저장 완료: ../data/interim/mart_int.pkl


In [4]:
log_raw = pd.read_pickle(LOG_RAW_PATH)
log_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50804390 entries, 0 to 50804389
Data columns (total 9 columns):
 #   Column          Dtype         
---  ------          -----         
 0   sha2_hash       object        
 1   asset           object        
 2   asset_nm        object        
 3   CT_CL           object        
 4   genre_of_ct_cl  object        
 5   use_tms         float64       
 6   disp_rtm        object        
 7   strt_dt         datetime64[ns]
 8   category        object        
dtypes: datetime64[ns](1), float64(1), object(7)
memory usage: 3.4+ GB


In [5]:
log_raw_sum = get_column_summary(log_raw)
log_raw_sum

Unnamed: 0,column,dtype,null_cnt,null_rate,unique_cnt
0,sha2_hash,object,0,0.0,817119
1,asset,object,13997,0.03,439170
2,asset_nm,object,1157048,2.28,340030
3,CT_CL,object,1157048,2.28,15
4,genre_of_ct_cl,object,1157048,2.28,59
5,use_tms,float64,1,0.0,20494
6,disp_rtm,object,1157048,2.28,359
7,strt_dt,datetime64[ns],1,0.0,17365219
8,category,object,1183154,2.33,19419


In [6]:
log_raw = drop_log_cols(log_raw)

In [7]:
log_clean = clean_log(log_raw)

In [8]:
log_clean['strt_dt'] = log_clean['strt_dt'].dt.strftime('%Y%m')
log_clean = log_clean.dropna(subset=['strt_dt'])
log_clean['strt_dt'] = log_clean['strt_dt'].astype('int64')

In [9]:
log_clean_sum = get_column_summary(log_clean)
log_clean_sum

Unnamed: 0,column,dtype,null_cnt,null_rate,unique_cnt
0,sha2_hash,category,0,0.0,817118
1,asset,category,0,0.0,439171
2,use_tms,Int64,0,0.0,20494
3,strt_dt,int64,0,0.0,9
4,disp_rtm_sec,Int64,1157047,2.28,352


In [17]:
log_int_path = "../data/interim/log_int.pkl"
log_clean.to_pickle(log_int_path)
print(f" 파일 저장 완료: {log_int_path}")

 파일 저장 완료: ../data/interim/log_int.pkl


In [None]:
tps_raw = pd.read_csv(TPS_RAW_PATH,  
                        sep = "|", 
                        encoding = 'utf-8')
tps_raw.info()

In [None]:
tps_raw.info()

In [None]:
tps_raw.head(3)

In [None]:
obj_cols = tps_raw.select_dtypes(include="object").columns
tps_raw[obj_cols] = tps_raw[obj_cols].astype("category")


In [None]:
tps_raw["AGE_GRP10"] = tps_raw["AGE_GRP10"].astype("category")
tps_raw["KIDS_USE_PV_MONTH1"] = tps_raw["KIDS_USE_PV_MONTH1"].astype("float32")

In [None]:
tps_raw_sum = get_column_summary(tps_raw)
tps_raw_sum

In [None]:
tps_int_path = "../data/interim/tps_int.pkl"
tps_raw.to_pickle(tps_int_path)
print(f" 파일 저장 완료: {tps_int_path}")

- interim

In [18]:
log_int = pd.read_pickle(LOG_INT_PATH)
log_int.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50804389 entries, 0 to 50804389
Data columns (total 5 columns):
 #   Column        Dtype   
---  ------        -----   
 0   sha2_hash     category
 1   asset         category
 2   use_tms       Int64   
 3   strt_dt       int64   
 4   disp_rtm_sec  Int64   
dtypes: Int64(2), category(2), int64(1)
memory usage: 2.0 GB


In [None]:
tps_int = pd.read_pickle(TPS_INT_PATH)
tps_int.info()

In [19]:
mart_int = pd.read_pickle(MART_INT_PATH)
mart_int.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470064 entries, 0 to 470063
Data columns (total 27 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   actr_disp          470064 non-null  category      
 1   asset_nm           470064 non-null  category      
 2   asset_prod         470064 non-null  category      
 3   broad_ymd          466679 non-null  datetime64[ns]
 4   category           470064 non-null  category      
 5   created            470064 non-null  datetime64[ns]
 6   created_by         470064 non-null  category      
 7   crt_ymd            467340 non-null  datetime64[ns]
 8   ct_cl              470064 non-null  category      
 9   cts_id             470064 non-null  category      
 10  disp_rtm           470064 non-null  object        
 11  epsd_id            470064 non-null  category      
 12  epsd_no            470064 non-null  category      
 13  full_asset_id      470064 non-null  category

In [20]:
mart_int['screen_tp'].value_counts().head(10)
mart_int['screen_tp'], uniques = pd.factorize(mart_int['screen_tp'])

uniques


CategoricalIndex(['HD', 'unknown', 'SD', 'UHD'], categories=['HD', 'SD', 'UHD', 'unknown'], ordered=False, dtype='category')

In [21]:
mart_int = encode_screen_tp(mart_int)

In [22]:
mart_int = encode_publctn_bit(mart_int)

In [None]:
mart_int.head(5)

In [23]:
mart_int_sum = get_column_summary(mart_int)
mart_int_sum   


Unnamed: 0,column,dtype,null_cnt,null_rate,unique_cnt
0,actr_disp,category,0,0.0,41758
1,asset_nm,category,0,0.0,360181
2,asset_prod,category,0,0.0,3
3,broad_ymd,datetime64[ns],3385,0.72,10572
4,category,category,0,0.0,20123
5,created,datetime64[ns],0,0.0,465546
6,created_by,category,0,0.0,11
7,crt_ymd,datetime64[ns],2724,0.58,2184
8,ct_cl,category,0,0.0,15
9,cts_id,category,0,0.0,468748


In [None]:
l_users = list(log_int['sha2_hash'].unique())
t_users = list(tps_int['sha2_hash'].unique())

In [None]:
m_ids = list(mart_int['full_asset_id'].unique())
l_ids = list(log_int['asset'].unique())

In [None]:
tps_log_match = check_id_matching(
    tps_int['sha2_hash'],
    log_int['sha2_hash'],
    dataset_type="tps_vs_log"
)

In [24]:
mart_log_match = check_id_matching(
    mart_int['full_asset_id'],
    log_int['asset'],
    dataset_type="mart_vs_log"
)

1. 공통 ID 개수 : 438,961개
2. Log 매칭률   : 99.95% (Log 중 Mart 정보가 존재하는 비율)
3. Mart 매칭률  : 93.38% (Mart 중 Log 정보가 존재하는 비율)
 경고: Log에는 있으나 Mart에 없는 ID가 210개 있습니다.
   예시: ['cjc|M5126166LSVK72734901', 'www.hchoice.co.kr|M5187962LFOM46592401', 'CJQA2211301727084678', 'cjc|CJPS2305251829085541', 'cjc|M5138767LSVL35826201']


In [25]:
mart_prc_path = "../data/processed/mart_prc.pkl"
mart_int.to_pickle(mart_prc_path)
log_prc_path  = "../data/processed/log_prc.pkl"
log_int.to_pickle(log_prc_path)
print(f" 파일 저장 완료: {log_prc_path}")
print(f" 파일 저장 완료: {mart_prc_path}")

 파일 저장 완료: ../data/processed/log_prc.pkl
 파일 저장 완료: ../data/processed/mart_prc.pkl


In [None]:
tps_prc_path  = "../data/processed/tps_prc.pkl"
tps_int.to_pickle(tps_prc_path)

print(f" 파일 저장 완료: {tps_prc_path}")