In [3]:
import sys
import os
sys.path.append(os.path.abspath(".."))

import pandas as pd
from src.load import load_csv,load_pickle
from src.config import RAW_MART_PATH,RAW_LOG_PATH,INTERIM_MART_PATH
from src.clean import (
    drop_high_null_cols,
    drop_unused_mart_cols,
    drop_unused_log_cols,
    get_column_summary,
    clean_mart,
    clean_log
)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)


In [4]:
mart_df = load_csv(RAW_MART_PATH)
mart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470064 entries, 0 to 470063
Data columns (total 88 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   asset_id            470064 non-null  object 
 1   actr_disp           470060 non-null  object 
 2   asset_nm            470064 non-null  object 
 3   asset_prod          470064 non-null  object 
 4   aud                 0 non-null       float64
 5   audience_cnt        0 non-null       float64
 6   broad_ymd           466682 non-null  object 
 7   category            468827 non-null  object 
 8   chapter             468852 non-null  object 
 9   created             470064 non-null  int64  
 10  created_by          470064 non-null  object 
 11  crt_ymd             467690 non-null  object 
 12  ct_cl               470064 non-null  object 
 13  cts_id              468751 non-null  object 
 14  description         470064 non-null  object 
 15  director            470036 non-nul

In [5]:
mart_df, dropped_by_null = drop_high_null_cols(mart_df, threshold=90)

print(f"Dropped {len(dropped_by_null)}")
dropped_by_null


Dropped 23


['aud',
 'audience_cnt',
 'disp_as_lst_chnc',
 'disp_as_new',
 'dsbtr_nm',
 'grade_score',
 'hash_tag',
 'mobile_watch_url',
 'one_line_review',
 'orgnl_air_dt',
 'prdcrs',
 'preview_file_nm',
 'preview_rate',
 'preview_rtm',
 'seasn_fin_fl',
 'seasn_prem_fl',
 'show_tp',
 'star_score',
 'sub_title',
 'svc_applied',
 'ttl_lng',
 'ttl_mdm',
 'ttl_sort_nm']

In [6]:
mart_df = drop_unused_mart_cols(mart_df)


In [7]:
col_summary = get_column_summary(mart_df)
col_summary


Unnamed: 0,column,dtype,null_cnt,null_rate,unique_cnt
0,actr_disp,object,4,0.0,41757
1,asset_nm,object,0,0.0,360181
2,asset_prod,object,0,0.0,3
3,broad_ymd,object,3382,0.72,10575
4,category,object,1237,0.26,20122
5,created,int64,0,0.0,465546
6,created_by,object,0,0.0,11
7,crt_ymd,object,2374,0.51,2218
8,ct_cl,object,0,0.0,15
9,cts_id,object,1313,0.28,468747


In [8]:
clean_mart_df = clean_mart(mart_df)

In [9]:
clean_mart_summary = get_column_summary(clean_mart_df)
clean_mart_summary


Unnamed: 0,column,dtype,null_cnt,null_rate,unique_cnt
0,actr_disp,category,4,0.0,41757
1,asset_nm,category,0,0.0,360181
2,asset_prod,category,0,0.0,3
3,broad_ymd,datetime64[ns],3385,0.72,10572
4,category,category,1237,0.26,20122
5,created,datetime64[ns],0,0.0,465546
6,created_by,category,0,0.0,11
7,crt_ymd,datetime64[ns],2724,0.58,2184
8,ct_cl,category,0,0.0,15
9,cts_id,category,1313,0.28,468747


In [10]:
clean_mart_df.head(3)

Unnamed: 0,actr_disp,asset_nm,asset_prod,broad_ymd,category,created,created_by,crt_ymd,ct_cl,cts_id,epsd_id,epsd_no,full_asset_id,genre,genre_of_ct_cl,genre_of_ct_cl_cd,lt_inclsn_fl,product_tp,prpgt_pri,publctn_rt,rate,rlse_year,screen_tp,studio_nm,super_asset_id
0,"양택조,김영환",건강 체조,FOD,NaT,지역콘텐츠/CCS충북/건강 체조,1970-01-01 05:36:51.126152916,VODIN:2.0.3,2021-11-26,기타,CCS000001,건강 체조_1,1.0,CCS|CCS00000000000000101,기타,기타,0,0,10,1,TV,0,1970-01-01 00:00:00.000002010,HD,CCS충북방송,null-00001
1,-,자연을 닮은 농부들,FOD,NaT,지역콘텐츠/CCS충북/의림지뜰 사람들,1970-01-01 05:36:51.223151741,VODIN:2.0.3,2021-12-23,기타,CCS000006,의림지뜰 사람들_1,1.0,CCS|CCS00000000000000601,다큐,기타,0,0,10,1,TV,15,1970-01-01 00:00:00.000002021,HD,CCS충북방송,null-00001
2,-,가을의 추억,FOD,NaT,지역콘텐츠/CCS충북/의림지뜰 사람들,1970-01-01 05:36:51.225171925,VODIN:2.0.3,2021-12-25,기타,CCS000007,의림지뜰 사람들_2,2.0,CCS|CCS00000000000000701,다큐,기타,0,0,10,1,TV,15,1970-01-01 00:00:00.000002021,HD,CCS충북방송,null-00002


In [11]:
clean_mart_df['publctn_rt'].unique()

['TV', NaN, 'TV,인터넷,모바일', 'TV,모바일', 'TV,모바일,인터넷', 'TV,인터넷', 'TV,TV,모바일,모바일,인터넷,인터넷', '모바일,인터넷', 'TV,TV']
Categories (8, object): ['TV', 'TV,TV', 'TV,TV,모바일,모바일,인터넷,인터넷', 'TV,모바일', 'TV,모바일,인터넷', 'TV,인터넷', 'TV,인터넷,모바일', '모바일,인터넷']

In [12]:
interim_path = "../data/interim/interim_mart.pkl"
clean_mart_df.to_pickle(interim_path)

print(f" 파일 저장 완료: {interim_path}")


 파일 저장 완료: ../data/interim/interim_mart.pkl


In [13]:
from src.load import load_pickle

In [14]:
log_df  = load_pickle(RAW_LOG_PATH)

In [15]:
log_df.head(10)

Unnamed: 0,sha2_hash,asset,asset_nm,CT_CL,genre_of_ct_cl,use_tms,disp_rtm,strt_dt,category
0,992c0dd6bafc5df33e86ece4885d574d25288d06530c65...,cjc|M4767613LSGK41566601,날 녹여주오 04회,TV드라마,외화 시리즈,3660.0,01:01,2023-01-23 17:08:58,CJ ENM/CJENM구작/날 녹여주오
1,ea62338ac5b6b11cf02ef8bf1889d1a063cec2c2493937...,cjc|M5140475LSGL08601501,압꾸정,영화,코미디,2.0,01:51,2023-01-23 22:28:46,영화/(HD)극장동시상영관
2,36aa302f3705794f5c2e5a971f38c8ef3c5a789e915c94...,cjc|M5079740LSVK14589601,어웨이크(2021),영화,공포/스릴러,7.0,01:14,2023-01-23 21:09:40,영화월정액관/무비n시리즈/범죄공포
3,d1ff76342bbc4f23f82c318b1f7a0ffc78f0b5ad2d1314...,cjc|M4574740LFOL08934201,동글동글 동물친구 시즌2 01회,키즈,기타,60.0,00:01,2023-01-23 09:05:10,키즈어린이/영어-놀이학습/동글동글 동물친구 시즌2
4,8cf3e070a173520f59e24b246ef83a1140c26d260c4ddd...,cjc|M5055522LFOI39238101,1박2일 시즌4 127회(22/05/29),TV 연예/오락,기타,157.0,01:19,2023-01-23 21:30:28,KBS/(HD)KBS 연예오락/1박2일 시즌4
5,7989cd8e8272b9dbfe94a7253809a7d64d22ce0449ca45...,cjc|M4818070LSVH73774901,학려화정 60회.,TV드라마,외화 시리즈,7.0,00:42,2023-01-23 03:24:27,해외드라마/무비n시리즈/학려화정
6,6c2fcacab6bba01b83d912bb6241f00d0296a3dd9b3add...,cjc|M5148050LSGL25460601,(할인)탑건: 매버릭,영화,액션/어드벤쳐,7571.0,02:10,2023-01-23 19:26:08,오늘의추천/반값할인관1
7,6939dee90e40ace55463b17e418084f13f5262886aec3e...,cjc|M5042788LSGL28317101,(HD)런닝맨 638회(23/01/22),TV 연예/오락,기타,5280.0,01:28,2023-01-23 10:16:48,SBS/(HD)SBS 연예오락/(HD)런닝맨
8,2b365d07914ab58d3a99e014ba7bc24f0ea7335ecf24cb...,cjc|M5035408LFOJ17822601,(HD)복면가왕 367회(22/07/31),TV 연예/오락,기타,5079.0,01:25,2023-01-23 19:29:43,MBC/(HD)MBC 연예오락/(HD)복면가왕
9,158e438e7d68ed90deb7dd3bc747ac552ba5a9032071a3...,cjc|M5010409LSVI64860301,(더빙)겜브링TV NEW 어드벤처 06회(G),TV애니메이션,기타,1140.0,00:19,2023-01-23 10:01:47,애니메이션/게임애니팩토리/(더빙)겜브링TV NEW 어드벤처


In [16]:
log_df_summary = get_column_summary(log_df)
log_df_summary

Unnamed: 0,column,dtype,null_cnt,null_rate,unique_cnt
0,sha2_hash,object,0,0.0,817119
1,asset,object,13997,0.03,439170
2,asset_nm,object,1157048,2.28,340030
3,CT_CL,object,1157048,2.28,15
4,genre_of_ct_cl,object,1157048,2.28,59
5,use_tms,float64,1,0.0,20494
6,disp_rtm,object,1157048,2.28,359
7,strt_dt,datetime64[ns],1,0.0,17365219
8,category,object,1183154,2.33,19419


In [17]:

log_df= drop_unused_log_cols(log_df)

In [18]:
clean_log_df = clean_log(log_df)

In [19]:
clean_log_summary = get_column_summary(clean_log_df)
clean_log_summary

Unnamed: 0,column,dtype,null_cnt,null_rate,unique_cnt
0,sha2_hash,category,0,0.0,817119
1,asset,category,13997,0.03,439170
2,use_tms,Int64,1,0.0,20494
3,strt_dt,datetime64[ns],1,0.0,17365219
4,disp_rtm_sec,Int64,1157048,2.28,352


In [20]:
interim_path = "../data/interim/interim_log.pkl"
clean_log_df.to_pickle(interim_path)

print(f" 파일 저장 완료: {interim_path}")


 파일 저장 완료: ../data/interim/interim_log.pkl


- interim

In [21]:
from src.load import load_interim_file

In [22]:
interim_mart = load_interim_file(file_type="mart")

In [23]:
interim_log = load_interim_file(file_type="log")




In [24]:
interim_log.head(10)

Unnamed: 0,sha2_hash,asset,use_tms,strt_dt,disp_rtm_sec
0,992c0dd6bafc5df33e86ece4885d574d25288d06530c65...,cjc|M4767613LSGK41566601,3660,2023-01-23 17:08:58,61
1,ea62338ac5b6b11cf02ef8bf1889d1a063cec2c2493937...,cjc|M5140475LSGL08601501,2,2023-01-23 22:28:46,111
2,36aa302f3705794f5c2e5a971f38c8ef3c5a789e915c94...,cjc|M5079740LSVK14589601,7,2023-01-23 21:09:40,74
3,d1ff76342bbc4f23f82c318b1f7a0ffc78f0b5ad2d1314...,cjc|M4574740LFOL08934201,60,2023-01-23 09:05:10,1
4,8cf3e070a173520f59e24b246ef83a1140c26d260c4ddd...,cjc|M5055522LFOI39238101,157,2023-01-23 21:30:28,79
5,7989cd8e8272b9dbfe94a7253809a7d64d22ce0449ca45...,cjc|M4818070LSVH73774901,7,2023-01-23 03:24:27,42
6,6c2fcacab6bba01b83d912bb6241f00d0296a3dd9b3add...,cjc|M5148050LSGL25460601,7571,2023-01-23 19:26:08,130
7,6939dee90e40ace55463b17e418084f13f5262886aec3e...,cjc|M5042788LSGL28317101,5280,2023-01-23 10:16:48,88
8,2b365d07914ab58d3a99e014ba7bc24f0ea7335ecf24cb...,cjc|M5035408LFOJ17822601,5079,2023-01-23 19:29:43,85
9,158e438e7d68ed90deb7dd3bc747ac552ba5a9032071a3...,cjc|M5010409LSVI64860301,1140,2023-01-23 10:01:47,19


In [25]:
interim_log_summary = get_column_summary(interim_log)
interim_log_summary




Unnamed: 0,column,dtype,null_cnt,null_rate,unique_cnt
0,sha2_hash,category,0,0.0,817119
1,asset,category,13997,0.03,439170
2,use_tms,Int64,1,0.0,20494
3,strt_dt,datetime64[ns],1,0.0,17365219
4,disp_rtm_sec,Int64,1157048,2.28,352


In [26]:
interim_mart_summary = get_column_summary(interim_mart)
interim_mart_summary

Unnamed: 0,column,dtype,null_cnt,null_rate,unique_cnt
0,actr_disp,category,4,0.0,41757
1,asset_nm,category,0,0.0,360181
2,asset_prod,category,0,0.0,3
3,broad_ymd,datetime64[ns],3385,0.72,10572
4,category,category,1237,0.26,20122
5,created,datetime64[ns],0,0.0,465546
6,created_by,category,0,0.0,11
7,crt_ymd,datetime64[ns],2724,0.58,2184
8,ct_cl,category,0,0.0,15
9,cts_id,category,1313,0.28,468747


In [27]:
interim_mart['asset_prod'].value_counts().head(10)
interim_mart['asset_prod'], uniques = pd.factorize(interim_mart['asset_prod'])

uniques


CategoricalIndex(['FOD', 'RVOD', 'SVOD'], categories=['FOD', 'RVOD', 'SVOD'], ordered=False, dtype='category')

In [28]:
from src.schema import ASSET_PROD_MAP

def encode_asset_prod(df: pd.DataFrame):
    df['asset_prod_code'] = (
        df['asset_prod']
        .map(ASSET_PROD_MAP)
        .fillna(-1)
        .astype("int8")
    )
    return df

In [29]:
interim_mart['screen_tp'].value_counts().head(10)
interim_mart['screen_tp'], uniques = pd.factorize(interim_mart['screen_tp'])

uniques


CategoricalIndex(['HD', 'SD', 'UHD'], categories=['HD', 'SD', 'UHD'], ordered=False, dtype='category')

In [30]:
def encode_screen_tp(df: pd.DataFrame):
    df['screen_tp_code'] = (
        df['screen_tp']
        .map(SCREEN_TP_MAP)
        .fillna(-1)
        .astype("int8")
    )
    return df

In [31]:
interim_mart['screen_tp'].unique()

array([ 0, -1,  1,  2])