# 월별 VOD 시청기록 전처리 + 영화 시청기록 1월~11월 병합

<br><hr>

## 00. 기본 설정

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 메시지 출력 X
import warnings
warnings.filterwarnings("ignore")

In [2]:
# 한글 font 설정
import platform
import matplotlib.font_manager as fm

#matplotlib 패키지 한글 깨짐 처리 시작
#------------------------------------------------------------------------------------
# 운영체제별 한글 폰트 설정

if platform.system() == 'Darwin': # Mac 환경 폰트 설정
    plt.rc('font', family='AppleGothic')
elif platform.system() == 'Windows': # Windows 환경 폰트 설정
    plt.rc('font', family='Malgun Gothic')
    
plt.rcParams['axes.unicode_minus'] = False #한글 폰트 사용시 마이너스 폰트 깨짐 해결

In [3]:
# 글씨 선명하게 출력하는 설정

from IPython.display import set_matplotlib_formats
set_matplotlib_formats("retina")

<br><hr>

## 01. 데이터 불러오기

In [5]:
zip_file_path = "../data/vods/202311_VOD.zip"
m11 = pd.read_csv(zip_file_path)

print(m11.shape) # 641만 74번

(6416074, 9)


In [6]:
m11.head(1)

Unnamed: 0,sha2_hash,asset,asset_nm,CT_CL,genre_of_ct_cl,use_tms,disp_rtm,strt_dt,category
0,f4234153897c4c350b6b6d45cb0aea9720604033e99e46...,cjc|M5151337LFON18981801,(FREE)천룡팔부: 교봉전(무료),기타,기타,15,02:10,20231118114124,프리미엄 무료관/무료영화/무료 영화관


<br><hr>

## 02. 전처리

In [8]:
# 인당 본 횟수
subsr_value_counts = m11['sha2_hash'].value_counts()
subsr_value_counts

# 30만 98명

sha2_hash
e97fd19c59c982cec8893fbbe72b4452214f6c0354da06fe52c84f449f6319be    8776
f86007dc2cc79681797a0e54481eeaab58d2d4884fb0f5efbc91aebd97ae6cb3    7631
fd26111ff64bd03c0c101ea5e1605a2990aa22dfa28f39b9033e4d725eb50f55    6995
40c9accc74b3e87a22e8fa30a0482c543f5c1a938f54ea71d2ef4b0ffbfad734    6704
dcffd93da703598c248d2bddf65061a4ccd34692628a5151bd2ddee7911efdd5    6443
                                                                    ... 
84a0360cf74ec4c36aac6be1e1005575191658c8a04811eb7819e733691c1204       1
5992cac24ee17d3fdb03550d6b80b716a6822e2cff962b5d803704d659e63944       1
1700a0d597a69a67afbf05de4f32f14c3dcf99c715e7e35d8f39687a543c542a       1
7309e4db35dd5585cfe98f64cc7c6f309937a81830d97bd40e91f5703f9bba89       1
9e6cd41990d791c8877f45e2e9bcef4f2b80e1fb93ddb490da2dbff41722b7fe       1
Name: count, Length: 300098, dtype: int64

In [9]:
# VOD를 한 번만 시청한 사용자
unique_count = (subsr_value_counts == 1).sum()
unique_count

# 6만 1172명 (총 30만 98명) >> 1/5

61172

In [13]:
# VOD를 1000번 이하로 시청한 사용자
unique_count = (subsr_value_counts <= 27).sum()
unique_count
# 25만 6235명이 27번 이하로 시청 (총 30만 98명)

256235

In [17]:
# VOD를 1000번 이하로 시청한 사용자
unique_count = (subsr_value_counts <= 5).sum()
unique_count
# 25만 6235명이 27번 이하로 시청 (총 30만 98명)

153190

<br><hr>

## 1월~11월 영화 시청기록 병합

In [41]:
m11_file_path = "../data/vods/202311_VOD.zip"
m10_file_path = "../data/vods/202310_VOD.zip"
m09_file_path = "../data/vods/202309_VOD.zip"

m11 = pd.read_csv(m11_file_path)
m10 = pd.read_csv(m10_file_path)
m09 = pd.read_csv(m09_file_path)

print(m11.shape)
print(m10.shape)
print(m09.shape)

(6416074, 9)
(1178159, 9)
(6919315, 9)


In [42]:
print(m11['CT_CL'].unique())
print(m10['CT_CL'].unique())
print(m09['CT_CL'].unique())

['기타' 'TV드라마' 'TV애니메이션' '키즈' '영화' 'TV 연예/오락' 'TV 시사/교양' '성인' '다큐' '우리동네'
 '라이프' '스포츠' '교육' nan '미분류']
['TV애니메이션' '키즈' 'TV드라마' 'TV 시사/교양' '영화' '다큐' 'TV 연예/오락' '성인' '기타' '우리동네'
 '라이프' '스포츠' '미분류' nan '교육' '공연/음악']
['TV드라마' 'TV 연예/오락' '영화' '성인' 'TV애니메이션' '키즈' 'TV 시사/교양' '기타' '우리동네' '다큐'
 '라이프' '미분류' '교육' '스포츠' nan '공연/음악']


In [43]:
m11_movies = m11[m11['CT_CL'] == '영화']
m10_movies = m10[m10['CT_CL'] == '영화']
m09_movies = m09[m09['CT_CL'] == '영화']
print('m11 shape:', m11_movies.shape)
print('m10 shape:', m10_movies.shape)
print('m09 shape:', m09_movies.shape)

m11 shape: (1422513, 9)
m10 shape: (250182, 9)
m09 shape: (1373404, 9)


In [44]:
m08_file_path = "../data/vods/202308_VOD.zip"
m07_file_path = "../data/vods/202307_VOD.zip"
m06_file_path = "../data/vods/202306_VOD.zip"

m08 = pd.read_csv(m08_file_path)
m07 = pd.read_csv(m07_file_path)
m06 = pd.read_csv(m06_file_path)

print(m08.shape)
print(m07.shape)
print(m06.shape)

In [45]:
m08_movies = m08[m08['CT_CL'] == '영화']
m07_movies = m07[m07['CT_CL'] == '영화']
m06_movies = m06[m06['CT_CL'] == '영화']
print('m08 shape:', m08_movies.shape)
print('m07 shape:', m07_movies.shape)
print('m06 shape:', m06_movies.shape)

m08 shape: (1579962, 9)
m07 shape: (1890441, 9)
m06 shape: (1849426, 9)


In [46]:
m05_file_path = "../data/vods/202305_VOD.zip"
m04_file_path = "../data/vods/202304_VOD.zip"
m03_file_path = "../data/vods/202303_VOD.zip"

m05 = pd.read_csv(m05_file_path)
m04 = pd.read_csv(m04_file_path)
m03 = pd.read_csv(m03_file_path)

print(m05.shape)
print(m04.shape)
print(m03.shape)

(7712374, 9)
(1971241, 9)
(7662099, 9)


In [47]:
m05_movies = m05[m05['CT_CL'] == '영화']
m04_movies = m04[m04['CT_CL'] == '영화']
m03_movies = m03[m03['CT_CL'] == '영화']
print('m05 shape:', m05_movies.shape)
print('m04 shape:', m04_movies.shape)
print('m03 shape:', m03_movies.shape)

m05 shape: (1775975, 9)
m04 shape: (463937, 9)
m03 shape: (1878906, 9)


In [48]:
m02_file_path = "../data/vods/202302_VOD.zip"
m01_file_path = "../data/vods/202301_VOD.zip"

m02 = pd.read_csv(m02_file_path)
m01 = pd.read_csv(m01_file_path)

print(m02.shape)
print(m01.shape)

(7350110, 9)
(6325398, 9)


In [49]:
m02_movies = m02[m02['CT_CL'] == '영화']
m01_movies = m01[m01['CT_CL'] == '영화']
print('m02 shape:', m02_movies.shape)
print('m01 shape:', m01_movies.shape)

m02 shape: (1593025, 9)
m01 shape: (1257149, 9)


In [50]:
# 11개의 데이터프레임 리스트
dataframes = [m11_movies, m10_movies, m09_movies, m08_movies, m07_movies, 
              m06_movies, m05_movies, m04_movies, m03_movies, m02_movies, m01_movies]

In [51]:
# 데이터프레임 병합
combined_df = pd.concat(dataframes, ignore_index=True)

# 결과 출력
print(combined_df.shape)

(15334920, 9)


In [52]:
combined_df.to_csv('../data/2023_VOD_movies.csv', index=False, encoding='utf8')