In [20]:
from pathlib import Path
import pandas as pd
import sys
import os
sys.path.append(os.path.abspath(".."))

BASE_DIR = Path().resolve().parent

DATA_DIR = BASE_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
INTERIM_DIR = DATA_DIR / "interim"
PROCESSED_DIR = DATA_DIR / "processed"

LOG_PRC_PATH = PROCESSED_DIR / "log_prc.pkl"
MART_PRC_PATH = PROCESSED_DIR / "mart_prc.pkl"
TPS_PRC_PATH = PROCESSED_DIR / "tps_prc.pkl"



In [21]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [22]:
log = pd.read_pickle(LOG_PRC_PATH)
mart = pd.read_pickle(MART_PRC_PATH)
tps = pd.read_pickle(TPS_PRC_PATH)

In [None]:
log.info()

In [None]:
mart.info()

In [None]:
tps.info()

In [23]:
log_base = log[['sha2_hash', 'use_tms', 'asset']]
mart_base = mart[['full_asset_id','actr_disp','genre','asset_nm','super_asset_nm','ttl','asset_prod']]


In [24]:
log_mart = log_base.merge(
    mart_base,
    left_on='asset',
    right_on='full_asset_id',
    how='left'
).drop(columns=['full_asset_id'])  

In [25]:
log_mart['asset'] = log_mart['asset'].astype('category')
log_mart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50804390 entries, 0 to 50804389
Data columns (total 9 columns):
 #   Column          Dtype   
---  ------          -----   
 0   sha2_hash       category
 1   use_tms         Int64   
 2   asset           category
 3   actr_disp       category
 4   genre           category
 5   asset_nm        category
 6   super_asset_nm  category
 7   ttl             category
 8   asset_prod      float64 
dtypes: Int64(1), category(7), float64(1)
memory usage: 2.0 GB


덕질 변수 1번

In [26]:
# 트로트 관련 키워드
trot_keywords = ['트로트', '트롯', '성인가요', '사랑의 콜센타', '뽕숭아학당', '화요일은 밤이 좋아', '전국노래자랑',
                 '불타는 장미단','미스터 로또','트랄랄라 브라더스']

pattern = '|'.join(trot_keywords)

tps_trot = log_mart[
    log_mart['super_asset_nm']
    .astype(str)
    .str.contains(pattern, na=False)
]

tps_trot[['super_asset_nm', 'genre']].head(20)



Unnamed: 0,super_asset_nm,genre
18,미스터트롯2,연예오락
70,미스터트롯2,연예오락
130,미스터트롯2,연예오락
174,더트롯 연예뉴스,기타
205,미스터트롯2,연예오락
283,불타는 트롯맨,연예오락
352,미스터트롯2,연예오락
455,미스터트롯2,연예오락
509,미스터트롯2,연예오락
513,미스터트롯2,연예오락


In [27]:
# tps_base에 해지 컬럼과 asset_prod 포함
tps_base = tps[['sha2_hash', 'AGE_GRP10', 'cancel_yn']]

# 중복 고객 제거 (sha2_hash 기준)
tps_base_unique = tps_base.drop_duplicates(subset='sha2_hash')

# 나이 매핑
age_map = tps_base_unique.set_index('sha2_hash')['AGE_GRP10']
log_mart['AGE_GRP10'] = log_mart['sha2_hash'].map(age_map)

# 해지 매핑
churn_map = tps_base_unique.set_index('sha2_hash')['cancel_yn']
log_mart['is_churn'] = log_mart['sha2_hash'].map(churn_map)


# NA 제거 (연령과 해지 컬럼 기준)
log_mart = log_mart.dropna(subset=['AGE_GRP10', 'is_churn'])

# 타입 정리
log_mart['AGE_GRP10'] = log_mart['AGE_GRP10'].astype(int)


In [None]:
log_mart.info()

In [30]:
# 50~70대만 필터링
target_age = [50, 60, 70]
target_df = log_mart[log_mart['AGE_GRP10'].isin(target_age)]

# 트로트 관련 필터링
trot_pattern = '|'.join(trot_keywords)
trot_df = target_df[
    target_df['super_asset_nm'].astype(str).str.contains(trot_pattern, na=False)
]

# 트로트를 보는 50~70대의 asset_prod 확인
trot_assets = trot_df[['sha2_hash', 'super_asset_nm', 'asset_prod', 'AGE_GRP10']].drop_duplicates()

# 결과 확인
trot_assets.head(20)


Unnamed: 0,sha2_hash,super_asset_nm,asset_prod,AGE_GRP10
130,44e254acf49adf83f8daf31aa05d027a7437f3ac630ea8...,미스터트롯2,-1.0,70
174,7c2b61c7cb47f101d491526ffc1a5f0e44a35786110641...,더트롯 연예뉴스,-1.0,60
352,bae744192cf442e00bbe3aa73e72ba26bc08d266c78812...,미스터트롯2,-1.0,50
455,16ec6424238f0dd1388353a1e3c95978c63a0881776361...,미스터트롯2,-1.0,50
1119,b752d439aa0ff5b1ba9f9c26203fd05f0c6bf529ceaeaf...,미스터트롯2,-1.0,60
1527,5fcdc60fb7cb6f5110b97cd7bb3240c0c153b30b58f192...,미스터트롯2,-1.0,60
1562,848f240ded54c63ce581caefa3113311a571bc0afbea12...,미스터트롯2,-1.0,50
1612,5643783a94d34a9421cb0ad472227812687b95677e2866...,사랑의 콜센타,-1.0,70
2178,c2025db945f598db66fc984a1923266f35019c077b3d09...,미스터트롯,-1.0,60
2186,3da78c6bcd784cb40a0ecc862b52542f0e03b45f3f7d30...,미스터트롯2,-1.0,50


In [32]:
trot_assets['asset_prod'].unique()

array([-1.])

In [18]:
# null(-1) 포함해서 asset_prod 비율 계산
trot_assets['asset_prod'] = trot_assets['asset_prod'].fillna(-1)  # null을 -1로 처리

# value_counts로 비율 계산
asset_ratio = (
    trot_assets['asset_prod']
    .value_counts(normalize=True, sort=True)
    .reset_index()
)
asset_ratio.columns = ['asset_prod', 'ratio']

# 보기 좋게 매핑 (선택 사항)
ASSET_PROD_MAP = {0: "FOD", 1: "RVOD", 2: "SVOD", -1: "Unknown"}
asset_ratio['asset_prod_name'] = asset_ratio['asset_prod'].map(ASSET_PROD_MAP)

# 결과 확인
asset_ratio


Unnamed: 0,asset_prod,ratio,asset_prod_name
0,-1.0,1.0,Unknown


In [None]:
target_age = [50, 60, 70]

target_df = log_mart[
    log_mart['AGE_GRP10'].isin(target_age)
]

In [None]:
genre_ratio = (
    target_df['genre']
    .value_counts(normalize=True)
    .reset_index()
)

genre_ratio.columns = ['genre', 'ratio']
top10_genre = genre_ratio.head(10)


In [None]:
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Malgun Gothic'  # 맑은 고딕
plt.rcParams['axes.unicode_minus'] = False 

plt.figure(figsize=(8, 5))
plt.bar(top10_genre['genre'], top10_genre['ratio'])
plt.xticks(rotation=45, ha='right')
plt.title('50~70대 시청 장르 비율 TOP 10')
plt.tight_layout()
plt.show()


In [None]:
age_use_tms = (
    target_df
    .groupby('AGE_GRP10')['use_tms']
    .sum()
    .reset_index()
)


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6, 4))
plt.bar(age_use_tms['AGE_GRP10'], age_use_tms['use_tms'])
plt.title('50~70대 연령대별 시청시간(use_tms)')
plt.xlabel('Age Group')
plt.ylabel('Total Viewing Time')
plt.tight_layout()
plt.show()


In [None]:
ent_df = target_df[
    target_df['genre'] == '연예오락'
]


In [None]:
ent_top = (
    ent_df['super_asset_nm']
    .value_counts()
    .reset_index()
)

ent_top.columns = ['program', 'count']
ent_top.head(10)


In [None]:
# 특정 장르 집중도 
target_df['genre_focus'] = (
    log_mart['super_asset_nm']
    .isin(trot_keywords)
).astype(int)


In [None]:
repeat_watch = target_df.groupby('sha2_hash')['super_asset_nm'].nunique()
target_df['repeat_watch'] = (repeat_watch > 3).astype(int)  # 3회 이상 반복 시 몰입


In [None]:
target_df['high_watch'] = (target_df['use_tms'] > 100000).astype(int)


In [None]:
target_df.head(4)

In [None]:
target_df['is_fandom'] = (
    (target_df['genre_focus'] == 1) &
    (target_df['repeat_watch'] == 1) &
    (target_df['high_watch'] == 1)
).astype(int)


In [None]:
target_df.info()

In [None]:
target_df['is_churn'].unique()

In [None]:
target_df['is_churn'] = target_df['is_churn'].astype(str)
target_df['is_churn'] = target_df['is_churn'].map({'해지':0, '유지':1})

In [None]:

fandom_churn = target_df.groupby('is_fandom')['is_churn'].mean()
print(fandom_churn)


In [None]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

In [None]:
cluster_features = ['genre_focus', 'repeat_watch', 'high_watch', 'use_tms']
X = target_df[cluster_features].fillna(0)

# 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



kmeans = MiniBatchKMeans(n_clusters=4, batch_size=100000, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

In [None]:
score = silhouette_score(X_scaled, cluster_labels)
print("Silhouette Score:", score)

In [None]:
cluster_summary = target_df.groupby('cluster')[cluster_features + ['is_fandom','is_churn']].mean()
cluster_churn = target_df.groupby('cluster')['is_churn'].mean()
target_df['cluster'].value_counts()



In [None]:
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Malgun Gothic'  # 맑은 고딕
plt.rcParams['axes.unicode_minus'] = False 
# 군집별 고객 수 계산
cluster_counts = target_df['cluster'].value_counts().sort_index()  # C0~C3 순서로 정렬

# 시각화
plt.figure(figsize=(7, 5))
plt.bar(cluster_counts.index.astype(str), cluster_counts.values, color='skyblue')
plt.title('군집별 고객 수')
plt.xlabel('Cluster')
plt.ylabel('고객 수')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()



In [None]:
cluster_summary = target_df.groupby('cluster')[['genre_focus','repeat_watch','high_watch','is_fandom','use_tms','is_churn']].mean()
print(cluster_summary)


In [None]:
cluster_churn = target_df.groupby('cluster')['is_churn'].mean()
print(cluster_churn)


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score

# 1️⃣ 군집에 사용할 변수
cluster_features = ['genre_focus', 'repeat_watch', 'high_watch', 'use_tms']

# 2️⃣ 샘플링 (전체 데이터 너무 크므로)
sample_df = target_df.sample(n=50000, random_state=42)  # 5만 건 샘플
X = sample_df[cluster_features].fillna(0)

# 3️⃣ 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 4️⃣ MiniBatchKMeans 군집화
batch_size = 500000  # 배치 사이즈 50만
kmeans = MiniBatchKMeans(n_clusters=4, batch_size=batch_size, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

# 5️⃣ Silhouette Score 계산
score = silhouette_score(X_scaled, cluster_labels)
print("Silhouette Score (샘플링 5만, 배치 50만):", score)

# 6️⃣ 샘플에 군집 라벨 추가 (원하면)
sample_df['cluster'] = cluster_labels
