In [1]:
import pymongo
import pandas as pd
import numpy as np
import math
from datetime import datetime

import dask.dataframe as dd ## 대규모 데이터 처리
# import vaex ## 대규모 데이터 처리

# from sklearn.linear_model import LinearRegression ## likes, dislikes 대체
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

import os
from dotenv import load_dotenv

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
file_path = 'C:/py_src/awake/data/'

In [3]:
# 계정 / 콘텐츠 분석 데이터셋 불러오기
# merge_df_users_fin = pd.read_csv(file_path + 'merge_df_users_fin_eda.csv', low_memory=False)
# youtube_videos = pd.read_csv(file_path + 'youtube_videos_eda.csv')
youtube_videos = dd.read_csv(file_path + 'youtube_videos_eda.csv')

## 계정 데이터 분석

In [4]:
# 불필요정보 제거 - y값 제거(다른 모델)
merge_df_users_fin = merge_df_users_fin.drop('y_label',axis=1)
## y값 : merge_df_users_fin['subscribers_count']

### 기간별 피처 생성

In [5]:
# 주별, 월별, 분기별 변수 생성

# 구독자 관련 변수
merge_df_users_fin['weekly_subscribers_gained'] = merge_df_users_fin['subscribers_gained'].rolling(window=7).sum()
merge_df_users_fin['monthly_subscribers_gained'] = merge_df_users_fin['subscribers_gained'].rolling(window=30).sum()
merge_df_users_fin['quarterly_subscribers_gained'] = merge_df_users_fin['subscribers_gained'].rolling(window=90).sum()

merge_df_users_fin['weekly_subscribers_lost'] = merge_df_users_fin['subscribers_lost'].rolling(window=7).sum()
merge_df_users_fin['monthly_subscribers_lost'] = merge_df_users_fin['subscribers_lost'].rolling(window=30).sum()
merge_df_users_fin['quarterly_subscribers_lost'] = merge_df_users_fin['subscribers_lost'].rolling(window=90).sum()

merge_df_users_fin['weekly_net_subscribers_change'] = merge_df_users_fin['weekly_subscribers_gained'] - merge_df_users_fin['weekly_subscribers_lost']
merge_df_users_fin['monthly_net_subscribers_change'] = merge_df_users_fin['monthly_subscribers_gained'] - merge_df_users_fin['monthly_subscribers_lost']
merge_df_users_fin['quarterly_net_subscribers_change'] = merge_df_users_fin['quarterly_subscribers_gained'] - merge_df_users_fin['quarterly_subscribers_lost']

# 수익 관련 변수
merge_df_users_fin['weekly_estimated_revenue'] = merge_df_users_fin['estimated_revenue'].rolling(window=7).sum()
merge_df_users_fin['monthly_estimated_revenue'] = merge_df_users_fin['estimated_revenue'].rolling(window=30).sum()
merge_df_users_fin['quarterly_estimated_revenue'] = merge_df_users_fin['estimated_revenue'].rolling(window=90).sum()

merge_df_users_fin['weekly_estimated_ad_revenue'] = merge_df_users_fin['estimated_ad_revenue'].rolling(window=7).sum()
merge_df_users_fin['monthly_estimated_ad_revenue'] = merge_df_users_fin['estimated_ad_revenue'].rolling(window=30).sum()
merge_df_users_fin['quarterly_estimated_ad_revenue'] = merge_df_users_fin['estimated_ad_revenue'].rolling(window=90).sum()

merge_df_users_fin['weekly_revenue_per_subscriber'] = merge_df_users_fin['weekly_estimated_revenue'] / (merge_df_users_fin['weekly_subscribers_gained'] + 1)
merge_df_users_fin['monthly_revenue_per_subscriber'] = merge_df_users_fin['monthly_estimated_revenue'] / (merge_df_users_fin['monthly_subscribers_gained'] + 1)
merge_df_users_fin['quarterly_revenue_per_subscriber'] = merge_df_users_fin['quarterly_estimated_revenue'] / (merge_df_users_fin['quarterly_subscribers_gained'] + 1)

# 시청 시간 관련 변수
merge_df_users_fin['weekly_avg_view_duration'] = merge_df_users_fin['averageViewDuration'].rolling(window=7).mean()
merge_df_users_fin['monthly_avg_view_duration'] = merge_df_users_fin['averageViewDuration'].rolling(window=30).mean()
merge_df_users_fin['quarterly_avg_view_duration'] = merge_df_users_fin['averageViewDuration'].rolling(window=90).mean()

merge_df_users_fin['weekly_total_view_time'] = merge_df_users_fin['estimatedMinutesWatched'].rolling(window=7).sum()
merge_df_users_fin['monthly_total_view_time'] = merge_df_users_fin['estimatedMinutesWatched'].rolling(window=30).sum()
merge_df_users_fin['quarterly_total_view_time'] = merge_df_users_fin['estimatedMinutesWatched'].rolling(window=90).sum()

merge_df_users_fin['weekly_view_time_per_user'] = merge_df_users_fin['weekly_total_view_time'] / (merge_df_users_fin['weekly_subscribers_gained'] + 1)
merge_df_users_fin['monthly_view_time_per_user'] = merge_df_users_fin['monthly_total_view_time'] / (merge_df_users_fin['monthly_subscribers_gained'] + 1)
merge_df_users_fin['quarterly_view_time_per_user'] = merge_df_users_fin['quarterly_total_view_time'] / (merge_df_users_fin['quarterly_subscribers_gained'] + 1)

# 광고 관련 변수
merge_df_users_fin['weekly_playback_rate'] = merge_df_users_fin['playback_based_cpm'].rolling(window=7).mean()
merge_df_users_fin['monthly_playback_rate'] = merge_df_users_fin['playback_based_cpm'].rolling(window=30).mean()
merge_df_users_fin['quarterly_playback_rate'] = merge_df_users_fin['playback_based_cpm'].rolling(window=90).mean()

merge_df_users_fin['weekly_ad_revenue_rate'] = merge_df_users_fin['estimated_ad_revenue'] / (merge_df_users_fin['weekly_total_view_time'] + 1)
merge_df_users_fin['monthly_ad_revenue_rate'] = merge_df_users_fin['estimated_ad_revenue'] / (merge_df_users_fin['monthly_total_view_time'] + 1)
merge_df_users_fin['quarterly_ad_revenue_rate'] = merge_df_users_fin['estimated_ad_revenue'] / (merge_df_users_fin['quarterly_total_view_time'] + 1)

merge_df_users_fin['weekly_revenue_per_ad_impression'] = merge_df_users_fin['weekly_estimated_ad_revenue'] / (merge_df_users_fin['ad_impressions'].rolling(window=7).sum() + 1)
merge_df_users_fin['monthly_revenue_per_ad_impression'] = merge_df_users_fin['monthly_estimated_ad_revenue'] / (merge_df_users_fin['ad_impressions'].rolling(window=30).sum() + 1)
merge_df_users_fin['quarterly_revenue_per_ad_impression'] = merge_df_users_fin['quarterly_estimated_ad_revenue'] / (merge_df_users_fin['ad_impressions'].rolling(window=90).sum() + 1)

In [6]:
# null값 대체
merge_df_users_fin = merge_df_users_fin.fillna(0) ## NaN
merge_df_users_fin = merge_df_users_fin.replace([np.inf, -np.inf], 0) ## inf

### 데이터 분할
- 미래를 예측하기 위한 모델이므로 시간순으로 데이터 분할
- 시작날짜 : 2023-03-26
- 종료날짜 : 2024-05-06
- 전체 기간의 80% 날짜 : 2024-02-11

In [7]:
# 컬럼 정리
unique_col = merge_df_users_fin.columns[:11]
x_col = merge_df_users_fin.columns[11:].drop('subscribers_count') ## y값 제거

In [8]:
# 데이터 분할
train_data = merge_df_users_fin[merge_df_users_fin['date'] <= '2024-02-11']
test_data = merge_df_users_fin[merge_df_users_fin['date'] > '2024-02-11']

In [9]:
# 불균형 확인
print(train_data.shape)
print(test_data.shape)

(84473, 124)
(21210, 124)


### 변수선택

상관분석

In [11]:
# 상관계수 절대값이 0.3 이상
corr_df = train_data[['subscribers_count'] + list(x_col)].corr()
selected_features_by_corr = list(corr_df[corr_df['subscribers_count'] >= 0.3].iloc[:,0].keys()) + list(corr_df[corr_df['subscribers_count'] <= -0.3].iloc[:,0].keys())
selected_features_by_corr.remove('subscribers_count')
print(selected_features_by_corr)
print(len(selected_features_by_corr))

['redViews', 'SUBSCRIBED', 'estimatedMinutesWatched', 'estimated_revenue', 'estimated_ad_revenue', 'estimated_red_partner_revenue', 'gross_revenue', 'cpm', 'subscribers_lost', 'monetized_playbacks', 'revenue_per_ad_impression', 'watched_time_rate', 'gross_revenue_per_ad_impression', 'playback_rate', 'monthly_subscribers_gained', 'quarterly_subscribers_gained', 'weekly_subscribers_lost', 'monthly_subscribers_lost', 'quarterly_subscribers_lost', 'weekly_estimated_revenue', 'monthly_estimated_revenue', 'quarterly_estimated_revenue', 'weekly_estimated_ad_revenue', 'monthly_estimated_ad_revenue', 'quarterly_estimated_ad_revenue', 'weekly_total_view_time', 'monthly_total_view_time', 'quarterly_total_view_time', 'quarterly_revenue_per_ad_impression', 'unplayback_rate']
30


RandomForest

In [12]:
from sklearn.ensemble import RandomForestRegressor

# 랜덤 포레스트 모델 학습
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(train_data[x_col], train_data['subscribers_count'])

# 변수 중요도 추출
importances = rf_model.feature_importances_
feature_importance_rf = pd.Series(importances, index=x_col).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
selected_features_by_rf = list(feature_importance_rf[feature_importance_rf >= 0.001].keys())

# 중요한 변수 출력
print(f"랜덤 포레스트로 선정된 변수: \n{selected_features_by_rf}")

랜덤 포레스트로 선정된 변수: 
['weekly_subscribers_lost', 'quarterly_estimated_ad_revenue', 'monthly_revenue_per_ad_impression', 'subscriber_decrease_rate', 'subscribers_lost', 'playback_rate', 'weekly_revenue_per_ad_impression', 'weekly_net_subscribers_change', 'unplayback_rate', 'quarterly_subscribers_lost', 'monthly_net_subscribers_change', 'age18-24.female', 'age13-17.female', 'quarterly_net_subscribers_change', 'cpm', 'quarterly_revenue_per_subscriber', 'age25-34.female', 'monthly_subscribers_lost', 'redViews', 'quarterly_avg_view_duration', 'quarterly_subscribers_gained', 'quarterly_estimated_revenue', 'SUBSCRIBED', 'gross_revenue_per_ad_impression', 'UNSUBSCRIBED', 'likes', 'quarterly_total_view_time', 'subscriber_increase_rate', 'weekly_estimated_ad_revenue', 'quarterly_revenue_per_ad_impression', 'monthly_subscribers_gained', 'weekly_estimated_revenue', 'subscriber_view_time_rate', 'monthly_total_view_time', 'revenue_per_subscriber', 'age25-34.male', 'unsubscribed_view_time_rate', 'estima

Gradient Boosting

In [13]:
from sklearn.ensemble import GradientBoostingRegressor

# Gradient Boosting 모델 학습
gbm_model = GradientBoostingRegressor(random_state=42)
gbm_model.fit(train_data[x_col], train_data['subscribers_count'])

# 변수 중요도 추출
importances = gbm_model.feature_importances_
feature_importance_gbm = pd.Series(importances, index=x_col).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
selected_features_by_gbm = list(feature_importance_gbm[feature_importance_gbm >= 0.001].keys())

# 중요한 변수 출력
print(f"Gradient Boosting으로 선정된 변수: \n{selected_features_by_gbm}")

Gradient Boosting으로 선정된 변수: 
['weekly_subscribers_lost', 'subscribers_lost', 'subscriber_decrease_rate', 'quarterly_estimated_ad_revenue', 'quarterly_estimated_revenue', 'gross_revenue_per_ad_impression', 'quarterly_net_subscribers_change', 'monthly_subscribers_lost', 'SUBSCRIBED', 'monthly_estimated_ad_revenue', 'cpm', 'monthly_net_subscribers_change', 'weekly_net_subscribers_change', 'age13-17.female', 'weekly_estimated_revenue', 'revenue_per_subscriber', 'quarterly_subscribers_lost', 'subscriber_increase_rate', 'age25-34.female', 'playback_rate', 'monthly_ad_revenue_rate', 'quarterly_subscribers_gained', 'likes', 'quarterly_total_view_time', 'quarterly_playback_rate', 'monthly_subscribers_gained', 'estimated_ad_revenue', 'subscriber_view_time_rate', 'quarterly_avg_view_duration', 'monthly_total_view_time', 'unplayback_rate', 'quarterly_ad_revenue_rate', 'age18-24.female']


LightGBM

In [14]:
import lightgbm as lgb

# LightGBM 모델 학습
lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_model.fit(train_data[x_col], train_data['subscribers_count'])

# 변수 중요도 추출
importances = lgb_model.feature_importances_
feature_importance_lgbm = pd.Series(importances, index=x_col).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
feature_importance_lgbm = feature_importance_lgbm / feature_importance_lgbm.sum() ## 중요도 정규화
selected_features_by_lgbm = list(feature_importance_lgbm[feature_importance_lgbm >= 0.005].keys())

# 중요한 변수 출력
print(f"LightGBM으로 선정된 변수: \n{selected_features_by_lgbm}")


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.081231 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28461
[LightGBM] [Info] Number of data points in the train set: 84473, number of used features: 112
[LightGBM] [Info] Start training from score 65433.322872
LightGBM으로 선정된 변수: 
['subscriber_decrease_rate', 'subscribers_lost', 'weekly_subscribers_lost', 'subscriber_increase_rate', 'subscriber_view_time_rate', 'quarterly_net_subscribers_change', 'weekly_net_subscribers_change', 'monthly_net_subscribers_change', 'quarterly_playback_rate', 'quarterly_avg_view_duration', 'monthly_subscribers_lost', 'quarterly_subscribers_lost', 'age18-24.female', 'SUBSCRIBED', 'quarterly_total_view_time', 'quarterly_estimated_revenue', 'quarterly_ad_revenue_rate', 'revenue_per_subscriber', 'quarterly_estimated_ad_revenue', 'avg_view_duration_rate', 'quarterly_revenue_per_ad_impression', 'quarterly_view_time_per_user', '

XGBoost

In [15]:
import xgboost as xgb

# XGBoost 모델 학습
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_model.fit(train_data[x_col], train_data['subscribers_count'])

# 변수 중요도 추출
importances = xgb_model.feature_importances_
feature_importance_xgb = pd.Series(importances, index=x_col).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
selected_features_by_xgb = list(feature_importance_xgb[feature_importance_xgb >= 0.001].keys())

# 중요한 변수 출력
print(f"XGBoost로 선정된 변수: \n{selected_features_by_xgb}")

XGBoost로 선정된 변수: 
['quarterly_estimated_revenue', 'quarterly_estimated_ad_revenue', 'weekly_subscribers_lost', 'quarterly_net_subscribers_change', 'weekly_estimated_ad_revenue', 'quarterly_subscribers_lost', 'weekly_net_subscribers_change', 'quarterly_total_view_time', 'playback_rate', 'subscribers_lost', 'unplayback_rate', 'age18-24.female', 'subscriber_decrease_rate', 'SUBSCRIBED', 'estimated_revenue', 'age25-34.female', 'cpm', 'quarterly_revenue_per_ad_impression', 'revenue_per_unsubscribed_view', 'monthly_net_subscribers_change', 'quarterly_playback_rate', 'subscribed_view_time_rate', 'subscriber_increase_rate', 'quarterly_avg_view_duration', 'male_viewer_rate', 'revenue_per_red_view', 'monthly_playback_rate', 'weekly_estimated_revenue', 'quarterly_revenue_per_subscriber', 'subscriber_retention_rate', 'estimated_red_partner_revenue', 'views', 'unsubscribed_view_time_rate']


In [16]:
# 비선형모델 활용 변수 선정
importances_df = pd.DataFrame({
    'features': x_col,
    'rf_importance': rf_model.feature_importances_,
    'gbm_importance': gbm_model.feature_importances_,
    'lgbm_importance': lgb_model.feature_importances_,
    'xgb_importance': xgb_model.feature_importances_    
})
importances_df['lgbm_importance'] = importances_df['lgbm_importance'] / importances_df['lgbm_importance'].sum() ## 다른 모델과 중요도 단위 다르기 때문에 정규화
importances_df['mean_importance'] = importances_df[['rf_importance', 'gbm_importance', 'lgbm_importance', 'xgb_importance']].mean(axis=1) ## 각 변수별 모델 중요도 평균

In [17]:
# 최종 변수 선택
final_selected_features = list(set(list(importances_df[importances_df['mean_importance'] >= 0.004]['features']) + selected_features_by_corr))

In [21]:
# 최종 모델링 데이터셋
merge_df_users_final = merge_df_users_fin[list(unique_col) + final_selected_features + ['subscribers_count']]

In [22]:
# merge_df_users_final.to_csv('C:/py_src/awake/data/merge_df_users_final2.csv', encoding='utf-8-sig', index=False)

## 콘텐츠 데이터 분석

In [7]:
youtube_videos.columns

Index(['youtube_user_id', 'video', 'end_date', 'views', 'redViews', 'comments',
       'likes', 'dislikes', 'shares', 'estimatedMinutesWatched',
       'estimatedRedMinutesWatched', 'averageViewDuration',
       'averageViewPercentage', 'videosAddedToPlaylists',
       'videosRemovedFromPlaylists', 'estimatedRevenue', 'estimatedAdRevenue',
       'grossRevenue', 'estimatedRedPartnerRevenue', 'playbackBasedCpm', 'cpm',
       'subscribersGained', 'subscribersLost', 'monetizedPlaybacks',
       'adImpressions', 'cardClickRate', 'cardTeaserClickRate',
       'cardImpressions', 'cardTeaserImpressions', 'cardClicks',
       'cardTeaserClicks', 'like_rate', 'comment_rate', 'share_rate',
       'dislike_rate', 'total_engage_rate', 'positive_engage_rate',
       'comment_to_like_rate', 'like_to_dislike_ratio',
       'net_subscribers_change', 'subscribers_conversion_rate',
       'subscribers_gained_per_card_click',
       'subscribers_gained_per_playlist_add',
       'card_click_to_subscriber

In [8]:
# 불필요정보 제거 - y값 제거(다른 모델)
youtube_videos = youtube_videos.drop('y_label',axis=1)
## y값 : youtube_videos['subscribers_count']

### 기간별 피처 생성

In [12]:
# 주별, 월별, 분기별 변수 생성

# 조회수 및 시청 시간 관련
youtube_videos['weekly_views'] = youtube_videos['views'].rolling(window=7).sum()
youtube_videos['monthly_views'] = youtube_videos['views'].rolling(window=30).sum()
youtube_videos['quarterly_views'] = youtube_videos['views'].rolling(window=90).sum()

youtube_videos['weekly_watch_time'] = youtube_videos['estimatedMinutesWatched'].rolling(window=7).sum()
youtube_videos['monthly_watch_time'] = youtube_videos['estimatedMinutesWatched'].rolling(window=30).sum()
youtube_videos['quarterly_watch_time'] = youtube_videos['estimatedMinutesWatched'].rolling(window=90).sum()

youtube_videos['weekly_avg_view_duration'] = youtube_videos['averageViewDuration'].rolling(window=7).mean()
youtube_videos['monthly_avg_view_duration'] = youtube_videos['averageViewDuration'].rolling(window=30).mean()
youtube_videos['quarterly_avg_view_duration'] = youtube_videos['averageViewDuration'].rolling(window=90).mean()

youtube_videos['weekly_avg_view_percentage'] = youtube_videos['averageViewPercentage'].rolling(window=7).mean()
youtube_videos['monthly_avg_view_percentage'] = youtube_videos['averageViewPercentage'].rolling(window=30).mean()
youtube_videos['quarterly_avg_view_percentage'] = youtube_videos['averageViewPercentage'].rolling(window=90).mean()


# 참여도 관련
youtube_videos['weekly_videos_added'] = youtube_videos['videosAddedToPlaylists'].rolling(window=7).sum()
youtube_videos['monthly_videos_added'] = youtube_videos['videosAddedToPlaylists'].rolling(window=30).sum()
youtube_videos['quarterly_videos_added'] = youtube_videos['videosAddedToPlaylists'].rolling(window=90).sum()

youtube_videos['weekly_videos_removed'] = youtube_videos['videosRemovedFromPlaylists'].rolling(window=7).sum()
youtube_videos['monthly_videos_removed'] = youtube_videos['videosRemovedFromPlaylists'].rolling(window=30).sum()
youtube_videos['quarterly_videos_removed'] = youtube_videos['videosRemovedFromPlaylists'].rolling(window=90).sum()

# 수익 및 광고 관련
youtube_videos['weekly_estimated_revenue'] = youtube_videos['estimatedRevenue'].rolling(window=7).sum()
youtube_videos['monthly_estimated_revenue'] = youtube_videos['estimatedRevenue'].rolling(window=30).sum()
youtube_videos['quarterly_estimated_revenue'] = youtube_videos['estimatedRevenue'].rolling(window=90).sum()

youtube_videos['weekly_estimated_ad_revenue'] = youtube_videos['estimatedAdRevenue'].rolling(window=7).sum()
youtube_videos['monthly_estimated_ad_revenue'] = youtube_videos['estimatedAdRevenue'].rolling(window=30).sum()
youtube_videos['quarterly_estimated_ad_revenue'] = youtube_videos['estimatedAdRevenue'].rolling(window=90).sum()

youtube_videos['weekly_revenue_per_ad_impression'] = youtube_videos['weekly_estimated_ad_revenue'] / (youtube_videos['adImpressions'].rolling(window=7).sum() + 1)
youtube_videos['monthly_revenue_per_ad_impression'] = youtube_videos['monthly_estimated_ad_revenue'] / (youtube_videos['adImpressions'].rolling(window=30).sum() + 1)
youtube_videos['quarterly_revenue_per_ad_impression'] = youtube_videos['quarterly_estimated_ad_revenue'] / (youtube_videos['adImpressions'].rolling(window=90).sum() + 1)

# 카드 및 티저 관련
youtube_videos['weekly_card_click_rate'] = youtube_videos['cardClickRate'].rolling(window=7).mean()
youtube_videos['monthly_card_click_rate'] = youtube_videos['cardClickRate'].rolling(window=30).mean()
youtube_videos['quarterly_card_click_rate'] = youtube_videos['cardClickRate'].rolling(window=90).mean()

youtube_videos['weekly_card_teaser_click_rate'] = youtube_videos['cardTeaserClickRate'].rolling(window=7).mean()
youtube_videos['monthly_card_teaser_click_rate'] = youtube_videos['cardTeaserClickRate'].rolling(window=30).mean()
youtube_videos['quarterly_card_teaser_click_rate'] = youtube_videos['cardTeaserClickRate'].rolling(window=90).mean()

youtube_videos['weekly_card_clicks'] = youtube_videos['cardClicks'].rolling(window=7).sum()
youtube_videos['monthly_card_clicks'] = youtube_videos['cardClicks'].rolling(window=30).sum()
youtube_videos['quarterly_card_clicks'] = youtube_videos['cardClicks'].rolling(window=90).sum()

youtube_videos['weekly_card_teaser_clicks'] = youtube_videos['cardTeaserClicks'].rolling(window=7).sum()
youtube_videos['monthly_card_teaser_clicks'] = youtube_videos['cardTeaserClicks'].rolling(window=30).sum()
youtube_videos['quarterly_card_teaser_clicks'] = youtube_videos['cardTeaserClicks'].rolling(window=90).sum()

# 참여도 관련
youtube_videos['weekly_total_engagement'] = (youtube_videos['likes'].rolling(window=7).sum() +
                                 youtube_videos['dislikes'].rolling(window=7).sum() +
                                 youtube_videos['comments'].rolling(window=7).sum() +
                                 youtube_videos['shares'].rolling(window=7).sum())

youtube_videos['monthly_total_engagement'] = (youtube_videos['likes'].rolling(window=30).sum() +
                                  youtube_videos['dislikes'].rolling(window=30).sum() +
                                  youtube_videos['comments'].rolling(window=30).sum() +
                                  youtube_videos['shares'].rolling(window=30).sum())

youtube_videos['quarterly_total_engagement'] = (youtube_videos['likes'].rolling(window=90).sum() +
                                    youtube_videos['dislikes'].rolling(window=90).sum() +
                                    youtube_videos['comments'].rolling(window=90).sum() +
                                    youtube_videos['shares'].rolling(window=90).sum())

youtube_videos['weekly_engagement_rate'] = youtube_videos['weekly_total_engagement'] / (youtube_videos['weekly_views'] + 1)
youtube_videos['monthly_engagement_rate'] = youtube_videos['monthly_total_engagement'] / (youtube_videos['monthly_views'] + 1)
youtube_videos['quarterly_engagement_rate'] = youtube_videos['quarterly_total_engagement'] / (youtube_videos['quarterly_views'] + 1)

youtube_videos['weekly_playlist_change_rate'] = (youtube_videos['weekly_videos_added'] - youtube_videos['weekly_videos_removed']) / (youtube_videos['weekly_videos_added'] + 1)
youtube_videos['monthly_playlist_change_rate'] = (youtube_videos['monthly_videos_added'] - youtube_videos['monthly_videos_removed']) / (youtube_videos['monthly_videos_added'] + 1)
youtube_videos['quarterly_playlist_change_rate'] = (youtube_videos['quarterly_videos_added'] - youtube_videos['quarterly_videos_removed']) / (youtube_videos['quarterly_videos_added'] + 1)

In [13]:
# null값 대체
youtube_videos = youtube_videos.fillna(0) ## NaN
youtube_videos = youtube_videos.replace([np.inf, -np.inf], 0) ## inf

### 데이터 분할
- 미래를 예측하기 위한 모델이므로 시간순으로 데이터 분할
- 시작날짜 : 2023-03-26
- 종료날짜 : 2024-05-06
- 전체 기간의 80% 날짜 : 2024-02-11

In [18]:
# 컬럼 정리
unique_col = youtube_videos.columns[:3]
x_col = youtube_videos.columns[3:].drop('net_subscribers_change') ## y값 제거

In [21]:
# 데이터 분할
train_data = youtube_videos[youtube_videos['end_date'] <= '2024-02-11']
test_data = youtube_videos[youtube_videos['end_date'] > '2024-02-11']

In [24]:
len(test_data)

In [22]:
# 불균형 확인
print(train_data.shape)
print(test_data.shape)

(Delayed('int-3a56412b-1e5c-455b-81ab-88ecae1c65ed'), 131)
(Delayed('int-31826ee8-6b78-4632-afb2-66ff59fbe08b'), 131)


### 변수선택

상관분석

In [None]:
# 상관계수 절대값이 0.3 이상
corr_df = train_data[['net_subscribers_change'] + list(x_col)].corr()
selected_features_by_corr = list(corr_df[corr_df['net_subscribers_change'] >= 0.3].iloc[:,0].keys()) + list(corr_df[corr_df['net_subscribers_change'] <= -0.3].iloc[:,0].keys())
selected_features_by_corr.remove('net_subscribers_change')
print(selected_features_by_corr)
print(len(selected_features_by_corr))

['redViews', 'SUBSCRIBED', 'estimatedMinutesWatched', 'estimated_revenue', 'estimated_ad_revenue', 'estimated_red_partner_revenue', 'gross_revenue', 'cpm', 'subscribers_lost', 'monetized_playbacks', 'revenue_per_ad_impression', 'watched_time_rate', 'gross_revenue_per_ad_impression', 'playback_rate', 'monthly_subscribers_gained', 'quarterly_subscribers_gained', 'weekly_subscribers_lost', 'monthly_subscribers_lost', 'quarterly_subscribers_lost', 'weekly_estimated_revenue', 'monthly_estimated_revenue', 'quarterly_estimated_revenue', 'weekly_estimated_ad_revenue', 'monthly_estimated_ad_revenue', 'quarterly_estimated_ad_revenue', 'weekly_total_view_time', 'monthly_total_view_time', 'quarterly_total_view_time', 'quarterly_revenue_per_ad_impression', 'unplayback_rate']
30


RandomForest

In [None]:
from sklearn.ensemble import RandomForestRegressor

# 랜덤 포레스트 모델 학습
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(train_data[x_col], train_data['subscribers_count'])

# 변수 중요도 추출
importances = rf_model.feature_importances_
feature_importance_rf = pd.Series(importances, index=x_col).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
selected_features_by_rf = list(feature_importance_rf[feature_importance_rf >= 0.001].keys())

# 중요한 변수 출력
print(f"랜덤 포레스트로 선정된 변수: \n{selected_features_by_rf}")

랜덤 포레스트로 선정된 변수: 
['weekly_subscribers_lost', 'quarterly_estimated_ad_revenue', 'monthly_revenue_per_ad_impression', 'subscriber_decrease_rate', 'subscribers_lost', 'playback_rate', 'weekly_revenue_per_ad_impression', 'weekly_net_subscribers_change', 'unplayback_rate', 'quarterly_subscribers_lost', 'monthly_net_subscribers_change', 'age18-24.female', 'age13-17.female', 'quarterly_net_subscribers_change', 'cpm', 'quarterly_revenue_per_subscriber', 'age25-34.female', 'monthly_subscribers_lost', 'redViews', 'quarterly_avg_view_duration', 'quarterly_subscribers_gained', 'quarterly_estimated_revenue', 'SUBSCRIBED', 'gross_revenue_per_ad_impression', 'UNSUBSCRIBED', 'likes', 'quarterly_total_view_time', 'subscriber_increase_rate', 'weekly_estimated_ad_revenue', 'quarterly_revenue_per_ad_impression', 'monthly_subscribers_gained', 'weekly_estimated_revenue', 'subscriber_view_time_rate', 'monthly_total_view_time', 'revenue_per_subscriber', 'age25-34.male', 'unsubscribed_view_time_rate', 'estima

Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Gradient Boosting 모델 학습
gbm_model = GradientBoostingRegressor(random_state=42)
gbm_model.fit(train_data[x_col], train_data['subscribers_count'])

# 변수 중요도 추출
importances = gbm_model.feature_importances_
feature_importance_gbm = pd.Series(importances, index=x_col).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
selected_features_by_gbm = list(feature_importance_gbm[feature_importance_gbm >= 0.001].keys())

# 중요한 변수 출력
print(f"Gradient Boosting으로 선정된 변수: \n{selected_features_by_gbm}")

Gradient Boosting으로 선정된 변수: 
['weekly_subscribers_lost', 'subscribers_lost', 'subscriber_decrease_rate', 'quarterly_estimated_ad_revenue', 'quarterly_estimated_revenue', 'gross_revenue_per_ad_impression', 'quarterly_net_subscribers_change', 'monthly_subscribers_lost', 'SUBSCRIBED', 'monthly_estimated_ad_revenue', 'cpm', 'monthly_net_subscribers_change', 'weekly_net_subscribers_change', 'age13-17.female', 'weekly_estimated_revenue', 'revenue_per_subscriber', 'quarterly_subscribers_lost', 'subscriber_increase_rate', 'age25-34.female', 'playback_rate', 'monthly_ad_revenue_rate', 'quarterly_subscribers_gained', 'likes', 'quarterly_total_view_time', 'quarterly_playback_rate', 'monthly_subscribers_gained', 'estimated_ad_revenue', 'subscriber_view_time_rate', 'quarterly_avg_view_duration', 'monthly_total_view_time', 'unplayback_rate', 'quarterly_ad_revenue_rate', 'age18-24.female']


LightGBM

In [None]:
import lightgbm as lgb

# LightGBM 모델 학습
lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_model.fit(train_data[x_col], train_data['subscribers_count'])

# 변수 중요도 추출
importances = lgb_model.feature_importances_
feature_importance_lgbm = pd.Series(importances, index=x_col).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
feature_importance_lgbm = feature_importance_lgbm / feature_importance_lgbm.sum() ## 중요도 정규화
selected_features_by_lgbm = list(feature_importance_lgbm[feature_importance_lgbm >= 0.005].keys())

# 중요한 변수 출력
print(f"LightGBM으로 선정된 변수: \n{selected_features_by_lgbm}")


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.081231 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28461
[LightGBM] [Info] Number of data points in the train set: 84473, number of used features: 112
[LightGBM] [Info] Start training from score 65433.322872
LightGBM으로 선정된 변수: 
['subscriber_decrease_rate', 'subscribers_lost', 'weekly_subscribers_lost', 'subscriber_increase_rate', 'subscriber_view_time_rate', 'quarterly_net_subscribers_change', 'weekly_net_subscribers_change', 'monthly_net_subscribers_change', 'quarterly_playback_rate', 'quarterly_avg_view_duration', 'monthly_subscribers_lost', 'quarterly_subscribers_lost', 'age18-24.female', 'SUBSCRIBED', 'quarterly_total_view_time', 'quarterly_estimated_revenue', 'quarterly_ad_revenue_rate', 'revenue_per_subscriber', 'quarterly_estimated_ad_revenue', 'avg_view_duration_rate', 'quarterly_revenue_per_ad_impression', 'quarterly_view_time_per_user', '

XGBoost

In [None]:
import xgboost as xgb

# XGBoost 모델 학습
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_model.fit(train_data[x_col], train_data['subscribers_count'])

# 변수 중요도 추출
importances = xgb_model.feature_importances_
feature_importance_xgb = pd.Series(importances, index=x_col).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
selected_features_by_xgb = list(feature_importance_xgb[feature_importance_xgb >= 0.001].keys())

# 중요한 변수 출력
print(f"XGBoost로 선정된 변수: \n{selected_features_by_xgb}")

XGBoost로 선정된 변수: 
['quarterly_estimated_revenue', 'quarterly_estimated_ad_revenue', 'weekly_subscribers_lost', 'quarterly_net_subscribers_change', 'weekly_estimated_ad_revenue', 'quarterly_subscribers_lost', 'weekly_net_subscribers_change', 'quarterly_total_view_time', 'playback_rate', 'subscribers_lost', 'unplayback_rate', 'age18-24.female', 'subscriber_decrease_rate', 'SUBSCRIBED', 'estimated_revenue', 'age25-34.female', 'cpm', 'quarterly_revenue_per_ad_impression', 'revenue_per_unsubscribed_view', 'monthly_net_subscribers_change', 'quarterly_playback_rate', 'subscribed_view_time_rate', 'subscriber_increase_rate', 'quarterly_avg_view_duration', 'male_viewer_rate', 'revenue_per_red_view', 'monthly_playback_rate', 'weekly_estimated_revenue', 'quarterly_revenue_per_subscriber', 'subscriber_retention_rate', 'estimated_red_partner_revenue', 'views', 'unsubscribed_view_time_rate']


In [None]:
# 비선형모델 활용 변수 선정
importances_df = pd.DataFrame({
    'features': x_col,
    'rf_importance': rf_model.feature_importances_,
    'gbm_importance': gbm_model.feature_importances_,
    'lgbm_importance': lgb_model.feature_importances_,
    'xgb_importance': xgb_model.feature_importances_    
})
importances_df['lgbm_importance'] = importances_df['lgbm_importance'] / importances_df['lgbm_importance'].sum() ## 다른 모델과 중요도 단위 다르기 때문에 정규화
importances_df['mean_importance'] = importances_df[['rf_importance', 'gbm_importance', 'lgbm_importance', 'xgb_importance']].mean(axis=1) ## 각 변수별 모델 중요도 평균

In [None]:
# 최종 변수 선택
final_selected_features = list(set(list(importances_df[importances_df['mean_importance'] >= 0.004]['features']) + selected_features_by_corr))

In [None]:
# 최종 모델링 데이터셋
merge_df_users_final = merge_df_users_fin[list(unique_col) + final_selected_features + ['subscribers_count']]

In [None]:
# merge_df_users_final.to_csv('C:/py_src/awake/data/merge_df_users_final2.csv', encoding='utf-8-sig', index=False)

In [100]:
youtube_videos_final = youtube_videos[list(unique_col) + final_selected_features + ['y_label']]

In [101]:
# youtube_videos_final.to_csv('C:/py_src/awake/data/youtube_videos_final.csv', encoding='utf-8-sig', index=False)