In [1]:
import pymongo
import pandas as pd
import numpy as np
import math
from datetime import datetime

# import dask.dataframe as dd ## 대규모 데이터 처리
# import vaex ## 대규모 데이터 처리

# from sklearn.linear_model import LinearRegression ## likes, dislikes 대체
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

import os
from dotenv import load_dotenv

In [2]:
file_path = 'C:/py_src/awake/data/'

In [3]:
# 계정 / 콘텐츠 분석 데이터셋 불러오기
merge_df_users_fin = pd.read_csv(file_path + 'merge_df_users_fin.csv', low_memory=False)
# youtube_videos = pd.read_csv(file_path + 'youtube_videos.csv')

## 계정 데이터 분석

### 데이터 분할
- 미래를 예측하기 위한 모델이므로 시간순으로 데이터 분할
- 시작날짜 : 2023-03-26
- 종료날짜 : 2024-05-06
- 전체 기간의 80% 날짜 : 2024-02-11

In [7]:
# 컬럼 정리
unique_col = merge_df_users_fin.columns[:11]
x_col = merge_df_users_fin.columns[11:].drop('estimated_ad_revenue') ## y값 제거

In [8]:
# 데이터 분할
train_data = merge_df_users_fin[merge_df_users_fin['date'] <= '2024-02-11']
test_data = merge_df_users_fin[merge_df_users_fin['date'] > '2024-02-11']

In [9]:
# 불균형 확인
print(train_data.shape)
print(test_data.shape)

(84473, 57)
(21210, 57)


### 모델 기법 적용

In [10]:
# 각 모델 정의
rf_model = RandomForestRegressor(random_state=42)
gbm_model = GradientBoostingRegressor(random_state=42)
lgbm_model = LGBMRegressor(random_state=42)
xgb_model = xgb.XGBRegressor(random_state=42)

# 모델 리스트
models = {
    'RandomForest': rf_model,
    'GBM': gbm_model,
    'LightGBM': lgbm_model,
    'XGBoost': xgb_model
}

In [9]:
# # 모델별 교차 검증 결과 저장
# results = {}

# for model_name, model in models.items():
#     print(f"\n{model_name} 모델 성능 평가 중...")
    
#     # 교차 검증
#     cv_scores = cross_val_score(model, train_data[x_col], train_data['subscribers_count'], cv=5, scoring='neg_mean_squared_error')
    
#     # 평균 RMSE 계산
#     rmse_scores = np.sqrt(-cv_scores)  # neg_mean_squared_error는 음수이므로 양수로 변환 후 제곱근
#     mean_rmse = rmse_scores.mean()
#     print(f"{model_name} 교차 검증 평균 RMSE: {mean_rmse}")
    
#     # 결과 저장
#     results[model_name] = mean_rmse

### 모델 성능 평가

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

for model in models:
    # 모델 정의 및 학습
    models[model].fit(train_data[x_col], train_data['estimated_ad_revenue'])

    # 예측
    y_pred = models[model].predict(test_data[x_col])

    # MSE 계산
    mse = mean_squared_error(test_data['estimated_ad_revenue'], y_pred)

    # RMSE 계산
    rmse = np.sqrt(mse)

    # R² 값 계산
    r2 = r2_score(test_data['estimated_ad_revenue'], y_pred)

    # Adjusted R² 계산
    n = len(test_data)  # 샘플 수
    p = test_data.shape[1]  # 독립 변수(특성) 수
    adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

    # 결과 출력
    print(model)
    print(f"R² 값: {r2:.4f}")
    print(f"Adjusted R² 값: {adjusted_r2:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print("----------------------------------------")
    print("")

RandomForest
R² 값: 0.9936
Adjusted R² 값: 0.9936
MSE: 47091405.1252
RMSE: 6862.3178
----------------------------------------

GBM
R² 값: 0.9974
Adjusted R² 값: 0.9974
MSE: 18992570.5474
RMSE: 4358.0466
----------------------------------------

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008939 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11454
[LightGBM] [Info] Number of data points in the train set: 84473, number of used features: 45
[LightGBM] [Info] Start training from score 14667.681766
LightGBM
R² 값: 0.9223
Adjusted R² 값: 0.9221
MSE: 571114131.6523
RMSE: 23897.9943
----------------------------------------

XGBoost
R² 값: 0.9663
Adjusted R² 값: 0.9662
MSE: 248007649.8104
RMSE: 15748.2586
----------------------------------------



In [92]:
# # 비선형모델 활용 변수 선정
# importances_df = pd.DataFrame({
#     'features': x_col,
#     'rf_importance': models['RandomForest'].feature_importances_,
#     'gbm_importance': models['GBM'].feature_importances_,
#     'lgbm_importance': models['LightGBM'].feature_importances_,
#     'xgb_importance': models['XGBoost'].feature_importances_    
# })
# importances_df['lgbm_importance'] = importances_df['lgbm_importance'] / importances_df['lgbm_importance'].sum() ## 다른 모델과 중요도 단위 다르기 때문에 정규화
# importances_df['mean_importance'] = importances_df[['rf_importance', 'gbm_importance', 'lgbm_importance', 'xgb_importance']].mean(axis=1) ## 각 변수별 모델 중요도 평균

### 모델 성능 개선

스태킹

In [14]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge

# 스태킹 모델 정의
stacking_model = StackingRegressor(
    estimators=[
    ('random_forest', models['RandomForest']),
    ('gbm', models['GBM']),
    ('lightgbm', models['LightGBM']),
    ('xgboost', models['XGBoost'])
    ],
    final_estimator=Ridge()
)

# 스태킹 모델 학습
stacking_model.fit(train_data[x_col], train_data['estimated_ad_revenue'])

# 예측
y_pred = stacking_model.predict(test_data[x_col])

# 성능 평가
# R² 값 계산
r2 = r2_score(test_data['estimated_ad_revenue'], y_pred)

# Adjusted R² 계산
n = len(test_data)  # 샘플 수
p = test_data.shape[1]  # 독립 변수(특성) 수
adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

# MSE 계산
mse = mean_squared_error(test_data['estimated_ad_revenue'], y_pred)

# RMSE 계산
rmse = mean_squared_error(test_data['estimated_ad_revenue'], y_pred, squared=False)

print(f"스태킹 앙상블 모델 R2: {r2:.4f}")
print(f"스태킹 앙상블 모델 Adjusted R2: {adjusted_r2:.4f}")
print(f"스태킹 앙상블 모델 MSE: {mse:.4f}")
print(f"스태킹 앙상블 모델 RMSE: {rmse:.4f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006312 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11454
[LightGBM] [Info] Number of data points in the train set: 84473, number of used features: 45
[LightGBM] [Info] Start training from score 14667.681766
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005094 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11453
[LightGBM] [Info] Number of data points in the train set: 67578, number of used features: 45
[LightGBM] [Info] Start training from score 17638.612776
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006035 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11455
[LightGBM] [Info] Number of data points in the train set: 67578, number of used features: 45
[LightGBM] [Info



### 실제데이터 결과 확인

In [16]:
# 예측 결과 확인
y_pred = stacking_model.predict(merge_df_users_fin[x_col])
merge_df_users_fin['predict'] = y_pred

In [61]:
# 계정별 구독자수 평균, 구독자 예측수 평균 비교
result_df = merge_df_users_fin[['youtube_user_id','date','channel_title','estimated_ad_revenue','predict']]

# 1개월, 3개월, 6개월, 1년 뒤 구독자수 예측값
result_df['date'] = pd.to_datetime(result_df['date']) 

result_df['1_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-30).rolling(window=30).sum()
result_df['3_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-90).rolling(window=90).sum()
result_df['6_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-180).rolling(window=180).sum()
result_df['12_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-365).rolling(window=365).sum()

# 계정별 기간에 따른 광고 수익 예측값 비교
# result_df_final = result_df.groupby(['youtube_user_id'])[['estimated_ad_revenue','predict','1_month_future_predict','3_month_future_predict','6_month_future_predict','12_month_future_predict']].sum().reset_index()

# 계정별로 최종 평균값을 계산
result_df_final = result_df.groupby('youtube_user_id').agg({
    'estimated_ad_revenue': 'mean', ## 1일 평균 광고수익
    'predict': 'mean', ## 1일 평균 광고수익 예측
    '1_month_future_predict': 'mean',
    '3_month_future_predict': 'mean',
    '6_month_future_predict': 'mean',
    '12_month_future_predict': 'mean'
}).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['date'] = pd.to_datetime(result_df['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['1_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-30).rolling(window=30).sum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['3_month_future_pre

In [162]:
result_df[result_df['youtube_user_id'] == '63d2239450eb530dfd137d1e']

Unnamed: 0,youtube_user_id,date,channel_title,estimated_ad_revenue,predict,1_month_future_predict,3_month_future_predict,6_month_future_predict,12_month_future_predict
43071,63d2239450eb530dfd137d1e,2023-03-26,이고 EGO,297.806,44.758478,,,,
43072,63d2239450eb530dfd137d1e,2023-03-27,이고 EGO,252.163,-71.941941,,,,
43073,63d2239450eb530dfd137d1e,2023-03-28,이고 EGO,236.273,12.215801,,,,
43074,63d2239450eb530dfd137d1e,2023-03-29,이고 EGO,269.039,49.785959,,,,
43075,63d2239450eb530dfd137d1e,2023-03-30,이고 EGO,503.468,128.037987,,,,
...,...,...,...,...,...,...,...,...,...
43471,63d2239450eb530dfd137d1e,2024-04-29,이고 EGO,104.513,-133.691835,,,,
43472,63d2239450eb530dfd137d1e,2024-04-30,이고 EGO,80.377,-143.835546,,,,
43473,63d2239450eb530dfd137d1e,2024-05-01,이고 EGO,28.881,-160.208379,,,,
43474,63d2239450eb530dfd137d1e,2024-05-02,이고 EGO,53.548,-153.921827,,,,


In [148]:
result_df_final[result_df_final['predict'] > 0].sort_values('predict')

Unnamed: 0,youtube_user_id,estimated_ad_revenue,predict,1_month_future_predict,3_month_future_predict,6_month_future_predict,12_month_future_predict
48,62b718bc507271632b8f0ce4,339.897065,15.990677,4.692295e+02,-2.621279e+02,-3.614523e+03,
201,64bce3c5616bd20e3037a1cf,323.219489,23.331282,-1.257770e+02,-6.636114e+03,-1.884004e+04,
98,63d2239450eb530dfd137d1e,291.510282,37.633269,8.248158e+02,1.640833e+03,-5.892702e+03,
76,63315973ef33d840a0999698,367.495198,47.274911,2.638626e+03,1.863610e+04,3.914999e+04,
151,6417c62789085e280d0e410b,350.699542,47.398836,-8.481259e+02,-1.499391e+04,-3.482298e+04,
...,...,...,...,...,...,...,...
142,640339ac118c0f5858818694,157229.117648,159944.491440,4.721605e+06,1.344499e+07,2.493102e+07,
57,62d11f080b4c4c7502a5be3d,326569.531929,332969.983834,9.928727e+06,3.084473e+07,6.230265e+07,
84,639bb8dcd603b8138e33780b,342046.202626,348007.035268,1.123271e+07,3.696964e+07,1.215998e+08,
108,63eb4f87ee122e631992279f,343644.047387,350449.210011,9.967546e+06,3.010190e+07,5.184346e+07,


In [65]:
result_df[result_df['youtube_user_id'].isin(result_df_final[result_df_final['predict'] < 0]['youtube_user_id'])]['channel_title'].unique()

array(['고도람 Go!doram', '0', '세남자 물고기', '름쿠 ᴘʟᴀʏʟɪꜱᴛ', '달고캠핑',
       '고군 Gohgoon', '루깬미', '임삐나', '바른걸음연구소', '임퓨의 비트메이킹 클래스',
       '혜성네일_comet', '마파TV', '담순언니 Twins Vlog', '유익한 균튜버', '1분뉴스',
       'the sence', 'MINLEE 민리', '성한준', '다먹어라이언',
       '하부유튜브 Minor / (Lower) YouTube', 'sa lly', '수집의 수집', '도아이 Doh-I',
       'OBL - 온라인 농부, 사자가 되다', '서유 SEOYU DANCE', 'ORlGN 오리진', '윈플즈TV',
       'DDONIE 또니 / 러브크레센트', '슈로시안 SUROSIAN', '김밈서', '드론브이로그 DroneVlog',
       'gahyun 가현', 'Mein 미인', '김두부', '은는이가', '인썸니아TV', '한나임한나Hannaim',
       '수란쿤', '기자 황덕현 KIJA HWANG', '콜로니', '어웨이커 | 크리에이터 이코노미',
       'GMENCY 멘시의 마인크래프트', '로컬필름 LOCAL FILM', '꾸앤끄', '키키낙낙',
       '프롬수지 fromsuzy', 'fromsuzy 프롬수지', '탬니몰리', 'WORKS.D PLAYLIST',
       '강포동하우스', '태다린tae_darin', '홈바부부_HOMBA BOOBOO', 'TJ 영상채널',
       'MORE김모어', '청도시네마', '두꼽이Challenge', '여행윤Tripyun', '핸슥슥',
       '연디아 채널 Yeondia Channel', 'D_tail_디테', '김희영', '김우다', '닷츠 DOTS',
       '약사 이진수💊', '모리녀', '자수의숲jasooforest', 'SBM&E Official',
       'Yeren

In [115]:
# 상위, 하위 20개 계정
top_user_id_20_6_month = result_df_final[result_df_final['6_month_future_predict'] > 0].sort_values(['6_month_future_predict'],ascending=False)['youtube_user_id'].iloc[:20]
bottom_user_id_20_6_month = result_df_final[result_df_final['6_month_future_predict'] > 0].sort_values(['6_month_future_predict'],ascending=False)['youtube_user_id'].iloc[-20:]

In [112]:
print(len(result_df[result_df['youtube_user_id'].isin(top_user_id_20_6_month)]['channel_title'].unique()))
print(result_df[result_df['youtube_user_id'].isin(top_user_id_20_6_month)]['channel_title'].unique())

20
['abbapraise 아바프레이즈' '0' 'Jeffreyxking' '팀브라더스' 'kiu기우쌤'
 '수빙수tv sooBingsoo' '너굴몬' '미니멀영어 Minimal English' '日本ジヌ【니혼지누】ー韓国に関する全て'
 '뻘짓연구소' '정가거부' '북토크' '집구석구석꿀팁, 집꿀' '나연이즈백 LPGA Na Yeon Choi' '유네린NERIN'
 '뷰드름 유튜버 인씨' '빅민 GAME' 'OSSC' '이현우의 MLBTV' '하원장 강동현']


In [113]:
print(len(result_df[result_df['youtube_user_id'].isin(bottom_user_id_20_6_month)]['channel_title'].unique()))
print(result_df[result_df['youtube_user_id'].isin(bottom_user_id_20_6_month)]['channel_title'].unique())

20
['띠혜 ddihye' '0' '소리미의 신화방송' '나는 불독' 'assesta' '배우GO' '굥플레이스 맛집투어'
 '이숲soop' '원의 독백' '채림처럼firstcherry' '모하지연 MOHAJIYEON' '오토컨테이너 스튜디오'
 '부반TV_부에 반하다' '미디하는남자' '이고 EGO' '맛집남자 foodman' '-mentalholder 멘탈홀더 tv'
 '너드 슬로리 SloLee' '캠핑 릴리아빠' '여리여리YeoriYeori']


In [117]:
len(set(top_user_id_20_6_month) & set(top_user_id_20_6_month) & set(top_user_id_20_6_month))

16

## 콘텐츠 데이터 분석

In [26]:
# 최종 콘텐츠 분석 데이터셋
# 환율 적용 - 해당기간 평균환율 : 1322.42
exchange_rate_col = ['estimatedRevenue', 'estimatedAdRevenue', 'estimatedRedPartnerRevenue', 'grossRevenue', 'cpm', 'playbackBasedCpm']
youtube_videos[exchange_rate_col] = youtube_videos[exchange_rate_col] * 1322.42

In [27]:
# 잘못된값 처리
youtube_videos['estimatedRevenue'] = np.where(youtube_videos['estimatedRevenue'] < 0,
                                               youtube_videos['estimatedAdRevenue'] + youtube_videos['estimatedRedPartnerRevenue'],
                                               youtube_videos['estimatedRevenue'])

In [28]:
# 버그로 사용된 수치값 대체
youtube_videos['likes'] = np.where(youtube_videos['likes'] < 0, 0, youtube_videos['likes'])
youtube_videos['dislikes'] = np.where(youtube_videos['dislikes'] < 0, 0, youtube_videos['dislikes'])

### 중요지표확인

In [29]:
youtube_videos.columns

Index(['youtube_user_id', 'video', 'end_date', 'views', 'redViews', 'comments',
       'likes', 'dislikes', 'shares', 'estimatedMinutesWatched',
       'estimatedRedMinutesWatched', 'averageViewDuration',
       'averageViewPercentage', 'videosAddedToPlaylists',
       'videosRemovedFromPlaylists', 'estimatedRevenue', 'estimatedAdRevenue',
       'grossRevenue', 'estimatedRedPartnerRevenue', 'playbackBasedCpm', 'cpm',
       'subscribersGained', 'subscribersLost', 'monetizedPlaybacks',
       'adImpressions', 'cardClickRate', 'cardTeaserClickRate',
       'cardImpressions', 'cardTeaserImpressions', 'cardClicks',
       'cardTeaserClicks'],
      dtype='object')

In [30]:
# y값 설정
# youtube_videos['estimatedRevenue']
# youtube_videos['net_subscribers_change']
# youtube_videos['engage_rate']
youtube_videos['net_subscribers_change'] = youtube_videos['subscribersGained'] - youtube_videos['subscribersLost']
youtube_videos['engage_rate'] = (youtube_videos['likes'] + youtube_videos['comments'] + youtube_videos['shares']) / youtube_videos['views']

In [31]:
# 조회수당 수익
youtube_videos['revenue_per_view'] = youtube_videos['estimatedRevenue'] / youtube_videos['views']

# 구독자당 수익
youtube_videos['revenue_per_Subscriber'] = youtube_videos['estimatedRevenue'] / youtube_videos['subscribersGained']

# YouTube Premium 수익
youtube_videos['estimatedRedPartnerRevenue']

# 수익 다변화 비율
youtube_videos['revenue_diversification_ratio'] = (youtube_videos['grossRevenue'] - youtube_videos['estimatedRevenue']) / youtube_videos['estimatedRevenue']

# 구독자 증가율
youtube_videos['subscriber_growth_rate'] = youtube_videos['subscribersGained'] / (youtube_videos['subscribersGained'] + youtube_videos['subscribersLost'])

# 구독자 감소율
youtube_videos['subscriber_loss_rate'] = youtube_videos['subscribersLost'] / (youtube_videos['subscribersGained'] + youtube_videos['subscribersLost'])

# 구독자 유지율
youtube_videos['subscriber_retention_rate'] = (youtube_videos['subscribersGained'] - youtube_videos['subscribersLost']) / youtube_videos['subscribersGained']

# 콘텐츠당 구독자 증가율
youtube_videos['subscriber_gain_per_content'] = youtube_videos['subscribersGained']/ youtube_videos['videosAddedToPlaylists']

# 구독자당 시청 시간
youtube_videos['watch_time_per_subscriber'] = youtube_videos['estimatedMinutesWatched'] / youtube_videos['subscribersGained']

# 광고재생률
youtube_videos['ad_playback_rate'] = youtube_videos['monetizedPlaybacks'] / youtube_videos['adImpressions']

상관분석

In [32]:
corr_df = youtube_videos[youtube_videos.columns[3:]].corr()
corr_df[['estimatedRevenue','net_subscribers_change','engage_rate']]

Unnamed: 0,estimatedRevenue,net_subscribers_change,engage_rate
views,0.318394,0.6975,0.017541
redViews,0.412902,0.414674,0.011004
comments,0.104338,0.129862,0.06993
likes,0.24478,0.568465,0.017327
dislikes,0.267804,0.740121,0.015847
shares,0.268022,0.585073,0.022817
estimatedMinutesWatched,0.699625,0.560338,0.012668
estimatedRedMinutesWatched,0.682464,0.169521,0.002882
averageViewDuration,0.08022,-0.004228,0.008303
averageViewPercentage,-0.001652,0.00843,0.020841


### 기간별 피처 생성

In [33]:
# 주별 피처 생성(콘텐츠별로 계산)
youtube_videos['weekly_revenue_per_view_std'] = youtube_videos.groupby('video')['revenue_per_view'].transform(lambda x: x.rolling(window=7).std())
youtube_videos['weekly_revenue_per_Subscriber_std'] = youtube_videos.groupby('video')['revenue_per_Subscriber'].transform(lambda x: x.rolling(window=7).std())
youtube_videos['weekly_estimatedRedPartnerRevenue_std'] = youtube_videos.groupby('video')['estimatedRedPartnerRevenue'].transform(lambda x: x.rolling(window=7).std())
youtube_videos['weekly_revenue_diversification_ratio_std'] = youtube_videos.groupby('video')['revenue_diversification_ratio'].transform(lambda x: x.rolling(window=7).std())
youtube_videos['weekly_subscriber_growth_rate_std'] = youtube_videos.groupby('video')['subscriber_growth_rate'].transform(lambda x: x.rolling(window=7).std())
youtube_videos['weekly_subscriber_loss_rate_std'] = youtube_videos.groupby('video')['subscriber_loss_rate'].transform(lambda x: x.rolling(window=7).std())
youtube_videos['weekly_subscriber_retention_rate_std'] = youtube_videos.groupby('video')['subscriber_retention_rate'].transform(lambda x: x.rolling(window=7).std())
youtube_videos['weekly_subscriber_gain_per_content_std'] = youtube_videos.groupby('video')['subscriber_gain_per_content'].transform(lambda x: x.rolling(window=7).std())
youtube_videos['weekly_watch_time_per_subscriber_std'] = youtube_videos.groupby('video')['watch_time_per_subscriber'].transform(lambda x: x.rolling(window=7).std())
youtube_videos['weekly_ad_playback_rate_std'] = youtube_videos.groupby('video')['ad_playback_rate'].transform(lambda x: x.rolling(window=7).std())

youtube_videos['weekly_revenue_per_view_trd'] = youtube_videos.groupby('video')['revenue_per_view'].transform(lambda x: (x - x.shift(7)) / x.shift(7))
youtube_videos['weekly_revenue_per_Subscriber_trd'] = youtube_videos.groupby('video')['revenue_per_Subscriber'].transform(lambda x: (x - x.shift(7)) / x.shift(7))
youtube_videos['weekly_estimatedRedPartnerRevenue_trd'] = youtube_videos.groupby('video')['estimatedRedPartnerRevenue'].transform(lambda x: (x - x.shift(7)) / x.shift(7))
youtube_videos['weekly_revenue_diversification_ratio_trd'] = youtube_videos.groupby('video')['revenue_diversification_ratio'].transform(lambda x: (x - x.shift(7)) / x.shift(7))
youtube_videos['weekly_subscriber_growth_rate_trd'] = youtube_videos.groupby('video')['subscriber_growth_rate'].transform(lambda x: (x - x.shift(7)) / x.shift(7))
youtube_videos['weekly_subscriber_loss_rate_trd'] = youtube_videos.groupby('video')['subscriber_loss_rate'].transform(lambda x: (x - x.shift(7)) / x.shift(7))
youtube_videos['weekly_subscriber_retention_rate_trd'] = youtube_videos.groupby('video')['subscriber_retention_rate'].transform(lambda x: (x - x.shift(7)) / x.shift(7))
youtube_videos['weekly_subscriber_gain_per_content_trd'] = youtube_videos.groupby('video')['subscriber_gain_per_content'].transform(lambda x: (x - x.shift(7)) / x.shift(7))
youtube_videos['weekly_watch_time_per_subscriber_trd'] = youtube_videos.groupby('video')['watch_time_per_subscriber'].transform(lambda x: (x - x.shift(7)) / x.shift(7))
youtube_videos['weekly_ad_playback_rate_trd'] = youtube_videos.groupby('video')['ad_playback_rate'].transform(lambda x: (x - x.shift(7)) / x.shift(7))

# 월별 피처 생성(콘텐츠별로 계산)
youtube_videos['monthly_revenue_per_view_std'] = youtube_videos.groupby('video')['revenue_per_view'].transform(lambda x: x.rolling(window=30).std())
youtube_videos['monthly_revenue_per_Subscriber_std'] = youtube_videos.groupby('video')['revenue_per_Subscriber'].transform(lambda x: x.rolling(window=30).std())
youtube_videos['monthly_estimatedRedPartnerRevenue_std'] = youtube_videos.groupby('video')['estimatedRedPartnerRevenue'].transform(lambda x: x.rolling(window=30).std())
youtube_videos['monthly_revenue_diversification_ratio_std'] = youtube_videos.groupby('video')['revenue_diversification_ratio'].transform(lambda x: x.rolling(window=30).std())
youtube_videos['monthly_subscriber_growth_rate_std'] = youtube_videos.groupby('video')['subscriber_growth_rate'].transform(lambda x: x.rolling(window=30).std())
youtube_videos['monthly_subscriber_loss_rate_std'] = youtube_videos.groupby('video')['subscriber_loss_rate'].transform(lambda x: x.rolling(window=30).std())
youtube_videos['monthly_subscriber_retention_rate_std'] = youtube_videos.groupby('video')['subscriber_retention_rate'].transform(lambda x: x.rolling(window=30).std())
youtube_videos['monthly_subscriber_gain_per_content_std'] = youtube_videos.groupby('video')['subscriber_gain_per_content'].transform(lambda x: x.rolling(window=30).std())
youtube_videos['monthly_watch_time_per_subscriber_std'] = youtube_videos.groupby('video')['watch_time_per_subscriber'].transform(lambda x: x.rolling(window=30).std())
youtube_videos['monthly_ad_playback_rate_std'] = youtube_videos.groupby('video')['ad_playback_rate'].transform(lambda x: x.rolling(window=30).std())

youtube_videos['monthly_revenue_per_view_trd'] = youtube_videos.groupby('video')['revenue_per_view'].transform(lambda x: (x - x.shift(30)) / x.shift(30))
youtube_videos['monthly_revenue_per_Subscriber_trd'] = youtube_videos.groupby('video')['revenue_per_Subscriber'].transform(lambda x: (x - x.shift(30)) / x.shift(30))
youtube_videos['monthly_estimatedRedPartnerRevenue_trd'] = youtube_videos.groupby('video')['estimatedRedPartnerRevenue'].transform(lambda x: (x - x.shift(30)) / x.shift(30))
youtube_videos['monthly_revenue_diversification_ratio_trd'] = youtube_videos.groupby('video')['revenue_diversification_ratio'].transform(lambda x: (x - x.shift(30)) / x.shift(30))
youtube_videos['monthly_subscriber_growth_rate_trd'] = youtube_videos.groupby('video')['subscriber_growth_rate'].transform(lambda x: (x - x.shift(30)) / x.shift(30))
youtube_videos['monthly_subscriber_loss_rate_trd'] = youtube_videos.groupby('video')['subscriber_loss_rate'].transform(lambda x: (x - x.shift(30)) / x.shift(30))
youtube_videos['monthly_subscriber_retention_rate_trd'] = youtube_videos.groupby('video')['subscriber_retention_rate'].transform(lambda x: (x - x.shift(30)) / x.shift(30))
youtube_videos['monthly_subscriber_gain_per_content_trd'] = youtube_videos.groupby('video')['subscriber_gain_per_content'].transform(lambda x: (x - x.shift(30)) / x.shift(30))
youtube_videos['monthly_watch_time_per_subscriber_trd'] = youtube_videos.groupby('video')['watch_time_per_subscriber'].transform(lambda x: (x - x.shift(30)) / x.shift(30))
youtube_videos['monthly_ad_playback_rate_trd'] = youtube_videos.groupby('video')['ad_playback_rate'].transform(lambda x: (x - x.shift(30)) / x.shift(30))

In [34]:
# null값 대체
youtube_videos = youtube_videos.fillna(0) ## NaN
youtube_videos = youtube_videos.replace([np.inf, -np.inf], 0) ## inf

In [35]:
# youtube_videos.to_csv('C:/py_src/awake/data/youtube_videos_credit.csv', encoding='utf-8-sig', index=False)

In [4]:
# youtube_videos = dd.read_csv('C:/py_src/awake/data/youtube_videos_credit.csv')
youtube_videos = pd.read_csv(file_path + 'youtube_videos_credit.csv')

### 데이터 분할
- 미래를 예측하기 위한 모델이므로 시간순으로 데이터 분할
- 시작날짜 : 2023-03-26
- 종료날짜 : 2024-05-06
- 전체 기간의 80% 날짜 : 2024-02-11

In [5]:
# 컬럼 정리
unique_col = youtube_videos.columns[:3]
remove_col = ['estimatedRevenue','estimatedAdRevenue','grossRevenue','net_subscribers_change','subscribersGained','subscribersLost','engage_rate','views']
x_col = youtube_videos.columns.drop(list(unique_col) + remove_col) ## y값 제거, y값 변수 관여 변수 제거

In [6]:
# 데이터 분할
train_data = youtube_videos[youtube_videos['end_date'] <= '2024-02-11']
test_data = youtube_videos[youtube_videos['end_date'] > '2024-02-11']

In [7]:
# 불균형 확인
print(train_data.shape)
print(test_data.shape)

(6538190, 82)
(2031321, 82)


### 모델 기법 적용

In [8]:
# 각 모델 정의
# rf_model = RandomForestRegressor(random_state=42)
# gbm_model = GradientBoostingRegressor(random_state=42)
# lgbm_model = LGBMRegressor(random_state=42)
xgb_model = xgb.XGBRegressor(random_state=42)

# 모델 리스트
models = {
    # 'RandomForest': rf_model,
    # 'GBM': gbm_model,
    # 'LightGBM': lgbm_model,
    'XGBoost': xgb_model
}

In [None]:
# # 모델별 교차 검증 결과 저장
# results = {}

# for model_name, model in models.items():
#     print(f"\n{model_name} 모델 성능 평가 중...")
    
#     # 교차 검증
#     cv_scores = cross_val_score(model, train_data[x_col], train_data['subscribers_count'], cv=5, scoring='neg_mean_squared_error')
    
#     # 평균 RMSE 계산
#     rmse_scores = np.sqrt(-cv_scores)  # neg_mean_squared_error는 음수이므로 양수로 변환 후 제곱근
#     mean_rmse = rmse_scores.mean()
#     print(f"{model_name} 교차 검증 평균 RMSE: {mean_rmse}")
    
#     # 결과 저장
#     results[model_name] = mean_rmse

### 모델 성능 평가

In [9]:
feature_importances_dict = {}

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

for target in ['estimatedRevenue','net_subscribers_change','engage_rate']:
    # 모델 정의 및 학습
    models['XGBoost'].fit(train_data[x_col], train_data[target]) ## 'estimatedRevenue','net_subscribers_change','engage_rate

    # 변수 중요도
    feature_importances_dict[target] = models['XGBoost'].feature_importances_

    # 예측
    y_pred = models['XGBoost'].predict(test_data[x_col])

    # MSE 계산
    mse = mean_squared_error(test_data[target], y_pred)

    # RMSE 계산
    rmse = np.sqrt(mse)

    # R² 값 계산
    r2 = r2_score(test_data[target], y_pred)

    # Adjusted R² 계산
    n = len(test_data)  # 샘플 수
    p = test_data.shape[1]  # 독립 변수(특성) 수
    adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

    # 결과 출력
    print(target)
    print(f"R² 값: {r2:.4f}")
    print(f"Adjusted R² 값: {adjusted_r2:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print("----------------------------------------")
    print("")

estimatedRevenue
R² 값: 0.8992
Adjusted R² 값: 0.8992
MSE: 2389651.9695
RMSE: 1545.8499
----------------------------------------

net_subscribers_change
R² 값: 0.7636
Adjusted R² 값: 0.7636
MSE: 161.2195
RMSE: 12.6972
----------------------------------------

engage_rate
R² 값: 0.8314
Adjusted R² 값: 0.8314
MSE: 0.0013
RMSE: 0.0365
----------------------------------------



### 변수 중요도 확인
- 각 y값에 대한 중요도 평균

In [11]:
importances_df = pd.DataFrame({
    'features': x_col,
    'estimatedRevenue_importance': feature_importances_dict['estimatedRevenue'],
    'net_subscribers_change_importance': feature_importances_dict['net_subscribers_change'],
    'engage_rate_importance': feature_importances_dict['engage_rate']
    })

importances_df['mean_importance'] = importances_df[['estimatedRevenue_importance', 'net_subscribers_change_importance', 'engage_rate_importance']].mean(axis=1)

최종 지표 선정

In [12]:
# 신용평가 최종 지표 선정
final_col = list(importances_df.sort_values(['mean_importance'],ascending=False).iloc[:20]['features'])
final_col

['dislikes',
 'estimatedRedPartnerRevenue',
 'likes',
 'redViews',
 'monthly_revenue_diversification_ratio_trd',
 'shares',
 'revenue_diversification_ratio',
 'estimatedMinutesWatched',
 'revenue_per_view',
 'comments',
 'averageViewDuration',
 'playbackBasedCpm',
 'averageViewPercentage',
 'monthly_watch_time_per_subscriber_std',
 'watch_time_per_subscriber',
 'adImpressions',
 'monetizedPlaybacks',
 'monthly_estimatedRedPartnerRevenue_std',
 'subscriber_gain_per_content',
 'videosRemovedFromPlaylists']

In [None]:
# # 신용평가 최종 지표
# final_col = ['dislikes', 'estimatedRedPartnerRevenue', 'likes', 'redViews', 'monthly_revenue_diversification_ratio_trd', 'shares', 'revenue_diversification_ratio', 'estimatedMinutesWatched',
#              'revenue_per_view', 'comments', 'averageViewDuration', 'playbackBasedCpm', 'averageViewPercentage', 'monthly_watch_time_per_subscriber_std', 'watch_time_per_subscriber',
#              'adImpressions', 'monetizedPlaybacks', 'monthly_estimatedRedPartnerRevenue_std', 'subscriber_gain_per_content', 'videosRemovedFromPlaylists']

In [14]:
# # 신용 평가 최종 지표 활용 모델 성능 확인
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error, r2_score
# import numpy as np

# for target in ['estimatedRevenue','net_subscribers_change','engage_rate']:
#     # 모델 정의 및 학습
#     models['XGBoost'].fit(train_data[final_col], train_data[target]) ## 'estimatedRevenue','net_subscribers_change','engage_rate

#     # 변수 중요도
#     feature_importances_dict[target] = models['XGBoost'].feature_importances_

#     # 예측
#     y_pred = models['XGBoost'].predict(test_data[final_col])

#     # MSE 계산
#     mse = mean_squared_error(test_data[target], y_pred)

#     # RMSE 계산
#     rmse = np.sqrt(mse)

#     # R² 값 계산
#     r2 = r2_score(test_data[target], y_pred)

#     # Adjusted R² 계산
#     n = len(test_data)  # 샘플 수
#     p = test_data.shape[1]  # 독립 변수(특성) 수
#     adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

#     # 결과 출력
#     print(target)
#     print(f"R² 값: {r2:.4f}")
#     print(f"Adjusted R² 값: {adjusted_r2:.4f}")
#     print(f"MSE: {mse:.4f}")
#     print(f"RMSE: {rmse:.4f}")
#     print("----------------------------------------")
#     print("")

estimatedRevenue
R² 값: 0.9054
Adjusted R² 값: 0.9054
MSE: 2242811.2569
RMSE: 1497.6018
----------------------------------------

net_subscribers_change
R² 값: 0.8259
Adjusted R² 값: 0.8259
MSE: 118.7260
RMSE: 10.8961
----------------------------------------

engage_rate
R² 값: 0.8329
Adjusted R² 값: 0.8329
MSE: 0.0013
RMSE: 0.0363
----------------------------------------



### 가중치 설정
- 최종 지표 활용

In [13]:
importances_df_final = importances_df[importances_df['features'].isin(final_col)][['features','mean_importance']].reset_index(drop=True).sort_values('mean_importance', ascending=False)
importances_df_final['weight'] = importances_df_final['mean_importance'] / importances_df_final['mean_importance'].sum() ## 가중치 설정
importances_df_final

Unnamed: 0,features,mean_importance,weight
3,dislikes,0.1383,0.15978
9,estimatedRedPartnerRevenue,0.131571,0.152006
2,likes,0.084882,0.098065
0,redViews,0.052842,0.061049
19,monthly_revenue_diversification_ratio_trd,0.042292,0.04886
4,shares,0.038931,0.044977
14,revenue_diversification_ratio,0.038377,0.044338
5,estimatedMinutesWatched,0.036728,0.042432
13,revenue_per_view,0.036697,0.042397
1,comments,0.035313,0.040798


평가요소 분류

In [14]:
# 상환이력
eval_col1 = ['dislikes','likes','shares','comments','redViews']
importances_df_final[importances_df_final['features'].isin(eval_col1)]['weight'].sum()

np.float32(0.4046696)

In [15]:
# 부채수준
eval_col2 = ['estimatedRedPartnerRevenue','monthly_estimatedRedPartnerRevenue_std','revenue_per_view']
importances_df_final[importances_df_final['features'].isin(eval_col2)]['weight'].sum()

np.float32(0.2153935)

In [16]:
# 신용거래기간
eval_col3 = ['subscriber_gain_per_content','videosRemovedFromPlaylists','monthly_watch_time_per_subscriber_std','watch_time_per_subscriber']
importances_df_final[importances_df_final['features'].isin(eval_col3)]['weight'].sum()

np.float32(0.09121278)

In [17]:
# 신용형태
eval_col4 = ['revenue_diversification_ratio','monthly_revenue_diversification_ratio_trd','playbackBasedCpm','monetizedPlaybacks','adImpressions']
importances_df_final[importances_df_final['features'].isin(eval_col4)]['weight'].sum()

np.float32(0.17576551)

In [18]:
# 비금융/마이데이터
eval_col5 = ['averageViewDuration','averageViewPercentage','estimatedMinutesWatched']
importances_df_final[importances_df_final['features'].isin(eval_col5)]['weight'].sum()

np.float32(0.11295858)

가중치 적용

In [19]:
# 가중치 실제값 적용
# 상환이력
youtube_videos['score1'] = (youtube_videos[eval_col1] * np.array(importances_df_final[importances_df_final['features'].isin(eval_col1)]['weight'])).sum(axis=1)

In [20]:
# 부채수준
youtube_videos['score2'] = (youtube_videos[eval_col2] * np.array(importances_df_final[importances_df_final['features'].isin(eval_col2)]['weight'])).sum(axis=1)

In [21]:
# 신용거래기간
youtube_videos['score3'] = (youtube_videos[eval_col3] * np.array(importances_df_final[importances_df_final['features'].isin(eval_col3)]['weight'])).sum(axis=1)

In [22]:
# 신용형태
youtube_videos['score4'] = (youtube_videos[eval_col4] * np.array(importances_df_final[importances_df_final['features'].isin(eval_col4)]['weight'])).sum(axis=1)

In [23]:
# 비금융/마이데이터
youtube_videos['score5'] = (youtube_videos[eval_col5] * np.array(importances_df_final[importances_df_final['features'].isin(eval_col5)]['weight'])).sum(axis=1)

In [24]:
# 계정별 신용점수 확인 - 계정별 평가요소 합계
credit_df = youtube_videos.groupby('youtube_user_id')[['score1','score2','score3','score4','score5']].sum().reset_index()

In [25]:
# 불필요 및 이상치 제거
credit_df = credit_df[~credit_df['youtube_user_id'].isin(['639bb8dcd603b8138e33780b'])].reset_index(drop=True)
# '639bb8dcd603b8138e33780b' ## 없는 계정 및 이상치

In [26]:
from sklearn.preprocessing import MinMaxScaler

# 1. MinMaxScaler 적용
scaler = MinMaxScaler()
# credit_df['score_scale'] = scaler.fit_transform(credit_df[['score']])
credit_df['score1_scale'] = scaler.fit_transform(credit_df[['score1']])
credit_df['score2_scale'] = scaler.fit_transform(credit_df[['score2']])
credit_df['score3_scale'] = scaler.fit_transform(credit_df[['score3']])
credit_df['score4_scale'] = scaler.fit_transform(credit_df[['score4']])
credit_df['score5_scale'] = scaler.fit_transform(credit_df[['score5']])

# credit_df['score_final'] = scaler.fit_transform(credit_df[['score_scale']]) * 1000
# credit_df['score1_final'] = scaler.fit_transform(credit_df[['score1_scale']]) * 350 # 평가요소 0.284 * 0 이 아닌 비율 0.12 --> 0.03 : 0.075 * 0.35 --> 0.03  : 0.15
# credit_df['score2_final'] = scaler.fit_transform(credit_df[['score2_scale']]) * 280 # 평가요소 0.245 * 0 이 아닌 비율 0.46 --> 0.11 : 0.27 * 0.28 --> 0.076 : 0.38
# credit_df['score3_final'] = scaler.fit_transform(credit_df[['score3_scale']]) * 120 # 평가요소 0.123 * 0 이 아닌 비율 0.25 --> 0.03 : 0.075 * 0.12 --> 0.009 : 0.04
# credit_df['score4_final'] = scaler.fit_transform(credit_df[['score4_scale']]) * 180 # 평가요소 0.275 * 0 이 아닌 비율 0.58 --> 0.16 : 0.4 * 0.18 --> 0.072 : 0.36
# credit_df['score5_final'] = scaler.fit_transform(credit_df[['score5_scale']]) * 80 # 평가요소 0.073 * 0 이 아닌 비율 0.9 --> 0.07 : 0.18 * 0.08 --> 0.014 : 0.07

credit_df['score1_final'] = scaler.fit_transform(credit_df[['score1_scale']]) * 430
credit_df['score2_final'] = scaler.fit_transform(credit_df[['score2_scale']]) * 410
credit_df['score3_final'] = scaler.fit_transform(credit_df[['score3_scale']]) * 50
credit_df['score4_final'] = scaler.fit_transform(credit_df[['score4_scale']]) * 60
credit_df['score5_final'] = scaler.fit_transform(credit_df[['score5_scale']]) * 50

In [27]:
# 평가요소 분류 활용 스코어링
credit_df['credit_score'] = credit_df[['score1_final','score2_final','score3_final','score4_final','score5_final']].sum(axis=1)

In [28]:
# 계정 확인 테이블
user_info_df = merge_df_users_fin[['youtube_user_id','channel_title']].drop_duplicates().reset_index(drop=True)
user_info_df = user_info_df[user_info_df['channel_title']!='0'].drop_duplicates().reset_index(drop=True)
user_info_df = user_info_df[~user_info_df['channel_title'].isnull()].reset_index(drop=True)
user_info_df = user_info_df.drop_duplicates(['youtube_user_id']).reset_index(drop=True)
user_info_df = user_info_df.drop_duplicates(['channel_title']).reset_index(drop=True)

In [29]:
# 신용평가점수 테이블
credit_df_fin = pd.merge(credit_df[['youtube_user_id','credit_score']],user_info_df,how='left',on='youtube_user_id')
credit_df_fin = credit_df_fin[['youtube_user_id','channel_title','credit_score']]
credit_df_fin = credit_df_fin[~credit_df_fin['channel_title'].isnull()].reset_index(drop=True)

In [30]:
credit_df_fin.sort_values(['credit_score'],ascending=False)

Unnamed: 0,youtube_user_id,channel_title,credit_score
90,63d77c9650eb530dfd139f8b,kiu기우쌤,811.211271
98,63eb4f87ee122e631992279f,수빙수tv sooBingsoo,607.087625
128,6401e117d746c60e1271fdef,앙찡,501.772071
31,629f6ca6eaf5732d6df0611e,Mind Patting마음토닥,487.574932
199,64da8a9ef638790e0f74bae7,잼스기타,394.203490
...,...,...,...
48,62bc1aca507271632b940e2e,도아이 Doh-I,0.049373
20,6287a9cefb15712a8cbb098e,혜성네일_comet,0.044330
68,6314a287a0673403176d3c35,gahyun 가현,0.024643
57,62d11f9f0b4c4c7502a5c1b6,DDONIE 또니 / 러브크레센트,0.019391


### 변동계수
- y값 활용 : 'estimatedRevenue', 'net_subscribers_change', 'engage_rate'
- 0값이 많아 왜곡된 변동계수를 계산할 수 있음 --> 0값 제거후 변동계수 산출

In [157]:
# 각 y값의 0비율 확인
# 'estimatedRevenue', 'net_subscribers_change', 'engage_rate'
print('estimatedRevenue의 0비율 : ', round((youtube_videos['estimatedRevenue']==0).sum() / len(youtube_videos),2))
print('net_subscribers_change의 0비율 : ', round((youtube_videos['net_subscribers_change']==0).sum() / len(youtube_videos),2))
print('engage_rate의 0비율 : ', round((youtube_videos['engage_rate']==0).sum() / len(youtube_videos),2))

estimatedRevenue의 0비율 :  0.32
net_subscribers_change의 0비율 :  0.9
engage_rate의 0비율 :  0.71


In [31]:
# estimatedRevenue
youtube_videos['weekly_estimatedRevenue_cv'] = youtube_videos.groupby('video')['estimatedRevenue'].transform(lambda x: x.rolling(window=7).std() / x.rolling(window=7).mean())
youtube_videos['monthly_estimatedRevenue_cv'] = youtube_videos.groupby('video')['estimatedRevenue'].transform(lambda x: x.rolling(window=30).std() / x.rolling(window=30).mean())

# net_subscribers_change
youtube_videos['weekly_net_subscribers_change_cv'] = youtube_videos[youtube_videos['net_subscribers_change']!=0].groupby('video')['net_subscribers_change'].transform(lambda x: x.rolling(window=7).std() / x.rolling(window=7).mean())
youtube_videos['monthly_net_subscribers_change_cv'] = youtube_videos[youtube_videos['net_subscribers_change']!=0].groupby('video')['net_subscribers_change'].transform(lambda x: x.rolling(window=30).std() / x.rolling(window=30).mean())

# engage_rate
youtube_videos['weekly_engage_rate_cv'] = youtube_videos[youtube_videos['engage_rate']!=0].groupby('video')['engage_rate'].transform(lambda x: x.rolling(window=7).std() / x.rolling(window=7).mean())
youtube_videos['monthly_engage_rate_cv'] = youtube_videos[youtube_videos['engage_rate']!=0].groupby('video')['engage_rate'].transform(lambda x: x.rolling(window=30).std() / x.rolling(window=30).mean())

cv_col = ['weekly_estimatedRevenue_cv','monthly_estimatedRevenue_cv','weekly_net_subscribers_change_cv','monthly_net_subscribers_change_cv','weekly_engage_rate_cv','monthly_engage_rate_cv']

In [32]:
# null값 대체
youtube_videos = youtube_videos.fillna(0) ## NaN
youtube_videos = youtube_videos.replace([np.inf, -np.inf], 0) ## inf

In [33]:
# 계정별 변동계수 평균
coefvar_df = youtube_videos.groupby('youtube_user_id')[cv_col].mean().reset_index()

In [34]:
# 최종 신용평가 테이블
credit_coef_df = pd.merge(credit_df_fin, coefvar_df, how='left', on='youtube_user_id')
# credit_df_final = credit_df_final[~credit_df_final['channel_title'].isnull()].reset_index(drop=True)

In [100]:
credit_coef_df[credit_coef_df['monthly_engage_rate_cv'] > 0.005].sort_values('monthly_engage_rate_cv')[['youtube_user_id','channel_title','monthly_engage_rate_cv']].iloc[-20:]

Unnamed: 0,youtube_user_id,channel_title,monthly_engage_rate_cv
151,64399d0b659261656b3f0681,日本ジヌ【니혼지누】ー韓国に関する全て,0.220213
209,6508ff021120b40b4427a4fc,뛰뛰빵빵 김옥순,0.226618
127,6401da7ad746c60e1271fdd6,히스커버리 역사채널,0.227237
135,640a007613bc6a0e24f95b24,뚜니랑,0.233161
47,62b8660e507271632b906efc,모염 moyeom,0.233943
193,64d1c7e51e9bad0e238f45a0,쿜쿜쿜,0.234603
26,6294ab84fe241a32a48ada00,오디디 코미디,0.237432
37,62a890be9d41c93ff9129a22,황나겸,0.238419
199,64da8a9ef638790e0f74bae7,잼스기타,0.240656
153,6444cfbea224e30e2de2b264,뻘짓연구소,0.241346


In [124]:
credit_coef_df

Unnamed: 0,youtube_user_id,channel_title,credit_score,weekly_estimatedRevenue_cv,monthly_estimatedRevenue_cv,weekly_net_subscribers_change_cv,monthly_net_subscribers_change_cv,weekly_engage_rate_cv,monthly_engage_rate_cv,cv_score
0,627cb611aa6f212355e0b617,성팩 SPAAK,13.349272,0.490795,0.574736,0.040273,0.035230,0.117803,0.100668,0.073249
1,627f59ccaa39226247c60b01,고도람 Go!doram,0.476567,0.802761,0.809237,0.000000,0.000000,0.009414,0.001670,0.086583
2,6287228afb15712a8cb931d7,세남자 물고기,0.524080,0.240621,0.369765,0.012032,0.010465,0.075025,0.062087,0.041407
3,6287229efb15712a8cb93225,띠혜 ddihye,79.980751,0.298516,0.386913,0.036342,0.019539,0.188688,0.149371,0.058409
4,628722c8fb15712a8cb9326e,소리미의 신화방송,5.023647,1.286501,1.491383,0.001229,0.000380,0.032041,0.015675,0.150872
...,...,...,...,...,...,...,...,...,...,...
229,65cc401305bf1c0baa425146,주피코,144.009275,0.344102,0.314013,0.294535,0.176194,0.227461,0.146422,0.083122
230,65e7b773d8da110bb072e2b5,신크TV,1.788091,0.783319,0.254329,0.000160,0.000000,0.001754,0.000000,0.055447
231,65f7b17ed8da110bb0733b7b,Yerendipity예렌디피티,0.227405,0.276907,0.429779,0.018331,0.002414,0.035343,0.003394,0.041031
232,65fecf7ed8da110bb0736199,JN테크리뷰,6.575062,0.595728,0.234931,0.016795,0.000595,0.072448,0.008624,0.049775


In [125]:
# 가중치 역으로 활용하여 최종 변동 계수 도출
# (1 - 0.36) + (1 - 0.3) + (1 - 0.34) 활용 스케일링
# 0.32 / 0.35 / 0.33 가중치 적용
credit_coef_df['cv_score'] = (
                              (((credit_coef_df['weekly_estimatedRevenue_cv'] + credit_coef_df['monthly_estimatedRevenue_cv']) / 2) * 0.32) + 
                              (((credit_coef_df['weekly_net_subscribers_change_cv'] + credit_coef_df['monthly_net_subscribers_change_cv']) / 2) * 0.35) + 
                              (((credit_coef_df['weekly_engage_rate_cv'] + credit_coef_df['monthly_engage_rate_cv']) / 2) * 0.33)
                              )

In [None]:
# 가중치 역으로 활용하여 최종 변동 계수 도출
# (1 - 0.36) + (1 - 0.3) + (1 - 0.34) 활용 스케일링
# 0.32 / 0.35 / 0.33 가중치 적용
credit_coef_df['cv_score'] = (
                              (((credit_coef_df['weekly_estimatedRevenue_cv'] + credit_coef_df['monthly_estimatedRevenue_cv']) / 2) * 0.32) + 
                              (((credit_coef_df['weekly_net_subscribers_change_cv'] + credit_coef_df['monthly_net_subscribers_change_cv']) / 2) * 0.35) + 
                              (((credit_coef_df['weekly_engage_rate_cv'] + credit_coef_df['monthly_engage_rate_cv']) / 2) * 0.33)
                              )

In [126]:
# 최종 데이터셋 확인
credit_df_final = credit_coef_df[['youtube_user_id','channel_title','credit_score','cv_score']]

In [127]:
credit_df_final.sort_values('credit_score',ascending=False)

Unnamed: 0,youtube_user_id,channel_title,credit_score,cv_score
90,63d77c9650eb530dfd139f8b,kiu기우쌤,811.211271,0.273353
98,63eb4f87ee122e631992279f,수빙수tv sooBingsoo,607.087625,0.236631
128,6401e117d746c60e1271fdef,앙찡,501.772071,0.277777
31,629f6ca6eaf5732d6df0611e,Mind Patting마음토닥,487.574932,0.343168
199,64da8a9ef638790e0f74bae7,잼스기타,394.203490,0.270385
...,...,...,...,...
48,62bc1aca507271632b940e2e,도아이 Doh-I,0.049373,0.017712
20,6287a9cefb15712a8cbb098e,혜성네일_comet,0.044330,0.238565
68,6314a287a0673403176d3c35,gahyun 가현,0.024643,0.000314
57,62d11f9f0b4c4c7502a5c1b6,DDONIE 또니 / 러브크레센트,0.019391,0.001429


In [133]:
credit_df_final[(credit_df_final['cv_score']>0.5) & (credit_df_final['cv_score']<=0.7)].sort_values('credit_score',ascending=False)

Unnamed: 0,youtube_user_id,channel_title,credit_score,cv_score
143,64243f6f194dfa2b2aedadac,황헬린 탈출기,13.271173,0.510975


In [134]:
credit_df_final[(credit_df_final['cv_score']>0.7)].sort_values('credit_score',ascending=False)

Unnamed: 0,youtube_user_id,channel_title,credit_score,cv_score


### 실제데이터 결과 확인

In [147]:
credit_df_final[(credit_df_final['credit_score'] >= 200) & (credit_df_final['credit_score'] < 400)].sort_values(['credit_score','cv_score'], ascending=[False,True])

Unnamed: 0,youtube_user_id,channel_title,credit_score,cv_score
199,64da8a9ef638790e0f74bae7,잼스기타,394.20349,0.270385
151,64399d0b659261656b3f0681,日本ジヌ【니혼지누】ー韓国に関する全て,387.543536,0.245119
181,649b0fb29247f326464c53f6,뷰드름 유튜버 인씨,293.308033,0.233153
55,62d11f080b4c4c7502a5be3d,abbapraise 아바프레이즈,286.126228,0.285314
201,64dc7156f638790e0f74d1a2,축구 읽어주는 여자 쵱내,217.654294,0.25002
153,6444cfbea224e30e2de2b264,뻘짓연구소,216.400031,0.252299
176,6486b00519c22b644dded32c,그롬마쉬TV,202.103948,0.210104


In [151]:
credit_df_final.sort_values('cv_score')

Unnamed: 0,youtube_user_id,channel_title,credit_score,cv_score
68,6314a287a0673403176d3c35,gahyun 가현,0.024643,0.000314
12,628724d8fb15712a8cb93456,루깬미,0.013068,0.000554
57,62d11f9f0b4c4c7502a5c1b6,DDONIE 또니 / 러브크레센트,0.019391,0.001429
94,63e0fa9aee122e631991e13d,한나임한나Hannaim,0.058399,0.001482
146,6427b4a01d589972c84adf22,키키낙낙,0.102775,0.003566
...,...,...,...,...
133,6405796d118c0f58588198ab,채채ChaeChae,8.441781,0.465178
182,64a524ff9247f326464d2d44,AllaproTV,6.145893,0.474736
203,64ed848edcc0250e17c4278f,법무법인 슈가스퀘어,0.693733,0.482350
86,63d0c34350eb530dfd1370ee,은는이가,1.358575,0.486428


In [150]:
credit_df_final[(credit_df_final['credit_score'] < 200)].sort_values(['credit_score','cv_score'], ascending=[False,True]).iloc[-20:]

Unnamed: 0,youtube_user_id,channel_title,credit_score,cv_score
130,64024042118c0f58588183b7,로컬필름 LOCAL FILM,0.31394,0.013572
13,62872523fb15712a8cb93479,임삐나,0.311566,0.150228
196,64d92952f638790e0f74a1d9,연디아 채널 Yeondia Channel,0.311366,0.231423
191,64c7fc2a1951980e344809f0,두꼽이Challenge,0.293137,0.041765
202,64ddab69f638790e0f74e9c5,D_tail_디테,0.242014,0.014242
54,62d11e8d0b4c4c7502a5bb11,ORlGN 오리진,0.235628,0.036915
231,65f7b17ed8da110bb0733b7b,Yerendipity예렌디피티,0.227405,0.123092
27,629b694beaf5732d6deae948,담순언니 Twins Vlog,0.149267,0.30906
40,62aafbccc6d9f158d06a7f03,하부유튜브 Minor / (Lower) YouTube,0.140438,0.052532
226,65b26f8405bf1c0baa41b80d,SBM&E Official,0.124665,0.189288
