In [1]:
import pymongo
import pandas as pd
import numpy as np
import math
from datetime import datetime

# import dask.dataframe as dd ## 대규모 데이터 처리
# import vaex ## 대규모 데이터 처리

# from sklearn.linear_model import LinearRegression ## likes, dislikes 대체
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

import os
from dotenv import load_dotenv

In [2]:
file_path = 'C:/py_src/awake/data/'

In [8]:
# 계정 / 콘텐츠 분석 데이터셋 불러오기
merge_df_users_fin = pd.read_csv(file_path + 'merge_df_users_final3.csv', low_memory=False)
youtube_videos = pd.read_csv(file_path + 'youtube_videos.csv')

## 계정 데이터 분석

### 데이터 분할
- 미래를 예측하기 위한 모델이므로 시간순으로 데이터 분할
- 시작날짜 : 2023-03-26
- 종료날짜 : 2024-05-06
- 전체 기간의 80% 날짜 : 2024-02-11

In [7]:
# 컬럼 정리
unique_col = merge_df_users_fin.columns[:11]
x_col = merge_df_users_fin.columns[11:].drop('estimated_ad_revenue') ## y값 제거

In [8]:
# 데이터 분할
train_data = merge_df_users_fin[merge_df_users_fin['date'] <= '2024-02-11']
test_data = merge_df_users_fin[merge_df_users_fin['date'] > '2024-02-11']

In [9]:
# 불균형 확인
print(train_data.shape)
print(test_data.shape)

(84473, 57)
(21210, 57)


### 모델 기법 적용

In [10]:
# 각 모델 정의
rf_model = RandomForestRegressor(random_state=42)
gbm_model = GradientBoostingRegressor(random_state=42)
lgbm_model = LGBMRegressor(random_state=42)
xgb_model = xgb.XGBRegressor(random_state=42)

# 모델 리스트
models = {
    'RandomForest': rf_model,
    'GBM': gbm_model,
    'LightGBM': lgbm_model,
    'XGBoost': xgb_model
}

In [9]:
# # 모델별 교차 검증 결과 저장
# results = {}

# for model_name, model in models.items():
#     print(f"\n{model_name} 모델 성능 평가 중...")
    
#     # 교차 검증
#     cv_scores = cross_val_score(model, train_data[x_col], train_data['subscribers_count'], cv=5, scoring='neg_mean_squared_error')
    
#     # 평균 RMSE 계산
#     rmse_scores = np.sqrt(-cv_scores)  # neg_mean_squared_error는 음수이므로 양수로 변환 후 제곱근
#     mean_rmse = rmse_scores.mean()
#     print(f"{model_name} 교차 검증 평균 RMSE: {mean_rmse}")
    
#     # 결과 저장
#     results[model_name] = mean_rmse

### 모델 성능 평가

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

for model in models:
    # 모델 정의 및 학습
    models[model].fit(train_data[x_col], train_data['estimated_ad_revenue'])

    # 예측
    y_pred = models[model].predict(test_data[x_col])

    # MSE 계산
    mse = mean_squared_error(test_data['estimated_ad_revenue'], y_pred)

    # RMSE 계산
    rmse = np.sqrt(mse)

    # R² 값 계산
    r2 = r2_score(test_data['estimated_ad_revenue'], y_pred)

    # Adjusted R² 계산
    n = len(test_data)  # 샘플 수
    p = test_data.shape[1]  # 독립 변수(특성) 수
    adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

    # 결과 출력
    print(model)
    print(f"R² 값: {r2:.4f}")
    print(f"Adjusted R² 값: {adjusted_r2:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print("----------------------------------------")
    print("")

RandomForest
R² 값: 0.9936
Adjusted R² 값: 0.9936
MSE: 47091405.1252
RMSE: 6862.3178
----------------------------------------

GBM
R² 값: 0.9974
Adjusted R² 값: 0.9974
MSE: 18992570.5474
RMSE: 4358.0466
----------------------------------------

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008939 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11454
[LightGBM] [Info] Number of data points in the train set: 84473, number of used features: 45
[LightGBM] [Info] Start training from score 14667.681766
LightGBM
R² 값: 0.9223
Adjusted R² 값: 0.9221
MSE: 571114131.6523
RMSE: 23897.9943
----------------------------------------

XGBoost
R² 값: 0.9663
Adjusted R² 값: 0.9662
MSE: 248007649.8104
RMSE: 15748.2586
----------------------------------------



In [92]:
# # 비선형모델 활용 변수 선정
# importances_df = pd.DataFrame({
#     'features': x_col,
#     'rf_importance': models['RandomForest'].feature_importances_,
#     'gbm_importance': models['GBM'].feature_importances_,
#     'lgbm_importance': models['LightGBM'].feature_importances_,
#     'xgb_importance': models['XGBoost'].feature_importances_    
# })
# importances_df['lgbm_importance'] = importances_df['lgbm_importance'] / importances_df['lgbm_importance'].sum() ## 다른 모델과 중요도 단위 다르기 때문에 정규화
# importances_df['mean_importance'] = importances_df[['rf_importance', 'gbm_importance', 'lgbm_importance', 'xgb_importance']].mean(axis=1) ## 각 변수별 모델 중요도 평균

### 모델 성능 개선

스태킹

In [14]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge

# 스태킹 모델 정의
stacking_model = StackingRegressor(
    estimators=[
    ('random_forest', models['RandomForest']),
    ('gbm', models['GBM']),
    ('lightgbm', models['LightGBM']),
    ('xgboost', models['XGBoost'])
    ],
    final_estimator=Ridge()
)

# 스태킹 모델 학습
stacking_model.fit(train_data[x_col], train_data['estimated_ad_revenue'])

# 예측
y_pred = stacking_model.predict(test_data[x_col])

# 성능 평가
# R² 값 계산
r2 = r2_score(test_data['estimated_ad_revenue'], y_pred)

# Adjusted R² 계산
n = len(test_data)  # 샘플 수
p = test_data.shape[1]  # 독립 변수(특성) 수
adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

# MSE 계산
mse = mean_squared_error(test_data['estimated_ad_revenue'], y_pred)

# RMSE 계산
rmse = mean_squared_error(test_data['estimated_ad_revenue'], y_pred, squared=False)

print(f"스태킹 앙상블 모델 R2: {r2:.4f}")
print(f"스태킹 앙상블 모델 Adjusted R2: {adjusted_r2:.4f}")
print(f"스태킹 앙상블 모델 MSE: {mse:.4f}")
print(f"스태킹 앙상블 모델 RMSE: {rmse:.4f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006312 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11454
[LightGBM] [Info] Number of data points in the train set: 84473, number of used features: 45
[LightGBM] [Info] Start training from score 14667.681766
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005094 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11453
[LightGBM] [Info] Number of data points in the train set: 67578, number of used features: 45
[LightGBM] [Info] Start training from score 17638.612776
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006035 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11455
[LightGBM] [Info] Number of data points in the train set: 67578, number of used features: 45
[LightGBM] [Info



### 실제데이터 결과 확인

In [16]:
# 예측 결과 확인
y_pred = stacking_model.predict(merge_df_users_fin[x_col])
merge_df_users_fin['predict'] = y_pred

In [61]:
# 계정별 구독자수 평균, 구독자 예측수 평균 비교
result_df = merge_df_users_fin[['youtube_user_id','date','channel_title','estimated_ad_revenue','predict']]

# 1개월, 3개월, 6개월, 1년 뒤 구독자수 예측값
result_df['date'] = pd.to_datetime(result_df['date']) 

result_df['1_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-30).rolling(window=30).sum()
result_df['3_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-90).rolling(window=90).sum()
result_df['6_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-180).rolling(window=180).sum()
result_df['12_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-365).rolling(window=365).sum()

# 계정별 기간에 따른 광고 수익 예측값 비교
# result_df_final = result_df.groupby(['youtube_user_id'])[['estimated_ad_revenue','predict','1_month_future_predict','3_month_future_predict','6_month_future_predict','12_month_future_predict']].sum().reset_index()

# 계정별로 최종 평균값을 계산
result_df_final = result_df.groupby('youtube_user_id').agg({
    'estimated_ad_revenue': 'mean', ## 1일 평균 광고수익
    'predict': 'mean', ## 1일 평균 광고수익 예측
    '1_month_future_predict': 'mean',
    '3_month_future_predict': 'mean',
    '6_month_future_predict': 'mean',
    '12_month_future_predict': 'mean'
}).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['date'] = pd.to_datetime(result_df['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['1_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-30).rolling(window=30).sum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['3_month_future_pre

In [162]:
result_df[result_df['youtube_user_id'] == '63d2239450eb530dfd137d1e']

Unnamed: 0,youtube_user_id,date,channel_title,estimated_ad_revenue,predict,1_month_future_predict,3_month_future_predict,6_month_future_predict,12_month_future_predict
43071,63d2239450eb530dfd137d1e,2023-03-26,이고 EGO,297.806,44.758478,,,,
43072,63d2239450eb530dfd137d1e,2023-03-27,이고 EGO,252.163,-71.941941,,,,
43073,63d2239450eb530dfd137d1e,2023-03-28,이고 EGO,236.273,12.215801,,,,
43074,63d2239450eb530dfd137d1e,2023-03-29,이고 EGO,269.039,49.785959,,,,
43075,63d2239450eb530dfd137d1e,2023-03-30,이고 EGO,503.468,128.037987,,,,
...,...,...,...,...,...,...,...,...,...
43471,63d2239450eb530dfd137d1e,2024-04-29,이고 EGO,104.513,-133.691835,,,,
43472,63d2239450eb530dfd137d1e,2024-04-30,이고 EGO,80.377,-143.835546,,,,
43473,63d2239450eb530dfd137d1e,2024-05-01,이고 EGO,28.881,-160.208379,,,,
43474,63d2239450eb530dfd137d1e,2024-05-02,이고 EGO,53.548,-153.921827,,,,


In [148]:
result_df_final[result_df_final['predict'] > 0].sort_values('predict')

Unnamed: 0,youtube_user_id,estimated_ad_revenue,predict,1_month_future_predict,3_month_future_predict,6_month_future_predict,12_month_future_predict
48,62b718bc507271632b8f0ce4,339.897065,15.990677,4.692295e+02,-2.621279e+02,-3.614523e+03,
201,64bce3c5616bd20e3037a1cf,323.219489,23.331282,-1.257770e+02,-6.636114e+03,-1.884004e+04,
98,63d2239450eb530dfd137d1e,291.510282,37.633269,8.248158e+02,1.640833e+03,-5.892702e+03,
76,63315973ef33d840a0999698,367.495198,47.274911,2.638626e+03,1.863610e+04,3.914999e+04,
151,6417c62789085e280d0e410b,350.699542,47.398836,-8.481259e+02,-1.499391e+04,-3.482298e+04,
...,...,...,...,...,...,...,...
142,640339ac118c0f5858818694,157229.117648,159944.491440,4.721605e+06,1.344499e+07,2.493102e+07,
57,62d11f080b4c4c7502a5be3d,326569.531929,332969.983834,9.928727e+06,3.084473e+07,6.230265e+07,
84,639bb8dcd603b8138e33780b,342046.202626,348007.035268,1.123271e+07,3.696964e+07,1.215998e+08,
108,63eb4f87ee122e631992279f,343644.047387,350449.210011,9.967546e+06,3.010190e+07,5.184346e+07,


In [65]:
result_df[result_df['youtube_user_id'].isin(result_df_final[result_df_final['predict'] < 0]['youtube_user_id'])]['channel_title'].unique()

array(['고도람 Go!doram', '0', '세남자 물고기', '름쿠 ᴘʟᴀʏʟɪꜱᴛ', '달고캠핑',
       '고군 Gohgoon', '루깬미', '임삐나', '바른걸음연구소', '임퓨의 비트메이킹 클래스',
       '혜성네일_comet', '마파TV', '담순언니 Twins Vlog', '유익한 균튜버', '1분뉴스',
       'the sence', 'MINLEE 민리', '성한준', '다먹어라이언',
       '하부유튜브 Minor / (Lower) YouTube', 'sa lly', '수집의 수집', '도아이 Doh-I',
       'OBL - 온라인 농부, 사자가 되다', '서유 SEOYU DANCE', 'ORlGN 오리진', '윈플즈TV',
       'DDONIE 또니 / 러브크레센트', '슈로시안 SUROSIAN', '김밈서', '드론브이로그 DroneVlog',
       'gahyun 가현', 'Mein 미인', '김두부', '은는이가', '인썸니아TV', '한나임한나Hannaim',
       '수란쿤', '기자 황덕현 KIJA HWANG', '콜로니', '어웨이커 | 크리에이터 이코노미',
       'GMENCY 멘시의 마인크래프트', '로컬필름 LOCAL FILM', '꾸앤끄', '키키낙낙',
       '프롬수지 fromsuzy', 'fromsuzy 프롬수지', '탬니몰리', 'WORKS.D PLAYLIST',
       '강포동하우스', '태다린tae_darin', '홈바부부_HOMBA BOOBOO', 'TJ 영상채널',
       'MORE김모어', '청도시네마', '두꼽이Challenge', '여행윤Tripyun', '핸슥슥',
       '연디아 채널 Yeondia Channel', 'D_tail_디테', '김희영', '김우다', '닷츠 DOTS',
       '약사 이진수💊', '모리녀', '자수의숲jasooforest', 'SBM&E Official',
       'Yeren

In [115]:
# 상위, 하위 20개 계정
top_user_id_20_6_month = result_df_final[result_df_final['6_month_future_predict'] > 0].sort_values(['6_month_future_predict'],ascending=False)['youtube_user_id'].iloc[:20]
bottom_user_id_20_6_month = result_df_final[result_df_final['6_month_future_predict'] > 0].sort_values(['6_month_future_predict'],ascending=False)['youtube_user_id'].iloc[-20:]

In [112]:
print(len(result_df[result_df['youtube_user_id'].isin(top_user_id_20_1_month)]['channel_title'].unique()))
print(result_df[result_df['youtube_user_id'].isin(top_user_id_20_1_month)]['channel_title'].unique())

20
['abbapraise 아바프레이즈' '0' 'Jeffreyxking' '팀브라더스' 'kiu기우쌤'
 '수빙수tv sooBingsoo' '너굴몬' '미니멀영어 Minimal English' '日本ジヌ【니혼지누】ー韓国に関する全て'
 '뻘짓연구소' '정가거부' '북토크' '집구석구석꿀팁, 집꿀' '나연이즈백 LPGA Na Yeon Choi' '유네린NERIN'
 '뷰드름 유튜버 인씨' '빅민 GAME' 'OSSC' '이현우의 MLBTV' '하원장 강동현']


In [113]:
print(len(result_df[result_df['youtube_user_id'].isin(bottom_user_id_20_1_month)]['channel_title'].unique()))
print(result_df[result_df['youtube_user_id'].isin(bottom_user_id_20_1_month)]['channel_title'].unique())

20
['띠혜 ddihye' '0' '소리미의 신화방송' '나는 불독' 'assesta' '배우GO' '굥플레이스 맛집투어'
 '이숲soop' '원의 독백' '채림처럼firstcherry' '모하지연 MOHAJIYEON' '오토컨테이너 스튜디오'
 '부반TV_부에 반하다' '미디하는남자' '이고 EGO' '맛집남자 foodman' '-mentalholder 멘탈홀더 tv'
 '너드 슬로리 SloLee' '캠핑 릴리아빠' '여리여리YeoriYeori']


In [117]:
len(set(top_user_id_20_1_month) & set(top_user_id_20_3_month) & set(top_user_id_20_6_month))

16

## 콘텐츠 데이터 분석

In [11]:
# 최종 콘텐츠 분석 데이터셋
# 환율 적용 - 해당기간 평균환율 : 1322.42
exchange_rate_col = ['estimatedRevenue', 'estimatedAdRevenue', 'estimatedRedPartnerRevenue', 'grossRevenue', 'cpm', 'playbackBasedCpm']
youtube_videos[exchange_rate_col] = youtube_videos[exchange_rate_col] * 1322.42

In [14]:
# 잘못된값 처리
youtube_videos['estimatedRevenue'] = np.where(youtube_videos['estimatedRevenue'] < 0,
                                               youtube_videos['estimatedAdRevenue'] + youtube_videos['estimatedRedPartnerRevenue'],
                                               youtube_videos['estimatedRevenue'])

In [15]:
# 버그로 사용된 수치값 대체
youtube_videos['likes'] = np.where(youtube_videos['likes'] < 0, 0, youtube_videos['likes'])
youtube_videos['dislikes'] = np.where(youtube_videos['dislikes'] < 0, 0, youtube_videos['dislikes'])

### 중요지표확인

In [17]:
youtube_videos.columns

Index(['youtube_user_id', 'video', 'end_date', 'views', 'redViews', 'comments',
       'likes', 'dislikes', 'shares', 'estimatedMinutesWatched',
       'estimatedRedMinutesWatched', 'averageViewDuration',
       'averageViewPercentage', 'videosAddedToPlaylists',
       'videosRemovedFromPlaylists', 'estimatedRevenue', 'estimatedAdRevenue',
       'grossRevenue', 'estimatedRedPartnerRevenue', 'playbackBasedCpm', 'cpm',
       'subscribersGained', 'subscribersLost', 'monetizedPlaybacks',
       'adImpressions', 'cardClickRate', 'cardTeaserClickRate',
       'cardImpressions', 'cardTeaserImpressions', 'cardClicks',
       'cardTeaserClicks'],
      dtype='object')

In [None]:
# y값 설정
youtube_videos['estimatedRevenue']
youtube_videos['estimatedAdRevenue']
youtube_videos['net_subscribers_change'] = youtube_videos['subscribersGained'] - youtube_videos['subscribersLost']

In [None]:
# 조회수당 수익
# 구독자당 수익
# 수익 대비 조회수 비율
# 구독자 증가율
# 구독자 감소율
# 콘텐츠 업로드 빈도
# 콘텐츠당 구독자 증가율
# YouTube Premium 수익
# 수익 다변화 비율
# 구독자 유지율
# 참여도 비율
# 구독자당 시청 시간
# 광고 재생률
# 구독자당 수익
# 시청 시간 변동성
# 광고 수익

In [None]:
# 파생변수1 - 참여도 관련
youtube_videos['like_rate'] = youtube_videos['likes'] / youtube_videos['views'] ## 좋아요 비율 
youtube_videos['comment_rate'] = youtube_videos['comments'] / youtube_videos['views'] ## 댓글 비율
youtube_videos['share_rate'] = youtube_videos['shares'] / youtube_videos['views'] ## 공유 비율  
youtube_videos['dislike_rate'] = youtube_videos['dislikes'] / youtube_videos['views'] ## 싫어요 비율
youtube_videos['total_engage_rate'] = (youtube_videos['likes'] + youtube_videos['comments'] + youtube_videos['shares'] + youtube_videos['dislikes']) / youtube_videos['views'] ## 총 참여율
youtube_videos['positive_engage_rate'] = (youtube_videos['likes'] + youtube_videos['shares']) / youtube_videos['views'] ## 긍정적 참여율
youtube_videos['comment_to_like_rate'] = youtube_videos['comments'] / youtube_videos['likes'] ## 댓글/좋아요 비율
youtube_videos['like_to_dislike_ratio'] = youtube_videos['likes'] / (youtube_videos['dislikes']) ## 좋아요/싫어요 비율

In [None]:
# null값 대체
youtube_videos = youtube_videos.fillna(0) ## NaN
youtube_videos = youtube_videos.replace([np.inf, -np.inf], 0) ## inf

### 데이터 분할
- 미래를 예측하기 위한 모델이므로 시간순으로 데이터 분할
- 시작날짜 : 2023-03-26
- 종료날짜 : 2024-05-06
- 전체 기간의 80% 날짜 : 2024-02-11

In [164]:
youtube_videos.columns

Index(['youtube_user_id', 'video', 'end_date', 'weekly_watch_time',
       'weekly_estimated_revenue', 'dislikes', 'revenue_per_playlist_add',
       'weekly_engagement_rate', 'monetizedPlaybacks',
       'monthly_estimated_revenue', 'playlist_related_revenue_rate',
       'quarterly_avg_view_percentage', 'weekly_playlist_change_rate',
       'quarterly_views', 'grossRevenue', 'ad_revenue_rate',
       'quarterly_ad_impressions', 'quarterly_watch_time',
       'playback_based_cpm_rate', 'monthly_watch_time',
       'subscribers_lost_per_playlist_remove', 'watched_time_rate',
       'subscribersLost', 'likes', 'playlist_removal_rate',
       'positive_engage_rate', 'comments', 'revenue_per_view',
       'monthly_ad_impressions', 'like_rate',
       'subscribers_gained_per_playlist_add', 'weekly_videos_removed', 'cpm',
       'playbackBasedCpm', 'weekly_ad_impressions', 'estimatedMinutesWatched',
       'videosAddedToPlaylists', 'videosRemovedFromPlaylists',
       'revenue_per_minute_wa

In [4]:
# 컬럼 정리
unique_col = youtube_videos.columns[:3]
x_col = youtube_videos.columns[3:].drop(['estimatedAdRevenue']) ## y값 제거

In [5]:
# 데이터 분할
train_data = youtube_videos[youtube_videos['end_date'] <= '2024-02-11']
test_data = youtube_videos[youtube_videos['end_date'] > '2024-02-11']

In [6]:
# 불균형 확인
print(train_data.shape)
print(test_data.shape)

(6538190, 53)
(2031321, 53)


### 모델 기법 적용

In [7]:
# 각 모델 정의
rf_model = RandomForestRegressor(random_state=42)
gbm_model = GradientBoostingRegressor(random_state=42)
lgbm_model = LGBMRegressor(random_state=42)
xgb_model = xgb.XGBRegressor(random_state=42)

# 모델 리스트
models = {
    'RandomForest': rf_model,
    'GBM': gbm_model,
    'LightGBM': lgbm_model,
    'XGBoost': xgb_model
}

In [None]:
# # 모델별 교차 검증 결과 저장
# results = {}

# for model_name, model in models.items():
#     print(f"\n{model_name} 모델 성능 평가 중...")
    
#     # 교차 검증
#     cv_scores = cross_val_score(model, train_data[x_col], train_data['subscribers_count'], cv=5, scoring='neg_mean_squared_error')
    
#     # 평균 RMSE 계산
#     rmse_scores = np.sqrt(-cv_scores)  # neg_mean_squared_error는 음수이므로 양수로 변환 후 제곱근
#     mean_rmse = rmse_scores.mean()
#     print(f"{model_name} 교차 검증 평균 RMSE: {mean_rmse}")
    
#     # 결과 저장
#     results[model_name] = mean_rmse

### 모델 성능 평가

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# 모델 정의 및 학습
models['XGBoost'].fit(train_data[x_col], train_data['estimatedAdRevenue'])

# 예측
y_pred = models['XGBoost'].predict(test_data[x_col])

# MSE 계산
mse = mean_squared_error(test_data['estimatedAdRevenue'], y_pred)

# RMSE 계산
rmse = np.sqrt(mse)

# R² 값 계산
r2 = r2_score(test_data['estimatedAdRevenue'], y_pred)

# Adjusted R² 계산
n = len(test_data)  # 샘플 수
p = test_data.shape[1]  # 독립 변수(특성) 수
adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

# 결과 출력
print('XGBoost')
print(f"R² 값: {r2:.4f}")
print(f"Adjusted R² 값: {adjusted_r2:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print("----------------------------------------")
print("")

XGBoost
R² 값: 0.8577
Adjusted R² 값: 0.8577
MSE: 1831523.5852
RMSE: 1353.3379
----------------------------------------



In [20]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error, r2_score
# import numpy as np

# for model in models:
#     # 모델 정의 및 학습
#     models[model].fit(train_data[x_col], train_data['estimatedAdRevenue'])

#     # 예측
#     y_pred = models[model].predict(test_data[x_col])

#     # MSE 계산
#     mse = mean_squared_error(test_data['estimatedAdRevenue'], y_pred)

#     # RMSE 계산
#     rmse = np.sqrt(mse)

#     # R² 값 계산
#     r2 = r2_score(test_data['estimatedAdRevenue'], y_pred)

#     # Adjusted R² 계산
#     n = len(test_data)  # 샘플 수
#     p = test_data.shape[1]  # 독립 변수(특성) 수
#     adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

#     # 결과 출력
#     print(model)
#     print(f"R² 값: {r2:.4f}")
#     print(f"Adjusted R² 값: {adjusted_r2:.4f}")
#     print(f"MSE: {mse:.4f}")
#     print(f"RMSE: {rmse:.4f}")
#     print("----------------------------------------")
#     print("")

In [None]:
# # 비선형모델 활용 변수 선정
# importances_df = pd.DataFrame({
#     'features': x_col,
#     'rf_importance': models['RandomForest'].feature_importances_,
#     'gbm_importance': models['GBM'].feature_importances_,
#     'lgbm_importance': models['LightGBM'].feature_importances_,
#     'xgb_importance': models['XGBoost'].feature_importances_    
# })
# importances_df['lgbm_importance'] = importances_df['lgbm_importance'] / importances_df['lgbm_importance'].sum() ## 다른 모델과 중요도 단위 다르기 때문에 정규화
# importances_df['mean_importance'] = importances_df[['rf_importance', 'gbm_importance', 'lgbm_importance', 'xgb_importance']].mean(axis=1) ## 각 변수별 모델 중요도 평균

### 모델 성능 개선

In [None]:
x_col

Index(['weekly_estimated_revenue', 'weekly_estimated_ad_revenue',
       'subscriber_increase_rate', 'monthly_revenue_per_ad_impression',
       'revenue_per_subscriber', 'quarterly_avg_view_duration',
       'quarterly_estimated_revenue', 'monthly_net_subscribers_change',
       'estimated_revenue', 'subscriber_view_time_rate',
       'quarterly_total_view_time', 'weekly_revenue_per_ad_impression',
       'monthly_subscribers_lost', 'subscribers_lost', 'watched_time_rate',
       'age18-24.female', 'quarterly_subscribers_gained', 'gross_revenue',
       'gross_revenue_per_ad_impression', 'weekly_net_subscribers_change',
       'playback_rate', 'age25-34.female', 'estimated_red_partner_revenue',
       'redViews', 'quarterly_estimated_ad_revenue',
       'quarterly_net_subscribers_change', 'estimatedMinutesWatched',
       'subscriber_decrease_rate', 'weekly_subscribers_lost',
       'estimated_ad_revenue', 'quarterly_subscribers_lost',
       'monthly_subscribers_gained', 'age13-17.fe

스태킹

In [None]:
# from sklearn.ensemble import StackingRegressor
# from sklearn.linear_model import Ridge

# # 스태킹 모델 정의
# stacking_model = StackingRegressor(
#     estimators=[
#     ('random_forest', models['RandomForest']),
#     # ('gbm', models['GBM']),
#     # ('lightgbm', models['LightGBM']),
#     ('xgboost', models['XGBoost'])
#     ],
#     final_estimator=Ridge()
# )

# # 스태킹 모델 학습
# stacking_model.fit(train_data[x_col], train_data['estimatedAdRevenue'])

# # 예측
# y_pred = stacking_model.predict(test_data[x_col])

# # 성능 평가
# # R² 값 계산
# r2 = r2_score(test_data['estimatedAdRevenue'], y_pred)

# # MSE 계산
# mse = mean_squared_error(test_data['estimatedAdRevenue'], y_pred)

# # RMSE 계산
# rmse = mean_squared_error(test_data['estimatedAdRevenue'], y_pred, squared=False)

# print(f"스태킹 앙상블 모델 R2: {r2:.4f}")
# print(f"스태킹 앙상블 모델 MSE: {mse:.4f}")
# print(f"스태킹 앙상블 모델 RMSE: {rmse:.4f}")

스태킹 앙상블 모델 R2: 0.9661
스태킹 앙상블 모델 MSE: 1998305072.9542
스태킹 앙상블 모델 RMSE: 44702.4057




### 실제데이터 결과 확인

In [14]:
# 예측 결과 확인
# y_pred = stacking_model.predict(merge_df_users_fin[x_col])
# merge_df_users_fin['predict'] = y_pred

y_pred = models['XGBoost'].predict(youtube_videos[x_col])
youtube_videos['predict'] = y_pred

In [15]:
# 계정별 구독자수 평균, 구독자 예측수 평균 비교
result_contents_df = youtube_videos[['youtube_user_id', 'video', 'end_date', 'estimatedAdRevenue', 'predict']]

# 계정별 콘텐츠의 구독자 순증감 1일 합계
result_contents_df = result_contents_df.groupby(['youtube_user_id', 'end_date'], as_index=False).agg({
    'estimatedAdRevenue': 'sum',
    'predict': 'sum'
})

# 1개월, 3개월, 6개월, 1년 뒤 구독자수 예측값 계산
result_contents_df['end_date'] = pd.to_datetime(result_contents_df['end_date'])

# Shift와 Rolling 연산을 위한 그룹별 처리
result_contents_df['1_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-30).rolling(window=30).sum())
result_contents_df['3_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-90).rolling(window=90).sum())
result_contents_df['6_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-180).rolling(window=180).sum())
result_contents_df['12_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-365).rolling(window=365).sum())

# 계정별로 최종 평균값을 계산
result_contents_df_final = result_contents_df.groupby('youtube_user_id').agg({
    'estimatedAdRevenue': 'mean',
    'predict': 'mean',
    '1_month_future_predict': 'mean',
    '3_month_future_predict': 'mean',
    '6_month_future_predict': 'mean',
    '12_month_future_predict': 'mean'
}).reset_index()


In [21]:
result_contents_df_final[result_contents_df_final['predict'] > 0]

Unnamed: 0,youtube_user_id,estimatedAdRevenue,predict,1_month_future_predict,3_month_future_predict,6_month_future_predict,12_month_future_predict
0,627cb611aa6f212355e0b617,7776.284805,7865.420898,238511.455694,615687.204672,1.276685e+06,
1,627f59ccaa39226247c60b01,22.858974,28.559135,857.206126,2612.180373,4.973132e+03,
2,6287228afb15712a8cb931d7,118.569240,119.170174,2175.118191,243.520620,3.429281e+02,
3,6287229efb15712a8cb93225,319.104255,412.594482,10763.081399,22723.167320,6.502604e+04,
4,628722c8fb15712a8cb9326e,584.817873,616.643494,17610.511524,48419.635411,8.952879e+04,
...,...,...,...,...,...,...,...
244,65cc401305bf1c0baa425146,1177.050562,1004.203003,31746.118715,,,
245,65e7b773d8da110bb072e2b5,2419.026120,2434.887939,80514.173564,,,
246,65f7b17ed8da110bb0733b7b,150.755880,154.831406,,,,
247,65fecf7ed8da110bb0736199,14030.064715,14011.172852,,,,


In [33]:
top_user_id_20_6_month = result_contents_df_final[~result_contents_df_final['6_month_future_predict'].isnull()].sort_values(['6_month_future_predict']).iloc[-20:]['youtube_user_id']
bottom_user_id_20_6_month = result_contents_df_final[~result_contents_df_final['6_month_future_predict'].isnull()].sort_values(['6_month_future_predict']).iloc[:20]['youtube_user_id']

In [44]:
merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(top_user_id_20_6_month)]['channel_title'].unique()

array(['Mind Patting마음토닥', '0', '모염 moyeom', 'abbapraise 아바프레이즈', '유경몬',
       '콜드쉽 Coldsheep', '월텍남 - 월스트리트 테크남', '팀브라더스', 'kiu기우쌤', '비됴클래스',
       '수빙수tv sooBingsoo', '만능혁키', '석시원 커플 SeokSiWon Couple', '너굴몬',
       '코인덕 차트아지', '미니멀영어 Minimal English', '윤순의 평범치 않은 생활',
       '日本ジヌ【니혼지누】ー韓国に関する全て', '뻘짓연구소', '중년독수리의 대리여행', '북토크'], dtype=object)

In [45]:
merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(bottom_user_id_20_6_month)]['channel_title'].unique()

array(['세남자 물고기', '0', '름쿠 ᴘʟᴀʏʟɪꜱᴛ', '마파TV', 'the sence', '다먹어라이언',
       'sa lly', '도아이 Doh-I', 'ORlGN 오리진', '윈플즈TV', 'DDONIE 또니 / 러브크레센트',
       'gahyun 가현', '인썸니아TV', '한나임한나Hannaim', '로컬필름 LOCAL FILM', '꾸앤끄',
       '나나무비', '키키낙낙', '탬니몰리', 'WORKS.D PLAYLIST'], dtype=object)

In [73]:
result_contents_df_final[result_contents_df_final['estimatedAdRevenue']!=0].sort_values(['predict'])

Unnamed: 0,youtube_user_id,estimatedAdRevenue,predict,1_month_future_predict,3_month_future_predict,6_month_future_predict,12_month_future_predict
21,6287a9cefb15712a8cbb098e,7.386412,8.279515,2.809071e+02,,,
28,629b694beaf5732d6deae948,11.077663,13.263415,3.710075e+02,1.012134e+03,,
204,64c7fc2a1951980e344809f0,11.443656,13.787192,6.959918e+01,1.633820e+02,,
79,6353096a5a3ac10b5fe8376b,17.244489,25.759954,7.142252e+02,2.043089e+03,3.939559e+03,
47,62b3c78d507271632b8ade02,11.523946,28.041706,8.220679e+02,2.572409e+03,5.192340e+03,
...,...,...,...,...,...,...,...
173,645ec17eef566f0e136a9880,152430.860845,152357.812500,4.481848e+06,1.380492e+07,2.768449e+07,
57,62d11f080b4c4c7502a5be3d,311574.550930,311694.593750,9.304204e+06,2.896338e+07,5.822894e+07,
108,63eb4f87ee122e631992279f,323549.499186,324879.625000,9.218749e+06,2.773670e+07,4.785841e+07,
84,639bb8dcd603b8138e33780b,367042.582846,354395.218750,1.178914e+07,4.438877e+07,,


In [69]:
merge_df_users_fin[merge_df_users_fin['youtube_user_id'] == '62b3c78d507271632b8ade02']

Unnamed: 0,youtube_user_id,date,channel_id,channel_title,phone_num,report_user_id,published_at,viewCount,subscriberCount,videoCount,...,weekly_estimated_revenue,weekly_total_view_time,adult_viewer_rate,quarterly_subscribers_lost,weekly_subscribers_lost,red_revenue_rate,monthly_subscribers_lost,estimatedMinutesWatched,monthly_estimated_revenue,estimated_ad_revenue
19500,62b3c78d507271632b8ade02,2023-03-26,UC67YKqgxk9eGSVBz2mt35aw,수집의 수집,010-9641-1969,0,2011-02-16 19:31:52.000,11620880.0,14400.0,553.0,...,12343.587,15887.0,89.0,670.0,38.0,0.000675,198.0,7018.0,45030.278,15.456
19501,62b3c78d507271632b8ade02,2023-03-27,UC67YKqgxk9eGSVBz2mt35aw,수집의 수집,010-9641-1969,0,2011-02-16 19:31:52.000,11620880.0,14400.0,553.0,...,13218.991,20900.0,82.4,669.0,33.0,0.000542,194.0,6728.0,46175.592,39.989
19502,62b3c78d507271632b8ade02,2023-03-28,UC67YKqgxk9eGSVBz2mt35aw,수집의 수집,010-9641-1969,0,2011-02-16 19:31:52.000,11620880.0,14400.0,553.0,...,15056.954,28481.0,87.7,671.0,33.0,0.000726,191.0,9125.0,47926.061,10.500
19503,62b3c78d507271632b8ade02,2023-03-29,UC67YKqgxk9eGSVBz2mt35aw,수집의 수집,010-9641-1969,0,2011-02-16 19:31:52.000,11620880.0,14400.0,553.0,...,14141.809,34739.0,87.8,663.0,31.0,0.000144,187.0,8050.0,48978.677,32.627
19504,62b3c78d507271632b8ade02,2023-03-30,UC67YKqgxk9eGSVBz2mt35aw,수집의 수집,010-9641-1969,0,2011-02-16 19:31:52.000,11620880.0,14400.0,553.0,...,16089.837,43657.0,86.8,658.0,31.0,0.000483,187.0,10313.0,50708.798,25.967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19900,62b3c78d507271632b8ade02,2024-04-29,UC67YKqgxk9eGSVBz2mt35aw,수집의 수집,010-9641-1969,0,2011-02-16 19:31:52.000,11620880.0,14400.0,553.0,...,5087.921,8462.0,92.7,617.0,22.0,0.216267,143.0,1098.0,24905.241,17.482
19901,62b3c78d507271632b8ade02,2024-04-30,UC67YKqgxk9eGSVBz2mt35aw,수집의 수집,010-9641-1969,0,2011-02-16 19:31:52.000,11620880.0,14400.0,553.0,...,5167.600,8697.0,90.1,616.0,24.0,0.169105,143.0,1379.0,24246.425,0.000
19902,62b3c78d507271632b8ade02,2024-05-01,UC67YKqgxk9eGSVBz2mt35aw,수집의 수집,010-9641-1969,0,2011-02-16 19:31:52.000,11620880.0,14400.0,553.0,...,5266.899,8908.0,89.1,608.0,25.0,0.187596,140.0,1232.0,24194.967,5.887
19903,62b3c78d507271632b8ade02,2024-05-02,UC67YKqgxk9eGSVBz2mt35aw,수집의 수집,010-9641-1969,0,2011-02-16 19:31:52.000,11620880.0,14400.0,553.0,...,5279.221,9066.0,90.3,606.0,25.0,0.188938,142.0,1271.0,24210.491,6.763
