In [1]:
import pymongo
import pandas as pd
import numpy as np
import math
from datetime import datetime

# import dask.dataframe as dd ## 대규모 데이터 처리
# import vaex ## 대규모 데이터 처리

# from sklearn.linear_model import LinearRegression ## likes, dislikes 대체
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

import os
from dotenv import load_dotenv

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
file_path = 'C:/py_src/awake/data/'

In [3]:
# 계정 / 콘텐츠 분석 데이터셋 불러오기
merge_df_users_fin = pd.read_csv(file_path + 'merge_df_users_final2.csv', low_memory=False)
youtube_videos = pd.read_csv(file_path + 'youtube_videos_final2.csv')

## 계정 데이터 분석

### 데이터 분할
- 미래를 예측하기 위한 모델이므로 시간순으로 데이터 분할
- 시작날짜 : 2023-03-26
- 종료날짜 : 2024-05-06
- 전체 기간의 80% 날짜 : 2024-02-11

In [4]:
# 컬럼 정리
unique_col = merge_df_users_fin.columns[:11]
x_col = merge_df_users_fin.columns[11:].drop('subscribers_count') ## y값 제거

In [5]:
# 데이터 분할
train_data = merge_df_users_fin[merge_df_users_fin['date'] <= '2024-02-11']
test_data = merge_df_users_fin[merge_df_users_fin['date'] > '2024-02-11']

In [6]:
# 불균형 확인
print(train_data.shape)
print(test_data.shape)

(84473, 56)
(21210, 56)


### 모델 기법 적용

In [7]:
# 각 모델 정의
rf_model = RandomForestRegressor(random_state=42)
gbm_model = GradientBoostingRegressor(random_state=42)
lgbm_model = LGBMRegressor(random_state=42)
xgb_model = xgb.XGBRegressor(random_state=42)

# 모델 리스트
models = {
    'RandomForest': rf_model,
    'GBM': gbm_model,
    'LightGBM': lgbm_model,
    'XGBoost': xgb_model
}

In [8]:
# # 모델별 교차 검증 결과 저장
# results = {}

# for model_name, model in models.items():
#     print(f"\n{model_name} 모델 성능 평가 중...")
    
#     # 교차 검증
#     cv_scores = cross_val_score(model, train_data[x_col], train_data['subscribers_count'], cv=5, scoring='neg_mean_squared_error')
    
#     # 평균 RMSE 계산
#     rmse_scores = np.sqrt(-cv_scores)  # neg_mean_squared_error는 음수이므로 양수로 변환 후 제곱근
#     mean_rmse = rmse_scores.mean()
#     print(f"{model_name} 교차 검증 평균 RMSE: {mean_rmse}")
    
#     # 결과 저장
#     results[model_name] = mean_rmse

### 모델 성능 평가

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

for model in models:
    # 모델 정의 및 학습
    models[model].fit(train_data[x_col], train_data['subscribers_count'])

    # 예측
    y_pred = models[model].predict(test_data[x_col])

    # MSE 계산
    mse = mean_squared_error(test_data['subscribers_count'], y_pred)

    # RMSE 계산
    rmse = np.sqrt(mse)

    # R² 값 계산
    r2 = r2_score(test_data['subscribers_count'], y_pred)

    # 결과 출력
    print(model)
    print(f"R² 값: {r2:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print("----------------------------------------")
    print("")

RandomForest
R² 값: 0.9157
MSE: 4975277385.4485
RMSE: 70535.6462
----------------------------------------

GBM
R² 값: 0.8720
MSE: 7554219339.4865
RMSE: 86915.0122
----------------------------------------

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012762 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11208
[LightGBM] [Info] Number of data points in the train set: 84473, number of used features: 44
[LightGBM] [Info] Start training from score 65433.322872
LightGBM
R² 값: 0.8862
MSE: 6715269272.1564
RMSE: 81946.7466
----------------------------------------

XGBoost
R² 값: 0.9474
MSE: 3100811798.8810
RMSE: 55684.9333
----------------------------------------



In [10]:
# # 비선형모델 활용 변수 선정
# importances_df = pd.DataFrame({
#     'features': x_col,
#     'rf_importance': models['RandomForest'].feature_importances_,
#     'gbm_importance': models['GBM'].feature_importances_,
#     'lgbm_importance': models['LightGBM'].feature_importances_,
#     'xgb_importance': models['XGBoost'].feature_importances_    
# })
# importances_df['lgbm_importance'] = importances_df['lgbm_importance'] / importances_df['lgbm_importance'].sum() ## 다른 모델과 중요도 단위 다르기 때문에 정규화
# importances_df['mean_importance'] = importances_df[['rf_importance', 'gbm_importance', 'lgbm_importance', 'xgb_importance']].mean(axis=1) ## 각 변수별 모델 중요도 평균

### 모델 성능 개선

스태킹

In [11]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge

# 스태킹 모델 정의
stacking_model = StackingRegressor(
    estimators=[
    ('random_forest', models['RandomForest']),
    # ('gbm', models['GBM']),
    # ('lightgbm', models['LightGBM']),
    ('xgboost', models['XGBoost'])
    ],
    final_estimator=Ridge()
)

# 스태킹 모델 학습
stacking_model.fit(train_data[x_col], train_data['subscribers_count'])

# 예측
y_pred = stacking_model.predict(test_data[x_col])

# 성능 평가
# R² 값 계산
r2 = r2_score(test_data['subscribers_count'], y_pred)

# MSE 계산
mse = mean_squared_error(test_data['subscribers_count'], y_pred)

# RMSE 계산
rmse = mean_squared_error(test_data['subscribers_count'], y_pred, squared=False)

print(f"스태킹 앙상블 모델 R2: {r2:.4f}")
print(f"스태킹 앙상블 모델 MSE: {mse:.4f}")
print(f"스태킹 앙상블 모델 RMSE: {rmse:.4f}")

스태킹 앙상블 모델 R2: 0.9661
스태킹 앙상블 모델 MSE: 1998305072.9543
스태킹 앙상블 모델 RMSE: 44702.4057




### 실제데이터 결과 확인

In [12]:
# 예측 결과 확인
y_pred = stacking_model.predict(merge_df_users_fin[x_col])
merge_df_users_fin['predict'] = y_pred

In [13]:
# 계정별 구독자수 평균, 구독자 예측수 평균 비교
result_df = merge_df_users_fin[['youtube_user_id','date','channel_title','subscriberCount','subscribers_count','predict']]

# 1개월, 3개월, 6개월, 1년 뒤 구독자수 예측값
result_df['date'] = pd.to_datetime(result_df['date']) 

result_df['1_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-30).rolling(window=30).mean()
result_df['3_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-90).rolling(window=90).mean()
result_df['6_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-180).rolling(window=180).mean()
result_df['12_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-365).rolling(window=365).mean()

# 계정별 기간에 따른 구독자수 예측값 비교
result_df_final = result_df.groupby(['youtube_user_id'])[['subscribers_count','predict','1_month_future_predict','3_month_future_predict','6_month_future_predict','12_month_future_predict']].mean().reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['date'] = pd.to_datetime(result_df['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['1_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-30).rolling(window=30).mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['3_month_future_pr

In [14]:
# 기간 따른 예측 결과
# 1개월
result_1_month_plus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] < result_df_final[result_df_final['predict'] > 0]['1_month_future_predict']]['youtube_user_id']
result_1_month_minus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] > result_df_final[result_df_final['predict'] > 0]['1_month_future_predict']]['youtube_user_id']

# 3개월
result_3_month_plus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] < result_df_final[result_df_final['predict'] > 0]['3_month_future_predict']]['youtube_user_id']
result_3_month_minus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] > result_df_final[result_df_final['predict'] > 0]['3_month_future_predict']]['youtube_user_id']

# 6개월
result_6_month_plus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] < result_df_final[result_df_final['predict'] > 0]['6_month_future_predict']]['youtube_user_id']
result_6_month_minus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] > result_df_final[result_df_final['predict'] > 0]['6_month_future_predict']]['youtube_user_id']

# 12개월
result_12_month_plus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] < result_df_final[result_df_final['predict'] > 0]['12_month_future_predict']]['youtube_user_id']
result_12_month_minus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] > result_df_final[result_df_final['predict'] > 0]['12_month_future_predict']]['youtube_user_id']

In [15]:
print(len(result_df[result_df['youtube_user_id'].isin(result_12_month_minus_user_id)]['channel_title'].unique()))
print(result_df[result_df['youtube_user_id'].isin(result_12_month_minus_user_id)]['channel_title'].unique())

7
['유익한 균튜버' '0' '모하지연 MOHAJIYEON' '-mentalholder 멘탈홀더 tv' '지미 geemi.'
 '프롬수지 fromsuzy' 'fromsuzy 프롬수지']


In [16]:
# 계속 증가
increase_user_id = result_df_final[result_df_final['predict'] > 0][(result_df_final[result_df_final['predict'] > 0]['1_month_future_predict'] < result_df_final[result_df_final['predict'] > 0]['3_month_future_predict']) & 
                                                                   (result_df_final[result_df_final['predict'] > 0]['3_month_future_predict'] < result_df_final[result_df_final['predict'] > 0]['6_month_future_predict']) & 
                                                                   (result_df_final[result_df_final['predict'] > 0]['6_month_future_predict'] < result_df_final[result_df_final['predict'] > 0]['12_month_future_predict'])]['youtube_user_id']
result_df[result_df['youtube_user_id'].isin(increase_user_id)]['channel_title'].unique()

array(['낭만아저씨코디TV', '0', '오디디 코미디', '모하지연 MOHAJIYEON', '콜드쉽 Coldsheep',
       '팀브라더스', '맛집남자 foodman', '-mentalholder 멘탈홀더 tv', '황헬린 탈출기',
       '프롬수지 fromsuzy', 'fromsuzy 프롬수지'], dtype=object)

In [17]:
# 계속 감소
decrease_user_id = result_df_final[result_df_final['predict'] > 0][(result_df_final[result_df_final['predict'] > 0]['1_month_future_predict'] > result_df_final[result_df_final['predict'] > 0]['3_month_future_predict']) & 
                                                                   (result_df_final[result_df_final['predict'] > 0]['3_month_future_predict'] > result_df_final[result_df_final['predict'] > 0]['6_month_future_predict']) & 
                                                                   (result_df_final[result_df_final['predict'] > 0]['6_month_future_predict'] > result_df_final[result_df_final['predict'] > 0]['12_month_future_predict'])]['youtube_user_id']
result_df[result_df['youtube_user_id'].isin(decrease_user_id)]['channel_title'].unique()

array([], dtype=object)

In [18]:
result_df[result_df['youtube_user_id'].isin(increase_user_id)]['channel_title'].unique()

array(['낭만아저씨코디TV', '0', '오디디 코미디', '모하지연 MOHAJIYEON', '콜드쉽 Coldsheep',
       '팀브라더스', '맛집남자 foodman', '-mentalholder 멘탈홀더 tv', '황헬린 탈출기',
       '프롬수지 fromsuzy', 'fromsuzy 프롬수지'], dtype=object)

## 콘텐츠 데이터 분석

### 데이터 분할
- 미래를 예측하기 위한 모델이므로 시간순으로 데이터 분할
- 시작날짜 : 2023-03-26
- 종료날짜 : 2024-05-06
- 전체 기간의 80% 날짜 : 2024-02-11

In [19]:
# 컬럼 정리
unique_col = youtube_videos.columns[:3]
x_col = youtube_videos.columns[3:].drop(['net_subscribers_change']) ## y값 제거

In [20]:
# 데이터 분할
train_data = youtube_videos[youtube_videos['end_date'] <= '2024-02-11']
test_data = youtube_videos[youtube_videos['end_date'] > '2024-02-11']

In [21]:
# 불균형 확인
print(train_data.shape)
print(test_data.shape)

(6538190, 62)
(2031321, 62)


### 모델 기법 적용

In [22]:
# 각 모델 정의
rf_model = RandomForestRegressor(random_state=42)
# gbm_model = GradientBoostingRegressor(random_state=42)
# lgbm_model = LGBMRegressor(random_state=42)
xgb_model = xgb.XGBRegressor(random_state=42)

# 모델 리스트
models = {
    'RandomForest': rf_model,
    # 'GBM': gbm_model,
    # 'LightGBM': lgbm_model,
    'XGBoost': xgb_model
}

### 모델 성능 평가

XGBoost

In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# 모델 정의 및 학습
models['XGBoost'].fit(train_data[x_col], train_data['net_subscribers_change'])

# 예측
y_pred = models['XGBoost'].predict(test_data[x_col])

# MSE 계산
mse = mean_squared_error(test_data['net_subscribers_change'], y_pred)

# RMSE 계산
rmse = np.sqrt(mse)

# R² 값 계산
r2 = r2_score(test_data['net_subscribers_change'], y_pred)

# 결과 출력
print('XGBoost')
print(f"R² 값: {r2:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print("----------------------------------------")
print("")

XGBoost
R² 값: 0.7614
MSE: 162.7376
RMSE: 12.7569
----------------------------------------



In [24]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error, r2_score
# import numpy as np

# for model in models:
#     # 모델 정의 및 학습
#     models[model].fit(train_data[x_col], train_data['net_subscribers_change'])

#     # 예측
#     y_pred = models[model].predict(test_data[x_col])

#     # MSE 계산
#     mse = mean_squared_error(test_data['net_subscribers_change'], y_pred)

#     # RMSE 계산
#     rmse = np.sqrt(mse)

#     # R² 값 계산
#     r2 = r2_score(test_data['net_subscribers_change'], y_pred)

#     # 결과 출력
#     print(model)
#     print(f"R² 값: {r2:.4f}")
#     print(f"MSE: {mse:.4f}")
#     print(f"RMSE: {rmse:.4f}")
#     print("----------------------------------------")
#     print("")

In [25]:
# # 비선형모델 활용 변수 선정
# importances_df = pd.DataFrame({
#     'features': x_col,
#     'rf_importance': models['RandomForest'].feature_importances_,
#     'gbm_importance': models['GBM'].feature_importances_,
#     'lgbm_importance': models['LightGBM'].feature_importances_,
#     'xgb_importance': models['XGBoost'].feature_importances_    
# })
# importances_df['lgbm_importance'] = importances_df['lgbm_importance'] / importances_df['lgbm_importance'].sum() ## 다른 모델과 중요도 단위 다르기 때문에 정규화
# importances_df['mean_importance'] = importances_df[['rf_importance', 'gbm_importance', 'lgbm_importance', 'xgb_importance']].mean(axis=1) ## 각 변수별 모델 중요도 평균

### 모델 성능 개선

스태킹

In [26]:
# from sklearn.ensemble import StackingRegressor
# from sklearn.linear_model import Ridge

# # 스태킹 모델 정의
# stacking_model = StackingRegressor(
#     estimators=[
#     ('random_forest', models['RandomForest']),
#     # ('gbm', models['GBM']),
#     # ('lightgbm', models['LightGBM']),
#     ('xgboost', models['XGBoost'])
#     ],
#     final_estimator=Ridge()
# )

# # 스태킹 모델 학습
# stacking_model.fit(train_data[x_col], train_data['subscribers_count'])

# # 예측
# y_pred = stacking_model.predict(test_data[x_col])

# # 성능 평가
# # R² 값 계산
# r2 = r2_score(test_data['subscribers_count'], y_pred)

# # MSE 계산
# mse = mean_squared_error(test_data['subscribers_count'], y_pred)

# # RMSE 계산
# rmse = mean_squared_error(test_data['subscribers_count'], y_pred, squared=False)

# print(f"스태킹 앙상블 모델 R2: {r2:.4f}")
# print(f"스태킹 앙상블 모델 MSE: {mse:.4f}")
# print(f"스태킹 앙상블 모델 RMSE: {rmse:.4f}")

### 실제데이터 결과 확인

In [27]:
# 예측 결과 확인
# y_pred = stacking_model.predict(merge_df_users_fin[x_col])
# merge_df_users_fin['predict'] = y_pred

y_pred = models['XGBoost'].predict(youtube_videos[x_col])
youtube_videos['predict'] = y_pred

In [28]:
# 계정별 구독자수 평균, 구독자 예측수 평균 비교
result_contents_df = youtube_videos[['youtube_user_id', 'video', 'end_date', 'net_subscribers_change', 'predict']]

# 계정별 콘텐츠의 구독자 순증감 1일 합계
result_contents_df = result_contents_df.groupby(['youtube_user_id', 'end_date'], as_index=False).agg({
    'net_subscribers_change': 'sum',
    'predict': 'sum'
})

# 1개월, 3개월, 6개월, 1년 뒤 구독자수 예측값 계산
result_contents_df['end_date'] = pd.to_datetime(result_contents_df['end_date'])

# Shift와 Rolling 연산을 위한 그룹별 처리
result_contents_df['1_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-30).rolling(window=30).sum())
result_contents_df['3_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-90).rolling(window=90).sum())
result_contents_df['6_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-180).rolling(window=180).sum())
result_contents_df['12_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-365).rolling(window=365).sum())

# 계정별로 최종 평균값을 계산
result_contents_df_final = result_contents_df.groupby('youtube_user_id').agg({
    'net_subscribers_change': 'mean',
    'predict': 'mean',
    '1_month_future_predict': 'mean',
    '3_month_future_predict': 'mean',
    '6_month_future_predict': 'mean',
    '12_month_future_predict': 'mean'
}).reset_index()


In [76]:
result_contents_df_final

Unnamed: 0,youtube_user_id,net_subscribers_change,predict,1_month_future_predict,3_month_future_predict,6_month_future_predict,12_month_future_predict
0,627cb611aa6f212355e0b617,14.698492,14.807063,451.798290,1235.781848,3059.494297,
1,627f59ccaa39226247c60b01,0.087719,0.065694,1.464700,4.562360,9.341878,
2,6287228afb15712a8cb931d7,0.811558,0.845420,26.440269,80.305818,147.027722,
3,6287229efb15712a8cb93225,86.398496,89.691544,2557.540167,5313.285963,17462.534853,
4,628722c8fb15712a8cb9326e,1.538847,1.536410,44.785220,134.212667,236.541541,
...,...,...,...,...,...,...,...
244,65cc401305bf1c0baa425146,3875.524390,4295.879883,169077.137456,,,
245,65e7b773d8da110bb072e2b5,4.064516,3.525848,90.618017,,,
246,65f7b17ed8da110bb0733b7b,5.714286,5.675271,,,,
247,65fecf7ed8da110bb0736199,26.113636,25.640135,,,,


In [34]:
# 최종 결과 데이터셋
final_result_df = pd.merge(result_df_final[['youtube_user_id','subscribers_count']],result_contents_df_final,how='left',on='youtube_user_id')

In [41]:
# 기간별 구독자수 증가율 계산
final_result_df['1_month_future_ratio'] = final_result_df['1_month_future_predict'] / final_result_df['subscribers_count']
final_result_df['3_month_future_ratio'] = final_result_df['3_month_future_predict'] / final_result_df['subscribers_count']
final_result_df['6_month_future_ratio'] = final_result_df['6_month_future_predict'] / final_result_df['subscribers_count']
final_result_df['12_month_future_ratio'] = final_result_df['12_month_future_predict'] / final_result_df['subscribers_count']

In [68]:
final_result_df['12_month_future_ratio'].isnull().sum()

250

In [66]:
final_result_df.sort_values(['12_month_future_ratio'],ascending=False)

Unnamed: 0,youtube_user_id,subscribers_count,net_subscribers_change,predict,1_month_future_predict,3_month_future_predict,6_month_future_predict,12_month_future_predict,1_month_future_ratio,3_month_future_ratio,6_month_future_ratio,12_month_future_ratio
0,627cb611aa6f212355e0b617,118163.451852,14.698492,14.807063,451.798290,1235.781848,3059.494297,,0.003824,0.010458,0.025892,
1,627f59ccaa39226247c60b01,5062.320988,0.087719,0.065694,1.464700,4.562360,9.341878,,0.000289,0.000901,0.001845,
2,6287228afb15712a8cb931d7,3041.802469,0.811558,0.845420,26.440269,80.305818,147.027722,,0.008692,0.026401,0.048336,
3,6287229efb15712a8cb93225,85975.167901,86.398496,89.691544,2557.540167,5313.285963,17462.534853,,0.029747,0.061800,0.203111,
4,628722c8fb15712a8cb9326e,1597.286420,1.538847,1.536410,44.785220,134.212667,236.541541,,0.028038,0.084025,0.148090,
...,...,...,...,...,...,...,...,...,...,...,...,...
245,65cc401305bf1c0baa425146,257643.593220,3875.524390,4295.879883,169077.137456,,,,0.656244,,,
246,65e7b773d8da110bb072e2b5,2992.901235,4.064516,3.525848,90.618017,,,,0.030278,,,
247,65f7b17ed8da110bb0733b7b,2572.145679,5.714286,5.675271,,,,,,,,
248,65fecf7ed8da110bb0736199,33770.382716,26.113636,25.640135,,,,,,,,


In [71]:
merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(final_result_df.sort_values(['6_month_future_ratio'],ascending=False)['youtube_user_id'].iloc[:20])]['channel_title'].unique()

array(['오디디 코미디', '0', '황나겸', '다먹어라이언', '래아TV', '슈로시안 SUROSIAN',
       '채림처럼firstcherry', '드론브이로그 DroneVlog', '군대위키', '월텍남 - 월스트리트 테크남',
       '콤므', '맛집남자 foodman', '엔트리뷰 [누구나 재미있는 테크리뷰]', 'Dalhae달달해', '단곰',
       '너굴몬', 'GMENCY 멘시의 마인크래프트', '코인덕 차트아지', '日本ジヌ【니혼지누】ー韓国に関する全て',
       "파파스캠핑 papa's camp | a korean camper", '중년독수리의 대리여행'], dtype=object)

In [77]:
(set(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(final_result_df.sort_values(['1_month_future_ratio'],ascending=False)['youtube_user_id'].iloc[:50])]['channel_title'].unique()) & 
 set(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(final_result_df.sort_values(['3_month_future_ratio'],ascending=False)['youtube_user_id'].iloc[:50])]['channel_title'].unique()) & 
 set(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(final_result_df.sort_values(['6_month_future_ratio'],ascending=False)['youtube_user_id'].iloc[:50])]['channel_title'].unique()))

{'0',
 'GMENCY 멘시의 마인크래프트',
 '日本ジヌ【니혼지누】ー韓国に関する全て',
 '군대위키',
 '너굴몬',
 '다먹어라이언',
 '단곰',
 '래아TV',
 '맛집남자 foodman',
 '슈로시안 SUROSIAN',
 '오늘도희다 HEEDA',
 '월텍남 - 월스트리트 테크남',
 '중년독수리의 대리여행',
 '채림처럼firstcherry',
 '채찍단',
 '코인덕 차트아지',
 '콤므',
 '탬니몰리',
 "파파스캠핑 papa's camp | a korean camper",
 '황나겸'}