In [1]:
import pymongo
import pandas as pd
import numpy as np
import math
from datetime import datetime

# import dask.dataframe as dd ## 대규모 데이터 처리
# import vaex ## 대규모 데이터 처리

# from sklearn.linear_model import LinearRegression ## likes, dislikes 대체
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

import os
from dotenv import load_dotenv

In [2]:
file_path = 'C:/py_src/awake/data/'

In [3]:
# 계정 / 콘텐츠 분석 데이터셋 불러오기
merge_df_users_fin = pd.read_csv(file_path + 'merge_df_users_final2.csv', low_memory=False)
youtube_videos = pd.read_csv(file_path + 'youtube_videos_final2.csv')

## 계정 데이터 분석

### 데이터 분할
- 미래를 예측하기 위한 모델이므로 시간순으로 데이터 분할
- 시작날짜 : 2023-03-26
- 종료날짜 : 2024-05-06
- 전체 기간의 80% 날짜 : 2024-02-11

In [7]:
# 컬럼 정리
unique_col = merge_df_users_fin.columns[:11]
x_col = merge_df_users_fin.columns[11:].drop('subscribers_count') ## y값 제거

In [8]:
# 데이터 분할
train_data = merge_df_users_fin[merge_df_users_fin['date'] <= '2024-02-11']
test_data = merge_df_users_fin[merge_df_users_fin['date'] > '2024-02-11']

In [9]:
# 불균형 확인
print(train_data.shape)
print(test_data.shape)

(84473, 56)
(21210, 56)


### 모델 기법 적용

In [10]:
# 각 모델 정의
rf_model = RandomForestRegressor(random_state=42)
gbm_model = GradientBoostingRegressor(random_state=42)
lgbm_model = LGBMRegressor(random_state=42)
xgb_model = xgb.XGBRegressor(random_state=42)

# 모델 리스트
models = {
    'RandomForest': rf_model,
    'GBM': gbm_model,
    'LightGBM': lgbm_model,
    'XGBoost': xgb_model
}

In [9]:
# # 모델별 교차 검증 결과 저장
# results = {}

# for model_name, model in models.items():
#     print(f"\n{model_name} 모델 성능 평가 중...")
    
#     # 교차 검증
#     cv_scores = cross_val_score(model, train_data[x_col], train_data['subscribers_count'], cv=5, scoring='neg_mean_squared_error')
    
#     # 평균 RMSE 계산
#     rmse_scores = np.sqrt(-cv_scores)  # neg_mean_squared_error는 음수이므로 양수로 변환 후 제곱근
#     mean_rmse = rmse_scores.mean()
#     print(f"{model_name} 교차 검증 평균 RMSE: {mean_rmse}")
    
#     # 결과 저장
#     results[model_name] = mean_rmse

### 모델 성능 평가

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

for model in models:
    # 모델 정의 및 학습
    models[model].fit(train_data[x_col], train_data['subscribers_count'])

    # 예측
    y_pred = models[model].predict(test_data[x_col])

    # MSE 계산
    mse = mean_squared_error(test_data['subscribers_count'], y_pred)

    # RMSE 계산
    rmse = np.sqrt(mse)

    # R² 값 계산
    r2 = r2_score(test_data['subscribers_count'], y_pred)

    # 결과 출력
    print(model)
    print(f"R² 값: {r2:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print("----------------------------------------")
    print("")

RandomForest
R² 값: 0.9157
MSE: 4975277385.4485
RMSE: 70535.6462
----------------------------------------

GBM
R² 값: 0.8720
MSE: 7554219339.4865
RMSE: 86915.0122
----------------------------------------

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007564 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11208
[LightGBM] [Info] Number of data points in the train set: 84473, number of used features: 44
[LightGBM] [Info] Start training from score 65433.322872
LightGBM
R² 값: 0.8862
MSE: 6715269272.1564
RMSE: 81946.7466
----------------------------------------

XGBoost
R² 값: 0.9474
MSE: 3100811798.8810
RMSE: 55684.9333
----------------------------------------



In [92]:
# # 비선형모델 활용 변수 선정
# importances_df = pd.DataFrame({
#     'features': x_col,
#     'rf_importance': models['RandomForest'].feature_importances_,
#     'gbm_importance': models['GBM'].feature_importances_,
#     'lgbm_importance': models['LightGBM'].feature_importances_,
#     'xgb_importance': models['XGBoost'].feature_importances_    
# })
# importances_df['lgbm_importance'] = importances_df['lgbm_importance'] / importances_df['lgbm_importance'].sum() ## 다른 모델과 중요도 단위 다르기 때문에 정규화
# importances_df['mean_importance'] = importances_df[['rf_importance', 'gbm_importance', 'lgbm_importance', 'xgb_importance']].mean(axis=1) ## 각 변수별 모델 중요도 평균

### 모델 성능 개선

In [60]:
x_col

Index(['weekly_estimated_revenue', 'weekly_estimated_ad_revenue',
       'subscriber_increase_rate', 'monthly_revenue_per_ad_impression',
       'revenue_per_subscriber', 'quarterly_avg_view_duration',
       'quarterly_estimated_revenue', 'monthly_net_subscribers_change',
       'estimated_revenue', 'subscriber_view_time_rate',
       'quarterly_total_view_time', 'weekly_revenue_per_ad_impression',
       'monthly_subscribers_lost', 'subscribers_lost', 'watched_time_rate',
       'age18-24.female', 'quarterly_subscribers_gained', 'gross_revenue',
       'gross_revenue_per_ad_impression', 'weekly_net_subscribers_change',
       'playback_rate', 'age25-34.female', 'estimated_red_partner_revenue',
       'redViews', 'quarterly_estimated_ad_revenue',
       'quarterly_net_subscribers_change', 'estimatedMinutesWatched',
       'subscriber_decrease_rate', 'weekly_subscribers_lost',
       'estimated_ad_revenue', 'quarterly_subscribers_lost',
       'monthly_subscribers_gained', 'age13-17.fe

스태킹

In [12]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge

# 스태킹 모델 정의
stacking_model = StackingRegressor(
    estimators=[
    ('random_forest', models['RandomForest']),
    # ('gbm', models['GBM']),
    # ('lightgbm', models['LightGBM']),
    ('xgboost', models['XGBoost'])
    ],
    final_estimator=Ridge()
)

# 스태킹 모델 학습
stacking_model.fit(train_data[x_col], train_data['subscribers_count'])

# 예측
y_pred = stacking_model.predict(test_data[x_col])

# 성능 평가
# R² 값 계산
r2 = r2_score(test_data['subscribers_count'], y_pred)

# MSE 계산
mse = mean_squared_error(test_data['subscribers_count'], y_pred)

# RMSE 계산
rmse = mean_squared_error(test_data['subscribers_count'], y_pred, squared=False)

print(f"스태킹 앙상블 모델 R2: {r2:.4f}")
print(f"스태킹 앙상블 모델 MSE: {mse:.4f}")
print(f"스태킹 앙상블 모델 RMSE: {rmse:.4f}")

스태킹 앙상블 모델 R2: 0.9661
스태킹 앙상블 모델 MSE: 1998305072.9542
스태킹 앙상블 모델 RMSE: 44702.4057




### 실제데이터 결과 확인

In [56]:
# 예측 결과 확인
y_pred = stacking_model.predict(merge_df_users_fin[x_col])
merge_df_users_fin['predict'] = y_pred

In [57]:
# 계정별 구독자수 평균, 구독자 예측수 평균 비교
result_df = merge_df_users_fin[['youtube_user_id','date','channel_title','subscriberCount','subscribers_count','predict']]

# 1개월, 3개월, 6개월, 1년 뒤 구독자수 예측값
result_df['date'] = pd.to_datetime(result_df['date']) 

result_df['1_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-30).rolling(window=30).mean()
result_df['3_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-90).rolling(window=90).mean()
result_df['6_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-180).rolling(window=180).mean()
result_df['12_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-365).rolling(window=365).mean()

# 계정별 기간에 따른 구독자수 예측값 비교
result_df_final = result_df.groupby(['youtube_user_id'])[['subscribers_count','predict','1_month_future_predict','3_month_future_predict','6_month_future_predict','12_month_future_predict']].mean().reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['date'] = pd.to_datetime(result_df['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['1_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-30).rolling(window=30).mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['3_month_future_pr

In [129]:
# 기간 따른 예측 결과
# 1개월
result_1_month_plus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] < result_df_final[result_df_final['predict'] > 0]['1_month_future_predict']]['youtube_user_id']
result_1_month_minus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] > result_df_final[result_df_final['predict'] > 0]['1_month_future_predict']]['youtube_user_id']

# 3개월
result_3_month_plus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] < result_df_final[result_df_final['predict'] > 0]['3_month_future_predict']]['youtube_user_id']
result_3_month_minus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] > result_df_final[result_df_final['predict'] > 0]['3_month_future_predict']]['youtube_user_id']

# 6개월
result_6_month_plus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] < result_df_final[result_df_final['predict'] > 0]['6_month_future_predict']]['youtube_user_id']
result_6_month_minus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] > result_df_final[result_df_final['predict'] > 0]['6_month_future_predict']]['youtube_user_id']

# 12개월
result_12_month_plus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] < result_df_final[result_df_final['predict'] > 0]['12_month_future_predict']]['youtube_user_id']
result_12_month_minus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] > result_df_final[result_df_final['predict'] > 0]['12_month_future_predict']]['youtube_user_id']

In [144]:
print(len(result_df[result_df['youtube_user_id'].isin(result_12_month_minus_user_id)]['channel_title'].unique()))
print(result_df[result_df['youtube_user_id'].isin(result_12_month_minus_user_id)]['channel_title'].unique())

7
['유익한 균튜버' '0' '모하지연 MOHAJIYEON' '-mentalholder 멘탈홀더 tv' '지미 geemi.'
 '프롬수지 fromsuzy' 'fromsuzy 프롬수지']


In [159]:
# 계속 증가
increase_user_id = result_df_final[result_df_final['predict'] > 0][(result_df_final[result_df_final['predict'] > 0]['1_month_future_predict'] < result_df_final[result_df_final['predict'] > 0]['3_month_future_predict']) & 
                                                                   (result_df_final[result_df_final['predict'] > 0]['3_month_future_predict'] < result_df_final[result_df_final['predict'] > 0]['6_month_future_predict']) & 
                                                                   (result_df_final[result_df_final['predict'] > 0]['6_month_future_predict'] < result_df_final[result_df_final['predict'] > 0]['12_month_future_predict'])]['youtube_user_id']
result_df[result_df['youtube_user_id'].isin(increase_user_id)]['channel_title'].unique()

array(['낭만아저씨코디TV', '0', '오디디 코미디', '모하지연 MOHAJIYEON', '콜드쉽 Coldsheep',
       '팀브라더스', '맛집남자 foodman', '-mentalholder 멘탈홀더 tv', '황헬린 탈출기',
       '프롬수지 fromsuzy', 'fromsuzy 프롬수지'], dtype=object)

In [163]:
# 계속 감소
decrease_user_id = result_df_final[result_df_final['predict'] > 0][(result_df_final[result_df_final['predict'] > 0]['1_month_future_predict'] > result_df_final[result_df_final['predict'] > 0]['3_month_future_predict']) & 
                                                                   (result_df_final[result_df_final['predict'] > 0]['3_month_future_predict'] > result_df_final[result_df_final['predict'] > 0]['6_month_future_predict']) & 
                                                                   (result_df_final[result_df_final['predict'] > 0]['6_month_future_predict'] > result_df_final[result_df_final['predict'] > 0]['12_month_future_predict'])]['youtube_user_id']
result_df[result_df['youtube_user_id'].isin(decrease_user_id)]['channel_title'].unique()

array([], dtype=object)

In [161]:
result_df[result_df['youtube_user_id'].isin(increase_user_id)]['channel_title'].unique()

array(['낭만아저씨코디TV', '0', '오디디 코미디', '모하지연 MOHAJIYEON', '콜드쉽 Coldsheep',
       '팀브라더스', '맛집남자 foodman', '-mentalholder 멘탈홀더 tv', '황헬린 탈출기',
       '프롬수지 fromsuzy', 'fromsuzy 프롬수지'], dtype=object)

## 콘텐츠 데이터 분석

### 데이터 분할
- 미래를 예측하기 위한 모델이므로 시간순으로 데이터 분할
- 시작날짜 : 2023-03-26
- 종료날짜 : 2024-05-06
- 전체 기간의 80% 날짜 : 2024-02-11

In [14]:
youtube_videos.columns

Index(['youtube_user_id', 'video', 'end_date', 'quarterly_avg_view_percentage',
       'playlist_removal_rate', 'ad_revenue_rate',
       'weekly_avg_view_percentage', 'playback_rate', 'dislikes', 'views',
       'comment_rate', 'quarterly_avg_view_duration',
       'weekly_total_engagement', 'playlist_addition_rate',
       'positive_engage_rate', 'share_rate', 'estimatedRedPartnerRevenue',
       'weekly_watch_time', 'monthly_total_engagement',
       'watch_time_loss_per_playlist_remove', 'monthly_watch_time',
       'quarterly_estimated_revenue', 'subscribers_lost_per_playlist_remove',
       'subscribers_gained_per_playlist_add', 'averageViewDuration',
       'monthly_playlist_change_rate', 'shares', 'weekly_playlist_change_rate',
       'weekly_engagement_rate', 'dislike_rate', 'videosRemovedFromPlaylists',
       'redViews', 'averageViewPercentage', 'weekly_videos_removed',
       'monthly_views', 'subscribers_conversion_rate',
       'estimatedMinutesWatched', 'like_rate', 'vid

In [4]:
# 컬럼 정리
unique_col = youtube_videos.columns[:3]
x_col = youtube_videos.columns[3:].drop(['net_subscribers_change']) ## y값 제거

In [5]:
# 데이터 분할
train_data = youtube_videos[youtube_videos['end_date'] <= '2024-02-11']
test_data = youtube_videos[youtube_videos['end_date'] > '2024-02-11']

In [6]:
# 불균형 확인
print(train_data.shape)
print(test_data.shape)

(6538190, 62)
(2031321, 62)


### 모델 기법 적용

In [7]:
# 각 모델 정의
rf_model = RandomForestRegressor(random_state=42)
gbm_model = GradientBoostingRegressor(random_state=42)
lgbm_model = LGBMRegressor(random_state=42)
xgb_model = xgb.XGBRegressor(random_state=42)

# 모델 리스트
models = {
    'RandomForest': rf_model,
    'GBM': gbm_model,
    'LightGBM': lgbm_model,
    'XGBoost': xgb_model
}

### 모델 성능 평가

XGBoost

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# 모델 정의 및 학습
models['XGBoost'].fit(train_data[x_col], train_data['net_subscribers_change'])

# 예측
y_pred = models['XGBoost'].predict(test_data[x_col])

# MSE 계산
mse = mean_squared_error(test_data['net_subscribers_change'], y_pred)

# RMSE 계산
rmse = np.sqrt(mse)

# R² 값 계산
r2 = r2_score(test_data['net_subscribers_change'], y_pred)

# 결과 출력
print('XGBoost')
print(f"R² 값: {r2:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print("----------------------------------------")
print("")

XGBoost
R² 값: 0.7614
MSE: 162.7376
RMSE: 12.7569
----------------------------------------



In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

for model in models:
    # 모델 정의 및 학습
    models[model].fit(train_data[x_col], train_data['net_subscribers_change'])

    # 예측
    y_pred = models[model].predict(test_data[x_col])

    # MSE 계산
    mse = mean_squared_error(test_data['net_subscribers_change'], y_pred)

    # RMSE 계산
    rmse = np.sqrt(mse)

    # R² 값 계산
    r2 = r2_score(test_data['net_subscribers_change'], y_pred)

    # 결과 출력
    print(model)
    print(f"R² 값: {r2:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print("----------------------------------------")
    print("")

In [None]:
# # 비선형모델 활용 변수 선정
# importances_df = pd.DataFrame({
#     'features': x_col,
#     'rf_importance': models['RandomForest'].feature_importances_,
#     'gbm_importance': models['GBM'].feature_importances_,
#     'lgbm_importance': models['LightGBM'].feature_importances_,
#     'xgb_importance': models['XGBoost'].feature_importances_    
# })
# importances_df['lgbm_importance'] = importances_df['lgbm_importance'] / importances_df['lgbm_importance'].sum() ## 다른 모델과 중요도 단위 다르기 때문에 정규화
# importances_df['mean_importance'] = importances_df[['rf_importance', 'gbm_importance', 'lgbm_importance', 'xgb_importance']].mean(axis=1) ## 각 변수별 모델 중요도 평균

### 모델 성능 개선

In [10]:
x_col

Index(['quarterly_avg_view_percentage', 'playlist_removal_rate',
       'ad_revenue_rate', 'weekly_avg_view_percentage', 'playback_rate',
       'dislikes', 'views', 'comment_rate', 'quarterly_avg_view_duration',
       'weekly_total_engagement', 'playlist_addition_rate',
       'positive_engage_rate', 'share_rate', 'estimatedRedPartnerRevenue',
       'weekly_watch_time', 'monthly_total_engagement',
       'watch_time_loss_per_playlist_remove', 'monthly_watch_time',
       'quarterly_estimated_revenue', 'subscribers_lost_per_playlist_remove',
       'subscribers_gained_per_playlist_add', 'averageViewDuration',
       'monthly_playlist_change_rate', 'shares', 'weekly_playlist_change_rate',
       'weekly_engagement_rate', 'dislike_rate', 'videosRemovedFromPlaylists',
       'redViews', 'averageViewPercentage', 'weekly_videos_removed',
       'monthly_views', 'subscribers_conversion_rate',
       'estimatedMinutesWatched', 'like_rate', 'videosAddedToPlaylists',
       'likes', 'weekly_e

스태킹

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge

# 스태킹 모델 정의
stacking_model = StackingRegressor(
    estimators=[
    ('random_forest', models['RandomForest']),
    # ('gbm', models['GBM']),
    # ('lightgbm', models['LightGBM']),
    ('xgboost', models['XGBoost'])
    ],
    final_estimator=Ridge()
)

# 스태킹 모델 학습
stacking_model.fit(train_data[x_col], train_data['subscribers_count'])

# 예측
y_pred = stacking_model.predict(test_data[x_col])

# 성능 평가
# R² 값 계산
r2 = r2_score(test_data['subscribers_count'], y_pred)

# MSE 계산
mse = mean_squared_error(test_data['subscribers_count'], y_pred)

# RMSE 계산
rmse = mean_squared_error(test_data['subscribers_count'], y_pred, squared=False)

print(f"스태킹 앙상블 모델 R2: {r2:.4f}")
print(f"스태킹 앙상블 모델 MSE: {mse:.4f}")
print(f"스태킹 앙상블 모델 RMSE: {rmse:.4f}")

스태킹 앙상블 모델 R2: 0.9661
스태킹 앙상블 모델 MSE: 1998305072.9542
스태킹 앙상블 모델 RMSE: 44702.4057




### 실제데이터 결과 확인

In [11]:
# 예측 결과 확인
# y_pred = stacking_model.predict(merge_df_users_fin[x_col])
# merge_df_users_fin['predict'] = y_pred

y_pred = models['XGBoost'].predict(youtube_videos[x_col])
youtube_videos['predict'] = y_pred

In [47]:
# 계정별 구독자수 평균, 구독자 예측수 평균 비교
result_df = youtube_videos[['youtube_user_id', 'video', 'end_date','net_subscribers_change','predict']]

# 1개월, 3개월, 6개월, 1년 뒤 구독자수 예측값
result_df['end_date'] = pd.to_datetime(result_df['end_date']) 

result_df['1_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-30).rolling(window=30).sum()
result_df['3_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-90).rolling(window=90).sum()
result_df['6_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-180).rolling(window=180).sum()
result_df['12_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-365).rolling(window=365).sum()

result_df_final = result_df.groupby('youtube_user_id')[['net_subscribers_change','predict','1_month_future_predict','3_month_future_predict','6_month_future_predict','12_month_future_predict']].mean().reset_index()

# # 계정별 기간에 따른 구독자수 예측값 비교
# result_df_final = result_df.groupby('youtube_user_id').agg({
#     'net_subscribers_change': 'sum',
#     'predict': 'sum',
#     '1_month_future_predict': 'mean',
#     '3_month_future_predict': 'mean',
#     '6_month_future_predict': 'mean',
#     '12_month_future_predict': 'mean'
# }).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['end_date'] = pd.to_datetime(result_df['end_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['1_month_future_predict'] = result_df.groupby('youtube_user_id')['predict'].shift(-30).rolling(window=30).sum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['3_month_fu

In [48]:
result_df_final

Unnamed: 0,youtube_user_id,net_subscribers_change,predict,1_month_future_predict,3_month_future_predict,6_month_future_predict,12_month_future_predict
0,627cb611aa6f212355e0b617,0.379747,0.382552,12.385390,116.729057,129.459735,291.270128
1,627f59ccaa39226247c60b01,0.004092,0.003064,7.216854,28.098924,38.695281,100.541398
2,6287228afb15712a8cb931d7,0.062367,0.064970,2.104034,19.590236,188.309656,99.094312
3,6287229efb15712a8cb93225,0.431036,0.447465,15.172724,46.000699,109.600360,290.532449
4,628722c8fb15712a8cb9326e,0.017652,0.017624,2.686011,10.595387,41.708335,89.416130
...,...,...,...,...,...,...,...
244,65cc401305bf1c0baa425146,26.399153,29.262514,699.265555,849.958527,1009.570118,1167.163210
245,65e7b773d8da110bb072e2b5,0.040671,0.035281,36.399463,86.325874,108.807077,160.637183
246,65f7b17ed8da110bb0733b7b,0.110803,0.110047,22.619763,41.335566,164.753147,346.371367
247,65fecf7ed8da110bb0736199,0.130583,0.128215,39.495797,120.498499,213.384922,407.093474


In [38]:
result_df_final

Unnamed: 0,youtube_user_id,net_subscribers_change,predict,1_month_future_predict,3_month_future_predict,6_month_future_predict,12_month_future_predict
0,627cb611aa6f212355e0b617,5850,5893.210938,0.412846,1.296990,0.719221,0.798000
1,627f59ccaa39226247c60b01,35,26.211802,0.240562,0.312210,0.214974,0.275456
2,6287228afb15712a8cb931d7,323,336.477325,0.070134,0.217669,1.046165,0.271491
3,6287229efb15712a8cb93225,34473,35786.925781,0.505757,0.511119,0.608891,0.795979
4,628722c8fb15712a8cb9326e,614,613.027466,0.089534,0.117727,0.231713,0.244976
...,...,...,...,...,...,...,...
244,65cc401305bf1c0baa425146,317793,352262.156250,23.308852,9.443984,5.608723,3.197707
245,65e7b773d8da110bb072e2b5,252,218.602585,1.213315,0.959176,0.604484,0.440102
246,65f7b17ed8da110bb0733b7b,280,278.088257,0.753992,0.459284,0.915295,0.948963
247,65fecf7ed8da110bb0736199,1149,1128.165894,1.316527,1.338872,1.185472,1.115325


In [None]:
# 기간 따른 예측 결과
# 1개월
result_1_month_plus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] < result_df_final[result_df_final['predict'] > 0]['1_month_future_predict']]['youtube_user_id']
result_1_month_minus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] > result_df_final[result_df_final['predict'] > 0]['1_month_future_predict']]['youtube_user_id']

# 3개월
result_3_month_plus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] < result_df_final[result_df_final['predict'] > 0]['3_month_future_predict']]['youtube_user_id']
result_3_month_minus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] > result_df_final[result_df_final['predict'] > 0]['3_month_future_predict']]['youtube_user_id']

# 6개월
result_6_month_plus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] < result_df_final[result_df_final['predict'] > 0]['6_month_future_predict']]['youtube_user_id']
result_6_month_minus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] > result_df_final[result_df_final['predict'] > 0]['6_month_future_predict']]['youtube_user_id']

# 12개월
result_12_month_plus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] < result_df_final[result_df_final['predict'] > 0]['12_month_future_predict']]['youtube_user_id']
result_12_month_minus_user_id = result_df_final[result_df_final['predict'] > 0][result_df_final[result_df_final['predict'] > 0]['subscribers_count'] > result_df_final[result_df_final['predict'] > 0]['12_month_future_predict']]['youtube_user_id']

In [None]:
print(len(result_df[result_df['youtube_user_id'].isin(result_12_month_minus_user_id)]['channel_title'].unique()))
print(result_df[result_df['youtube_user_id'].isin(result_12_month_minus_user_id)]['channel_title'].unique())

7
['유익한 균튜버' '0' '모하지연 MOHAJIYEON' '-mentalholder 멘탈홀더 tv' '지미 geemi.'
 '프롬수지 fromsuzy' 'fromsuzy 프롬수지']


In [None]:
# 계속 증가
increase_user_id = result_df_final[result_df_final['predict'] > 0][(result_df_final[result_df_final['predict'] > 0]['1_month_future_predict'] < result_df_final[result_df_final['predict'] > 0]['3_month_future_predict']) & 
                                                                   (result_df_final[result_df_final['predict'] > 0]['3_month_future_predict'] < result_df_final[result_df_final['predict'] > 0]['6_month_future_predict']) & 
                                                                   (result_df_final[result_df_final['predict'] > 0]['6_month_future_predict'] < result_df_final[result_df_final['predict'] > 0]['12_month_future_predict'])]['youtube_user_id']
result_df[result_df['youtube_user_id'].isin(increase_user_id)]['channel_title'].unique()

array(['낭만아저씨코디TV', '0', '오디디 코미디', '모하지연 MOHAJIYEON', '콜드쉽 Coldsheep',
       '팀브라더스', '맛집남자 foodman', '-mentalholder 멘탈홀더 tv', '황헬린 탈출기',
       '프롬수지 fromsuzy', 'fromsuzy 프롬수지'], dtype=object)

In [None]:
# 계속 감소
decrease_user_id = result_df_final[result_df_final['predict'] > 0][(result_df_final[result_df_final['predict'] > 0]['1_month_future_predict'] > result_df_final[result_df_final['predict'] > 0]['3_month_future_predict']) & 
                                                                   (result_df_final[result_df_final['predict'] > 0]['3_month_future_predict'] > result_df_final[result_df_final['predict'] > 0]['6_month_future_predict']) & 
                                                                   (result_df_final[result_df_final['predict'] > 0]['6_month_future_predict'] > result_df_final[result_df_final['predict'] > 0]['12_month_future_predict'])]['youtube_user_id']
result_df[result_df['youtube_user_id'].isin(decrease_user_id)]['channel_title'].unique()

array([], dtype=object)