In [1]:
import pymongo
import pandas as pd
import numpy as np
import math
from datetime import datetime

# import dask.dataframe as dd ## 대규모 데이터 처리
# import vaex ## 대규모 데이터 처리

# from sklearn.linear_model import LinearRegression ## likes, dislikes 대체
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

import os
from dotenv import load_dotenv

In [2]:
file_path = 'C:/py_src/awake/data/'

In [3]:
# 계정 / 콘텐츠 분석 데이터셋 불러오기
merge_df_users_fin = pd.read_csv(file_path + 'merge_df_users_fin_eda.csv', low_memory=False)
youtube_videos = pd.read_csv(file_path + 'youtube_videos_eda.csv')

## 계정 데이터 분석

In [None]:
merge_df_users_fin.head()

### 데이터 분할

In [21]:
# 컬럼 정리
unique_col = merge_df_users_fin.columns[:11]
x_col = merge_df_users_fin.columns[11:-1]

In [22]:
# 데이터 분할
X = merge_df_users_fin[x_col].drop(columns=['total_engage_rate','net_subscribers_change','revenue_per_view','averageViewPercentage', 'gross_revenue_per_ad_impression']) ## y값 라벨링에 쓰인 지표 제거
y = merge_df_users_fin['y_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# 불균형 확인
print(y_train.value_counts())
print(y_test.value_counts())

y_label
1    80302
0     4244
Name: count, dtype: int64
y_label
1    20096
0     1041
Name: count, dtype: int64


### 언더샘플링

In [24]:
# 언더샘플링
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

### 변수선택

t-test

In [25]:
from scipy.stats import ttest_ind

# y=0과 y=1 그룹으로 데이터 나누기
group_0 = X_train_resampled[y_train_resampled == 0]
group_1 = X_train_resampled[y_train_resampled == 1]

# t-test 수행
p_values = []
for col in X_train_resampled.columns:
    t_stat, p_val = ttest_ind(group_0[col], group_1[col], equal_var=False)  # Welch's t-test
    p_values.append((col, p_val))

# p-value < 0.05인 변수 선택
selected_features_by_ttest = [col for col, p_val in p_values if p_val < 0.05]
print("Selected Features by t-test:", selected_features_by_ttest)

Selected Features by t-test: ['views', 'redViews', 'UNSUBSCRIBED', 'SUBSCRIBED', 'comments', 'likes', 'shares', 'dislikes', 'estimatedMinutesWatched', 'averageViewDuration', 'estimated_revenue', 'estimated_ad_revenue', 'estimated_red_partner_revenue', 'gross_revenue', 'cpm', 'playback_based_cpm', 'subscribers_count', 'subscribers_gained', 'subscribers_lost', 'ad_impressions', 'monetized_playbacks', 'age13-17.female', 'age13-17.male', 'age18-24.female', 'age18-24.male', 'age25-34.female', 'age25-34.male', 'age35-44.female', 'age35-44.male', 'age45-54.female', 'age55-64.female', 'age55-64.male', 'age65-.female', 'age65-.male', 'like_rate', 'comment_rate', 'share_rate', 'dislike_rate', 'positive_engage_rate', 'like_to_dislike_ratio', 'subscriber_increase_rate', 'subscriber_decrease_rate', 'subscribers_conversion_rate', 'subscribed_view_rate', 'unsubscribed_view_rate', 'revenue_per_subscribed_view', 'revenue_per_unsubscribed_view', 'revenue_per_subscriber', 'revenue_per_red_view', 'ad_reve

  a_zero_mean = a - mean
  d = mean1 - mean2
  estimate = m1-m2


Lasso

In [42]:
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

# t-test로 선택된 변수로 데이터프레임 구성
X_train_ttest_selected = X_train_resampled[selected_features_by_ttest]

# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_ttest_selected)

# LassoCV 모델 설정: 반복 횟수 증가, alpha 범위 조정
lasso = LassoCV(cv=5, random_state=42, max_iter=10000, alphas=[0.1, 0.05, 0.01, 0.005, 0.001]).fit(X_train_scaled, y_train_resampled)

# 선택된 변수 확인 (회귀 계수가 0이 아닌 변수들)
selected_features_by_lasso = X_train_ttest_selected.columns[(lasso.coef_ != 0)]
print("Selected Features by Lasso:", selected_features_by_lasso)

Selected Features by Lasso: Index(['redViews', 'SUBSCRIBED', 'comments', 'likes', 'shares',
       'estimatedMinutesWatched', 'averageViewDuration', 'estimated_revenue',
       'estimated_red_partner_revenue', 'gross_revenue', 'cpm',
       'subscribers_count', 'subscribers_gained', 'subscribers_lost',
       'ad_impressions', 'monetized_playbacks', 'age13-17.female',
       'age18-24.female', 'age18-24.male', 'age25-34.female', 'age25-34.male',
       'age35-44.female', 'age65-.female', 'like_rate', 'comment_rate',
       'dislike_rate', 'positive_engage_rate', 'like_to_dislike_ratio',
       'subscriber_increase_rate', 'subscriber_decrease_rate',
       'subscribers_conversion_rate', 'subscribed_view_rate',
       'unsubscribed_view_rate', 'revenue_per_subscriber',
       'revenue_per_red_view', 'ad_revenue_rate', 'red_revenue_rate',
       'revenue_per_minute_watched', 'avg_view_duration_rate',
       'watched_view_rate', 'subscribed_view_time_rate',
       'unsubscribed_view_time_r

In [51]:
len(selected_features_by_lasso)

44

In [44]:
## lasso 선택으로 제거된 변수
set(selected_features_by_ttest) - set(selected_features_by_lasso)

{'UNSUBSCRIBED',
 'adult_viewer_rate',
 'age13-17.male',
 'age35-44.male',
 'age45-54.female',
 'age55-64.female',
 'age55-64.male',
 'age65-.male',
 'cpm_to_revenue_ratio',
 'dislikes',
 'estimated_ad_revenue',
 'female_viewer_rate',
 'male_viewer_rate',
 'older_viewer_rate',
 'playback_based_cpm',
 'revenue_per_ad_impression',
 'revenue_per_playback',
 'revenue_per_subscribed_view',
 'revenue_per_unsubscribed_view',
 'share_rate',
 'unplayback_rate',
 'views',
 'watched_time_rate',
 'youth_viewer_rate'}

RandomForest

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# RandomForest 모델 학습 및 교차 검증
X_train_lasso_selected = X_train_resampled[selected_features_by_lasso]

model = RandomForestClassifier(random_state=42)
cross_val_scores = cross_val_score(model, X_train_lasso_selected, y_train_resampled, cv=5, scoring='accuracy')

# 모델 학습
model.fit(X_train_lasso_selected, y_train_resampled)

# 피처 중요도 추출
importances = model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_features_by_lasso, 'importance': importances})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)

# 최종적으로 중요한 변수 선택 (예: 상위 10개 변수)
final_selected_features = list(feature_importances[feature_importances['importance'] > 0.005]['feature'])
print("Final Selected Features by RandomForest:", final_selected_features)

# 교차 검증 결과 출력
print(f"Cross-Validation Accuracy: {cross_val_scores.mean():.2f}")

Final Selected Features by RandomForest: ['estimated_revenue', 'estimatedMinutesWatched', 'subscribers_gained', 'comments', 'revenue_per_red_view', 'positive_engage_rate', 'revenue_per_minute_watched', 'likes', 'shares', 'estimated_red_partner_revenue', 'gross_revenue', 'like_rate', 'subscribers_lost', 'revenue_per_subscriber', 'cpm', 'subscriber_increase_rate', 'monetized_playbacks', 'playback_rate', 'redViews', 'red_revenue_rate', 'subscribers_count', 'SUBSCRIBED']
Cross-Validation Accuracy: 0.98


In [52]:
feature_importances ## 0.005 기준으로 변수 선택

Unnamed: 0,feature,importance
7,estimated_revenue,0.124202
5,estimatedMinutesWatched,0.078844
12,subscribers_gained,0.064941
2,comments,0.060625
34,revenue_per_red_view,0.054205
26,positive_engage_rate,0.053293
37,revenue_per_minute_watched,0.05308
3,likes,0.051197
4,shares,0.049286
8,estimated_red_partner_revenue,0.046729


In [56]:
len(final_selected_features)

32

## 콘텐츠 데이터 분석

In [None]:
youtube_videos.head()

### 데이터 분할

In [31]:
# 컬럼 정리
unique_col = youtube_videos.columns[:3]
x_col = youtube_videos.columns[3:-1]

In [35]:
# 데이터 분할
X = youtube_videos[x_col].drop(columns=y_col) ## y값 라벨링에 쓰인 지표 제거
y = youtube_videos['y_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
del X, y

In [37]:
# 불균형 확인
print(y_train.value_counts())
print(y_test.value_counts())

y_label
1    6513105
0     342503
Name: count, dtype: int64
y_label
1    1627933
0      85970
Name: count, dtype: int64


### 언더샘플링

In [38]:
# 언더샘플링
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

### 변수선택

t-test

In [39]:
from scipy.stats import ttest_ind

# y=0과 y=1 그룹으로 데이터 나누기
group_0 = X_train_resampled[y_train_resampled == 0]
group_1 = X_train_resampled[y_train_resampled == 1]

# t-test 수행
p_values = []
for col in X_train_resampled.columns:
    t_stat, p_val = ttest_ind(group_0[col], group_1[col], equal_var=False)  # Welch's t-test
    p_values.append((col, p_val))

# p-value < 0.05인 변수 선택
selected_features_by_ttest = [col for col, p_val in p_values if p_val < 0.05]
print("Selected Features by t-test:", selected_features_by_ttest)

Selected Features by t-test: ['views', 'redViews', 'comments', 'likes', 'dislikes', 'shares', 'estimatedMinutesWatched', 'estimatedRedMinutesWatched', 'averageViewDuration', 'videosAddedToPlaylists', 'videosRemovedFromPlaylists', 'estimatedRevenue', 'estimatedAdRevenue', 'grossRevenue', 'estimatedRedPartnerRevenue', 'playbackBasedCpm', 'cpm', 'subscribersGained', 'subscribersLost', 'monetizedPlaybacks', 'adImpressions', 'cardClickRate', 'cardTeaserClickRate', 'cardImpressions', 'cardTeaserImpressions', 'cardClicks', 'cardTeaserClicks', 'like_rate', 'comment_rate', 'share_rate', 'dislike_rate', 'positive_engage_rate', 'comment_to_like_rate', 'like_to_dislike_ratio', 'subscribers_conversion_rate', 'subscribers_gained_per_card_click', 'subscribers_gained_per_playlist_add', 'card_click_to_subscriber_conversion', 'subscribers_lost_per_playlist_remove', 'revenue_per_red_view', 'ad_revenue_rate', 'red_revenue_rate', 'cpm_to_revenue_ratio', 'revenue_per_ad_impression', 'playback_based_cpm_rate

  a_zero_mean = a - mean
  d = mean1 - mean2
  estimate = m1-m2


In [47]:
len(selected_features_by_ttest)

74

Lasso

In [48]:
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

# t-test로 선택된 변수로 데이터프레임 구성
X_train_ttest_selected = X_train_resampled[selected_features_by_ttest]

# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_ttest_selected)

# LassoCV 모델 설정: 반복 횟수 증가, alpha 범위 조정
lasso = LassoCV(cv=5, random_state=42, max_iter=10000, alphas=[0.1, 0.05, 0.01, 0.005, 0.001]).fit(X_train_scaled, y_train_resampled)

# 선택된 변수 확인 (회귀 계수가 0이 아닌 변수들)
selected_features_by_lasso = X_train_ttest_selected.columns[(lasso.coef_ != 0)]
print("Selected Features by Lasso:", selected_features_by_lasso)

Selected Features by Lasso: Index(['views', 'redViews', 'comments', 'likes', 'dislikes', 'shares',
       'estimatedRedMinutesWatched', 'averageViewDuration',
       'videosAddedToPlaylists', 'videosRemovedFromPlaylists',
       'estimatedRevenue', 'estimatedAdRevenue', 'grossRevenue',
       'playbackBasedCpm', 'cpm', 'subscribersGained', 'subscribersLost',
       'monetizedPlaybacks', 'adImpressions', 'cardClickRate',
       'cardTeaserClickRate', 'cardImpressions', 'cardTeaserImpressions',
       'like_rate', 'comment_rate', 'dislike_rate', 'positive_engage_rate',
       'comment_to_like_rate', 'like_to_dislike_ratio',
       'subscribers_conversion_rate', 'subscribers_gained_per_playlist_add',
       'revenue_per_red_view', 'ad_revenue_rate', 'red_revenue_rate',
       'cpm_to_revenue_ratio', 'revenue_per_ad_impression',
       'playback_based_cpm_rate', 'revenue_per_playlist_add',
       'revenue_per_minute_watched', 'avg_view_duration_rate',
       'watched_time_rate', 'watched_r

In [50]:
len(selected_features_by_lasso)

55

In [53]:
## lasso 선택으로 제거된 변수
set(selected_features_by_ttest) - set(selected_features_by_lasso)

{'ad_impressions_per_card_click',
 'ad_impressions_per_playlist_add',
 'ad_playbacks_per_card_click',
 'ad_revenue_per_card_click',
 'cardClicks',
 'cardTeaserClicks',
 'card_click_to_subscriber_conversion',
 'card_to_teaser_click_rate',
 'estimatedMinutesWatched',
 'estimatedRedPartnerRevenue',
 'grossrevenue_per_card_click',
 'revenue_per_card_click',
 'revenue_per_red_minute_watched',
 'share_rate',
 'subscribers_gained_per_card_click',
 'subscribers_lost_per_playlist_remove',
 'watch_time_per_card_click',
 'watched_red_time_rate',
 'watched_view_rate'}

RandomForest

In [54]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# RandomForest 모델 학습 및 교차 검증
X_train_lasso_selected = X_train_resampled[selected_features_by_lasso]

model = RandomForestClassifier(random_state=42)
cross_val_scores = cross_val_score(model, X_train_lasso_selected, y_train_resampled, cv=5, scoring='accuracy')

# 모델 학습
model.fit(X_train_lasso_selected, y_train_resampled)

# 피처 중요도 추출
importances = model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_features_by_lasso, 'importance': importances})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)

# 최종적으로 중요한 변수 선택 (예: 상위 10개 변수)
final_selected_features = list(feature_importances[feature_importances['importance'] > 0.005]['feature'])
print("Final Selected Features by RandomForest:", final_selected_features)

# 교차 검증 결과 출력
print(f"Cross-Validation Accuracy: {cross_val_scores.mean():.2f}")

Final Selected Features by RandomForest: ['positive_engage_rate', 'playlist_addition_rate', 'cpm', 'revenue_per_ad_impression', 'revenue_per_playback', 'playlist_removal_rate', 'like_rate', 'playbackBasedCpm', 'videosAddedToPlaylists', 'views', 'likes', 'videosRemovedFromPlaylists', 'subscribersGained', 'watch_time_per_playlist_add', 'shares', 'watched_time_rate', 'revenue_per_red_view', 'estimatedRevenue', 'redViews', 'averageViewDuration', 'revenue_per_minute_watched', 'watch_time_loss_per_playlist_remove', 'avg_view_duration_rate', 'revenue_per_playlist_add', 'estimatedAdRevenue', 'watched_red_view_rate', 'estimatedRedMinutesWatched', 'grossRevenue', 'cpm_to_revenue_ratio', 'comment_rate', 'playlist_related_revenue_rate', 'ad_revenue_rate', 'comments', 'subscribers_conversion_rate', 'red_revenue_rate', 'ad_playbacks_per_playlist_add', 'monetizedPlaybacks', 'adImpressions']
Cross-Validation Accuracy: 0.99


In [58]:
final_selected_features

['positive_engage_rate',
 'playlist_addition_rate',
 'cpm',
 'revenue_per_ad_impression',
 'revenue_per_playback',
 'playlist_removal_rate',
 'like_rate',
 'playbackBasedCpm',
 'videosAddedToPlaylists',
 'views',
 'likes',
 'videosRemovedFromPlaylists',
 'subscribersGained',
 'watch_time_per_playlist_add',
 'shares',
 'watched_time_rate',
 'revenue_per_red_view',
 'estimatedRevenue',
 'redViews',
 'averageViewDuration',
 'revenue_per_minute_watched',
 'watch_time_loss_per_playlist_remove',
 'avg_view_duration_rate',
 'revenue_per_playlist_add',
 'estimatedAdRevenue',
 'watched_red_view_rate',
 'estimatedRedMinutesWatched',
 'grossRevenue',
 'cpm_to_revenue_ratio',
 'comment_rate',
 'playlist_related_revenue_rate',
 'ad_revenue_rate',
 'comments',
 'subscribers_conversion_rate',
 'red_revenue_rate',
 'ad_playbacks_per_playlist_add',
 'monetizedPlaybacks',
 'adImpressions']

In [56]:
feature_importances ## 0.005 기준으로 변수 선택

Unnamed: 0,feature,importance
26,positive_engage_rate,0.115805
52,playlist_addition_rate,0.072534
14,cpm,0.066794
35,revenue_per_ad_impression,0.056216
45,revenue_per_playback,0.054511
53,playlist_removal_rate,0.049415
23,like_rate,0.049097
13,playbackBasedCpm,0.048472
8,videosAddedToPlaylists,0.0455
0,views,0.033676
