In [31]:
import pymongo
import pandas as pd
import numpy as np
import math
from datetime import datetime

# import dask.dataframe as dd ## 대규모 데이터 처리
# import vaex ## 대규모 데이터 처리

# from sklearn.linear_model import LinearRegression ## likes, dislikes 대체
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

import os
from dotenv import load_dotenv

In [2]:
file_path = 'C:/py_src/awake/data/'

In [3]:
# 계정 / 콘텐츠 분석 데이터셋 불러오기
merge_df_users_fin = pd.read_csv('C:/py_src/awake/data/merge_df_users_fin.csv', low_memory=False)
youtube_videos = pd.read_csv(file_path + 'youtube_videos.csv')

## 계정 데이터 분석

In [4]:
# 버그로 사용된 수치값 대체
merge_df_users_fin['likes'] = np.where(merge_df_users_fin['likes'] < 0, 0, merge_df_users_fin['likes'])
merge_df_users_fin['dislikes'] = np.where(merge_df_users_fin['dislikes'] < 0, 0, merge_df_users_fin['dislikes'])

### 파생변수

In [5]:
# 파생변수1 - 참여도 관련
merge_df_users_fin['like_rate'] = merge_df_users_fin['likes'] / merge_df_users_fin['views'] ## 좋아요 비율 
merge_df_users_fin['comment_rate'] = merge_df_users_fin['comments'] / merge_df_users_fin['views'] ## 댓글 비율
merge_df_users_fin['share_rate'] = merge_df_users_fin['shares'] / merge_df_users_fin['views'] ## 공유 비율  
merge_df_users_fin['dislike_rate'] = merge_df_users_fin['dislikes'] / merge_df_users_fin['views'] ## 싫어요 비율
merge_df_users_fin['total_engage_rate'] = (merge_df_users_fin['likes'] + merge_df_users_fin['comments'] + merge_df_users_fin['shares']) / merge_df_users_fin['views'] ## 총 참여율
merge_df_users_fin['comment_to_like_rate'] = merge_df_users_fin['comments'] / merge_df_users_fin['likes'] ## 댓글/좋아요 비율

In [6]:
# 파생변수2 - 구독자 관련
merge_df_users_fin['subscriber_increase_rate'] = merge_df_users_fin['subscribers_gained'] / merge_df_users_fin['subscribers_count'] ## 구독자 증가율
merge_df_users_fin['subscriber_decrease_rate'] = merge_df_users_fin['subscribers_lost'] / merge_df_users_fin['subscribers_count'] ## 구독자 감소율
merge_df_users_fin['net_subscribers_change'] = merge_df_users_fin['subscribers_gained'] - merge_df_users_fin['subscribers_lost'] ## 구독자 순증가

In [7]:
# 파생변수3 - 수익 관련
merge_df_users_fin['revenue_per_view'] = merge_df_users_fin['estimated_revenue'] / merge_df_users_fin['views'] ## 조회수당 수익
merge_df_users_fin['ad_revenue_rate'] = merge_df_users_fin['estimated_ad_revenue'] / merge_df_users_fin['estimated_revenue'] ## 광고수익비율
merge_df_users_fin['red_revenue_rate'] = merge_df_users_fin['estimated_red_partner_revenue'] / merge_df_users_fin['estimated_revenue'] ## 프리미엄수익비율
merge_df_users_fin['cpm_to_revenue_ratio'] = merge_df_users_fin['cpm'] - merge_df_users_fin['estimated_revenue'] ## cpm 대비 수익
merge_df_users_fin['revenue_per_ad_impression'] = merge_df_users_fin['estimated_revenue'] / merge_df_users_fin['ad_impressions'] ## 광고노출당 수익
merge_df_users_fin['playback_based_cpm_rate'] = merge_df_users_fin['playback_based_cpm'] / merge_df_users_fin['cpm'] ## 재생 기반 수익

In [8]:
# 파생변수4 - 시청 시간 관련
merge_df_users_fin['revenue_per_minute_watched'] = merge_df_users_fin['estimated_revenue'] / merge_df_users_fin['estimatedMinutesWatched'] ## 시청 시간 당 수익
merge_df_users_fin['avg_view_duration_rate'] = merge_df_users_fin['averageViewDuration'] / merge_df_users_fin['averageViewPercentage'] ## 평균 시청 시간 비율
merge_df_users_fin['watched_time_rate'] = merge_df_users_fin['averageViewPercentage'] * merge_df_users_fin['estimatedMinutesWatched'] ## 재생 비율 대비 시청 시간

In [9]:
# 파생변수5 - 구독자 유형 관련
merge_df_users_fin['subscribed_view_time_rate'] = merge_df_users_fin['SUBSCRIBED'] / (merge_df_users_fin['SUBSCRIBED'] + merge_df_users_fin['UNSUBSCRIBED']) ## 구독자 시청 시간 비율
merge_df_users_fin['unsubscribed_view_time_rate'] = merge_df_users_fin['UNSUBSCRIBED'] / (merge_df_users_fin['SUBSCRIBED'] + merge_df_users_fin['UNSUBSCRIBED']) ## 비구독자 시청 시간 비율

In [10]:
# 파생변수6 - 연령 및 성별 관련
female_col = [col_nm for col_nm in merge_df_users_fin.columns if 'female' in col_nm ]
male_col = [col_nm for col_nm in merge_df_users_fin.columns if ('male' in col_nm) and ('female' not in col_nm)]

youth_col = [col_nm for col_nm in merge_df_users_fin.columns if '13-17' in col_nm]
adult_col = [col_nm for col_nm in merge_df_users_fin.columns if ('18-24' in col_nm) or ('25-34' in col_nm) or ('35-44' in col_nm) or ('45-54' in col_nm)]
older_col = [col_nm for col_nm in merge_df_users_fin.columns if ('55-64' in col_nm) or ('65' in col_nm)]

merge_df_users_fin['female_viewer_rate'] = merge_df_users_fin[female_col].sum(axis=1) ## 여성 시청자 비율
merge_df_users_fin['male_viewer_rate'] = merge_df_users_fin[male_col].sum(axis=1) ## 남성 시청자 비율

merge_df_users_fin['youth_viewer_rate'] = merge_df_users_fin[youth_col].sum(axis=1) ## 청소년 시청자 비율
merge_df_users_fin['adult_viewer_rate'] = merge_df_users_fin[adult_col].sum(axis=1) ## 성인 시청자 비율
merge_df_users_fin['older_viewer_rate'] = merge_df_users_fin[older_col].sum(axis=1) ## 노인 시청자 비율

### y값 설정

#### 중요 지표 표준화

In [11]:
# y값 활용 지표
y_col = ['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view'] ## null값은 views가 0인 데이터

In [12]:
# null값 대체
merge_df_users_fin[y_col] = merge_df_users_fin[y_col].fillna(0) ## NaN
merge_df_users_fin[y_col] = np.where(merge_df_users_fin[y_col]==np.inf,0,merge_df_users_fin[y_col]) ## inf

In [13]:
# 표준화 (Standard Scaling)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(merge_df_users_fin[y_col])

#### 다중 지표 결합

In [14]:
# Isolation Forest 모델 학습
iso_forest = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
iso_forest.fit(scaled_features)

# 이상치 점수 계산
anomaly_scores = iso_forest.decision_function(scaled_features)

In [15]:
# 임계값 설정
threshold = np.percentile(anomaly_scores, 5) ## 이상치 점수가 작을수록 이상치에 가깝다고 판단하는데, 5번째 백분위수가 (-)로 바뀌는 포인트

# y값 라벨링 (이상치이면 0, 그렇지 않으면 1)
merge_df_users_fin['y_label'] = np.where(anomaly_scores < threshold, 0, 1)

In [16]:
# 전체 데이터 y값 빈도 확인
merge_df_users_fin['y_label'].value_counts()

y_label
1    100398
0      5285
Name: count, dtype: int64

In [17]:
# 계정별 y값 빈도 데이터
y_result_df = merge_df_users_fin.groupby('youtube_user_id')['y_label'].value_counts().reset_index()
y_result_df

Unnamed: 0,youtube_user_id,y_label,count
0,627cb611aa6f212355e0b617,1,403
1,627cb611aa6f212355e0b617,0,2
2,627f59ccaa39226247c60b01,1,388
3,627f59ccaa39226247c60b01,0,17
4,6287228afb15712a8cb931d7,1,405
...,...,...,...
430,65f7b17ed8da110bb0733b7b,1,404
431,65f7b17ed8da110bb0733b7b,0,1
432,65fecf7ed8da110bb0736199,1,405
433,66230ee6d8da110bb0744b2d,1,366


In [20]:
# 계정별 일일데이터의 20% 이상 이상치 데이터인 계정 확인
y_result_df[y_result_df['y_label']==0][y_result_df[y_result_df['y_label']==0]['count'] >= 40]

Unnamed: 0,youtube_user_id,y_label,count
22,62872523fb15712a8cb93479,0,113
54,629f6ca6eaf5732d6df0611e,0,326
62,62a35ce69d41c93ff90b5670,0,168
87,62bc1aca507271632b940e2e,0,57
92,62c4e558507271632b9cc1c7,0,68
100,62d11f080b4c4c7502a5be3d,0,77
113,62d55a5e9900f20e1f259d24,0,49
119,62fb96f62be6ae3ff3672d79,0,46
128,631a067c1babf83920070ad7,0,72
148,639bb8dcd603b8138e33780b,0,203


### 데이터 분할

In [76]:
# 컬럼 정리
unique_col = merge_df_users_fin.columns[:11]
x_col = merge_df_users_fin.columns[11:-1]

In [77]:
# 데이터 분할
X = merge_df_users_fin[x_col].drop(columns=['total_engage_rate','net_subscribers_change','revenue_per_view','averageViewPercentage']) ## y값 라벨링에 쓰인 지표 제거
y = merge_df_users_fin['y_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [81]:
# 불균형 확인
print(y_train.value_counts())
print(y_test.value_counts())

y_label
1    80357
0     4189
Name: count, dtype: int64
y_label
1    20041
0     1096
Name: count, dtype: int64


### 언더샘플링

In [82]:
from imblearn.under_sampling import RandomUnderSampler

In [83]:
# 언더샘플링
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

### 변수선택

t-test

In [85]:
from scipy.stats import ttest_ind

# y=0과 y=1 그룹으로 데이터 나누기
group_0 = X_train_resampled[y_train_resampled == 0]
group_1 = X_train_resampled[y_train_resampled == 1]

# t-test 수행
p_values = []
for col in X_train_resampled.columns:
    t_stat, p_val = ttest_ind(group_0[col], group_1[col], equal_var=False)  # Welch's t-test
    p_values.append((col, p_val))

# p-value < 0.05인 변수 선택
selected_features_by_ttest = [col for col, p_val in p_values if p_val < 0.05]
print("Selected Features by t-test:", selected_features_by_ttest)

Selected Features by t-test: ['views', 'redViews', 'UNSUBSCRIBED', 'SUBSCRIBED', 'comments', 'likes', 'shares', 'dislikes', 'estimatedMinutesWatched', 'averageViewDuration', 'estimated_revenue', 'estimated_ad_revenue', 'estimated_red_partner_revenue', 'gross_revenue', 'cpm', 'playback_based_cpm', 'subscribers_count', 'subscribers_gained', 'subscribers_lost', 'ad_impressions', 'monetized_playbacks', 'age13-17.male', 'age18-24.female', 'age18-24.male', 'age25-34.female', 'age25-34.male', 'age35-44.female', 'age45-54.female', 'age45-54.male', 'age55-64.female', 'age65-.female', 'age65-.male', 'cpm_to_revenue_ratio', 'watched_time_rate', 'male_viewer_rate', 'youth_viewer_rate', 'adult_viewer_rate', 'older_viewer_rate']


In [97]:
len(selected_features_by_ttest)

38

Lasso

In [90]:
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

# t-test로 선택된 변수로 데이터프레임 구성
X_train_ttest_selected = X_train[selected_features_by_ttest]

# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_ttest_selected)

# LassoCV 모델 설정: 반복 횟수 증가, alpha 범위 조정
lasso = LassoCV(cv=5, random_state=42, max_iter=5000, alphas=[0.1, 0.01, 0.001]).fit(X_train_scaled, y_train)

# 선택된 변수 확인 (회귀 계수가 0이 아닌 변수들)
selected_features_by_lasso = X_train_ttest_selected.columns[(lasso.coef_ != 0)]
print("Selected Features by Lasso:", selected_features_by_lasso)

Selected Features by Lasso: Index(['redViews', 'UNSUBSCRIBED', 'SUBSCRIBED', 'comments', 'shares',
       'estimatedMinutesWatched', 'averageViewDuration',
       'estimated_ad_revenue', 'cpm', 'playback_based_cpm',
       'subscribers_count', 'subscribers_gained', 'subscribers_lost',
       'ad_impressions', 'monetized_playbacks', 'age13-17.male',
       'age18-24.female', 'age25-34.female', 'age35-44.female',
       'age45-54.male', 'age55-64.female', 'age65-.female', 'age65-.male',
       'cpm_to_revenue_ratio', 'watched_time_rate', 'male_viewer_rate',
       'adult_viewer_rate'],
      dtype='object')


In [98]:
len(selected_features_by_lasso)

27

In [91]:
## lasso 선택으로 제거된 변수
set(selected_features_by_ttest) - set(selected_features_by_lasso)

{'age18-24.male',
 'age25-34.male',
 'age45-54.female',
 'dislikes',
 'estimated_red_partner_revenue',
 'estimated_revenue',
 'gross_revenue',
 'likes',
 'older_viewer_rate',
 'views',
 'youth_viewer_rate'}

RandomForest

In [92]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# RandomForest 모델 학습 및 교차 검증
X_train_lasso_selected = X_train[selected_features_by_lasso]

model = RandomForestClassifier(random_state=42)
cross_val_scores = cross_val_score(model, X_train_lasso_selected, y_train, cv=5, scoring='accuracy')

# 모델 학습
model.fit(X_train_lasso_selected, y_train)

# 피처 중요도 추출
importances = model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_features_by_lasso, 'importance': importances})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)

# 최종적으로 중요한 변수 선택 (예: 상위 10개 변수)
final_selected_features = feature_importances['feature'].head(10).tolist()
print("Final Selected Features by RandomForest:", final_selected_features)

# 교차 검증 결과 출력
print(f"Cross-Validation Accuracy: {cross_val_scores.mean():.2f}")

Final Selected Features by RandomForest: ['subscribers_gained', 'cpm_to_revenue_ratio', 'shares', 'averageViewDuration', 'estimatedMinutesWatched', 'redViews', 'watched_time_rate', 'comments', 'subscribers_count', 'UNSUBSCRIBED']
Cross-Validation Accuracy: 0.99


In [104]:
feature_importances ## 0.01 기준으로 변수 선택

Unnamed: 0,feature,importance
11,subscribers_gained,0.157649
23,cpm_to_revenue_ratio,0.15663
4,shares,0.063796
6,averageViewDuration,0.054485
5,estimatedMinutesWatched,0.052089
0,redViews,0.048114
24,watched_time_rate,0.042851
3,comments,0.040425
10,subscribers_count,0.039504
1,UNSUBSCRIBED,0.038901


## 콘텐츠 분석 데이터셋

In [52]:
youtube_videos

Unnamed: 0,youtube_user_id,video,end_date,views,redViews,comments,likes,dislikes,shares,estimatedMinutesWatched,...,subscribersGained,subscribersLost,monetizedPlaybacks,adImpressions,cardClickRate,cardTeaserClickRate,cardImpressions,cardTeaserImpressions,cardClicks,cardTeaserClicks
0,64467ea09634a10e3709e1ea,--0HSDH6J7o,2023-04-23,3,1,0,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0,0,0,0
1,64467ea09634a10e3709e1ea,--0HSDH6J7o,2023-04-30,4,0,0,0,0,0,1,...,0,0,0.0,0.0,0.0,0.0,0,0,0,0
2,64467ea09634a10e3709e1ea,--0HSDH6J7o,2023-05-01,2,1,0,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0,0,0,0
3,64467ea09634a10e3709e1ea,--0HSDH6J7o,2023-05-02,3,0,0,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0,0,0,0
4,64467ea09634a10e3709e1ea,--0HSDH6J7o,2023-05-03,1,0,0,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8569506,63fb5daa2a0144119186eca8,,2024-01-28,0,0,0,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0,0,0,0
8569507,64809e9719c22b644dde6c44,,2024-01-28,0,0,0,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0,0,0,0
8569508,64bbf3cd616bd20e30379bf3,,2024-01-28,0,0,0,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0,0,0,0
8569509,6508ff021120b40b4427a4fc,,2024-01-28,0,0,0,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0,0,0,0
