In [1]:
import pymongo
import pandas as pd
import numpy as np
import math
from datetime import datetime

# import dask.dataframe as dd ## 대규모 데이터 처리
# import vaex ## 대규모 데이터 처리

# from sklearn.linear_model import LinearRegression ## likes, dislikes 대체
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

import os
from dotenv import load_dotenv

In [2]:
file_path = 'C:/py_src/awake/data/'

In [3]:
# 계정 / 콘텐츠 분석 데이터셋 불러오기
merge_df_users_fin = pd.read_csv('C:/py_src/awake/data/merge_df_users_fin.csv', low_memory=False)
youtube_videos = pd.read_csv(file_path + 'youtube_videos.csv')

In [4]:
del merge_df_users_fin

## 계정 데이터 분석

In [None]:
# 버그로 사용된 수치값 대체
merge_df_users_fin['likes'] = np.where(merge_df_users_fin['likes'] < 0, 0, merge_df_users_fin['likes'])
merge_df_users_fin['dislikes'] = np.where(merge_df_users_fin['dislikes'] < 0, 0, merge_df_users_fin['dislikes'])

### 파생변수

In [6]:
# 파생변수1 - 참여도 관련
merge_df_users_fin['like_rate'] = merge_df_users_fin['likes'] / merge_df_users_fin['views'] ## 좋아요 비율 
merge_df_users_fin['comment_rate'] = merge_df_users_fin['comments'] / merge_df_users_fin['views'] ## 댓글 비율
merge_df_users_fin['share_rate'] = merge_df_users_fin['shares'] / merge_df_users_fin['views'] ## 공유 비율  
merge_df_users_fin['dislike_rate'] = merge_df_users_fin['dislikes'] / merge_df_users_fin['views'] ## 싫어요 비율
merge_df_users_fin['total_engage_rate'] = (merge_df_users_fin['likes'] + merge_df_users_fin['comments'] + merge_df_users_fin['shares'] + merge_df_users_fin['dislikes']) / merge_df_users_fin['views'] ## 총 참여율
merge_df_users_fin['positive_engage_rate'] = (merge_df_users_fin['likes'] + merge_df_users_fin['shares']) / merge_df_users_fin['views'] ## 긍정적 참여율
merge_df_users_fin['comment_to_like_rate'] = merge_df_users_fin['comments'] / merge_df_users_fin['likes'] ## 댓글/좋아요 비율
merge_df_users_fin['like_to_dislike_ratio'] = merge_df_users_fin['likes'] / (merge_df_users_fin['dislikes']) ## 좋아요/싫어요 비율

In [7]:
# 파생변수2 - 구독자 관련
merge_df_users_fin['subscriber_increase_rate'] = merge_df_users_fin['subscribers_gained'] / merge_df_users_fin['subscribers_count'] ## 구독자 증가율
merge_df_users_fin['subscriber_decrease_rate'] = merge_df_users_fin['subscribers_lost'] / merge_df_users_fin['subscribers_count'] ## 구독자 감소율
merge_df_users_fin['net_subscribers_change'] = merge_df_users_fin['subscribers_gained'] - merge_df_users_fin['subscribers_lost'] ## 구독자 순증가
merge_df_users_fin['subscribers_conversion_rate'] = merge_df_users_fin['subscribers_gained'] / merge_df_users_fin['views'] ## 구독자 전환율
merge_df_users_fin['subscriber_retention_rate'] = (merge_df_users_fin['SUBSCRIBED'] - merge_df_users_fin['UNSUBSCRIBED']) / merge_df_users_fin['subscribers_count'] ## 구독자 유지율
merge_df_users_fin['subscribed_view_rate'] = merge_df_users_fin['SUBSCRIBED'] / (merge_df_users_fin['SUBSCRIBED'] + merge_df_users_fin['UNSUBSCRIBED']) ## 구독자 조회수 비율
merge_df_users_fin['unsubscribed_view_rate'] = merge_df_users_fin['UNSUBSCRIBED'] / (merge_df_users_fin['SUBSCRIBED'] + merge_df_users_fin['UNSUBSCRIBED']) ## 비구독자 조회수 비율

In [8]:
# 파생변수3 - 수익 관련
merge_df_users_fin['revenue_per_view'] = merge_df_users_fin['estimated_revenue'] / merge_df_users_fin['views'] ## 조회수당 수익
merge_df_users_fin['revenue_per_subscribed_view'] = merge_df_users_fin['estimated_revenue'] / merge_df_users_fin['SUBSCRIBED'] ## 구독자조회수당 수익
merge_df_users_fin['revenue_per_unsubscribed_view'] = merge_df_users_fin['estimated_revenue'] / merge_df_users_fin['UNSUBSCRIBED'] ## 비구독자조회수당 수익
merge_df_users_fin['revenue_per_subscriber'] = merge_df_users_fin['estimated_revenue'] / merge_df_users_fin['subscribers_count'] ## 구독자당 수익
merge_df_users_fin['revenue_per_red_view'] = merge_df_users_fin['estimated_revenue'] / merge_df_users_fin['redViews'] ## 프리미엄당 수익
merge_df_users_fin['ad_revenue_rate'] = merge_df_users_fin['estimated_ad_revenue'] / merge_df_users_fin['estimated_revenue'] ## 광고수익비율
merge_df_users_fin['red_revenue_rate'] = merge_df_users_fin['estimated_red_partner_revenue'] / merge_df_users_fin['estimated_revenue'] ## 프리미엄수익비율
merge_df_users_fin['cpm_to_revenue_ratio'] = merge_df_users_fin['cpm'] / merge_df_users_fin['estimated_revenue'] ## cpm 대비 수익
merge_df_users_fin['revenue_per_ad_impression'] = merge_df_users_fin['estimated_revenue'] / merge_df_users_fin['ad_impressions'] ## 광고노출당 수익
merge_df_users_fin['playback_based_cpm_rate'] = merge_df_users_fin['playback_based_cpm'] / merge_df_users_fin['cpm'] ## 재생 기반 수익

In [9]:
# 파생변수4 - 시청 시간 관련
merge_df_users_fin['revenue_per_minute_watched'] = merge_df_users_fin['estimated_revenue'] / merge_df_users_fin['estimatedMinutesWatched'] ## 시청 시간 당 수익
merge_df_users_fin['avg_view_duration_rate'] = merge_df_users_fin['averageViewDuration'] / merge_df_users_fin['averageViewPercentage'] ## 평균 시청 시간 비율
merge_df_users_fin['watched_time_rate'] = merge_df_users_fin['averageViewPercentage'] * merge_df_users_fin['estimatedMinutesWatched'] ## 재생 비율 대비 시청 시간
merge_df_users_fin['watched_view_rate'] = merge_df_users_fin['estimatedMinutesWatched'] / merge_df_users_fin['views'] ## 조회수당 시청시간
merge_df_users_fin['subscribed_view_time_rate'] = merge_df_users_fin['estimatedMinutesWatched'] / merge_df_users_fin['SUBSCRIBED'] ## 구독자조회수당 시청시간
merge_df_users_fin['unsubscribed_view_time_rate'] = merge_df_users_fin['estimatedMinutesWatched'] / merge_df_users_fin['UNSUBSCRIBED'] ## 비구독자조회수당 시청시간
merge_df_users_fin['subscriber_view_time_rate'] = merge_df_users_fin['estimatedMinutesWatched'] / merge_df_users_fin['subscribers_count'] ## 구독자당 시청시간

In [10]:
# 파생변수5 - 광고 관련
merge_df_users_fin['revenue_per_playback'] = merge_df_users_fin['gross_revenue'] / merge_df_users_fin['monetized_playbacks'] ## 1회 광고재생당 총수익
merge_df_users_fin['gross_revenue_per_ad_impression'] = merge_df_users_fin['gross_revenue'] / merge_df_users_fin['ad_impressions'] ## 1회 광고노출당 총수익
merge_df_users_fin['playback_rate'] = merge_df_users_fin['monetized_playbacks'] / merge_df_users_fin['ad_impressions'] ## 광고노출 대비 재생율
merge_df_users_fin['unplayback_rate'] = (merge_df_users_fin['ad_impressions'] - merge_df_users_fin['monetized_playbacks']) / merge_df_users_fin['ad_impressions'] ## 광고노출 대비 비재생율

In [11]:
# 파생변수6 - 연령 및 성별 관련
female_col = [col_nm for col_nm in merge_df_users_fin.columns if 'female' in col_nm ]
male_col = [col_nm for col_nm in merge_df_users_fin.columns if ('male' in col_nm) and ('female' not in col_nm)]

youth_col = [col_nm for col_nm in merge_df_users_fin.columns if '13-17' in col_nm]
adult_col = [col_nm for col_nm in merge_df_users_fin.columns if ('18-24' in col_nm) or ('25-34' in col_nm) or ('35-44' in col_nm) or ('45-54' in col_nm)]
older_col = [col_nm for col_nm in merge_df_users_fin.columns if ('55-64' in col_nm) or ('65' in col_nm)]

merge_df_users_fin['female_viewer_rate'] = merge_df_users_fin[female_col].sum(axis=1) ## 여성 시청자 비율
merge_df_users_fin['male_viewer_rate'] = merge_df_users_fin[male_col].sum(axis=1) ## 남성 시청자 비율

merge_df_users_fin['youth_viewer_rate'] = merge_df_users_fin[youth_col].sum(axis=1) ## 청소년 시청자 비율
merge_df_users_fin['adult_viewer_rate'] = merge_df_users_fin[adult_col].sum(axis=1) ## 성인 시청자 비율
merge_df_users_fin['older_viewer_rate'] = merge_df_users_fin[older_col].sum(axis=1) ## 노인 시청자 비율

In [12]:
# null값 대체
merge_df_users_fin[merge_df_users_fin.columns[11:]] = merge_df_users_fin[merge_df_users_fin.columns[11:]].fillna(0) ## NaN
merge_df_users_fin[merge_df_users_fin.columns[11:]] = np.where(merge_df_users_fin[merge_df_users_fin.columns[11:]]==np.inf,0,merge_df_users_fin[merge_df_users_fin.columns[11:]]) ## inf

### y값 설정

#### 중요 지표 표준화

In [13]:
# y값 활용 지표
y_col = ['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view', 'gross_revenue_per_ad_impression'] ## null값은 views가 0인 데이터

In [14]:
# null값 대체
merge_df_users_fin[y_col] = merge_df_users_fin[y_col].fillna(0) ## NaN
merge_df_users_fin[y_col] = np.where(merge_df_users_fin[y_col]==np.inf,0,merge_df_users_fin[y_col]) ## inf

In [15]:
# 표준화 (Standard Scaling)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(merge_df_users_fin[y_col])

#### 다중 지표 결합

In [16]:
# Isolation Forest 모델 학습
iso_forest = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
iso_forest.fit(scaled_features)

# 이상치 점수 계산
anomaly_scores = iso_forest.decision_function(scaled_features)

In [17]:
# 임계값 설정
threshold = np.percentile(anomaly_scores, 5) ## 이상치 점수가 작을수록 이상치에 가깝다고 판단하는데, 5번째 백분위수가 (-)로 바뀌는 포인트

# y값 라벨링 (이상치이면 0, 그렇지 않으면 1)
merge_df_users_fin['y_label'] = np.where(anomaly_scores < threshold, 0, 1)

In [18]:
# 전체 데이터 y값 빈도 확인
merge_df_users_fin['y_label'].value_counts()

y_label
1    100398
0      5285
Name: count, dtype: int64

In [19]:
# 계정별 y값 빈도 데이터
y_result_df = merge_df_users_fin.groupby('youtube_user_id')['y_label'].value_counts().reset_index()
y_result_df

Unnamed: 0,youtube_user_id,y_label,count
0,627cb611aa6f212355e0b617,1,405
1,627f59ccaa39226247c60b01,1,402
2,627f59ccaa39226247c60b01,0,3
3,6287228afb15712a8cb931d7,1,405
4,6287229efb15712a8cb93225,1,398
...,...,...,...
394,65e7b773d8da110bb072e2b5,0,1
395,65f7b17ed8da110bb0733b7b,1,405
396,65fecf7ed8da110bb0736199,1,405
397,66230ee6d8da110bb0744b2d,1,373


In [20]:
# 계정별 일일데이터의 20% 이상 이상치 데이터인 계정 확인
y_result_df[y_result_df['y_label']==0][y_result_df[y_result_df['y_label']==0]['count'] >= 40]

Unnamed: 0,youtube_user_id,y_label,count
17,62872523fb15712a8cb93479,0,90
54,62a35ce69d41c93ff90b5670,0,110
78,62c4e558507271632b9cc1c7,0,50
85,62d11f080b4c4c7502a5be3d,0,400
131,639bb8dcd603b8138e33780b,0,187
139,63c9075250eb530dfd1346bd,0,46
156,63d77c9650eb530dfd139f8b,0,335
171,63eb4f87ee122e631992279f,0,302
205,640001db0abaa11316396d3b,0,223
218,64020bf4d746c60e1272055f,0,155


### 데이터 분할

In [21]:
# 컬럼 정리
unique_col = merge_df_users_fin.columns[:11]
x_col = merge_df_users_fin.columns[11:-1]

In [22]:
# 데이터 분할
X = merge_df_users_fin[x_col].drop(columns=['total_engage_rate','net_subscribers_change','revenue_per_view','averageViewPercentage', 'gross_revenue_per_ad_impression']) ## y값 라벨링에 쓰인 지표 제거
y = merge_df_users_fin['y_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# 불균형 확인
print(y_train.value_counts())
print(y_test.value_counts())

y_label
1    80302
0     4244
Name: count, dtype: int64
y_label
1    20096
0     1041
Name: count, dtype: int64


### 언더샘플링

In [24]:
# 언더샘플링
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

### 변수선택

t-test

In [25]:
from scipy.stats import ttest_ind

# y=0과 y=1 그룹으로 데이터 나누기
group_0 = X_train_resampled[y_train_resampled == 0]
group_1 = X_train_resampled[y_train_resampled == 1]

# t-test 수행
p_values = []
for col in X_train_resampled.columns:
    t_stat, p_val = ttest_ind(group_0[col], group_1[col], equal_var=False)  # Welch's t-test
    p_values.append((col, p_val))

# p-value < 0.05인 변수 선택
selected_features_by_ttest = [col for col, p_val in p_values if p_val < 0.05]
print("Selected Features by t-test:", selected_features_by_ttest)

Selected Features by t-test: ['views', 'redViews', 'UNSUBSCRIBED', 'SUBSCRIBED', 'comments', 'likes', 'shares', 'dislikes', 'estimatedMinutesWatched', 'averageViewDuration', 'estimated_revenue', 'estimated_ad_revenue', 'estimated_red_partner_revenue', 'gross_revenue', 'cpm', 'playback_based_cpm', 'subscribers_count', 'subscribers_gained', 'subscribers_lost', 'ad_impressions', 'monetized_playbacks', 'age13-17.female', 'age13-17.male', 'age18-24.female', 'age18-24.male', 'age25-34.female', 'age25-34.male', 'age35-44.female', 'age35-44.male', 'age45-54.female', 'age55-64.female', 'age55-64.male', 'age65-.female', 'age65-.male', 'like_rate', 'comment_rate', 'share_rate', 'dislike_rate', 'positive_engage_rate', 'like_to_dislike_ratio', 'subscriber_increase_rate', 'subscriber_decrease_rate', 'subscribers_conversion_rate', 'subscribed_view_rate', 'unsubscribed_view_rate', 'revenue_per_subscribed_view', 'revenue_per_unsubscribed_view', 'revenue_per_subscriber', 'revenue_per_red_view', 'ad_reve

  a_zero_mean = a - mean
  d = mean1 - mean2
  estimate = m1-m2


Lasso

In [42]:
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

# t-test로 선택된 변수로 데이터프레임 구성
X_train_ttest_selected = X_train_resampled[selected_features_by_ttest]

# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_ttest_selected)

# LassoCV 모델 설정: 반복 횟수 증가, alpha 범위 조정
lasso = LassoCV(cv=5, random_state=42, max_iter=10000, alphas=[0.1, 0.05, 0.01, 0.005, 0.001]).fit(X_train_scaled, y_train_resampled)

# 선택된 변수 확인 (회귀 계수가 0이 아닌 변수들)
selected_features_by_lasso = X_train_ttest_selected.columns[(lasso.coef_ != 0)]
print("Selected Features by Lasso:", selected_features_by_lasso)

Selected Features by Lasso: Index(['redViews', 'SUBSCRIBED', 'comments', 'likes', 'shares',
       'estimatedMinutesWatched', 'averageViewDuration', 'estimated_revenue',
       'estimated_red_partner_revenue', 'gross_revenue', 'cpm',
       'subscribers_count', 'subscribers_gained', 'subscribers_lost',
       'ad_impressions', 'monetized_playbacks', 'age13-17.female',
       'age18-24.female', 'age18-24.male', 'age25-34.female', 'age25-34.male',
       'age35-44.female', 'age65-.female', 'like_rate', 'comment_rate',
       'dislike_rate', 'positive_engage_rate', 'like_to_dislike_ratio',
       'subscriber_increase_rate', 'subscriber_decrease_rate',
       'subscribers_conversion_rate', 'subscribed_view_rate',
       'unsubscribed_view_rate', 'revenue_per_subscriber',
       'revenue_per_red_view', 'ad_revenue_rate', 'red_revenue_rate',
       'revenue_per_minute_watched', 'avg_view_duration_rate',
       'watched_view_rate', 'subscribed_view_time_rate',
       'unsubscribed_view_time_r

In [51]:
len(selected_features_by_lasso)

44

In [44]:
## lasso 선택으로 제거된 변수
set(selected_features_by_ttest) - set(selected_features_by_lasso)

{'UNSUBSCRIBED',
 'adult_viewer_rate',
 'age13-17.male',
 'age35-44.male',
 'age45-54.female',
 'age55-64.female',
 'age55-64.male',
 'age65-.male',
 'cpm_to_revenue_ratio',
 'dislikes',
 'estimated_ad_revenue',
 'female_viewer_rate',
 'male_viewer_rate',
 'older_viewer_rate',
 'playback_based_cpm',
 'revenue_per_ad_impression',
 'revenue_per_playback',
 'revenue_per_subscribed_view',
 'revenue_per_unsubscribed_view',
 'share_rate',
 'unplayback_rate',
 'views',
 'watched_time_rate',
 'youth_viewer_rate'}

RandomForest

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# RandomForest 모델 학습 및 교차 검증
X_train_lasso_selected = X_train_resampled[selected_features_by_lasso]

model = RandomForestClassifier(random_state=42)
cross_val_scores = cross_val_score(model, X_train_lasso_selected, y_train_resampled, cv=5, scoring='accuracy')

# 모델 학습
model.fit(X_train_lasso_selected, y_train_resampled)

# 피처 중요도 추출
importances = model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_features_by_lasso, 'importance': importances})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)

# 최종적으로 중요한 변수 선택 (예: 상위 10개 변수)
final_selected_features = list(feature_importances[feature_importances['importance'] > 0.005]['feature'])
print("Final Selected Features by RandomForest:", final_selected_features)

# 교차 검증 결과 출력
print(f"Cross-Validation Accuracy: {cross_val_scores.mean():.2f}")

Final Selected Features by RandomForest: ['estimated_revenue', 'estimatedMinutesWatched', 'subscribers_gained', 'comments', 'revenue_per_red_view', 'positive_engage_rate', 'revenue_per_minute_watched', 'likes', 'shares', 'estimated_red_partner_revenue', 'gross_revenue', 'like_rate', 'subscribers_lost', 'revenue_per_subscriber', 'cpm', 'subscriber_increase_rate', 'monetized_playbacks', 'playback_rate', 'redViews', 'red_revenue_rate', 'subscribers_count', 'SUBSCRIBED']
Cross-Validation Accuracy: 0.98


In [52]:
feature_importances ## 0.005 기준으로 변수 선택

Unnamed: 0,feature,importance
7,estimated_revenue,0.124202
5,estimatedMinutesWatched,0.078844
12,subscribers_gained,0.064941
2,comments,0.060625
34,revenue_per_red_view,0.054205
26,positive_engage_rate,0.053293
37,revenue_per_minute_watched,0.05308
3,likes,0.051197
4,shares,0.049286
8,estimated_red_partner_revenue,0.046729


In [56]:
len(final_selected_features)

32

### 모델 기법 적용

RandomForest

In [57]:
from sklearn.ensemble import RandomForestClassifier

# 랜덤 포레스트 모델 학습
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_resampled[final_selected_features], y_train_resampled)

# 랜덤 포레스트 모델 교차 검증
cv_scores_rf = cross_val_score(rf_model, X_train_resampled[final_selected_features], y_train_resampled, cv=5, scoring='accuracy')
print(f"RandomForest Cross-Validation Accuracy: {cv_scores_rf.mean():.2f}")

RandomForest Cross-Validation Accuracy: 0.98


LogisticRegression

In [58]:
from sklearn.linear_model import LogisticRegression

# 로지스틱 회귀 모델 학습
log_reg_model = LogisticRegression(random_state=42)
log_reg_model.fit(X_train_resampled[final_selected_features], y_train_resampled)

# 로지스틱 회귀 모델 교차 검증
cv_scores_log = cross_val_score(log_reg_model, X_train_resampled[final_selected_features], y_train_resampled, cv=5, scoring='accuracy')
print(f"Logistic Regression Cross-Validation Accuracy: {cv_scores_log.mean():.2f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression Cross-Validation Accuracy: 0.86


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GradientBoosting

In [59]:
from sklearn.ensemble import GradientBoostingClassifier

# 그라디언트 부스팅 모델 학습
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_resampled[final_selected_features], y_train_resampled)

# 그라디언트 부스팅 모델 교차 검증
cv_scores_gb = cross_val_score(gb_model, X_train_resampled[final_selected_features], y_train_resampled, cv=5, scoring='accuracy')
print(f"GradientBoosting Cross-Validation Accuracy: {cv_scores_gb.mean():.2f}")

GradientBoosting Cross-Validation Accuracy: 0.98


XGBoost

In [60]:
from xgboost import XGBClassifier

# XGBoost를 사용한 모델 학습
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_resampled[final_selected_features], y_train_resampled)

# 그라디언트 부스팅 모델 교차 검증
cv_scores_xgb = cross_val_score(xgb_model, X_train_resampled[final_selected_features], y_train_resampled, cv=5, scoring='accuracy')
print(f"GradientBoosting Cross-Validation Accuracy: {cv_scores_xgb.mean():.2f}")

GradientBoosting Cross-Validation Accuracy: 0.98


### 모델 성능 평가

In [61]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# 예측 결과 생성
y_pred_rf = rf_model.predict(X_test[final_selected_features])
y_pred_log = log_reg_model.predict(X_test[final_selected_features])
y_pred_gb = gb_model.predict(X_test[final_selected_features])
y_pred_xgb = xgb_model.predict(X_test[final_selected_features])

# 모델 정확도 확인
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_log = accuracy_score(y_test, y_pred_log)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print(f"RandomForest Test Accuracy: {accuracy_rf:.2f}")
print(f"Logistic Regression Test Accuracy: {accuracy_log:.2f}")
print(f"GradientBoosting Test Accuracy: {accuracy_gb:.2f}")
print(f"XGBoost Test Accuracy: {accuracy_xgb:.2f}")

# 정밀도, 재현율, F1-score 출력
print("RandomForest Classification Report:")
print(classification_report(y_test, y_pred_rf))

print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_log))

print("GradientBoosting Classification Report:")
print(classification_report(y_test, y_pred_gb))

print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))

# ROC-AUC 확인
roc_auc_rf = roc_auc_score(y_test, rf_model.predict_proba(X_test[final_selected_features])[:, 1])
roc_auc_log = roc_auc_score(y_test, log_reg_model.predict_proba(X_test[final_selected_features])[:, 1])
roc_auc_gb = roc_auc_score(y_test, gb_model.predict_proba(X_test[final_selected_features])[:, 1])
roc_auc_xgb = roc_auc_score(y_test, xgb_model.predict_proba(X_test[final_selected_features])[:, 1])

print(f"RandomForest ROC-AUC: {roc_auc_rf:.2f}")
print(f"Logistic Regression ROC-AUC: {roc_auc_log:.2f}")
print(f"GradientBoosting ROC-AUC: {roc_auc_gb:.2f}")
print(f"XGBoost ROC-AUC: {roc_auc_xgb:.2f}")


RandomForest Test Accuracy: 0.97
Logistic Regression Test Accuracy: 0.85
GradientBoosting Test Accuracy: 0.97
XGBoost Test Accuracy: 0.98
RandomForest Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.99      0.78      1041
           1       1.00      0.97      0.99     20096

    accuracy                           0.97     21137
   macro avg       0.82      0.98      0.88     21137
weighted avg       0.98      0.97      0.98     21137

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.23      0.89      0.36      1041
           1       0.99      0.84      0.91     20096

    accuracy                           0.85     21137
   macro avg       0.61      0.87      0.64     21137
weighted avg       0.96      0.85      0.89     21137

GradientBoosting Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.98 

### 모델 성능 개선

언더샘플링 데이터셋별 각 모델 학습

In [63]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import numpy as np

# 언더샘플링 횟수 설정
n_iterations = 10

# 각 모델의 예측 확률 저장 리스트
rf_probs = []
gb_probs = []
xgb_probs = []
models = []

for i in range(n_iterations):
    # 언더샘플링 적용
    rus = RandomUnderSampler(random_state=i)  # 각기 다른 랜덤 시드 사용
    X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

    # 랜덤포레스트 모델 학습
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train_resampled[final_selected_features], y_train_resampled)
    rf_probs.append(rf_model.predict_proba(X_test[final_selected_features]))
    models.append(rf_model)

    # 그라디언트 부스팅 모델 학습
    gb_model = GradientBoostingClassifier(random_state=42)
    gb_model.fit(X_train_resampled[final_selected_features], y_train_resampled)
    gb_probs.append(gb_model.predict_proba(X_test[final_selected_features]))
    models.append(gb_model)

    # XGBoost 모델 학습
    xgb_model = XGBClassifier(random_state=42)
    xgb_model.fit(X_train_resampled[final_selected_features], y_train_resampled)
    xgb_probs.append(xgb_model.predict_proba(X_test[final_selected_features]))
    models.append(xgb_model)

In [64]:
# 확률 평균을 통한 앙상블
rf_probs_avg = np.mean(np.array(rf_probs), axis=0)
gb_probs_avg = np.mean(np.array(gb_probs), axis=0)
xgb_probs_avg = np.mean(np.array(xgb_probs), axis=0)

# 최종 평균 확률 계산
ensemble_probs_avg = (rf_probs_avg + gb_probs_avg + xgb_probs_avg) / 3

# 평균 확률에 따라 최종 예측
y_pred_ensemble_avg = np.argmax(ensemble_probs_avg, axis=1)

# 성능 평가
from sklearn.metrics import accuracy_score
accuracy_ensemble_avg = accuracy_score(y_test, y_pred_ensemble_avg)
print(f"Ensemble Averaging Accuracy: {accuracy_ensemble_avg:.2f}")


Ensemble Averaging Accuracy: 0.98


In [65]:
from sklearn.metrics import classification_report

# 각 모델의 클래스별 성능 평가
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['Class 0', 'Class 1']))

print("\nGradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_gb, target_names=['Class 0', 'Class 1']))

print("\nXGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=['Class 0', 'Class 1']))

print("\nEnsemble Model Classification Report:")
print(classification_report(y_test, y_pred_ensemble_avg, target_names=['Class 0', 'Class 1']))


Random Forest Classification Report:
              precision    recall  f1-score   support

     Class 0       0.64      0.99      0.78      1041
     Class 1       1.00      0.97      0.99     20096

    accuracy                           0.97     21137
   macro avg       0.82      0.98      0.88     21137
weighted avg       0.98      0.97      0.98     21137


Gradient Boosting Classification Report:
              precision    recall  f1-score   support

     Class 0       0.62      0.98      0.76      1041
     Class 1       1.00      0.97      0.98     20096

    accuracy                           0.97     21137
   macro avg       0.81      0.98      0.87     21137
weighted avg       0.98      0.97      0.97     21137


XGBoost Classification Report:
              precision    recall  f1-score   support

     Class 0       0.68      0.99      0.81      1041
     Class 1       1.00      0.98      0.99     20096

    accuracy                           0.98     21137
   macro avg     

### 실제데이터 결과 확인

In [66]:
# 각 모델의 예측 확률을 평균내어 결합
ensemble_probs = [model.predict_proba(merge_df_users_fin[final_selected_features]) for model in models]
ensemble_avg_probs = np.mean(np.array(ensemble_probs), axis=0)

# 최종 확률에 따른 예측값 도출
y_pred_ensemble = np.argmax(ensemble_avg_probs, axis=1)

In [86]:
# 예측 결과 확인
merge_df_users_fin['predict'] = y_pred_ensemble
pred_result_df = merge_df_users_fin.groupby(['youtube_user_id'])['predict'].value_counts().reset_index()
fraud_user_id = pred_result_df[pred_result_df['predict']==0][pred_result_df[pred_result_df['predict']==0]['count']>=40]['youtube_user_id'] ## 이상치라고 생각되는 계정 확인

In [98]:
len(set(merge_df_users_fin['youtube_user_id'].unique()) - set(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(fraud_user_id)]['youtube_user_id'].unique()))

207

In [90]:
set(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(fraud_user_id)]['channel_title'].unique())

array(['임삐나', nan, 'Mind Patting마음토닥', 'MINLEE 민리', '시골낭만아재',
       'OBL - 온라인 농부, 사자가 되다', 'abbapraise 아바프레이즈', '채림처럼firstcherry',
       '모하지연 MOHAJIYEON', 'Jeffreyxking', '콜드쉽 Coldsheep', 'kiu기우쌤',
       '수빙수tv sooBingsoo', '석시원 커플 SeokSiWon Couple', '너굴몬',
       'GMENCY 멘시의 마인크래프트', '앙찡', '코인덕 차트아지', '미니멀영어 Minimal English',
       '프롬수지 fromsuzy', 'fromsuzy 프롬수지', '日本ジヌ【니혼지누】ー韓国に関する全て', '뻘짓연구소',
       '벽돌할아버지 Brick grandpa', '북토크', '나연이즈백 LPGA Na Yeon Choi', '그롬마쉬TV',
       '뷰드름 유튜버 인씨', 'MerryMa 메리마', '쿜쿜쿜', "루다의 댄스 연구소 Ruda's Dance Lab",
       '빅민 GAME', '잼스기타', '축구 읽어주는 여자 쵱내', 'OSSC', '뛰뛰빵빵 김옥순', '돈냄새',
       '키나kkina', '이현우의 MLBTV', 'V I N 빈 ', '평범한 사업가', '하원장 강동현', '주피코',
       '목소리 연기자 유지컬'], dtype=object)

In [92]:
# 피처 중요도 추출
final_importances = xgb_model.feature_importances_
feature_importances_final = pd.DataFrame({'feature': final_selected_features, 'importance': final_importances})
feature_importances_final = feature_importances_final.sort_values(by='importance', ascending=False)

In [93]:
feature_importances_final

Unnamed: 0,feature,importance
0,estimated_revenue,0.551478
2,subscribers_gained,0.084448
6,revenue_per_minute_watched,0.073407
5,positive_engage_rate,0.070522
4,revenue_per_red_view,0.031695
1,estimatedMinutesWatched,0.014611
18,redViews,0.013696
23,subscribers_conversion_rate,0.011824
31,unsubscribed_view_time_rate,0.011104
16,monetized_playbacks,0.010411


## 콘텐츠 데이터 분석

In [5]:
# 최종 콘텐츠 분석 데이터셋
# 환율 적용 - 해당기간 평균환율 : 1322.42
exchange_rate_col = ['estimatedRevenue', 'estimatedAdRevenue', 'estimatedRedPartnerRevenue', 'grossRevenue', 'cpm', 'playbackBasedCpm']
youtube_videos[exchange_rate_col] = youtube_videos[exchange_rate_col] * 1322.42

In [6]:
# 잘못된값 처리
youtube_videos['estimatedRevenue'] = np.where(youtube_videos['estimatedRevenue'] < 0,
                                               youtube_videos['estimatedAdRevenue'] + youtube_videos['estimatedRedPartnerRevenue'],
                                               youtube_videos['estimatedRevenue'])

In [7]:
# 버그로 사용된 수치값 대체
youtube_videos['likes'] = np.where(youtube_videos['likes'] < 0, 0, youtube_videos['likes'])
youtube_videos['dislikes'] = np.where(youtube_videos['dislikes'] < 0, 0, youtube_videos['dislikes'])

### 파생변수

In [8]:
# 파생변수1 - 참여도 관련
youtube_videos['like_rate'] = youtube_videos['likes'] / youtube_videos['views'] ## 좋아요 비율 
youtube_videos['comment_rate'] = youtube_videos['comments'] / youtube_videos['views'] ## 댓글 비율
youtube_videos['share_rate'] = youtube_videos['shares'] / youtube_videos['views'] ## 공유 비율  
youtube_videos['dislike_rate'] = youtube_videos['dislikes'] / youtube_videos['views'] ## 싫어요 비율
youtube_videos['total_engage_rate'] = (youtube_videos['likes'] + youtube_videos['comments'] + youtube_videos['shares'] + youtube_videos['dislikes']) / youtube_videos['views'] ## 총 참여율
youtube_videos['positive_engage_rate'] = (youtube_videos['likes'] + youtube_videos['shares']) / youtube_videos['views'] ## 긍정적 참여율
youtube_videos['comment_to_like_rate'] = youtube_videos['comments'] / youtube_videos['likes'] ## 댓글/좋아요 비율
youtube_videos['like_to_dislike_ratio'] = youtube_videos['likes'] / (youtube_videos['dislikes']) ## 좋아요/싫어요 비율

In [9]:
# 파생변수2 - 구독자 관련
youtube_videos['net_subscribers_change'] = youtube_videos['subscribersGained'] - youtube_videos['subscribersLost'] ## 구독자 순증가
youtube_videos['subscribers_conversion_rate'] = youtube_videos['subscribersGained'] / youtube_videos['views'] ## 구독자 전환율
youtube_videos['subscribers_gained_per_card_click'] = youtube_videos['subscribersGained'] / youtube_videos['cardClicks'] ## 카드 클릭당 구독자 증가
youtube_videos['subscribers_gained_per_playlist_add'] = youtube_videos['subscribersGained'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가당 구독자 증가
youtube_videos['card_click_to_subscriber_conversion'] = youtube_videos['cardClickRate'] / youtube_videos['subscribersGained'] ## 카드 클릭률 대비 구독자 전환율
youtube_videos['subscribers_lost_per_playlist_remove'] = youtube_videos['subscribersLost'] / youtube_videos['videosRemovedFromPlaylists'] ## 플레이리스트 제거당 구독자 손실

In [10]:
# 파생변수3 - 수익 관련
youtube_videos['revenue_per_view'] = youtube_videos['estimatedRevenue'] / youtube_videos['views'] ## 조회수당 수익
youtube_videos['revenue_per_red_view'] = youtube_videos['estimatedRevenue'] / youtube_videos['redViews'] ## 프리미엄당 수익
youtube_videos['ad_revenue_rate'] = youtube_videos['estimatedAdRevenue'] / youtube_videos['estimatedRevenue'] ## 광고수익비율
youtube_videos['red_revenue_rate'] = youtube_videos['estimatedRedPartnerRevenue'] / youtube_videos['estimatedRevenue'] ## 프리미엄수익비율
youtube_videos['cpm_to_revenue_ratio'] = youtube_videos['cpm'] / youtube_videos['estimatedRevenue'] ## cpm 대비 수익
youtube_videos['revenue_per_ad_impression'] = youtube_videos['estimatedRevenue'] / youtube_videos['adImpressions'] ## 광고노출당 수익
youtube_videos['playback_based_cpm_rate'] = youtube_videos['playbackBasedCpm'] / youtube_videos['cpm'] ## 재생 기반 수익
youtube_videos['revenue_per_card_click'] = youtube_videos['estimatedRevenue'] / youtube_videos['cardClicks'] ## 카드 클릭당 수익
youtube_videos['revenue_per_playlist_add'] = youtube_videos['estimatedRevenue'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가당 수익
youtube_videos['card_click_to_revenue_ratio'] = youtube_videos['cardClickRate'] / youtube_videos['estimatedRevenue'] ## 카드 클릭률 대비 수익 비율
youtube_videos['net_revenue_per_playlist_add'] = (youtube_videos['videosAddedToPlaylists'] - youtube_videos['videosRemovedFromPlaylists']) / youtube_videos['estimatedRevenue'] ## 플레이리스트 순추가당 수익
youtube_videos['ad_revenue_per_card_click'] = youtube_videos['estimatedAdRevenue'] / youtube_videos['cardClicks'] ## 카드 클릭당 광고 수익

In [11]:
# 파생변수4 - 시청 시간 관련
youtube_videos['revenue_per_minute_watched'] = youtube_videos['estimatedRevenue'] / youtube_videos['estimatedMinutesWatched'] ## 시청 시간 당 수익
youtube_videos['revenue_per_red_minute_watched'] = youtube_videos['estimatedRevenue'] / youtube_videos['estimatedRedMinutesWatched'] ## 프리미엄 이용자 시청 시간 당 수익
youtube_videos['avg_view_duration_rate'] = youtube_videos['averageViewDuration'] / youtube_videos['averageViewPercentage'] ## 평균 시청 시간 비율
youtube_videos['watched_time_rate'] = youtube_videos['averageViewPercentage'] * youtube_videos['estimatedMinutesWatched'] ## 재생 비율 대비 시청 시간
youtube_videos['watched_red_time_rate'] = youtube_videos['averageViewPercentage'] * youtube_videos['estimatedRedMinutesWatched'] ## 재생 비율 대비 프리미엄 이용자 시청 시간
youtube_videos['watched_view_rate'] = youtube_videos['estimatedMinutesWatched'] / youtube_videos['views'] ## 조회수당 시청시간
youtube_videos['watched_red_view_rate'] = youtube_videos['estimatedRedMinutesWatched'] / youtube_videos['views'] ## 조회수당 프리미엄 이용자 시청시간
youtube_videos['watch_time_per_card_click'] = youtube_videos['estimatedMinutesWatched'] / youtube_videos['cardClicks'] ## 카드 클릭당 시청 시간
youtube_videos['watch_time_per_playlist_add'] = youtube_videos['estimatedMinutesWatched'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가당 시청 시간
youtube_videos['avg_view_duration_per_card_click'] = youtube_videos['averageViewDuration'] / youtube_videos['cardClicks'] ## 카드 클릭 대비 평균 재생 시간 비율
youtube_videos['watch_time_loss_per_playlist_remove'] = youtube_videos['estimatedMinutesWatched'] / youtube_videos['videosRemovedFromPlaylists'] ## 플레이리스트 제거당 시청 시간 손실


In [12]:
# 파생변수5 - 광고 관련
youtube_videos['revenue_per_playback'] = youtube_videos['grossRevenue'] / youtube_videos['monetizedPlaybacks'] ## 1회 광고재생당 수익
youtube_videos['grossRevenue_per_ad_impression'] = youtube_videos['grossRevenue'] / youtube_videos['adImpressions'] ## 1회 광고노출당 총수익
youtube_videos['playback_rate'] = youtube_videos['monetizedPlaybacks'] / youtube_videos['adImpressions'] ## 광고노출 대비 재생율
youtube_videos['unplayback_rate'] = (youtube_videos['adImpressions'] - youtube_videos['monetizedPlaybacks']) / youtube_videos['adImpressions'] ## 광고노출 대비 비재생율
youtube_videos['grossrevenue_per_card_click'] = youtube_videos['grossRevenue'] / youtube_videos['cardClicks'] ## 카드 클릭당 수익
youtube_videos['grossrevenue_per_playlist_add'] = youtube_videos['grossRevenue'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가당 수익
youtube_videos['card_click_to_grossrevenue_ratio'] = youtube_videos['cardClickRate'] / youtube_videos['grossRevenue'] ## 카드 클릭률 대비 수익 비율
youtube_videos['net_grossrevenue_per_playlist_add'] = (youtube_videos['videosAddedToPlaylists'] - youtube_videos['videosRemovedFromPlaylists']) / youtube_videos['grossRevenue'] ## 플레이리스트 순추가당 수익
youtube_videos['ad_impressions_per_card_click'] = youtube_videos['adImpressions'] / youtube_videos['cardClicks'] ## 카드 클릭당 광고 노출
youtube_videos['ad_impressions_per_playlist_add'] = youtube_videos['adImpressions'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가당 광고 노출
youtube_videos['ad_playbacks_per_card_click'] = youtube_videos['monetizedPlaybacks'] / youtube_videos['cardClicks'] ## 카드 클릭 대비 광고 재생 비율
youtube_videos['ad_playbacks_per_playlist_add'] = youtube_videos['monetizedPlaybacks'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가당 광고 재생 비율

In [13]:
# 파생변수6 - 카드 관련
youtube_videos['card_to_teaser_click_rate'] = youtube_videos['cardClickRate'] / youtube_videos['cardTeaserClickRate'] ## 카드 티저 클릭률 대비 카드 클릭률
youtube_videos['card_click_per_impression_rate'] = youtube_videos['cardClicks'] / youtube_videos['cardImpressions'] ## 카드 노출당 클릭 비율
youtube_videos['card_teaser_click_per_impression_rate'] = youtube_videos['cardTeaserClicks'] / youtube_videos['cardTeaserImpressions'] ## 카드 티저 노출당 카드 티저 클릭 비율
youtube_videos['total_card_teaser_click_rate'] = (youtube_videos['cardClicks'] + youtube_videos['cardTeaserClicks']) / (youtube_videos['cardImpressions'] + youtube_videos['cardTeaserImpressions']) ## 카드와 카드 티저의 총 클릭률
youtube_videos['card_conversion_rate'] = youtube_videos['cardClicks'] / youtube_videos['cardTeaserClicks'] ## 카드 클릭 전환율

In [14]:
# 파생변수7 - 비디오 관련
youtube_videos['playlist_addition_rate'] = youtube_videos['videosAddedToPlaylists'] / youtube_videos['views'] ## 플레이리스트 추가 비율
youtube_videos['playlist_removal_rate'] = youtube_videos['videosRemovedFromPlaylists'] / youtube_videos['views'] ## 플레이리스트 제거 비율
youtube_videos['net_playlist_addition_rate'] = (youtube_videos['videosAddedToPlaylists'] - youtube_videos['videosRemovedFromPlaylists']) / youtube_videos['views'] ## 플레이리스트 순추가 비율
youtube_videos['playlist_engagement_rate'] = (youtube_videos['videosAddedToPlaylists'] + youtube_videos['videosRemovedFromPlaylists']) / youtube_videos['views'] ## 플레이리스트 참여도
youtube_videos['playlist_related_revenue_rate'] = youtube_videos['estimatedRevenue'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가 당 순수익

In [16]:
# null값 대체
youtube_videos[youtube_videos.columns[3:]] = youtube_videos[youtube_videos.columns[3:]].fillna(0) ## NaN
youtube_videos[youtube_videos.columns[3:]] = np.where(youtube_videos[youtube_videos.columns[3:]]==np.inf,0,youtube_videos[youtube_videos.columns[3:]]) ## inf

### y값 설정

#### 중요 지표 표준화

In [22]:
# y값 활용 지표
y_col = ['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view', 'grossRevenue_per_ad_impression','total_card_teaser_click_rate','playlist_engagement_rate'] ## null값은 views가 0인 데이터

In [23]:
# null값 대체
youtube_videos[y_col] = youtube_videos[y_col].fillna(0) ## NaN
youtube_videos[y_col] = np.where(youtube_videos[y_col]==np.inf,0,youtube_videos[y_col]) ## inf

In [26]:
# 표준화 (Standard Scaling)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(youtube_videos[y_col])

#### 다중 지표 결합

In [27]:
# Isolation Forest 모델 학습
iso_forest = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
iso_forest.fit(scaled_features)

# 이상치 점수 계산
anomaly_scores = iso_forest.decision_function(scaled_features)

In [28]:
# 임계값 설정
threshold = np.percentile(anomaly_scores, 5) ## 이상치 점수가 작을수록 이상치에 가깝다고 판단하는데, 5번째 백분위수가 (-)로 바뀌는 포인트

# y값 라벨링 (이상치이면 0, 그렇지 않으면 1)
youtube_videos['y_label'] = np.where(anomaly_scores < threshold, 0, 1)

In [29]:
# 전체 데이터 y값 빈도 확인
youtube_videos['y_label'].value_counts()

y_label
1    8141038
0     428473
Name: count, dtype: int64

In [30]:
del scaled_features, anomaly_scores

### 데이터 분할

In [31]:
# 컬럼 정리
unique_col = youtube_videos.columns[:3]
x_col = youtube_videos.columns[3:-1]

In [35]:
# 데이터 분할
X = youtube_videos[x_col].drop(columns=y_col) ## y값 라벨링에 쓰인 지표 제거
y = youtube_videos['y_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
del X, y

In [37]:
# 불균형 확인
print(y_train.value_counts())
print(y_test.value_counts())

y_label
1    6513105
0     342503
Name: count, dtype: int64
y_label
1    1627933
0      85970
Name: count, dtype: int64


### 언더샘플링

In [38]:
# 언더샘플링
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

### 변수선택

t-test

In [39]:
from scipy.stats import ttest_ind

# y=0과 y=1 그룹으로 데이터 나누기
group_0 = X_train_resampled[y_train_resampled == 0]
group_1 = X_train_resampled[y_train_resampled == 1]

# t-test 수행
p_values = []
for col in X_train_resampled.columns:
    t_stat, p_val = ttest_ind(group_0[col], group_1[col], equal_var=False)  # Welch's t-test
    p_values.append((col, p_val))

# p-value < 0.05인 변수 선택
selected_features_by_ttest = [col for col, p_val in p_values if p_val < 0.05]
print("Selected Features by t-test:", selected_features_by_ttest)

Selected Features by t-test: ['views', 'redViews', 'comments', 'likes', 'dislikes', 'shares', 'estimatedMinutesWatched', 'estimatedRedMinutesWatched', 'averageViewDuration', 'videosAddedToPlaylists', 'videosRemovedFromPlaylists', 'estimatedRevenue', 'estimatedAdRevenue', 'grossRevenue', 'estimatedRedPartnerRevenue', 'playbackBasedCpm', 'cpm', 'subscribersGained', 'subscribersLost', 'monetizedPlaybacks', 'adImpressions', 'cardClickRate', 'cardTeaserClickRate', 'cardImpressions', 'cardTeaserImpressions', 'cardClicks', 'cardTeaserClicks', 'like_rate', 'comment_rate', 'share_rate', 'dislike_rate', 'positive_engage_rate', 'comment_to_like_rate', 'like_to_dislike_ratio', 'subscribers_conversion_rate', 'subscribers_gained_per_card_click', 'subscribers_gained_per_playlist_add', 'card_click_to_subscriber_conversion', 'subscribers_lost_per_playlist_remove', 'revenue_per_red_view', 'ad_revenue_rate', 'red_revenue_rate', 'cpm_to_revenue_ratio', 'revenue_per_ad_impression', 'playback_based_cpm_rate

  a_zero_mean = a - mean
  d = mean1 - mean2
  estimate = m1-m2


In [47]:
len(selected_features_by_ttest)

74

Lasso

In [48]:
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

# t-test로 선택된 변수로 데이터프레임 구성
X_train_ttest_selected = X_train_resampled[selected_features_by_ttest]

# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_ttest_selected)

# LassoCV 모델 설정: 반복 횟수 증가, alpha 범위 조정
lasso = LassoCV(cv=5, random_state=42, max_iter=10000, alphas=[0.1, 0.05, 0.01, 0.005, 0.001]).fit(X_train_scaled, y_train_resampled)

# 선택된 변수 확인 (회귀 계수가 0이 아닌 변수들)
selected_features_by_lasso = X_train_ttest_selected.columns[(lasso.coef_ != 0)]
print("Selected Features by Lasso:", selected_features_by_lasso)

Selected Features by Lasso: Index(['views', 'redViews', 'comments', 'likes', 'dislikes', 'shares',
       'estimatedRedMinutesWatched', 'averageViewDuration',
       'videosAddedToPlaylists', 'videosRemovedFromPlaylists',
       'estimatedRevenue', 'estimatedAdRevenue', 'grossRevenue',
       'playbackBasedCpm', 'cpm', 'subscribersGained', 'subscribersLost',
       'monetizedPlaybacks', 'adImpressions', 'cardClickRate',
       'cardTeaserClickRate', 'cardImpressions', 'cardTeaserImpressions',
       'like_rate', 'comment_rate', 'dislike_rate', 'positive_engage_rate',
       'comment_to_like_rate', 'like_to_dislike_ratio',
       'subscribers_conversion_rate', 'subscribers_gained_per_playlist_add',
       'revenue_per_red_view', 'ad_revenue_rate', 'red_revenue_rate',
       'cpm_to_revenue_ratio', 'revenue_per_ad_impression',
       'playback_based_cpm_rate', 'revenue_per_playlist_add',
       'revenue_per_minute_watched', 'avg_view_duration_rate',
       'watched_time_rate', 'watched_r

In [50]:
len(selected_features_by_lasso)

55

In [53]:
## lasso 선택으로 제거된 변수
set(selected_features_by_ttest) - set(selected_features_by_lasso)

{'ad_impressions_per_card_click',
 'ad_impressions_per_playlist_add',
 'ad_playbacks_per_card_click',
 'ad_revenue_per_card_click',
 'cardClicks',
 'cardTeaserClicks',
 'card_click_to_subscriber_conversion',
 'card_to_teaser_click_rate',
 'estimatedMinutesWatched',
 'estimatedRedPartnerRevenue',
 'grossrevenue_per_card_click',
 'revenue_per_card_click',
 'revenue_per_red_minute_watched',
 'share_rate',
 'subscribers_gained_per_card_click',
 'subscribers_lost_per_playlist_remove',
 'watch_time_per_card_click',
 'watched_red_time_rate',
 'watched_view_rate'}

RandomForest

In [54]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# RandomForest 모델 학습 및 교차 검증
X_train_lasso_selected = X_train_resampled[selected_features_by_lasso]

model = RandomForestClassifier(random_state=42)
cross_val_scores = cross_val_score(model, X_train_lasso_selected, y_train_resampled, cv=5, scoring='accuracy')

# 모델 학습
model.fit(X_train_lasso_selected, y_train_resampled)

# 피처 중요도 추출
importances = model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_features_by_lasso, 'importance': importances})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)

# 최종적으로 중요한 변수 선택 (예: 상위 10개 변수)
final_selected_features = list(feature_importances[feature_importances['importance'] > 0.005]['feature'])
print("Final Selected Features by RandomForest:", final_selected_features)

# 교차 검증 결과 출력
print(f"Cross-Validation Accuracy: {cross_val_scores.mean():.2f}")

Final Selected Features by RandomForest: ['positive_engage_rate', 'playlist_addition_rate', 'cpm', 'revenue_per_ad_impression', 'revenue_per_playback', 'playlist_removal_rate', 'like_rate', 'playbackBasedCpm', 'videosAddedToPlaylists', 'views', 'likes', 'videosRemovedFromPlaylists', 'subscribersGained', 'watch_time_per_playlist_add', 'shares', 'watched_time_rate', 'revenue_per_red_view', 'estimatedRevenue', 'redViews', 'averageViewDuration', 'revenue_per_minute_watched', 'watch_time_loss_per_playlist_remove', 'avg_view_duration_rate', 'revenue_per_playlist_add', 'estimatedAdRevenue', 'watched_red_view_rate', 'estimatedRedMinutesWatched', 'grossRevenue', 'cpm_to_revenue_ratio', 'comment_rate', 'playlist_related_revenue_rate', 'ad_revenue_rate', 'comments', 'subscribers_conversion_rate', 'red_revenue_rate', 'ad_playbacks_per_playlist_add', 'monetizedPlaybacks', 'adImpressions']
Cross-Validation Accuracy: 0.99


In [58]:
final_selected_features

['positive_engage_rate',
 'playlist_addition_rate',
 'cpm',
 'revenue_per_ad_impression',
 'revenue_per_playback',
 'playlist_removal_rate',
 'like_rate',
 'playbackBasedCpm',
 'videosAddedToPlaylists',
 'views',
 'likes',
 'videosRemovedFromPlaylists',
 'subscribersGained',
 'watch_time_per_playlist_add',
 'shares',
 'watched_time_rate',
 'revenue_per_red_view',
 'estimatedRevenue',
 'redViews',
 'averageViewDuration',
 'revenue_per_minute_watched',
 'watch_time_loss_per_playlist_remove',
 'avg_view_duration_rate',
 'revenue_per_playlist_add',
 'estimatedAdRevenue',
 'watched_red_view_rate',
 'estimatedRedMinutesWatched',
 'grossRevenue',
 'cpm_to_revenue_ratio',
 'comment_rate',
 'playlist_related_revenue_rate',
 'ad_revenue_rate',
 'comments',
 'subscribers_conversion_rate',
 'red_revenue_rate',
 'ad_playbacks_per_playlist_add',
 'monetizedPlaybacks',
 'adImpressions']

In [56]:
feature_importances ## 0.005 기준으로 변수 선택

Unnamed: 0,feature,importance
26,positive_engage_rate,0.115805
52,playlist_addition_rate,0.072534
14,cpm,0.066794
35,revenue_per_ad_impression,0.056216
45,revenue_per_playback,0.054511
53,playlist_removal_rate,0.049415
23,like_rate,0.049097
13,playbackBasedCpm,0.048472
8,videosAddedToPlaylists,0.0455
0,views,0.033676


### 모델 기법 적용

RandomForest

In [59]:
from sklearn.ensemble import RandomForestClassifier

# 랜덤 포레스트 모델 학습
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_resampled[final_selected_features], y_train_resampled)

# # 랜덤 포레스트 모델 교차 검증
# cv_scores_rf = cross_val_score(rf_model, X_train_resampled[final_selected_features], y_train_resampled, cv=5, scoring='accuracy')
# print(f"RandomForest Cross-Validation Accuracy: {cv_scores_rf.mean():.2f}")

GradientBoosting

In [60]:
from sklearn.ensemble import GradientBoostingClassifier

# 그라디언트 부스팅 모델 학습
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_resampled[final_selected_features], y_train_resampled)

# # 그라디언트 부스팅 모델 교차 검증
# cv_scores_gb = cross_val_score(gb_model, X_train_resampled[final_selected_features], y_train_resampled, cv=5, scoring='accuracy')
# print(f"GradientBoosting Cross-Validation Accuracy: {cv_scores_gb.mean():.2f}")

XGBoost

In [61]:
from xgboost import XGBClassifier

# XGBoost를 사용한 모델 학습
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_resampled[final_selected_features], y_train_resampled)

# # 그라디언트 부스팅 모델 교차 검증
# cv_scores_xgb = cross_val_score(xgb_model, X_train_resampled[final_selected_features], y_train_resampled, cv=5, scoring='accuracy')
# print(f"GradientBoosting Cross-Validation Accuracy: {cv_scores_xgb.mean():.2f}")

### 모델 성능 평가

In [62]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# 예측 결과 생성
y_pred_rf = rf_model.predict(X_test[final_selected_features])
y_pred_gb = gb_model.predict(X_test[final_selected_features])
y_pred_xgb = xgb_model.predict(X_test[final_selected_features])

# 모델 정확도 확인
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print(f"RandomForest Test Accuracy: {accuracy_rf:.2f}")
print(f"GradientBoosting Test Accuracy: {accuracy_gb:.2f}")
print(f"XGBoost Test Accuracy: {accuracy_xgb:.2f}")

# 정밀도, 재현율, F1-score 출력
print("RandomForest Classification Report:")
print(classification_report(y_test, y_pred_rf))

print("GradientBoosting Classification Report:")
print(classification_report(y_test, y_pred_gb))

print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))

# ROC-AUC 확인
roc_auc_rf = roc_auc_score(y_test, rf_model.predict_proba(X_test[final_selected_features])[:, 1])
roc_auc_gb = roc_auc_score(y_test, gb_model.predict_proba(X_test[final_selected_features])[:, 1])
roc_auc_xgb = roc_auc_score(y_test, xgb_model.predict_proba(X_test[final_selected_features])[:, 1])

print(f"RandomForest ROC-AUC: {roc_auc_rf:.2f}")
print(f"GradientBoosting ROC-AUC: {roc_auc_gb:.2f}")
print(f"XGBoost ROC-AUC: {roc_auc_xgb:.2f}")

RandomForest Test Accuracy: 0.98
GradientBoosting Test Accuracy: 0.96
XGBoost Test Accuracy: 0.99
RandomForest Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.99      0.84     85970
           1       1.00      0.98      0.99   1627933

    accuracy                           0.98   1713903
   macro avg       0.87      0.99      0.92   1713903
weighted avg       0.99      0.98      0.98   1713903

GradientBoosting Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.98      0.74     85970
           1       1.00      0.96      0.98   1627933

    accuracy                           0.96   1713903
   macro avg       0.79      0.97      0.86   1713903
weighted avg       0.98      0.96      0.97   1713903

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.99      0.90     85970
           1       1.00      0.9

### 모델 성능 개선

언더샘플링 데이터셋별 각 모델 학습

In [40]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import numpy as np

# 언더샘플링 횟수 설정
n_iterations = 5

# 각 모델의 예측 확률 저장 리스트
rf_probs = []
gb_probs = []
xgb_probs = []
models = []

for i in range(n_iterations):
    # 언더샘플링 적용
    rus = RandomUnderSampler(random_state=i)  # 각기 다른 랜덤 시드 사용
    X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

    # 랜덤포레스트 모델 학습
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train_resampled[final_selected_features], y_train_resampled)
    rf_probs.append(rf_model.predict_proba(X_test[final_selected_features]))
    models.append(rf_model)

    # 그라디언트 부스팅 모델 학습
    gb_model = GradientBoostingClassifier(random_state=42)
    gb_model.fit(X_train_resampled[final_selected_features], y_train_resampled)
    gb_probs.append(gb_model.predict_proba(X_test[final_selected_features]))
    models.append(gb_model)

    # XGBoost 모델 학습
    xgb_model = XGBClassifier(random_state=42)
    xgb_model.fit(X_train_resampled[final_selected_features], y_train_resampled)
    xgb_probs.append(xgb_model.predict_proba(X_test[final_selected_features]))
    models.append(xgb_model)

In [None]:
models

In [41]:
# 확률 평균을 통한 앙상블
rf_probs_avg = np.mean(np.array(rf_probs), axis=0)
gb_probs_avg = np.mean(np.array(gb_probs), axis=0)
xgb_probs_avg = np.mean(np.array(xgb_probs), axis=0)

# 최종 평균 확률 계산
ensemble_probs_avg = (rf_probs_avg + gb_probs_avg + xgb_probs_avg) / 3

# 평균 확률에 따라 최종 예측
y_pred_ensemble_avg = np.argmax(ensemble_probs_avg, axis=1)

# 성능 평가
from sklearn.metrics import accuracy_score
accuracy_ensemble_avg = accuracy_score(y_test, y_pred_ensemble_avg)
print(f"Ensemble Averaging Accuracy: {accuracy_ensemble_avg:.2f}")


Ensemble Averaging Accuracy: 0.98


In [45]:
from sklearn.metrics import classification_report

# 각 모델의 클래스별 성능 평가
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['Class 0', 'Class 1']))

print("\nGradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_gb, target_names=['Class 0', 'Class 1']))

print("\nXGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=['Class 0', 'Class 1']))

print("\nEnsemble Model Classification Report:")
print(classification_report(y_test, y_pred_ensemble_avg, target_names=['Class 0', 'Class 1']))


Random Forest Classification Report:
              precision    recall  f1-score   support

     Class 0       0.70      0.99      0.82     85970
     Class 1       1.00      0.98      0.99   1627933

    accuracy                           0.98   1713903
   macro avg       0.85      0.99      0.90   1713903
weighted avg       0.98      0.98      0.98   1713903


Gradient Boosting Classification Report:
              precision    recall  f1-score   support

     Class 0       0.52      0.96      0.68     85970
     Class 1       1.00      0.95      0.98   1627933

    accuracy                           0.95   1713903
   macro avg       0.76      0.96      0.83   1713903
weighted avg       0.97      0.95      0.96   1713903


XGBoost Classification Report:
              precision    recall  f1-score   support

     Class 0       0.79      1.00      0.88     85970
     Class 1       1.00      0.99      0.99   1627933

    accuracy                           0.99   1713903
   macro avg     

### 실제데이터 결과 확인

In [49]:
# 각 모델의 예측 확률을 평균내어 결합
ensemble_probs = [model.predict_proba(youtube_videos[final_selected_features]) for model in models]
ensemble_avg_probs = np.mean(np.array(ensemble_probs), axis=0)

# 최종 확률에 따른 예측값 도출
y_pred_ensemble = np.argmax(ensemble_avg_probs, axis=1)

In [50]:
# 예측 결과 확인
youtube_videos['predict'] = y_pred_ensemble

In [54]:
youtube_videos[youtube_videos['predict']==0].groupby('video').aggregate()

Unnamed: 0,youtube_user_id,video,end_date,views,redViews,comments,likes,dislikes,shares,estimatedMinutesWatched,...,card_teaser_click_per_impression_rate,total_card_teaser_click_rate,card_conversion_rate,playlist_addition_rate,playlist_removal_rate,net_playlist_addition_rate,playlist_engagement_rate,playlist_related_revenue_rate,y_label,predict
70,6401da7ad746c60e1271fdd6,--0XOlJ3Lw4,2023-05-26,39,7,1,0,0,0,202,...,,0.0,,0.000000,0.000000,0.000000,0.000000,inf,1,0
72,6401da7ad746c60e1271fdd6,--0XOlJ3Lw4,2023-05-28,44,15,0,0,0,0,220,...,,0.0,,0.000000,0.000000,0.000000,0.000000,inf,1,0
100,6401da7ad746c60e1271fdd6,--0XOlJ3Lw4,2023-06-25,21,6,0,1,0,0,148,...,,0.0,,0.095238,0.095238,0.000000,0.190476,28.43203,1,0
101,6401da7ad746c60e1271fdd6,--0XOlJ3Lw4,2023-06-26,28,6,0,2,0,0,139,...,,0.0,,0.071429,0.107143,-0.035714,0.178571,51.57438,1,0
144,6401da7ad746c60e1271fdd6,--0XOlJ3Lw4,2023-08-08,64,20,3,0,0,0,372,...,,0.0,,0.000000,0.000000,0.000000,0.000000,inf,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8569349,63dbc0acee122e631991ca5b,zzza6bbJnMI,2023-07-14,7,1,0,0,0,0,74,...,,0.0,,0.000000,0.000000,0.000000,0.000000,inf,0,0
8569350,63dbc0acee122e631991ca5b,zzza6bbJnMI,2023-07-15,7,2,0,0,0,0,49,...,,0.0,,0.000000,0.000000,0.000000,0.000000,inf,1,0
8569410,63dbc0acee122e631991ca5b,zzza6bbJnMI,2023-10-21,1,0,0,0,0,0,5,...,,0.0,,0.000000,0.000000,0.000000,0.000000,,1,0
8569432,63dbc0acee122e631991ca5b,zzza6bbJnMI,2023-12-10,2,0,0,0,0,1,0,...,,0.0,,0.000000,0.000000,0.000000,0.000000,,1,0


In [None]:
youtube_videos

In [None]:
set(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(fraud_user_id)]['channel_title'].unique())

array(['임삐나', nan, 'Mind Patting마음토닥', 'MINLEE 민리', '시골낭만아재',
       'OBL - 온라인 농부, 사자가 되다', 'abbapraise 아바프레이즈', '채림처럼firstcherry',
       '모하지연 MOHAJIYEON', 'Jeffreyxking', '콜드쉽 Coldsheep', 'kiu기우쌤',
       '수빙수tv sooBingsoo', '석시원 커플 SeokSiWon Couple', '너굴몬',
       'GMENCY 멘시의 마인크래프트', '앙찡', '코인덕 차트아지', '미니멀영어 Minimal English',
       '프롬수지 fromsuzy', 'fromsuzy 프롬수지', '日本ジヌ【니혼지누】ー韓国に関する全て', '뻘짓연구소',
       '벽돌할아버지 Brick grandpa', '북토크', '나연이즈백 LPGA Na Yeon Choi', '그롬마쉬TV',
       '뷰드름 유튜버 인씨', 'MerryMa 메리마', '쿜쿜쿜', "루다의 댄스 연구소 Ruda's Dance Lab",
       '빅민 GAME', '잼스기타', '축구 읽어주는 여자 쵱내', 'OSSC', '뛰뛰빵빵 김옥순', '돈냄새',
       '키나kkina', '이현우의 MLBTV', 'V I N 빈 ', '평범한 사업가', '하원장 강동현', '주피코',
       '목소리 연기자 유지컬'], dtype=object)

In [55]:
# 피처 중요도 추출
final_importances = xgb_model.feature_importances_
feature_importances_final = pd.DataFrame({'feature': final_selected_features, 'importance': final_importances})
feature_importances_final = feature_importances_final.sort_values(by='importance', ascending=False)

In [58]:
feature_importances_final

Unnamed: 0,feature,importance
1,videosAddedToPlaylists,0.173585
0,cpm,0.168345
4,videosRemovedFromPlaylists,0.108364
2,likes,0.099482
5,shares,0.092513
7,subscribersGained,0.06408
18,dislikes,0.05708
3,views,0.054972
17,comments,0.036138
9,redViews,0.034451
