In [1]:
import pymongo
import pandas as pd
import numpy as np
import math
from datetime import datetime

# import dask.dataframe as dd ## 대규모 데이터 처리
# import vaex ## 대규모 데이터 처리

# from sklearn.linear_model import LinearRegression ## likes, dislikes 대체
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns

import os
from dotenv import load_dotenv

In [2]:
file_path = 'C:/py_src/awake/data/'

In [3]:
# 계정 / 콘텐츠 분석 데이터셋 불러오기
merge_df_users_fin = pd.read_csv(file_path + 'merge_df_users_final.csv', low_memory=False)
youtube_videos = pd.read_csv(file_path + 'youtube_videos_final.csv')

## 계정 데이터 분석

In [10]:
merge_df_users_fin.columns

Index(['youtube_user_id', 'date', 'channel_id', 'channel_title', 'phone_num',
       'report_user_id', 'published_at', 'viewCount', 'subscriberCount',
       'videoCount', 'yt_search_keyword', 'subscribers_gained', 'likes',
       'estimatedMinutesWatched', 'estimated_revenue', 'revenue_per_red_view',
       'positive_engage_rate', 'estimated_red_partner_revenue',
       'revenue_per_minute_watched', 'comments', 'shares', 'like_rate',
       'monetized_playbacks', 'gross_revenue', 'revenue_per_subscriber',
       'subscribers_lost', 'redViews', 'cpm', 'subscriber_increase_rate',
       'ad_revenue_rate', 'playback_rate', 'subscriber_view_time_rate',
       'red_revenue_rate', 'watched_view_rate', 'subscribers_conversion_rate',
       'avg_view_duration_rate', 'averageViewDuration', 'comment_rate',
       'subscriber_decrease_rate', 'SUBSCRIBED', 'subscribers_count',
       'unsubscribed_view_time_rate', 'subscriber_retention_rate',
       'like_to_dislike_ratio', 'ad_impressions', 'sub

### 데이터 분할

In [13]:
# 컬럼 정리
unique_col = merge_df_users_fin.columns[:11]
x_col = merge_df_users_fin.columns[11:-1]

In [14]:
# 데이터 분할
X = merge_df_users_fin[x_col]
y = merge_df_users_fin['y_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# 불균형 확인
print(y_train.value_counts())
print(y_test.value_counts())

y_label
1    80302
0     4244
Name: count, dtype: int64
y_label
1    20096
0     1041
Name: count, dtype: int64


### 언더샘플링

In [16]:
# 언더샘플링
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

### 모델 기법 적용

RandomForest

In [18]:
from sklearn.ensemble import RandomForestClassifier

# 랜덤 포레스트 모델 학습
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)

# 랜덤 포레스트 모델 교차 검증
cv_scores_rf = cross_val_score(rf_model, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
print(f"RandomForest Cross-Validation Accuracy: {cv_scores_rf.mean():.2f}")

RandomForest Cross-Validation Accuracy: 0.98


GradientBoosting

In [19]:
from sklearn.ensemble import GradientBoostingClassifier

# 그라디언트 부스팅 모델 학습
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_resampled, y_train_resampled)

# 그라디언트 부스팅 모델 교차 검증
cv_scores_gb = cross_val_score(gb_model, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
print(f"GradientBoosting Cross-Validation Accuracy: {cv_scores_gb.mean():.2f}")

GradientBoosting Cross-Validation Accuracy: 0.98


XGBoost

In [21]:
from xgboost import XGBClassifier

# XGBoost를 사용한 모델 학습
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_resampled, y_train_resampled)

# 그라디언트 부스팅 모델 교차 검증
cv_scores_xgb = cross_val_score(xgb_model, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
print(f"XGBoost Cross-Validation Accuracy: {cv_scores_xgb.mean():.2f}")

XGBoost Cross-Validation Accuracy: 0.98


### 모델 성능 평가

In [22]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# 예측 결과 생성
y_pred_rf = rf_model.predict(X_test)
y_pred_gb = gb_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)

# 모델 정확도 확인
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print(f"RandomForest Test Accuracy: {accuracy_rf:.2f}")
print(f"GradientBoosting Test Accuracy: {accuracy_gb:.2f}")
print(f"XGBoost Test Accuracy: {accuracy_xgb:.2f}")

# 정밀도, 재현율, F1-score 출력
print("RandomForest Classification Report:")
print(classification_report(y_test, y_pred_rf))

print("GradientBoosting Classification Report:")
print(classification_report(y_test, y_pred_gb))

print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))

# ROC-AUC 확인
roc_auc_rf = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])
roc_auc_gb = roc_auc_score(y_test, gb_model.predict_proba(X_test)[:, 1])
roc_auc_xgb = roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])

print(f"RandomForest ROC-AUC: {roc_auc_rf:.2f}")
print(f"GradientBoosting ROC-AUC: {roc_auc_gb:.2f}")
print(f"XGBoost ROC-AUC: {roc_auc_xgb:.2f}")

RandomForest Test Accuracy: 0.97
GradientBoosting Test Accuracy: 0.97
XGBoost Test Accuracy: 0.98
RandomForest Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.99      0.78      1041
           1       1.00      0.97      0.99     20096

    accuracy                           0.97     21137
   macro avg       0.82      0.98      0.88     21137
weighted avg       0.98      0.97      0.97     21137

GradientBoosting Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.98      0.75      1041
           1       1.00      0.97      0.98     20096

    accuracy                           0.97     21137
   macro avg       0.81      0.98      0.87     21137
weighted avg       0.98      0.97      0.97     21137

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.99      0.80      1041
           1       1.00      0.9

### 모델 성능 개선

언더샘플링 데이터셋별 각 모델 학습

In [23]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# 언더샘플링 횟수 설정
n_iterations = 10

# 각 모델의 예측 확률 저장 리스트
rf_probs = []
gb_probs = []
xgb_probs = []
models = []

for i in range(n_iterations):
    # 언더샘플링 적용
    rus = RandomUnderSampler(random_state=i)  # 각기 다른 랜덤 시드 사용
    X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

    # 랜덤포레스트 모델 학습
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train_resampled, y_train_resampled)
    rf_probs.append(rf_model.predict_proba(X_test))
    models.append(rf_model)

    # 그라디언트 부스팅 모델 학습
    gb_model = GradientBoostingClassifier(random_state=42)
    gb_model.fit(X_train_resampled, y_train_resampled)
    gb_probs.append(gb_model.predict_proba(X_test))
    models.append(gb_model)

    # XGBoost 모델 학습
    xgb_model = XGBClassifier(random_state=42)
    xgb_model.fit(X_train_resampled, y_train_resampled)
    xgb_probs.append(xgb_model.predict_proba(X_test))
    models.append(xgb_model)

In [24]:
# 확률 평균을 통한 앙상블
rf_probs_avg = np.mean(np.array(rf_probs), axis=0)
gb_probs_avg = np.mean(np.array(gb_probs), axis=0)
xgb_probs_avg = np.mean(np.array(xgb_probs), axis=0)

# 최종 평균 확률 계산
ensemble_probs_avg = (rf_probs_avg + gb_probs_avg + xgb_probs_avg) / 3

# 평균 확률에 따라 최종 예측
y_pred_ensemble_avg = np.argmax(ensemble_probs_avg, axis=1)

# 성능 평가
from sklearn.metrics import accuracy_score
accuracy_ensemble_avg = accuracy_score(y_test, y_pred_ensemble_avg)
print(f"Ensemble Averaging Accuracy: {accuracy_ensemble_avg:.2f}")


Ensemble Averaging Accuracy: 0.98


In [25]:
from sklearn.metrics import classification_report

# 각 모델의 클래스별 성능 평가
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['Class 0', 'Class 1']))

print("\nGradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_gb, target_names=['Class 0', 'Class 1']))

print("\nXGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=['Class 0', 'Class 1']))

print("\nEnsemble Model Classification Report:")
print(classification_report(y_test, y_pred_ensemble_avg, target_names=['Class 0', 'Class 1']))


Random Forest Classification Report:
              precision    recall  f1-score   support

     Class 0       0.64      0.99      0.78      1041
     Class 1       1.00      0.97      0.99     20096

    accuracy                           0.97     21137
   macro avg       0.82      0.98      0.88     21137
weighted avg       0.98      0.97      0.97     21137


Gradient Boosting Classification Report:
              precision    recall  f1-score   support

     Class 0       0.61      0.98      0.75      1041
     Class 1       1.00      0.97      0.98     20096

    accuracy                           0.97     21137
   macro avg       0.81      0.98      0.87     21137
weighted avg       0.98      0.97      0.97     21137


XGBoost Classification Report:
              precision    recall  f1-score   support

     Class 0       0.68      0.99      0.80      1041
     Class 1       1.00      0.98      0.99     20096

    accuracy                           0.98     21137
   macro avg     

### 실제데이터 결과 확인

In [26]:
x_col

Index(['subscribers_gained', 'likes', 'estimatedMinutesWatched',
       'estimated_revenue', 'revenue_per_red_view', 'positive_engage_rate',
       'estimated_red_partner_revenue', 'revenue_per_minute_watched',
       'comments', 'shares', 'like_rate', 'monetized_playbacks',
       'gross_revenue', 'revenue_per_subscriber', 'subscribers_lost',
       'redViews', 'cpm', 'subscriber_increase_rate', 'ad_revenue_rate',
       'playback_rate', 'subscriber_view_time_rate', 'red_revenue_rate',
       'watched_view_rate', 'subscribers_conversion_rate',
       'avg_view_duration_rate', 'averageViewDuration', 'comment_rate',
       'subscriber_decrease_rate', 'SUBSCRIBED', 'subscribers_count',
       'unsubscribed_view_time_rate', 'subscriber_retention_rate',
       'like_to_dislike_ratio', 'ad_impressions', 'subscribed_view_rate'],
      dtype='object')

In [27]:
# 각 모델의 예측 확률을 평균내어 결합
ensemble_probs = [model.predict_proba(merge_df_users_fin[x_col]) for model in models]
ensemble_avg_probs = np.mean(np.array(ensemble_probs), axis=0)

# 최종 확률에 따른 예측값 도출
y_pred_ensemble = np.argmax(ensemble_avg_probs, axis=1)

In [31]:
# 예측 결과 확인
merge_df_users_fin['predict'] = y_pred_ensemble
pred_result_df = merge_df_users_fin.groupby(['youtube_user_id'])['predict'].value_counts().reset_index()
fraud_user_id = list(pred_result_df[pred_result_df['predict']==0][pred_result_df[pred_result_df['predict']==0]['count']>=40]['youtube_user_id'].unique()) ## 이상치라고 생각되는 계정 확인

In [227]:
merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(fraud_user_id)]['channel_title'].unique()

array(['임삐나', '0', 'Mind Patting마음토닥', 'MINLEE 민리', '시골낭만아재',
       'OBL - 온라인 농부, 사자가 되다', 'abbapraise 아바프레이즈', '채림처럼firstcherry',
       '모하지연 MOHAJIYEON', 'Jeffreyxking', '콜드쉽 Coldsheep', 'kiu기우쌤',
       '수빙수tv sooBingsoo', '석시원 커플 SeokSiWon Couple', '너굴몬',
       'GMENCY 멘시의 마인크래프트', '앙찡', '코인덕 차트아지', '미니멀영어 Minimal English',
       '프롬수지 fromsuzy', 'fromsuzy 프롬수지', '日本ジヌ【니혼지누】ー韓国に関する全て', '뻘짓연구소',
       '벽돌할아버지 Brick grandpa', '북토크', '나연이즈백 LPGA Na Yeon Choi', '그롬마쉬TV',
       '뷰드름 유튜버 인씨', 'MerryMa 메리마', '쿜쿜쿜', "루다의 댄스 연구소 Ruda's Dance Lab",
       '빅민 GAME', '잼스기타', '축구 읽어주는 여자 쵱내', 'OSSC', '뛰뛰빵빵 김옥순', '돈냄새',
       '키나kkina', '이현우의 MLBTV', 'V I N 빈 ', '평범한 사업가', '하원장 강동현', '주피코',
       '목소리 연기자 유지컬'], dtype=object)

In [36]:
# 피처 중요도 추출
final_importances = xgb_model.feature_importances_
feature_importances_final = pd.DataFrame({'feature': x_col, 'importance': final_importances})
feature_importances_final = feature_importances_final.sort_values(by='importance', ascending=False)

In [37]:
feature_importances_final

Unnamed: 0,feature,importance
3,estimated_revenue,0.55145
7,revenue_per_minute_watched,0.075226
0,subscribers_gained,0.074242
5,positive_engage_rate,0.063512
4,revenue_per_red_view,0.034143
2,estimatedMinutesWatched,0.02323
15,redViews,0.015455
11,monetized_playbacks,0.013301
23,subscribers_conversion_rate,0.010322
30,unsubscribed_view_time_rate,0.009862


## 콘텐츠 데이터 분석

### 데이터 분할

In [50]:
# 컬럼 정리
unique_col = youtube_videos.columns[:3]
x_col = youtube_videos.columns[3:-1]

In [52]:
# 데이터 분할
X = youtube_videos[x_col] ## y값 라벨링에 쓰인 지표 제거
y = youtube_videos['y_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
# del X, y

In [53]:
# 불균형 확인
print(y_train.value_counts())
print(y_test.value_counts())

y_label
1    6513105
0     342503
Name: count, dtype: int64
y_label
1    1627933
0      85970
Name: count, dtype: int64


### 언더샘플링

In [54]:
# 언더샘플링
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

### 모델 기법 적용

RandomForest

In [55]:
from sklearn.ensemble import RandomForestClassifier

# 랜덤 포레스트 모델 학습
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)

# # 랜덤 포레스트 모델 교차 검증
# cv_scores_rf = cross_val_score(rf_model, X_train_resampled[final_selected_features], y_train_resampled, cv=5, scoring='accuracy')
# print(f"RandomForest Cross-Validation Accuracy: {cv_scores_rf.mean():.2f}")

GradientBoosting

In [56]:
from sklearn.ensemble import GradientBoostingClassifier

# 그라디언트 부스팅 모델 학습
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_resampled, y_train_resampled)

# # 그라디언트 부스팅 모델 교차 검증
# cv_scores_gb = cross_val_score(gb_model, X_train_resampled[final_selected_features], y_train_resampled, cv=5, scoring='accuracy')
# print(f"GradientBoosting Cross-Validation Accuracy: {cv_scores_gb.mean():.2f}")

XGBoost

In [57]:
from xgboost import XGBClassifier

# XGBoost를 사용한 모델 학습
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_resampled, y_train_resampled)

# # 그라디언트 부스팅 모델 교차 검증
# cv_scores_xgb = cross_val_score(xgb_model, X_train_resampled[final_selected_features], y_train_resampled, cv=5, scoring='accuracy')
# print(f"GradientBoosting Cross-Validation Accuracy: {cv_scores_xgb.mean():.2f}")

### 모델 성능 평가

In [58]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# 예측 결과 생성
y_pred_rf = rf_model.predict(X_test)
y_pred_gb = gb_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)

# 모델 정확도 확인
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print(f"RandomForest Test Accuracy: {accuracy_rf:.2f}")
print(f"GradientBoosting Test Accuracy: {accuracy_gb:.2f}")
print(f"XGBoost Test Accuracy: {accuracy_xgb:.2f}")

# 정밀도, 재현율, F1-score 출력
print("RandomForest Classification Report:")
print(classification_report(y_test, y_pred_rf))

print("GradientBoosting Classification Report:")
print(classification_report(y_test, y_pred_gb))

print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))

# ROC-AUC 확인
roc_auc_rf = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])
roc_auc_gb = roc_auc_score(y_test, gb_model.predict_proba(X_test)[:, 1])
roc_auc_xgb = roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])

print(f"RandomForest ROC-AUC: {roc_auc_rf:.2f}")
print(f"GradientBoosting ROC-AUC: {roc_auc_gb:.2f}")
print(f"XGBoost ROC-AUC: {roc_auc_xgb:.2f}")

RandomForest Test Accuracy: 0.98
GradientBoosting Test Accuracy: 0.96
XGBoost Test Accuracy: 0.99
RandomForest Classification Report:
              precision    recall  f1-score   support

           0       0.74      1.00      0.85     85970
           1       1.00      0.98      0.99   1627933

    accuracy                           0.98   1713903
   macro avg       0.87      0.99      0.92   1713903
weighted avg       0.99      0.98      0.98   1713903

GradientBoosting Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.98      0.73     85970
           1       1.00      0.96      0.98   1627933

    accuracy                           0.96   1713903
   macro avg       0.79      0.97      0.86   1713903
weighted avg       0.98      0.96      0.97   1713903

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     85970
           1       1.00      0.9

### 모델 성능 개선

언더샘플링 데이터셋별 각 모델 학습

In [59]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import numpy as np

# 언더샘플링 횟수 설정
n_iterations = 5

# 각 모델의 예측 확률 저장 리스트
rf_probs = []
gb_probs = []
xgb_probs = []
models = []

for i in range(n_iterations):
    # 언더샘플링 적용
    rus = RandomUnderSampler(random_state=i)  # 각기 다른 랜덤 시드 사용
    X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

    # 랜덤포레스트 모델 학습
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train_resampled, y_train_resampled)
    rf_probs.append(rf_model.predict_proba(X_test))
    models.append(rf_model)

    # 그라디언트 부스팅 모델 학습
    gb_model = GradientBoostingClassifier(random_state=42)
    gb_model.fit(X_train_resampled, y_train_resampled)
    gb_probs.append(gb_model.predict_proba(X_test))
    models.append(gb_model)

    # XGBoost 모델 학습
    xgb_model = XGBClassifier(random_state=42)
    xgb_model.fit(X_train_resampled, y_train_resampled)
    xgb_probs.append(xgb_model.predict_proba(X_test))
    models.append(xgb_model)

In [60]:
# 확률 평균을 통한 앙상블
rf_probs_avg = np.mean(np.array(rf_probs), axis=0)
gb_probs_avg = np.mean(np.array(gb_probs), axis=0)
xgb_probs_avg = np.mean(np.array(xgb_probs), axis=0)

# 최종 평균 확률 계산
ensemble_probs_avg = (rf_probs_avg + gb_probs_avg + xgb_probs_avg) / 3

# 평균 확률에 따라 최종 예측
y_pred_ensemble_avg = np.argmax(ensemble_probs_avg, axis=1)

# 성능 평가
from sklearn.metrics import accuracy_score
accuracy_ensemble_avg = accuracy_score(y_test, y_pred_ensemble_avg)
print(f"Ensemble Averaging Accuracy: {accuracy_ensemble_avg:.2f}")


Ensemble Averaging Accuracy: 0.98


In [61]:
from sklearn.metrics import classification_report

# 각 모델의 클래스별 성능 평가
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['Class 0', 'Class 1']))

print("\nGradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_gb, target_names=['Class 0', 'Class 1']))

print("\nXGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=['Class 0', 'Class 1']))

print("\nEnsemble Model Classification Report:")
print(classification_report(y_test, y_pred_ensemble_avg, target_names=['Class 0', 'Class 1']))


Random Forest Classification Report:
              precision    recall  f1-score   support

     Class 0       0.74      1.00      0.85     85970
     Class 1       1.00      0.98      0.99   1627933

    accuracy                           0.98   1713903
   macro avg       0.87      0.99      0.92   1713903
weighted avg       0.99      0.98      0.98   1713903


Gradient Boosting Classification Report:
              precision    recall  f1-score   support

     Class 0       0.59      0.98      0.73     85970
     Class 1       1.00      0.96      0.98   1627933

    accuracy                           0.96   1713903
   macro avg       0.79      0.97      0.86   1713903
weighted avg       0.98      0.96      0.97   1713903


XGBoost Classification Report:
              precision    recall  f1-score   support

     Class 0       0.82      1.00      0.90     85970
     Class 1       1.00      0.99      0.99   1627933

    accuracy                           0.99   1713903
   macro avg     

### 실제데이터 결과 확인

In [63]:
# 각 모델의 예측 확률을 평균내어 결합
ensemble_probs = [model.predict_proba(youtube_videos[x_col]) for model in models]
ensemble_avg_probs = np.mean(np.array(ensemble_probs), axis=0)

# 최종 확률에 따른 예측값 도출
y_pred_ensemble = np.argmax(ensemble_avg_probs, axis=1)

# 예측 결과
youtube_videos['predict'] = y_pred_ensemble

In [128]:
# 예측 결과 확인
result_video_df = youtube_videos.groupby('video')['predict'].value_counts().reset_index()

In [129]:
# 콘텐츠 일일 데이터 부족 확인
video_id_shortage = list(result_video_df.groupby('video')['count'].sum()[result_video_df.groupby('video')['count'].sum() < 10].reset_index()['video'].unique())
len(video_id_shortage)

21450

In [142]:
# 콘텐츠 일일 데이터 중 이상치 1개라도 있는 콘텐츠 및 계정 확인
video_id_contains = list(result_video_df[result_video_df['predict']==0]['video'].unique())
user_id_video_contains = list(youtube_videos[youtube_videos['video'].isin(result_video_df[result_video_df['predict']==0]['video'])]['youtube_user_id'].unique())
print(len(video_id_contains))
print(len(user_id_video_contains))

52646
249


In [165]:
# 이상치 콘텐츠 확인 - 콘텐츠 일일 데이터 중 10% 이상 이상치 데이터 확인
result_video_df_outlier = pd.merge(result_video_df[result_video_df['predict']==0].reset_index(drop=True),
                                   result_video_df.groupby('video')['count'].sum().reset_index(),
                                   how='left', on='video')
video_id_outlier = list(result_video_df_outlier[(result_video_df_outlier['count_x'] / result_video_df_outlier['count_y']) > 0.1]['video'].unique())

In [173]:
# 이상치 콘텐츠가 1개라도 있는 계정 확인
len(youtube_videos[youtube_videos['video'].isin(video_id_outlier)]['youtube_user_id'].unique())

242

In [204]:
# 이상치 콘텐츠가 20% 이상 있는 계정 확인
result_user_df_outlier = pd.merge(youtube_videos[youtube_videos['video'].isin(video_id_outlier)].groupby('youtube_user_id')['video'].count().reset_index(),
                                  youtube_videos.groupby(['youtube_user_id'])['video'].count().reset_index(), how='left', on='youtube_user_id')
user_id_outlier = list(result_user_df_outlier[(result_user_df_outlier['video_x'] / result_user_df_outlier['video_y']) > 0.2]['youtube_user_id'].unique())

In [231]:
fraud_video_user_id = list(youtube_videos[youtube_videos['youtube_user_id'].isin(user_id_outlier)]['youtube_user_id'].unique())
len(fraud_video_user_id)

69

## 분석 결과 확인

In [215]:
merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(fraud_user_id)]['channel_title'].unique()

array(['임삐나', '0', 'Mind Patting마음토닥', 'MINLEE 민리', '시골낭만아재',
       'OBL - 온라인 농부, 사자가 되다', 'abbapraise 아바프레이즈', '채림처럼firstcherry',
       '모하지연 MOHAJIYEON', 'Jeffreyxking', '콜드쉽 Coldsheep', 'kiu기우쌤',
       '수빙수tv sooBingsoo', '석시원 커플 SeokSiWon Couple', '너굴몬',
       'GMENCY 멘시의 마인크래프트', '앙찡', '코인덕 차트아지', '미니멀영어 Minimal English',
       '프롬수지 fromsuzy', 'fromsuzy 프롬수지', '日本ジヌ【니혼지누】ー韓国に関する全て', '뻘짓연구소',
       '벽돌할아버지 Brick grandpa', '북토크', '나연이즈백 LPGA Na Yeon Choi', '그롬마쉬TV',
       '뷰드름 유튜버 인씨', 'MerryMa 메리마', '쿜쿜쿜', "루다의 댄스 연구소 Ruda's Dance Lab",
       '빅민 GAME', '잼스기타', '축구 읽어주는 여자 쵱내', 'OSSC', '뛰뛰빵빵 김옥순', '돈냄새',
       '키나kkina', '이현우의 MLBTV', 'V I N 빈 ', '평범한 사업가', '하원장 강동현', '주피코',
       '목소리 연기자 유지컬'], dtype=object)

In [234]:
merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(fraud_video_user_id)]['channel_title'].unique()

array(['Ella', '0', '찌늉', 'hyeppening 혜프닝', 'KIMBEE 킴비', '지니원장의피부톡톡',
       '흙회장', 'Mind Patting마음토닥', 'MINLEE 민리', '시골낭만아재', '황나겸', '래아TV',
       'Seol-A 라이더 설아', 'SATUR 세터업', '이숲soop', '래띠 LAETI',
       'OBL - 온라인 농부, 사자가 되다', '시리얼 Sireal', 'ORlGN 오리진',
       'abbapraise 아바프레이즈', '잉툰TV- 만화로 쉽게 영어배우자', '모하지연 MOHAJIYEON',
       '부반TV_부에 반하다', '김두부', '라나제이베이킹Lana J', 'Jeffreyxking',
       '바라던 바다 BADACHANNEL', '월텍남 - 월스트리트 테크남', '은는이가', '비됴클래스',
       '어웨이커 | 크리에이터 이코노미', '석시원 커플 SeokSiWon Couple',
       '태권민국_Captain Master', '디지털생활제안', '미니멀영어 Minimal English',
       '청어람ARMC', 'Ood 오드', '황헬린 탈출기', '보미름', '비제TV', '소피요가 Sophie Yoga',
       '정가거부', '중년독수리의 대리여행', '북토크', '나연이즈백 LPGA Na Yeon Choi',
       '니들needle', '그롬마쉬TV', '복지다있소', '세계여행 테리로그 TERRYLOG', 'AllaproTV',
       '스타트업잡스', '청도시네마', '김퍼프PUFF', '쿜쿜쿜', '법무법인 슈가스퀘어', '닷츠 DOTS',
       '돈냄새', '에피코딩', '키나kkina', '이현우의 MLBTV', '슬기런바디 Run Body',
       'Lizzy리지', '자수의숲jasooforest', '평범한 사업가', '하원장 강동현', '주피코'],
      dtype=

In [255]:
# 계정 이상치, 콘텐츠 정상
print(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(set(fraud_user_id) - set(fraud_video_user_id))]['channel_title'].unique())
print(len(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(set(fraud_user_id) - set(fraud_video_user_id))]['channel_title'].unique()))

['임삐나' '0' '채림처럼firstcherry' '콜드쉽 Coldsheep' 'kiu기우쌤' '수빙수tv sooBingsoo'
 '너굴몬' 'GMENCY 멘시의 마인크래프트' '앙찡' '코인덕 차트아지' '프롬수지 fromsuzy' 'fromsuzy 프롬수지'
 '日本ジヌ【니혼지누】ー韓国に関する全て' '뻘짓연구소' '벽돌할아버지 Brick grandpa' '뷰드름 유튜버 인씨'
 'MerryMa 메리마' "루다의 댄스 연구소 Ruda's Dance Lab" '빅민 GAME' '잼스기타'
 '축구 읽어주는 여자 쵱내' 'OSSC' '뛰뛰빵빵 김옥순' 'V I N 빈 ' '목소리 연기자 유지컬']
25


In [256]:
# 계정 이상치, 콘텐츠 이상치
print(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(set(fraud_user_id) & set(fraud_video_user_id))]['channel_title'].unique())
print(len(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(set(fraud_user_id) & set(fraud_video_user_id))]['channel_title'].unique()))

['Mind Patting마음토닥' '0' 'MINLEE 민리' '시골낭만아재' 'OBL - 온라인 농부, 사자가 되다'
 'abbapraise 아바프레이즈' '모하지연 MOHAJIYEON' 'Jeffreyxking'
 '석시원 커플 SeokSiWon Couple' '미니멀영어 Minimal English' '북토크'
 '나연이즈백 LPGA Na Yeon Choi' '그롬마쉬TV' '쿜쿜쿜' '돈냄새' '키나kkina' '이현우의 MLBTV'
 '평범한 사업가' '하원장 강동현' '주피코']
20


In [257]:
# 계정 정상, 콘텐츠 정상
print(merge_df_users_fin[~merge_df_users_fin['youtube_user_id'].isin(set(fraud_video_user_id + fraud_user_id))]['channel_title'].unique())
print(len(merge_df_users_fin[~merge_df_users_fin['youtube_user_id'].isin(set(fraud_video_user_id + fraud_user_id))]['channel_title'].unique()))

['성팩 SPAAK' '0' '고도람 Go!doram' '세남자 물고기' '띠혜 ddihye' '소리미의 신화방송'
 '름쿠 ᴘʟᴀʏʟɪꜱᴛ' '달고캠핑' '고군 Gohgoon' '루깬미' '나는 불독' '바른걸음연구소' '임퓨의 비트메이킹 클래스'
 '차박씬' '오늘도희다 HEEDA' '혜성네일_comet' '낭만아저씨코디TV' 'Kevin Factory케빈팩토리'
 'assesta' 'JinBlog 진블로그' '마파TV' '오디디 코미디' '담순언니 Twins Vlog' '유익한 균튜버'
 '차세이CHASEYi' '1분뉴스' '배우GO' 'the sence' '성한준' '다먹어라이언'
 '하부유튜브 Minor / (Lower) YouTube' 'sa lly' '수집의 수집' '굥플레이스 맛집투어'
 '모염 moyeom' '도아이 Doh-I' '서유 SEOYU DANCE' '윈플즈TV' 'DDONIE 또니 / 러브크레센트'
 '슈로시안 SUROSIAN' '채찍단' '원의 독백' '라이프에이치 Life.H' '김밈서'
 '오엔티엘 패션 / ONTL FASHION' '드론브이로그 DroneVlog' '에디레일 Eddy Rails' 'gahyun 가현'
 '시현하다 RECORDERS' '데일리 슬슬' '오토컨테이너 스튜디오' 'Mein 미인' '군대위키' '미디하는남자'
 '고기,요정 MeatPixie' '유경몬' '여정을떠난여정' '찌수연' '인썸니아TV' '팀브라더스' '이고 EGO' '콤므'
 '윤새 Yoonsae' '맛집남자 foodman' '한나임한나Hannaim' '수란쿤' '라이라마' '카이바군'
 '-mentalholder 멘탈홀더 tv' 'Suzevi ASMR' '사부작장제소sabujakfarrier' '지미 geemi.'
 '아무튼 리뷰어즈' '도도' '퓨츠앙' '기자 황덕현 KIJA HWANG' '엔트리뷰 [누구나 재미있는 테크리뷰]' '만능혁키'
 '콜로니' '꽃 읽어주는 남자 kkotnam' '코드맨TV' 'Dalhae달달해' '두남자 토익

In [258]:
# 계정 정상, 콘텐츠 이상치
print(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(set(fraud_video_user_id) - set(fraud_user_id))]['channel_title'].unique())
print(len(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(set(fraud_video_user_id) - set(fraud_user_id))]['channel_title'].unique()))

['Ella' '0' '찌늉' 'hyeppening 혜프닝' 'KIMBEE 킴비' '지니원장의피부톡톡' '흙회장' '황나겸'
 '래아TV' 'Seol-A 라이더 설아' 'SATUR 세터업' '이숲soop' '래띠 LAETI' '시리얼 Sireal'
 'ORlGN 오리진' '잉툰TV- 만화로 쉽게 영어배우자' '부반TV_부에 반하다' '김두부' '라나제이베이킹Lana J'
 '바라던 바다 BADACHANNEL' '월텍남 - 월스트리트 테크남' '은는이가' '비됴클래스' '어웨이커 | 크리에이터 이코노미'
 '태권민국_Captain Master' '디지털생활제안' '청어람ARMC' 'Ood 오드' '황헬린 탈출기' '보미름' '비제TV'
 '소피요가 Sophie Yoga' '정가거부' '중년독수리의 대리여행' '니들needle' '복지다있소'
 '세계여행 테리로그 TERRYLOG' 'AllaproTV' '스타트업잡스' '청도시네마' '김퍼프PUFF' '법무법인 슈가스퀘어'
 '닷츠 DOTS' '에피코딩' '슬기런바디 Run Body' 'Lizzy리지' '자수의숲jasooforest']
47
