In [1]:
import pymongo
import pandas as pd
import numpy as np
import math
from datetime import datetime

# import dask.dataframe as dd ## 대규모 데이터 처리
# import vaex ## 대규모 데이터 처리

# from sklearn.linear_model import LinearRegression ## likes, dislikes 대체
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

import os
from dotenv import load_dotenv

## MongoDB 연동

In [2]:
# .env 파일에서 환경 변수 로드
load_dotenv('C:/py_src/awake/env')

# 환경 변수에서 MongoDB 연결 정보 가져오기
mongo_password = os.getenv('MONGO_PASSWORD')
mongo_user = os.getenv('MONGO_USER')

In [3]:
# MongoDB 연결 URL
url = f"mongodb+srv://{mongo_user}:{mongo_password}@meercat-external.udyfs.mongodb.net/?retryWrites=true&w=majority&appName=meercat-external"
client = pymongo.MongoClient(url, serverSelectionTimeoutMS=100000)

In [4]:
# 연결 확인
from pymongo.errors import ServerSelectionTimeoutError

try:
    # 데이터베이스 목록을 가져오는 쿼리
    databases = client.list_database_names()
    print("Connected successfully. Databases:", databases)
    
except ServerSelectionTimeoutError as err:
    print("Connection failed:", err)

Connected successfully. Databases: ['Test', 'admin', 'config', 'local']


In [5]:
# Test DB 선택
db = client.Test

# Test DB 컬렉션 확인
collections = db.list_collection_names()

In [6]:
# # 단위 환산
# def convert_bytes(num):
#     for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
#         if num < 1024.0:
#             return f"{num:.2f} {x}"
#         num /= 1024.0

In [7]:
# for collection_name in collections:
#     # 컬렉션 통계 정보 가져오기
#     stats = db.command("collStats", collection_name)

#     # 컬렉션의 크기와 문서 수 출력    
#     print(f"Collection '{collection_name}' size: {convert_bytes(stats['size'])}")
#     print(f"Collection '{collection_name}' document count: {stats['count']}")
#     print(f"Total index size: {convert_bytes(stats['totalIndexSize'])}")
#     print('--------------------------------------------------------------------')

In [8]:
## 인스타 관련 데이터 제외
## 'youtube_videos'/'youtube_datas' 따로 수집
## 'youtube_report_v2'/'youtube_report' 날짜 정보불분명, 다른 테이블 정보와 중복
collections_need = [
    'youtube_users',
    'youtube_channel_locations',
    'youtube_daily_channel_basics',
    ]

In [9]:
# 데이터 로드
youtube_dict={}
for collection_name in collections_need:

    youtube_dict[collection_name] = pd.DataFrame(list(db[collection_name].find()))
    
    print(collection_name)
    print(youtube_dict[collection_name].columns)
    print(">> Success")
    print("--------------------------------------")
    print("")

youtube_users
Index(['_id', 'country', 'phone_num', 'kakao_nick', 'kakao_account_id',
       'user_kind', 'created_at', '__v', 'channel_title', 'channel_id',
       'thumbnail_url', 'published_at', 'subscriber_count', 'is_rev_saved',
       'is_subs_saved', 'updated_at', 'brandingSettings', 'contentDetails',
       'contentOwnerDetails', 'etag', 'id', 'kind', 'snippet', 'statistics',
       'status', 'topicDetails', 'connected', 'refresh_error', 'localizations',
       'ads_array', 'age', 'gender', 'region_array', 'is_active',
       'category_array', 'account_type', 'children_age_array',
       'is_accept_suggestion', 'is_add_info', 'pet_array', 'user_id',
       'report_user_id'],
      dtype='object')
>> Success
--------------------------------------

youtube_channel_locations
Index(['_id', 'locations', 'youtube_user_id', 'end_date', 'created_at',
       'updated_at', '__v'],
      dtype='object')
>> Success
--------------------------------------

youtube_daily_channel_basics
Index(

In [10]:
# 키값 형식 맞추기
for key, df in youtube_dict.items():
    # 각 딕셔너리의 value가 DataFrame인지 확인
    if isinstance(df, pd.DataFrame):
        # '_id' 컬럼이 있는지 확인하고 문자열로 변환
        if '_id' in df.columns:
            df['_id'] = df['_id'].astype(str)
        
        # 'youtube_user_id' 컬럼이 있는지 확인하고 문자열로 변환
        if 'youtube_user_id' in df.columns:
            df['youtube_user_id'] = df['youtube_user_id'].astype(str)

        # 변경된 DataFrame을 다시 딕셔너리에 저장
        youtube_dict[key] = df

In [11]:
del df

In [12]:
youtube_dict.keys()

dict_keys(['youtube_users', 'youtube_channel_locations', 'youtube_daily_channel_basics'])

## 데이터 불러오기

### 계정 데이터

#### youtube_users

In [13]:
youtube_users = youtube_dict['youtube_users']

In [14]:
len(youtube_users['channel_id'].unique())
## 유튜버 계정 : 883개

883

In [15]:
# 필요컬럼추출
youtube_users = youtube_users[['channel_id', 'channel_title', 'phone_num', 'report_user_id', 'statistics','published_at']] 
## published_at : 유튜브 가입일
## 'statistics' 컬럼의 'subscriberCount' 정보와 'subscriber_count' 컬럼 정보가 다름 --> 구독자 수
## 'channel_id' 컬럼, 'contentDetails' 컬럼의 'uploads' 같은 정보

In [16]:
# cast 하여 최종 데이터셋 생성
youtube_users = pd.concat([youtube_users, pd.json_normalize(youtube_users['statistics'])],axis=1)
youtube_users = youtube_users.drop(['statistics','hiddenSubscriberCount'],axis=1)
youtube_users = youtube_users.dropna(how = 'all')

# null 값 0으로 대체
youtube_users[['viewCount', 'subscriberCount', 'videoCount']] = youtube_users[['viewCount', 'subscriberCount', 'videoCount']].fillna(0)
youtube_users['viewCount'] = youtube_users['viewCount'].astype(int)
youtube_users['subscriberCount'] = youtube_users['subscriberCount'].astype(int)
youtube_users['videoCount'] = youtube_users['videoCount'].astype(int)

youtube_users = youtube_users.sort_values('channel_id').drop_duplicates().reset_index(drop=True)

# id가 null인 계정 제거
youtube_users = youtube_users[~youtube_users['channel_id'].isnull()].reset_index(drop=True)

# channel_id별로 그룹화하여 null값을 해당 그룹 내에서 채우기
youtube_users['phone_num'] = youtube_users.groupby('channel_id')['phone_num'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))

# 계정 지표 모두 0인 계정 삭제
youtube_users = youtube_users[youtube_users[['viewCount', 'subscriberCount', 'videoCount']].sum(axis=1)!=0].reset_index(drop=True)

  youtube_users['phone_num'] = youtube_users.groupby('channel_id')['phone_num'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
  youtube_users['phone_num'] = youtube_users.groupby('channel_id')['phone_num'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))


#### youtube_channel_locations
- 채널 구독자 위치

In [17]:
youtube_channel_locations = youtube_dict['youtube_channel_locations']
# youtube_channel_locations = pd.read_csv(file_path + 'raw_data/youtube_channel_locations.csv', low_memory=False)

In [18]:
len(youtube_channel_locations['youtube_user_id'].unique())
## 유튜버 계정 : 906개

906

In [19]:
# 필요컬럼추출
youtube_channel_locations = youtube_channel_locations[youtube_channel_locations['locations'].apply(lambda x: len(x) > 0)]
youtube_channel_locations = youtube_channel_locations[['youtube_user_id','end_date','locations']]

In [20]:
# melt, cast 하여 최종데이터셋 생성
youtube_channel_locations = youtube_channel_locations.explode(['locations']).reset_index(drop=True)

# cast 하여 최종 데이터셋 생성
youtube_channel_locations = pd.concat([youtube_channel_locations,pd.json_normalize(youtube_channel_locations['locations'])], axis=1)
youtube_channel_locations = youtube_channel_locations.drop(['locations','subscribersGained','subscribersLost'],axis=1)
youtube_channel_locations = youtube_channel_locations[youtube_channel_locations[youtube_channel_locations.columns[3:]].apply(sum,axis=1)!=0] ## 모두 0인 행 제거
youtube_channel_locations = youtube_channel_locations.drop_duplicates().sort_values(['youtube_user_id','end_date']).reset_index(drop=True)

In [21]:
# 계정별 일자별 지표 계산
youtube_channel_locations = youtube_channel_locations.groupby(['youtube_user_id', 'end_date']).agg({'views': 'sum',
                                                                                                    'estimatedMinutesWatched': 'sum',
                                                                                                    'averageViewDuration': 'mean',
                                                                                                    'averageViewPercentage': 'mean'}).reset_index()

In [22]:
# 시간 분 단위로 변경
youtube_channel_locations['averageViewDuration'] = youtube_channel_locations['averageViewDuration'] / 60

In [23]:
# 날짜형식 변경
youtube_channel_locations = youtube_channel_locations.rename(columns={'end_date':'date'})
youtube_channel_locations['date'] = youtube_channel_locations['date'].astype(str)

#### youtube_daily_channel_basics

In [24]:
youtube_daily_channel_basics = youtube_dict['youtube_daily_channel_basics']

In [25]:
len(youtube_daily_channel_basics['youtube_user_id'].unique())
## 유튜버 계정 : 906개

906

In [26]:
# 필요컬럼추출
youtube_daily_channel_basics = youtube_daily_channel_basics[youtube_daily_channel_basics['daily_basics'].apply(lambda x: len(x) > 0)]
youtube_daily_channel_basics = youtube_daily_channel_basics[['youtube_user_id','daily_basics']] ## 'end_date' --> 'day' 컬럼 활용
youtube_daily_channel_basics = youtube_daily_channel_basics.explode(['daily_basics']).reset_index(drop=True) ## melt

In [27]:
# daily_basics 컬럼 형식 맞추기 --> 딕셔너리 값들 cast 형태로 맞추기
youtube_daily_channel_basics_cast = []
for item in youtube_daily_channel_basics['daily_basics']:
    if isinstance(item, list):
        youtube_daily_channel_basics_cast.extend(item)
    else:
        youtube_daily_channel_basics_cast.append(item)

youtube_daily_channel_basics_cast = pd.json_normalize(youtube_daily_channel_basics_cast)

In [28]:
# melt, cast 하여 최종 데이터셋 생성
youtube_daily_channel_basics = pd.concat([youtube_daily_channel_basics, youtube_daily_channel_basics_cast],axis=1)
youtube_daily_channel_basics = youtube_daily_channel_basics.drop('daily_basics',axis=1)
youtube_daily_channel_basics = youtube_daily_channel_basics.fillna(0) ## null값 0으로 대체
youtube_daily_channel_basics = youtube_daily_channel_basics[youtube_daily_channel_basics[youtube_daily_channel_basics.columns[3:]].sum(axis=1)!=0] ## 모두 0인 행 제거
youtube_daily_channel_basics = youtube_daily_channel_basics.sort_values(['youtube_user_id','day']).reset_index(drop=True)

del youtube_daily_channel_basics_cast

In [29]:
# 날짜형식 변경
youtube_daily_channel_basics = youtube_daily_channel_basics.rename(columns={'day':'date'})
youtube_daily_channel_basics['date'] = youtube_daily_channel_basics['date'].astype(str)

# 시간 분 단위로 변경
youtube_daily_channel_basics['averageViewDuration'] = youtube_daily_channel_basics['averageViewDuration'] / 60

In [30]:
# # 최종데이터셋 기준 필요 계정 수 추출 - report, report_v2 제외
# youtube_user_id_outer = list(set(list(youtube_channel_locations['youtube_user_id']) +
#                                  list(youtube_daily_channel_basics['youtube_user_id'])))
# print('youtube_user_id_outer', len(youtube_user_id_outer))
# ## youtube_user_id 모두 포함 912개

# youtube_user_id_inner = list(set(youtube_channel_locations['youtube_user_id']) &
#                              set(youtube_daily_channel_basics['youtube_user_id']))

# print('youtube_user_id_inner',len(youtube_user_id_inner))
# ## youtube_user_id  공통 포함 250개

#### youtube_datas

In [31]:
youtube_user_id_inner = pd.read_csv('C:/py_src/awake/data/youtube_user_id_inner.csv')

In [32]:
collection = db['youtube_datas']  # 컬렉션 선택

# 파이프라인 정의
pipeline = [
    {
        "$match": {
            "youtube_user_id": {
                "$in": list(youtube_user_id_inner['youtube_user_id'])
            },
            # "end_date": {
            # "$gte": first_date,
            # "$lte": last_date
            # }
        }
    },
    {
        "$sort": {
            "youtube_user_id": 1,
            "data_created_at": 1
        }
    },
    {
        "$project": {
            'youtube_user_id' : 1, 
            'data_created_at' : 1, 
            'published_at' : 1, 
            'channel_id' : 1, 
            'channel_title' : 1, 
            'yt_search_keyword' : 1, 
            'subscribed_status' : 1
        }
    }
]

# 파이프라인 실행
result = list(collection.aggregate(pipeline, allowDiskUse=True))

# 결과를 Pandas 데이터프레임으로 변환
youtube_datas = pd.DataFrame(result)

In [33]:
len(youtube_datas['youtube_user_id'].unique())

249

In [34]:
# 컬럼 순서 정리
need_col = ['youtube_user_id', 'data_created_at', 'published_at', 'channel_id', 'channel_title', 'yt_search_keyword', 'subscribed_status']
youtube_datas = youtube_datas[need_col]

In [35]:
# cast 하여 데이터셋 생성
youtube_datas = pd.concat([youtube_datas, pd.json_normalize(youtube_datas['subscribed_status'])],axis=1)
youtube_datas = youtube_datas.drop(['subscribed_status'],axis=1)

youtube_datas[youtube_datas.columns[6:]] = youtube_datas[youtube_datas.columns[6:]].fillna(0) # null값 0으로 대체
youtube_datas = youtube_datas[youtube_datas[youtube_datas.columns[6:]].sum(axis=1)!=0] ## 모두 0인 행 제거

youtube_datas = youtube_datas.sort_values(['youtube_user_id','data_created_at']).reset_index(drop=True)

In [36]:
# 날짜 기준 필터링
opt_date = (youtube_datas['data_created_at'] >= datetime(2023, 3, 26)) & (youtube_datas['data_created_at'] <= datetime(2024, 5, 3))
youtube_datas = youtube_datas[opt_date].sort_values(['youtube_user_id', 'data_created_at']).reset_index(drop=True)

In [37]:
# 날짜형식 변경
youtube_datas = youtube_datas.rename(columns={'data_created_at':'date'})
youtube_datas['date'] = youtube_datas['date'].astype(str)

### 콘텐츠 데이터

#### youtube_videos

In [7]:
youtube_user_id_inner = pd.read_csv('C:/py_src/awake/data/youtube_user_id_inner.csv')

In [8]:
collection = db['youtube_videos']  # 컬렉션 선택

# 파이프라인 정의
pipeline = [
    {
        "$match": {
            "youtube_user_id": {
                "$in": list(youtube_user_id_inner['youtube_user_id'])
            },
            "videos": {"$ne": []},
            "end_date": {
                "$gte": datetime(2023, 3, 26),
                "$lte": datetime(2024, 5, 3)
            }
        }
    },
    {
        "$sort": {
            "youtube_user_id": 1,
            "end_date": 1
        }
    },
    {
        "$project": {
            "youtube_user_id": 1,
            "end_date": 1,
            "videos": 1
        }
    }
]

# 파이프라인 실행
result = list(collection.aggregate(pipeline, allowDiskUse=True))

# 결과를 Pandas 데이터프레임으로 변환
youtube_videos = pd.DataFrame(result)

In [10]:
len(youtube_videos['youtube_user_id'].unique())

249

In [47]:
# melt, cast하여 최종데이터셋 생성
# melt
youtube_videos = youtube_videos.explode(['videos']).reset_index(drop=True)

#cast
youtube_videos = pd.concat([youtube_videos, pd.json_normalize(youtube_videos['videos'])],axis=1)

youtube_videos = youtube_videos.drop(['_id','videos'],axis=1)
youtube_videos = youtube_videos.fillna(0) ## null값 0으로 대체
youtube_videos = youtube_videos[youtube_videos[youtube_videos.columns[4:]].sum(axis=1)!=0] ## 모두 0인 행 제거
youtube_videos = youtube_videos.drop_duplicates().sort_values(['youtube_user_id', 'end_date']).reset_index(drop=True)

In [23]:
# # 날짜형식 변경
# youtube_videos = youtube_videos.rename(columns={'end_date':'date'})
# youtube_videos['date'] = youtube_videos['date'].astype(str)

In [97]:
# youtube_videos = pd.read_csv('C:/py_src/awake/data/youtube_videos.csv')

In [98]:
# youtube_videos = youtube_videos.fillna(0) ## null값 0으로 대체
# youtube_videos = youtube_videos[youtube_videos[youtube_videos.columns[4:]].sum(axis=1)!=0] ## 모두 0인 행 제거
# youtube_videos = youtube_videos.drop_duplicates().sort_values(['youtube_user_id', 'end_date']).reset_index(drop=True)

## 데이터 전처리

### 계정 데이터

데이터 통합

In [40]:
merge_df_users_fin = pd.merge(youtube_users,youtube_datas,how='left',on='channel_id')
need_col = ['youtube_user_id', 'date', 'channel_id', 'channel_title_x', 'published_at_x', 'phone_num', 'yt_search_keyword', 'viewCount', 'subscriberCount', 'videoCount','UNSUBSCRIBED', 'SUBSCRIBED']
merge_df_users_fin = merge_df_users_fin[need_col]
merge_df_users_fin = merge_df_users_fin.rename(columns={'channel_title_x':'channel_title','published_at_x':'published_at'})
merge_df_users_fin = merge_df_users_fin[~merge_df_users_fin['youtube_user_id'].isnull()].reset_index(drop=True)

merge_df_users_fin = pd.merge(merge_df_users_fin,youtube_channel_locations,how='left',on=['youtube_user_id','date'])
merge_df_users_fin = merge_df_users_fin.drop(['views'],axis=1)

youtube_daily_channel_basics = youtube_daily_channel_basics.drop(['annotationClickThroughRate','annotationCloseRate'],axis=1)
merge_df_users_fin = pd.merge(merge_df_users_fin,youtube_daily_channel_basics,how='left',on=['youtube_user_id','date'])

In [41]:
# null값 대체
merge_df_users_fin['estimatedMinutesWatched_x'] = merge_df_users_fin['estimatedMinutesWatched_x'].fillna(merge_df_users_fin['estimatedMinutesWatched_y'])
merge_df_users_fin['averageViewDuration_x'] = merge_df_users_fin['averageViewDuration_x'].fillna(merge_df_users_fin['averageViewDuration_y'])

merge_df_users_fin = merge_df_users_fin.drop(['estimatedMinutesWatched_y','averageViewDuration_y'],axis=1)
merge_df_users_fin = merge_df_users_fin.rename(columns={'estimatedMinutesWatched_x':'estimatedMinutesWatched','averageViewDuration_x':'averageViewDuration'})

# 영상 시청 시간 합 / 영상 재생 시간 합 = 영상 시청 비율 대체
merge_df_users_fin['averageViewPercentage'] = np.where(merge_df_users_fin['averageViewPercentage'].isnull(), 
                                                       merge_df_users_fin['estimatedMinutesWatched'] / (merge_df_users_fin['averageViewDuration'] * merge_df_users_fin['views']),
                                                       merge_df_users_fin['averageViewPercentage'])
merge_df_users_fin['averageViewPercentage'] = merge_df_users_fin['averageViewPercentage'].fillna(0)

In [42]:
# 환율 적용 - 해당기간 평균환율 : 1322.42
exchange_rate_col = ['estimatedRevenue', 'estimatedAdRevenue', 'estimatedRedPartnerRevenue', 'grossRevenue', 'cpm', 'playbackBasedCpm']
merge_df_users_fin[exchange_rate_col] = merge_df_users_fin[exchange_rate_col] * 1322.42

In [43]:
# 버그로 사용된 수치값 대체
merge_df_users_fin['likes'] = np.where(merge_df_users_fin['likes'] < 0, 0, merge_df_users_fin['likes'])
merge_df_users_fin['dislikes'] = np.where(merge_df_users_fin['dislikes'] < 0, 0, merge_df_users_fin['dislikes'])

### 콘텐츠 데이터

In [99]:
# 최종 콘텐츠 분석 데이터셋
# 환율 적용 - 해당기간 평균환율 : 1322.42
exchange_rate_col = ['estimatedRevenue', 'estimatedAdRevenue', 'estimatedRedPartnerRevenue', 'grossRevenue', 'cpm', 'playbackBasedCpm']
youtube_videos[exchange_rate_col] = youtube_videos[exchange_rate_col] * 1322.42

In [100]:
# 잘못된값 처리
youtube_videos['estimatedRevenue'] = np.where(youtube_videos['estimatedRevenue'] < 0,
                                               youtube_videos['estimatedAdRevenue'] + youtube_videos['estimatedRedPartnerRevenue'],
                                               youtube_videos['estimatedRevenue'])

In [101]:
# 버그로 사용된 수치값 대체
youtube_videos['likes'] = np.where(youtube_videos['likes'] < 0, 0, youtube_videos['likes'])
youtube_videos['dislikes'] = np.where(youtube_videos['dislikes'] < 0, 0, youtube_videos['dislikes'])

## 주제1

### 계정 데이터

#### 파생변수

In [47]:
# y값 파생변수
merge_df_users_fin['total_engage_rate'] = (merge_df_users_fin['likes'] + merge_df_users_fin['comments'] + merge_df_users_fin['shares'] + merge_df_users_fin['dislikes']) / merge_df_users_fin['views'] ## 총 참여율
merge_df_users_fin['net_subscribers_change'] = merge_df_users_fin['subscribersGained'] - merge_df_users_fin['subscribersLost'] ## 구독자 순증가
merge_df_users_fin['revenue_per_view'] = merge_df_users_fin['estimatedRevenue'] / merge_df_users_fin['views'] ## 조회수당 수익
merge_df_users_fin['gross_revenue_per_ad_impression'] = merge_df_users_fin['grossRevenue'] / merge_df_users_fin['adImpressions'] ## 1회 광고노출당 총수익

In [48]:
# 파생변수1 - 참여도 관련
merge_df_users_fin['like_rate'] = merge_df_users_fin['likes'] / merge_df_users_fin['views'] ## 좋아요 비율 
merge_df_users_fin['comment_rate'] = merge_df_users_fin['comments'] / merge_df_users_fin['views'] ## 댓글 비율
merge_df_users_fin['share_rate'] = merge_df_users_fin['shares'] / merge_df_users_fin['views'] ## 공유 비율  
merge_df_users_fin['dislike_rate'] = merge_df_users_fin['dislikes'] / merge_df_users_fin['views'] ## 싫어요 비율

In [49]:
# 파생변수2 - 구독자 관련
merge_df_users_fin['subscribers_conversion_rate'] = merge_df_users_fin['subscribersGained'] / merge_df_users_fin['views'] ## 구독자 전환율
merge_df_users_fin['subscribed_view_rate'] = merge_df_users_fin['SUBSCRIBED'] / (merge_df_users_fin['SUBSCRIBED'] + merge_df_users_fin['UNSUBSCRIBED']) ## 구독자 조회수 비율

In [50]:
# 파생변수3 - 수익 관련
merge_df_users_fin['revenue_per_subscribed_view'] = merge_df_users_fin['estimatedRevenue'] / merge_df_users_fin['SUBSCRIBED'] ## 구독자조회수당 수익
merge_df_users_fin['revenue_per_unsubscribed_view'] = merge_df_users_fin['estimatedRevenue'] / merge_df_users_fin['UNSUBSCRIBED'] ## 비구독자조회수당 수익
merge_df_users_fin['revenue_per_red_view'] = merge_df_users_fin['estimatedRevenue'] / merge_df_users_fin['redViews'] ## 프리미엄당 수익
merge_df_users_fin['cpm_to_revenue_ratio'] = merge_df_users_fin['cpm'] / merge_df_users_fin['estimatedRevenue'] ## cpm 대비 수익
merge_df_users_fin['revenue_per_ad_impression'] = merge_df_users_fin['estimatedRevenue'] / merge_df_users_fin['adImpressions'] ## 광고노출당 수익

In [51]:
# 파생변수4 - 시청 시간 관련
merge_df_users_fin['watched_view_rate'] = merge_df_users_fin['estimatedMinutesWatched'] / merge_df_users_fin['views'] ## 조회수당 시청시간
merge_df_users_fin['unsubscribed_view_time_rate'] = merge_df_users_fin['estimatedMinutesWatched'] / merge_df_users_fin['UNSUBSCRIBED'] ## 비구독자조회수당 시청시간

In [52]:
# null값 대체
merge_df_users_fin = merge_df_users_fin.fillna(0) ## NaN
merge_df_users_fin = merge_df_users_fin.replace([np.inf, -np.inf], 0) ## inf

#### y값 설정

중요 지표 표준화

In [53]:
# y값 활용 지표
y_col = ['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view', 'gross_revenue_per_ad_impression'] ## null값은 views가 0인 데이터

In [54]:
# null값 대체
merge_df_users_fin[y_col] = merge_df_users_fin[y_col].fillna(0) ## NaN
merge_df_users_fin[y_col] = merge_df_users_fin[y_col].replace([np.inf, -np.inf], 0) ## inf

In [55]:
# 표준화 (Standard Scaling)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(merge_df_users_fin[y_col])

상관분석

In [151]:
corr_df = merge_df_users_fin[merge_df_users_fin.columns[10:]].corr()

In [97]:
corr_df[y_col]

Unnamed: 0,total_engage_rate,net_subscribers_change,averageViewPercentage,revenue_per_view,gross_revenue_per_ad_impression
UNSUBSCRIBED,0.108787,0.945395,0.037334,-0.007606,-0.004395
SUBSCRIBED,0.11457,0.815132,0.030277,-0.006212,0.00576
estimatedMinutesWatched,0.104821,0.897967,0.023327,0.001644,0.015854
averageViewDuration,-0.011263,-0.016753,0.173696,0.074189,0.082187
averageViewPercentage,0.001931,0.034984,1.0,-0.026598,-0.034708
comments,0.128123,0.480661,0.046363,-0.006906,0.003772
dislikes,0.115068,0.949227,0.03831,-0.007502,-0.003619
likes,0.11191,0.926094,0.024621,-0.005813,-0.002989
shares,0.121115,0.93316,0.026394,-0.005253,0.004396
subscribersGained,0.121558,0.995933,0.036476,-0.006823,-0.004605


다중 지표 결합

In [56]:
# Isolation Forest 모델 학습
iso_forest = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
iso_forest.fit(scaled_features)

# 이상치 점수 계산
anomaly_scores = iso_forest.decision_function(scaled_features)

In [57]:
# 임계값 설정
threshold = np.percentile(anomaly_scores, 5) ## 이상치 점수가 작을수록 이상치에 가깝다고 판단하는데, 5번째 백분위수가 (-)로 바뀌는 포인트

# y값 라벨링 (이상치이면 0, 그렇지 않으면 1)
merge_df_users_fin['y_label'] = np.where(anomaly_scores < threshold, 0, 1)

In [58]:
# 전체 데이터 y값 빈도 확인
merge_df_users_fin['y_label'].value_counts()

y_label
1    77512
0     4080
Name: count, dtype: int64

In [158]:
# 계정별 y값 빈도 데이터
y_result_df = merge_df_users_fin.groupby('youtube_user_id')['y_label'].value_counts().reset_index()
y_result_df

Unnamed: 0,youtube_user_id,y_label,count
0,627cb611aa6f212355e0b617,1,397
1,627cb611aa6f212355e0b617,0,1
2,627f59ccaa39226247c60b01,1,364
3,627f59ccaa39226247c60b01,0,29
4,6287228afb15712a8cb931d7,1,400
...,...,...,...
415,65e7b773d8da110bb072e2b5,1,57
416,65e7b773d8da110bb072e2b5,0,4
417,65f7b17ed8da110bb0733b7b,1,49
418,65fecf7ed8da110bb0736199,1,44


In [159]:
# 계정별 일일데이터의 20% 이상 이상치 데이터인 계정 확인
y_result_df[y_result_df['y_label']==0][y_result_df[y_result_df['y_label']==0]['count'] >= 40]

Unnamed: 0,youtube_user_id,y_label,count
23,62872523fb15712a8cb93479,0,120
55,629f6ca6eaf5732d6df0611e,0,293
64,62a35ce69d41c93ff90b5670,0,162
95,62c4e558507271632b9cc1c7,0,105
115,62d55a5e9900f20e1f259d24,0,69
121,62fb96f62be6ae3ff3672d79,0,49
136,6332f892ef33d840a099abb3,0,45
147,639bb8dcd603b8138e33780b,0,201
175,63d77c9650eb530dfd139f8b,0,63
190,63e9a89eee122e6319921a52,0,62


#### 변수선택

##### 데이터 분할

In [59]:
# 컬럼 정리
unique_col = merge_df_users_fin.columns[:10]
x_col = merge_df_users_fin.columns[10:-1]

In [60]:
# 데이터 분할
X = merge_df_users_fin[x_col].drop(columns=['total_engage_rate','net_subscribers_change','revenue_per_view','averageViewPercentage', 'gross_revenue_per_ad_impression']) ## y값 라벨링에 쓰인 지표 제거
y = merge_df_users_fin['y_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [61]:
# 불균형 확인
print(y_train.value_counts())
print(y_test.value_counts())

y_label
1    62009
0     3264
Name: count, dtype: int64
y_label
1    15503
0      816
Name: count, dtype: int64


##### 언더샘플링

In [62]:
# 언더샘플링
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

##### 변수선택

t-test

In [63]:
from scipy.stats import ttest_ind

# y=0과 y=1 그룹으로 데이터 나누기
group_0 = X_train_resampled[y_train_resampled == 0]
group_1 = X_train_resampled[y_train_resampled == 1]

# t-test 수행
p_values = []
for col in X_train_resampled.columns:
    t_stat, p_val = ttest_ind(group_0[col], group_1[col], equal_var=False)  # Welch's t-test
    p_values.append((col, p_val))

# p-value < 0.05인 변수 선택
selected_features_by_ttest = [col for col, p_val in p_values if p_val < 0.05]
print("Selected Features by t-test:", selected_features_by_ttest)

Selected Features by t-test: ['UNSUBSCRIBED', 'SUBSCRIBED', 'estimatedMinutesWatched', 'averageViewDuration', 'comments', 'dislikes', 'likes', 'shares', 'subscribersGained', 'subscribersLost', 'views', 'redViews', 'estimatedRevenue', 'estimatedAdRevenue', 'estimatedRedPartnerRevenue', 'grossRevenue', 'cpm', 'monetizedPlaybacks', 'adImpressions', 'playbackBasedCpm', 'like_rate', 'comment_rate', 'share_rate', 'dislike_rate', 'subscribers_conversion_rate', 'subscribed_view_rate', 'revenue_per_subscribed_view', 'revenue_per_unsubscribed_view', 'revenue_per_red_view', 'cpm_to_revenue_ratio', 'revenue_per_ad_impression', 'watched_view_rate', 'unsubscribed_view_time_rate']


In [64]:
print(len(selected_features_by_ttest))
print(selected_features_by_ttest)

33
['UNSUBSCRIBED', 'SUBSCRIBED', 'estimatedMinutesWatched', 'averageViewDuration', 'comments', 'dislikes', 'likes', 'shares', 'subscribersGained', 'subscribersLost', 'views', 'redViews', 'estimatedRevenue', 'estimatedAdRevenue', 'estimatedRedPartnerRevenue', 'grossRevenue', 'cpm', 'monetizedPlaybacks', 'adImpressions', 'playbackBasedCpm', 'like_rate', 'comment_rate', 'share_rate', 'dislike_rate', 'subscribers_conversion_rate', 'subscribed_view_rate', 'revenue_per_subscribed_view', 'revenue_per_unsubscribed_view', 'revenue_per_red_view', 'cpm_to_revenue_ratio', 'revenue_per_ad_impression', 'watched_view_rate', 'unsubscribed_view_time_rate']


Lasso

In [166]:
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

# t-test로 선택된 변수로 데이터프레임 구성
X_train_ttest_selected = X_train_resampled[selected_features_by_ttest]

# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_ttest_selected)

# LassoCV 모델 설정: 반복 횟수 증가, alpha 범위 조정
lasso = LassoCV(cv=5, random_state=42, max_iter=10000, alphas=[0.1, 0.05, 0.01, 0.005, 0.001]).fit(X_train_scaled, y_train_resampled)

# 선택된 변수 확인 (회귀 계수가 0이 아닌 변수들)
selected_features_by_lasso = X_train_ttest_selected.columns[(lasso.coef_ != 0)]

In [167]:
print(len(selected_features_by_lasso))
print(selected_features_by_lasso)

29
Index(['estimatedMinutesWatched', 'averageViewDuration', 'comments',
       'dislikes', 'likes', 'shares', 'subscribersGained', 'subscribersLost',
       'redViews', 'estimatedRevenue', 'estimatedAdRevenue',
       'estimatedRedPartnerRevenue', 'cpm', 'monetizedPlaybacks',
       'adImpressions', 'playbackBasedCpm', 'like_rate', 'comment_rate',
       'share_rate', 'dislike_rate', 'subscribers_conversion_rate',
       'subscribed_view_rate', 'revenue_per_subscribed_view',
       'revenue_per_unsubscribed_view', 'revenue_per_red_view',
       'cpm_to_revenue_ratio', 'revenue_per_ad_impression',
       'watched_view_rate', 'unsubscribed_view_time_rate'],
      dtype='object')


RandomForest

In [168]:
from sklearn.ensemble import RandomForestRegressor

# 랜덤 포레스트 모델 학습
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train_resampled[selected_features_by_lasso], y_train_resampled)

# 변수 중요도 추출
importances = rf_model.feature_importances_
feature_importance_rf = pd.Series(importances, index=selected_features_by_lasso).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
selected_features_by_rf = list(feature_importance_rf[feature_importance_rf >= 0.001].keys())

# 중요한 변수 출력
print(f"랜덤 포레스트로 선정된 변수: \n{selected_features_by_rf}")

랜덤 포레스트로 선정된 변수: 
['subscribersGained', 'revenue_per_unsubscribed_view', 'like_rate', 'cpm', 'share_rate', 'comment_rate', 'revenue_per_ad_impression', 'averageViewDuration', 'subscribersLost', 'subscribed_view_rate', 'revenue_per_red_view', 'dislike_rate', 'unsubscribed_view_time_rate', 'estimatedMinutesWatched', 'revenue_per_subscribed_view', 'watched_view_rate', 'playbackBasedCpm', 'subscribers_conversion_rate', 'comments', 'redViews', 'dislikes', 'cpm_to_revenue_ratio', 'likes', 'estimatedRevenue', 'estimatedRedPartnerRevenue', 'monetizedPlaybacks', 'shares', 'adImpressions', 'estimatedAdRevenue']


Gradient Boosting

In [169]:
from sklearn.ensemble import GradientBoostingRegressor

# Gradient Boosting 모델 학습
gbm_model = GradientBoostingRegressor(random_state=42)
gbm_model.fit(X_train_resampled[selected_features_by_lasso], y_train_resampled)

# 변수 중요도 추출
importances = gbm_model.feature_importances_
feature_importance_gbm = pd.Series(importances, index=selected_features_by_lasso).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
selected_features_by_gbm = list(feature_importance_gbm[feature_importance_gbm >= 0.001].keys())

# 중요한 변수 출력
print(f"Gradient Boosting으로 선정된 변수: \n{selected_features_by_gbm}")

Gradient Boosting으로 선정된 변수: 
['subscribersGained', 'revenue_per_unsubscribed_view', 'like_rate', 'cpm', 'revenue_per_ad_impression', 'share_rate', 'revenue_per_red_view', 'comment_rate', 'cpm_to_revenue_ratio', 'subscribersLost', 'estimatedRevenue', 'dislike_rate', 'subscribed_view_rate', 'revenue_per_subscribed_view', 'averageViewDuration', 'watched_view_rate', 'shares', 'monetizedPlaybacks', 'playbackBasedCpm', 'estimatedMinutesWatched', 'subscribers_conversion_rate']


LightGBM

In [170]:
import lightgbm as lgb

# LightGBM 모델 학습
lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_model.fit(X_train_resampled[selected_features_by_lasso], y_train_resampled)

# 변수 중요도 추출
importances = lgb_model.feature_importances_
feature_importance_lgbm = pd.Series(importances, index=selected_features_by_lasso).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
feature_importance_lgbm = feature_importance_lgbm / feature_importance_lgbm.sum() ## 중요도 정규화
selected_features_by_lgbm = list(feature_importance_lgbm[feature_importance_lgbm >= 0.005].keys())

# 중요한 변수 출력
print(f"LightGBM으로 선정된 변수: \n{selected_features_by_lgbm}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001564 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 6528, number of used features: 29
[LightGBM] [Info] Start training from score 0.500000
LightGBM으로 선정된 변수: 
['like_rate', 'share_rate', 'averageViewDuration', 'subscribersGained', 'subscribed_view_rate', 'estimatedMinutesWatched', 'unsubscribed_view_time_rate', 'revenue_per_unsubscribed_view', 'cpm', 'comment_rate', 'revenue_per_ad_impression', 'watched_view_rate', 'subscribers_conversion_rate', 'dislike_rate', 'redViews', 'subscribersLost', 'revenue_per_subscribed_view', 'revenue_per_red_view', 'cpm_to_revenue_ratio', 'estimatedRevenue', 'likes', 'monetizedPlaybacks', 'playbackBasedCpm', 'shares', 'estimatedAdRevenue', 'comments', 'estimatedRedPartnerRevenue', 'adImpressions', 'dislikes']


XGBoost

In [171]:
import xgboost as xgb

# XGBoost 모델 학습
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_model.fit(X_train_resampled[selected_features_by_lasso], y_train_resampled)

# 변수 중요도 추출
importances = xgb_model.feature_importances_
feature_importance_xgb = pd.Series(importances, index=selected_features_by_lasso).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
selected_features_by_xgb = list(feature_importance_xgb[feature_importance_xgb >= 0.001].keys())

# 중요한 변수 출력
print(f"XGBoost로 선정된 변수: \n{selected_features_by_xgb}")

XGBoost로 선정된 변수: 
['revenue_per_unsubscribed_view', 'subscribersGained', 'like_rate', 'cpm', 'share_rate', 'comment_rate', 'adImpressions', 'revenue_per_ad_impression', 'subscribersLost', 'revenue_per_red_view', 'dislikes', 'estimatedRevenue', 'revenue_per_subscribed_view', 'unsubscribed_view_time_rate', 'estimatedAdRevenue', 'subscribed_view_rate', 'redViews', 'comments', 'dislike_rate', 'averageViewDuration', 'cpm_to_revenue_ratio', 'estimatedMinutesWatched', 'likes', 'monetizedPlaybacks', 'watched_view_rate', 'playbackBasedCpm', 'shares', 'subscribers_conversion_rate', 'estimatedRedPartnerRevenue']


In [172]:
# 비선형모델 활용 변수 선정
importances_df = pd.DataFrame({
    'features': selected_features_by_lasso,
    'rf_importance': rf_model.feature_importances_,
    'gbm_importance': gbm_model.feature_importances_,
    'lgbm_importance': lgb_model.feature_importances_,
    'xgb_importance': xgb_model.feature_importances_    
})
importances_df['lgbm_importance'] = importances_df['lgbm_importance'] / importances_df['lgbm_importance'].sum() ## 다른 모델과 중요도 단위 다르기 때문에 정규화
importances_df['mean_importance'] = importances_df[['rf_importance', 'gbm_importance', 'lgbm_importance', 'xgb_importance']].mean(axis=1) ## 각 변수별 모델 중요도 평균

In [173]:
final_selected_features_user = list(importances_df[importances_df['mean_importance']>=0.01]['features'])
final_selected_features_user

['estimatedMinutesWatched',
 'averageViewDuration',
 'subscribersGained',
 'subscribersLost',
 'redViews',
 'cpm',
 'like_rate',
 'comment_rate',
 'share_rate',
 'dislike_rate',
 'subscribers_conversion_rate',
 'subscribed_view_rate',
 'revenue_per_subscribed_view',
 'revenue_per_unsubscribed_view',
 'revenue_per_red_view',
 'cpm_to_revenue_ratio',
 'revenue_per_ad_impression',
 'watched_view_rate',
 'unsubscribed_view_time_rate']

#### 모델링

##### 모델 기법 적용

RandomForest

In [200]:
from sklearn.ensemble import RandomForestClassifier

# 랜덤 포레스트 모델 학습
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_resampled[final_selected_features_user], y_train_resampled)

# 랜덤 포레스트 모델 교차 검증
cv_scores_rf = cross_val_score(rf_model, X_train_resampled[final_selected_features_user], y_train_resampled, cv=5, scoring='accuracy')
print(f"RandomForest Cross-Validation Accuracy: {cv_scores_rf.mean():.2f}")

RandomForest Cross-Validation Accuracy: 0.97


GradientBoosting

In [201]:
from sklearn.ensemble import GradientBoostingClassifier

# 그라디언트 부스팅 모델 학습
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_resampled[final_selected_features_user], y_train_resampled)

# 그라디언트 부스팅 모델 교차 검증
cv_scores_gb = cross_val_score(gb_model, X_train_resampled[final_selected_features_user], y_train_resampled, cv=5, scoring='accuracy')
print(f"GradientBoosting Cross-Validation Accuracy: {cv_scores_gb.mean():.2f}")

GradientBoosting Cross-Validation Accuracy: 0.96


XGBoost

In [68]:
from xgboost import XGBClassifier

# XGBoost를 사용한 모델 학습
xgb_model_user = XGBClassifier(random_state=42)
xgb_model_user.fit(X_train_resampled[final_selected_features_user], y_train_resampled)

# 그라디언트 부스팅 모델 교차 검증
cv_scores_xgb = cross_val_score(xgb_model_user, X_train_resampled[final_selected_features_user], y_train_resampled, cv=5, scoring='accuracy')
print(f"XGBoost Cross-Validation Accuracy: {cv_scores_xgb.mean():.2f}")

XGBoost Cross-Validation Accuracy: 0.97


##### 모델 성능 평가

In [203]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# 예측 결과 생성
y_pred_rf = rf_model.predict(X_test[final_selected_features_user])
y_pred_gb = gb_model.predict(X_test[final_selected_features_user])
y_pred_xgb = xgb_model.predict(X_test[final_selected_features_user])

# 모델 정확도 확인
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print(f"RandomForest Test Accuracy: {accuracy_rf:.2f}")
print(f"GradientBoosting Test Accuracy: {accuracy_gb:.2f}")
print(f"XGBoost Test Accuracy: {accuracy_xgb:.2f}")

# 정밀도, 재현율, F1-score 출력
print("RandomForest Classification Report:")
print(classification_report(y_test, y_pred_rf))

print("GradientBoosting Classification Report:")
print(classification_report(y_test, y_pred_gb))

print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))

# ROC-AUC 확인
roc_auc_rf = roc_auc_score(y_test, rf_model.predict_proba(X_test[final_selected_features_user])[:, 1])
roc_auc_gb = roc_auc_score(y_test, gb_model.predict_proba(X_test[final_selected_features_user])[:, 1])
roc_auc_xgb = roc_auc_score(y_test, xgb_model.predict_proba(X_test[final_selected_features_user])[:, 1])

print(f"RandomForest ROC-AUC: {roc_auc_rf:.2f}")
print(f"GradientBoosting ROC-AUC: {roc_auc_gb:.2f}")
print(f"XGBoost ROC-AUC: {roc_auc_xgb:.2f}")

RandomForest Test Accuracy: 0.97
GradientBoosting Test Accuracy: 0.97
XGBoost Test Accuracy: 0.97
RandomForest Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.96      0.73       816
           1       1.00      0.97      0.98     15503

    accuracy                           0.97     16319
   macro avg       0.80      0.96      0.86     16319
weighted avg       0.98      0.97      0.97     16319

GradientBoosting Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.95      0.74       816
           1       1.00      0.97      0.98     15503

    accuracy                           0.97     16319
   macro avg       0.80      0.96      0.86     16319
weighted avg       0.98      0.97      0.97     16319

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.96      0.78       816
           1       1.00      0.9

### 콘텐츠 데이터

#### 파생변수

In [102]:
# y값 파생변수
youtube_videos['total_engage_rate'] = (youtube_videos['likes'] + youtube_videos['comments'] + youtube_videos['shares'] + youtube_videos['dislikes']) / youtube_videos['views'] ## 총 참여율
youtube_videos['net_subscribers_change'] = youtube_videos['subscribersGained'] - youtube_videos['subscribersLost'] ## 구독자 순증가
youtube_videos['revenue_per_view'] = youtube_videos['estimatedRevenue'] / youtube_videos['views'] ## 조회수당 수익
youtube_videos['grossRevenue_per_ad_impression'] = youtube_videos['grossRevenue'] / youtube_videos['adImpressions'] ## 1회 광고노출당 총수익
youtube_videos['total_card_teaser_click_rate'] = (youtube_videos['cardClicks'] + youtube_videos['cardTeaserClicks']) / (youtube_videos['cardImpressions'] + youtube_videos['cardTeaserImpressions']) ## 카드와 카드 티저의 총 클릭률
youtube_videos['playlist_engagement_rate'] = (youtube_videos['videosAddedToPlaylists'] + youtube_videos['videosRemovedFromPlaylists']) / youtube_videos['views'] ## 플레이리스트 참여도

In [103]:
# 파생변수1 - 참여도 관련
youtube_videos['comment_rate'] = youtube_videos['comments'] / youtube_videos['views'] ## 댓글 비율
youtube_videos['dislike_rate'] = youtube_videos['dislikes'] / youtube_videos['views'] ## 싫어요 비율

In [104]:
# 파생변수2 - 구독자 관련
youtube_videos['subscribers_conversion_rate'] = youtube_videos['subscribersGained'] / youtube_videos['views'] ## 구독자 전환율

In [105]:
# 파생변수3 - 수익 관련
youtube_videos['revenue_per_red_view'] = youtube_videos['estimatedRevenue'] / youtube_videos['redViews'] ## 프리미엄당 수익
youtube_videos['ad_revenue_rate'] = youtube_videos['estimatedAdRevenue'] / youtube_videos['estimatedRevenue'] ## 광고수익비율
youtube_videos['red_revenue_rate'] = youtube_videos['estimatedRedPartnerRevenue'] / youtube_videos['estimatedRevenue'] ## 프리미엄수익비율
youtube_videos['revenue_per_ad_impression'] = youtube_videos['estimatedRevenue'] / youtube_videos['adImpressions'] ## 광고노출당 수익
youtube_videos['net_revenue_per_playlist_add'] = (youtube_videos['videosAddedToPlaylists'] - youtube_videos['videosRemovedFromPlaylists']) / youtube_videos['estimatedRevenue'] ## 플레이리스트 순추가당 수익

In [106]:
# 파생변수4 - 시청 시간 관련
youtube_videos['avg_view_duration_rate'] = youtube_videos['averageViewDuration'] / youtube_videos['averageViewPercentage'] ## 평균 시청 시간 비율
youtube_videos['watched_time_rate'] = youtube_videos['averageViewPercentage'] * youtube_videos['estimatedMinutesWatched'] ## 재생 비율 대비 시청 시간
youtube_videos['watched_view_red_rate'] = youtube_videos['estimatedRedMinutesWatched'] / youtube_videos['views'] ## 조회수당 프리미엄 이용자 시청시간

In [107]:
# 파생변수5 - 광고 관련
youtube_videos['revenue_per_playback'] = youtube_videos['grossRevenue'] / youtube_videos['monetizedPlaybacks'] ## 1회 광고재생당 수익
youtube_videos['ad_playbacks_per_playlist_add'] = youtube_videos['monetizedPlaybacks'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가당 광고 재생 비율

In [109]:
# 파생변수7 - 비디오 관련
youtube_videos['playlist_addition_rate'] = youtube_videos['videosAddedToPlaylists'] / youtube_videos['views'] ## 플레이리스트 추가 비율
youtube_videos['playlist_removal_rate'] = youtube_videos['videosRemovedFromPlaylists'] / youtube_videos['views'] ## 플레이리스트 제거 비율

In [110]:
# null값 대체
youtube_videos = youtube_videos.fillna(0) ## NaN
youtube_videos = youtube_videos.replace([np.inf, -np.inf], 0) ## inf

#### y값 설정

중요 지표 표준화

In [111]:
# y값 활용 지표
y_col = ['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view', 'grossRevenue_per_ad_impression','total_card_teaser_click_rate','playlist_engagement_rate'] ## null값은 views가 0인 데이터

In [112]:
# null값 대체
youtube_videos[y_col] = youtube_videos[y_col].fillna(0) ## NaN
youtube_videos[y_col] = youtube_videos[y_col].replace([np.inf, -np.inf], 0) ## inf

In [113]:
# 표준화 (Standard Scaling)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(youtube_videos[y_col])

상관분석

In [57]:
corr_df = youtube_videos[youtube_videos.columns[3:]].corr()
corr_df[y_col]

Unnamed: 0,total_engage_rate,net_subscribers_change,averageViewPercentage,revenue_per_view,grossRevenue_per_ad_impression,total_card_teaser_click_rate,playlist_engagement_rate
views,0.017589,0.697500,0.015059,-0.002192,-0.010244,-0.000535,-0.001516
redViews,0.011168,0.414674,0.021418,-0.002645,-0.012235,-0.000177,-0.002112
comments,0.069468,0.129862,0.004478,-0.000404,-0.002009,0.001460,-0.000395
likes,0.017302,0.568465,0.008680,-0.001439,-0.006794,-0.000431,-0.000900
dislikes,0.016506,0.740122,0.013597,-0.001958,-0.009440,-0.000513,-0.001334
...,...,...,...,...,...,...,...
playlist_removal_rate,0.010308,-0.002620,-0.017461,0.003346,0.011646,0.000332,0.198769
net_playlist_addition_rate,0.004638,-0.000230,-0.001374,-0.000091,0.000168,0.000118,0.967649
playlist_engagement_rate,0.007157,-0.000888,-0.005761,0.000757,0.003108,0.000200,1.000000
playlist_related_revenue_rate,0.000637,0.055131,-0.000781,0.027295,0.031024,0.003774,-0.001662


다중 지표 결합

In [114]:
# Isolation Forest 모델 학습
iso_forest = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
iso_forest.fit(scaled_features)

# 이상치 점수 계산
anomaly_scores = iso_forest.decision_function(scaled_features)

In [115]:
# 임계값 설정
threshold = np.percentile(anomaly_scores, 5) ## 이상치 점수가 작을수록 이상치에 가깝다고 판단하는데, 5번째 백분위수가 (-)로 바뀌는 포인트

# y값 라벨링 (이상치이면 0, 그렇지 않으면 1)
youtube_videos['y_label'] = np.where(anomaly_scores < threshold, 0, 1)

In [116]:
# 전체 데이터 y값 빈도 확인
youtube_videos['y_label'].value_counts()

y_label
1    8136044
0     428213
Name: count, dtype: int64

In [42]:
# 계정별 y값 빈도 데이터
youtube_videos.groupby('youtube_user_id')['y_label'].value_counts().reset_index()

Unnamed: 0,youtube_user_id,y_label,count
0,627cb611aa6f212355e0b617,1,15205
1,627cb611aa6f212355e0b617,0,158
2,627f59ccaa39226247c60b01,1,8011
3,627f59ccaa39226247c60b01,0,513
4,6287228afb15712a8cb931d7,1,5117
...,...,...,...
492,65f7b17ed8da110bb0733b7b,0,85
493,65fecf7ed8da110bb0736199,1,8661
494,65fecf7ed8da110bb0736199,0,138
495,66230ee6d8da110bb0744b2d,1,3329


In [48]:
# 콘텐츠별 y값 빈도 데이터
y_result_df = youtube_videos.groupby('video')['y_label'].value_counts().reset_index()

# 콘텐츠별 일일 측정 데이터 수
video_cnt_df = youtube_videos.groupby('video')['end_date'].count().reset_index()

y_result_df = pd.merge(y_result_df,video_cnt_df,how='left',on='video')
y_result_df = y_result_df.rename(columns={'end_date':'total_count'})
y_result_df['standard_cnt'] = round(y_result_df['total_count'] * 0.1) ## 비디오별 일일 전체 데이터 중 이상치 데이터가 10% 이상
y_result_df['standard_cnt'] = y_result_df['standard_cnt'].astype(int)

y_result_df

Unnamed: 0,video,y_label,count,total_count,standard_cnt
0,0,1,8,8,1
1,--0HSDH6J7o,1,15,15,2
2,--0XOlJ3Lw4,1,397,399,40
3,--0XOlJ3Lw4,0,2,399,40
4,--7sZPRc1H4,1,31,32,3
...,...,...,...,...,...
135832,zzlQiqh04eE,1,21,21,2
135833,zzwBOCOq5YI,1,303,342,34
135834,zzwBOCOq5YI,0,39,342,34
135835,zzza6bbJnMI,1,201,203,20


In [49]:
# 계정별 일일데이터의 10% 이상 이상치 데이터
video_outlier_df = y_result_df[y_result_df['y_label']==0][y_result_df[y_result_df['y_label']==0]['count'] > y_result_df[y_result_df['y_label']==0]['standard_cnt']].reset_index(drop=True)
video_outlier_df

Unnamed: 0,video,y_label,count,total_count,standard_cnt
0,-0IyJ6nzgSg,0,17,87,9
1,-0JNe3se7JM,0,37,254,25
2,-0csgxlNcSc,0,22,68,7
3,-0rIxfNpjAc,0,2,13,1
4,-0vVZp3kclU,0,5,25,2
...,...,...,...,...,...
11177,zyjbDedYZB8,0,1,5,0
11178,zypZyGqg9ZY,0,1,2,0
11179,zyyDUlfD4jg,0,6,48,5
11180,zzWuUVocjCI,0,2,11,1


In [50]:
# 이상치 중 데이터 부족으로 판단 어려운 것 제거 후 최종 이상치 콘텐츠
video_outlier_fin = video_outlier_df[video_outlier_df['standard_cnt']!=0]['video'].unique()

In [53]:
# 계정별 이상치 콘텐츠 결과 데이터셋
result_contents_df = pd.merge(youtube_videos.groupby('youtube_user_id')['video'].count().reset_index(),
                              youtube_videos[youtube_videos['video'].isin(video_outlier_fin)].groupby('youtube_user_id')['video'].count().reset_index(),
                              how='left',on='youtube_user_id')
result_contents_df = result_contents_df[~result_contents_df['video_y'].isnull()].reset_index(drop=True)

#### 변수선택

##### 데이터 분할

In [117]:
# 컬럼 정리
unique_col = youtube_videos.columns[:3]
x_col = youtube_videos.columns[3:-1]

In [118]:
# 데이터 분할
X = youtube_videos[x_col].drop(columns=['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view', 'grossRevenue_per_ad_impression','total_card_teaser_click_rate','playlist_engagement_rate']) ## y값 라벨링에 쓰인 지표 제거
y = youtube_videos['y_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [119]:
# 불균형 확인
print(y_train.value_counts())
print(y_test.value_counts())

y_label
1    6509053
0     342352
Name: count, dtype: int64
y_label
1    1626991
0      85861
Name: count, dtype: int64


##### 언더샘플링

In [120]:
# 언더샘플링
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

##### 변수선택

t-test

In [121]:
from scipy.stats import ttest_ind

# y=0과 y=1 그룹으로 데이터 나누기
group_0 = X_train_resampled[y_train_resampled == 0]
group_1 = X_train_resampled[y_train_resampled == 1]

# t-test 수행
p_values = []
for col in X_train_resampled.columns:
    t_stat, p_val = ttest_ind(group_0[col], group_1[col], equal_var=False)  # Welch's t-test
    p_values.append((col, p_val))

# p-value < 0.05인 변수 선택
selected_features_by_ttest = [col for col, p_val in p_values if p_val < 0.05]
print("Selected Features by t-test:", selected_features_by_ttest)

Selected Features by t-test: ['views', 'redViews', 'comments', 'likes', 'dislikes', 'shares', 'estimatedMinutesWatched', 'estimatedRedMinutesWatched', 'averageViewDuration', 'videosAddedToPlaylists', 'videosRemovedFromPlaylists', 'estimatedRevenue', 'estimatedAdRevenue', 'grossRevenue', 'estimatedRedPartnerRevenue', 'playbackBasedCpm', 'cpm', 'subscribersGained', 'subscribersLost', 'monetizedPlaybacks', 'adImpressions', 'cardClickRate', 'cardTeaserClickRate', 'cardImpressions', 'cardTeaserImpressions', 'cardClicks', 'cardTeaserClicks', 'comment_rate', 'dislike_rate', 'comment_to_like_rate', 'like_to_dislike_ratio', 'subscribers_conversion_rate', 'subscribers_gained_per_card_click', 'subscribers_gained_per_playlist_add', 'card_click_to_subscriber_conversion', 'subscribers_lost_per_playlist_remove', 'revenue_per_red_view', 'ad_revenue_rate', 'red_revenue_rate', 'cpm_to_revenue_ratio', 'revenue_per_ad_impression', 'playback_based_cpm_rate', 'revenue_per_card_click', 'revenue_per_playlist_

In [122]:
print(len(selected_features_by_ttest))
print(selected_features_by_ttest)

71
['views', 'redViews', 'comments', 'likes', 'dislikes', 'shares', 'estimatedMinutesWatched', 'estimatedRedMinutesWatched', 'averageViewDuration', 'videosAddedToPlaylists', 'videosRemovedFromPlaylists', 'estimatedRevenue', 'estimatedAdRevenue', 'grossRevenue', 'estimatedRedPartnerRevenue', 'playbackBasedCpm', 'cpm', 'subscribersGained', 'subscribersLost', 'monetizedPlaybacks', 'adImpressions', 'cardClickRate', 'cardTeaserClickRate', 'cardImpressions', 'cardTeaserImpressions', 'cardClicks', 'cardTeaserClicks', 'comment_rate', 'dislike_rate', 'comment_to_like_rate', 'like_to_dislike_ratio', 'subscribers_conversion_rate', 'subscribers_gained_per_card_click', 'subscribers_gained_per_playlist_add', 'card_click_to_subscriber_conversion', 'subscribers_lost_per_playlist_remove', 'revenue_per_red_view', 'ad_revenue_rate', 'red_revenue_rate', 'cpm_to_revenue_ratio', 'revenue_per_ad_impression', 'playback_based_cpm_rate', 'revenue_per_card_click', 'revenue_per_playlist_add', 'card_click_to_reven

Lasso

In [123]:
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

# t-test로 선택된 변수로 데이터프레임 구성
X_train_ttest_selected = X_train_resampled[selected_features_by_ttest]

# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_ttest_selected)

# LassoCV 모델 설정: 반복 횟수 증가, alpha 범위 조정
lasso = LassoCV(cv=5, random_state=42, max_iter=10000, alphas=[0.1, 0.05, 0.01, 0.005, 0.001]).fit(X_train_scaled, y_train_resampled)

# 선택된 변수 확인 (회귀 계수가 0이 아닌 변수들)
selected_features_by_lasso = X_train_ttest_selected.columns[(lasso.coef_ != 0)]

In [124]:
print(len(selected_features_by_lasso))
print(selected_features_by_lasso)

56
Index(['views', 'redViews', 'comments', 'likes', 'dislikes', 'shares',
       'estimatedRedMinutesWatched', 'averageViewDuration',
       'videosAddedToPlaylists', 'videosRemovedFromPlaylists',
       'estimatedRevenue', 'estimatedAdRevenue', 'grossRevenue',
       'estimatedRedPartnerRevenue', 'playbackBasedCpm', 'cpm',
       'subscribersGained', 'subscribersLost', 'monetizedPlaybacks',
       'adImpressions', 'cardClickRate', 'cardTeaserClickRate',
       'cardImpressions', 'cardTeaserImpressions', 'cardTeaserClicks',
       'comment_rate', 'dislike_rate', 'comment_to_like_rate',
       'like_to_dislike_ratio', 'subscribers_conversion_rate',
       'subscribers_gained_per_playlist_add',
       'subscribers_lost_per_playlist_remove', 'revenue_per_red_view',
       'ad_revenue_rate', 'red_revenue_rate', 'cpm_to_revenue_ratio',
       'revenue_per_ad_impression', 'playback_based_cpm_rate',
       'revenue_per_playlist_add', 'card_click_to_revenue_ratio',
       'net_revenue_per_play

LightGBM

In [125]:
import lightgbm as lgb

# LightGBM 모델 학습
lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_model.fit(X_train_resampled[selected_features_by_lasso], y_train_resampled)

# 변수 중요도 추출
importances = lgb_model.feature_importances_
feature_importance_lgbm = pd.Series(importances, index=selected_features_by_lasso).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
feature_importance_lgbm = feature_importance_lgbm / feature_importance_lgbm.sum() ## 중요도 정규화
selected_features_by_lgbm = list(feature_importance_lgbm[feature_importance_lgbm >= 0.005].keys())

# 중요한 변수 출력
print(f"LightGBM으로 선정된 변수: \n{selected_features_by_lgbm}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.105088 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12254
[LightGBM] [Info] Number of data points in the train set: 684704, number of used features: 56
[LightGBM] [Info] Start training from score 0.500000
LightGBM으로 선정된 변수: 
['views', 'likes', 'playlist_addition_rate', 'shares', 'playlist_removal_rate', 'cpm', 'avg_view_duration_rate', 'subscribersGained', 'estimatedRevenue', 'averageViewDuration', 'watched_time_rate', 'revenue_per_ad_impression', 'revenue_per_red_view', 'comment_rate', 'subscribers_conversion_rate', 'playbackBasedCpm', 'dislike_rate', 'redViews', 'net_revenue_per_playlist_add', 'videosAddedToPlaylists', 'ad_playbacks_per_playlist_add', 'cpm_to_revenue_ratio', 'watch_time_per_playlist_add', 'estimatedAdRevenue', 'ad_revenue_rate', 'watched_view_red_rate', 'playlist_rela

XGBoost

In [126]:
import xgboost as xgb

# XGBoost 모델 학습
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_model.fit(X_train_resampled[selected_features_by_lasso], y_train_resampled)

# 변수 중요도 추출
importances = xgb_model.feature_importances_
feature_importance_xgb = pd.Series(importances, index=selected_features_by_lasso).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
selected_features_by_xgb = list(feature_importance_xgb[feature_importance_xgb >= 0.001].keys())

# 중요한 변수 출력
print(f"XGBoost로 선정된 변수: \n{selected_features_by_xgb}")

XGBoost로 선정된 변수: 
['videosAddedToPlaylists', 'revenue_per_playback', 'playlist_removal_rate', 'cpm', 'subscribersGained', 'playlist_addition_rate', 'views', 'ad_playbacks_per_playlist_add', 'shares', 'likes', 'dislike_rate', 'revenue_per_ad_impression', 'comment_rate', 'redViews', 'revenue_per_red_view', 'adImpressions', 'estimatedRevenue', 'averageViewDuration', 'red_revenue_rate', 'estimatedRedMinutesWatched', 'watched_view_red_rate', 'watched_time_rate', 'subscribers_conversion_rate', 'grossRevenue', 'ad_revenue_rate', 'avg_view_duration_rate', 'estimatedAdRevenue', 'card_teaser_click_per_impression_rate', 'net_revenue_per_playlist_add', 'subscribers_gained_per_playlist_add', 'playbackBasedCpm', 'cpm_to_revenue_ratio', 'monetizedPlaybacks', 'cardTeaserClickRate', 'card_click_to_revenue_ratio', 'like_to_dislike_ratio', 'watch_time_per_playlist_add']


In [127]:
# 비선형모델 활용 변수 선정
importances_df = pd.DataFrame({
    'features': selected_features_by_lasso,
    'lgbm_importance': lgb_model.feature_importances_,
    'xgb_importance': xgb_model.feature_importances_    
})
importances_df['lgbm_importance'] = importances_df['lgbm_importance'] / importances_df['lgbm_importance'].sum() ## 다른 모델과 중요도 단위 다르기 때문에 정규화
importances_df['mean_importance'] = importances_df[['lgbm_importance', 'xgb_importance']].mean(axis=1) ## 각 변수별 모델 중요도 평균

In [128]:
importances_df.sort_values('mean_importance',ascending=False)

Unnamed: 0,features,lgbm_importance,xgb_importance,mean_importance
8,videosAddedToPlaylists,0.01,0.245445,0.127723
48,revenue_per_playback,0.001667,0.211089,0.106378
0,views,0.119333,0.041026,0.08018
54,playlist_removal_rate,0.059667,0.070688,0.065177
53,playlist_addition_rate,0.086667,0.043025,0.064846
15,cpm,0.057333,0.070209,0.063771
3,likes,0.088333,0.030766,0.05955
5,shares,0.072667,0.031938,0.052303
16,subscribersGained,0.051333,0.050982,0.051158
36,revenue_per_ad_impression,0.034333,0.024827,0.02958


In [129]:
final_selected_features_video = list(importances_df[importances_df['mean_importance']>=0.005]['features'])
final_selected_features_video

['views',
 'redViews',
 'likes',
 'shares',
 'averageViewDuration',
 'videosAddedToPlaylists',
 'estimatedRevenue',
 'estimatedAdRevenue',
 'playbackBasedCpm',
 'cpm',
 'subscribersGained',
 'adImpressions',
 'comment_rate',
 'dislike_rate',
 'subscribers_conversion_rate',
 'revenue_per_red_view',
 'ad_revenue_rate',
 'red_revenue_rate',
 'revenue_per_ad_impression',
 'net_revenue_per_playlist_add',
 'avg_view_duration_rate',
 'watched_time_rate',
 'watched_view_red_rate',
 'revenue_per_playback',
 'ad_playbacks_per_playlist_add',
 'playlist_addition_rate',
 'playlist_removal_rate']

#### 모델링

##### 모델 기법 적용

XGBoost

In [179]:
from xgboost import XGBClassifier

# XGBoost를 사용한 모델 학습
xgb_model_video = XGBClassifier(random_state=42)
xgb_model_video.fit(X_train_resampled[final_selected_features_video], y_train_resampled)

# 그라디언트 부스팅 모델 교차 검증
cv_scores_xgb = cross_val_score(xgb_model_video, X_train_resampled[final_selected_features_video], y_train_resampled, cv=5, scoring='accuracy')
print(f"XGBoost Cross-Validation Accuracy: {cv_scores_xgb.mean():.2f}")

KeyError: "['videosAddedToPlaylists', 'videosRemovedFromPlaylists', 'positive_engage_rate', 'ad_revenue_rate', 'avg_view_duration_rate', 'watched_time_rate', 'watch_time_per_playlist_add', 'revenue_per_playback', 'playlist_addition_rate', 'playlist_removal_rate'] not in index"

##### 모델 성능 평가

In [80]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# 예측 결과 생성
y_pred_xgb = xgb_model_video.predict(X_test[final_selected_features_video])

# 모델 정확도 확인
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Test Accuracy: {accuracy_xgb:.2f}")

print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))

# ROC-AUC 확인
roc_auc_xgb = roc_auc_score(y_test, xgb_model.predict_proba(X_test[final_selected_features_video])[:, 1])
print(f"XGBoost ROC-AUC: {roc_auc_xgb:.2f}")

XGBoost Test Accuracy: 0.99
XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     85861
           1       1.00      0.99      0.99   1626991

    accuracy                           0.99   1712852
   macro avg       0.91      0.99      0.95   1712852
weighted avg       0.99      0.99      0.99   1712852

XGBoost ROC-AUC: 1.00


### 결과 확인

#### 계정 데이터

In [None]:
# 최종 선정 변수
final_selected_features_user = ['estimatedMinutesWatched', 'averageViewDuration', 'subscribersGained', 'subscribersLost', 'redViews', 'cpm', 'like_rate', 'comment_rate', 'share_rate',
                                'dislike_rate', 'subscribers_conversion_rate', 'subscribed_view_rate', 'revenue_per_subscribed_view', 'revenue_per_unsubscribed_view', 'revenue_per_red_view',
                                'cpm_to_revenue_ratio', 'revenue_per_ad_impression', 'watched_view_rate', 'unsubscribed_view_time_rate']

In [None]:
from xgboost import XGBClassifier

# XGBoost를 사용한 모델 학습
xgb_model_user = XGBClassifier(random_state=42)
xgb_model_user.fit(X_train_resampled[final_selected_features_user], y_train_resampled)

# 그라디언트 부스팅 모델 교차 검증
cv_scores_xgb = cross_val_score(xgb_model_user, X_train_resampled[final_selected_features_user], y_train_resampled, cv=5, scoring='accuracy')
print(f"XGBoost Cross-Validation Accuracy: {cv_scores_xgb.mean():.2f}")

XGBoost Cross-Validation Accuracy: 0.97


In [84]:
# 예측 결과 확인
merge_df_users_fin['predict'] = xgb_model_user.predict(merge_df_users_fin[final_selected_features_user])
pred_result_df = merge_df_users_fin.groupby(['youtube_user_id'])['predict'].value_counts().reset_index()
fraud_user_id = list(pred_result_df[pred_result_df['predict']==0][pred_result_df[pred_result_df['predict']==0]['count']>=40]['youtube_user_id'].unique()) ## 이상치라고 생각되는 계정 확인

In [86]:
print(len(fraud_user_id))
print(fraud_user_id)

44
['627f59ccaa39226247c60b01', '62872523fb15712a8cb93479', '629f6ca6eaf5732d6df0611e', '62a35ce69d41c93ff90b5670', '62a6a1c49d41c93ff90efd6e', '62ad9032423c30268e563375', '62bc1aca507271632b940e2e', '62c4e558507271632b9cc1c7', '62d11e8d0b4c4c7502a5bb11', '62d11f080b4c4c7502a5be3d', '62d11f9f0b4c4c7502a5c1b6', '62d120170b4c4c7502a5c44a', '62d55a5e9900f20e1f259d24', '62fb96f62be6ae3ff3672d79', '631a067c1babf83920070ad7', '6332f892ef33d840a099abb3', '639bb8dcd603b8138e33780b', '63c6a2504238543bcaf03d3e', '63d77c9650eb530dfd139f8b', '63d89df750eb530dfd13a940', '63e9a89eee122e6319921a52', '63eb4f87ee122e631992279f', '63fb5daa2a0144119186eca8', '640001db0abaa11316396d3b', '6400d819d746c60e1271f873', '6401e117d746c60e1271fdef', '64020bf4d746c60e1272055f', '640339ac118c0f5858818694', '640a007613bc6a0e24f95b24', '6417c62789085e280d0e410b', '6427b4a01d589972c84adf22', '642c19651d589972c84b1548', '645da9e6ef566f0e136a83cf', '645ec17eef566f0e136a9880', '6486b00519c22b644dded32c', '64d1c7e51e9bad0

#### 콘텐츠 데이터

In [130]:
final_selected_features_video = ['views', 'redViews', 'likes', 'shares', 'averageViewDuration', 'videosAddedToPlaylists', 'estimatedRevenue', 'estimatedAdRevenue', 'playbackBasedCpm', 'cpm',
                                 'subscribersGained', 'adImpressions', 'comment_rate', 'dislike_rate', 'subscribers_conversion_rate', 'revenue_per_red_view', 'ad_revenue_rate', 'red_revenue_rate',
                                 'revenue_per_ad_impression', 'net_revenue_per_playlist_add', 'avg_view_duration_rate', 'watched_time_rate', 'watched_view_red_rate', 'revenue_per_playback',
                                 'ad_playbacks_per_playlist_add', 'playlist_addition_rate', 'playlist_removal_rate']

In [131]:
from xgboost import XGBClassifier

# XGBoost를 사용한 모델 학습
xgb_model_video = XGBClassifier(random_state=42)
xgb_model_video.fit(X_train_resampled[final_selected_features_video], y_train_resampled)

# 그라디언트 부스팅 모델 교차 검증
cv_scores_xgb = cross_val_score(xgb_model_video, X_train_resampled[final_selected_features_video], y_train_resampled, cv=5, scoring='accuracy')
print(f"XGBoost Cross-Validation Accuracy: {cv_scores_xgb.mean():.2f}")

XGBoost Cross-Validation Accuracy: 0.99


In [132]:
# 예측 결과 확인
youtube_videos['predict'] = xgb_model_video.predict(youtube_videos[final_selected_features_video])
result_video_df = youtube_videos.groupby('video')['predict'].value_counts().reset_index()

In [133]:
# 이상치 콘텐츠 확인 - 콘텐츠 일일 데이터 중 10% 이상 이상치 데이터 확인
result_video_df_outlier = pd.merge(result_video_df[result_video_df['predict']==0].reset_index(drop=True), ## video별 이상치 테이블
                                   result_video_df.groupby('video')['count'].sum().reset_index(), ## 전체 video 테이블
                                   how='left', on='video')
video_id_outlier = list(result_video_df_outlier[(result_video_df_outlier['count_x'] / result_video_df_outlier['count_y']) > 0.1]['video'].unique()) ## video별 이상치 비율

In [134]:
# 이상치 콘텐츠가 20% 이상 있는 계정 확인
result_user_df_outlier = pd.merge(youtube_videos[youtube_videos['video'].isin(video_id_outlier)].groupby('youtube_user_id')['video'].count().reset_index(),
                                  youtube_videos.groupby(['youtube_user_id'])['video'].count().reset_index(), how='left', on='youtube_user_id')
fraud_video_user_id = list(result_user_df_outlier[(result_user_df_outlier['video_x'] / result_user_df_outlier['video_y']) > 0.2]['youtube_user_id'].unique())

In [135]:
print(len(fraud_video_user_id))
print(fraud_video_user_id)

68
['62872317fb15712a8cb932e9', '62872370fb15712a8cb93337', '6287239cfb15712a8cb93368', '62873efffb15712a8cb941a2', '629c6ab9eaf5732d6deb9186', '629f6ca6eaf5732d6df0611e', '62a35ce69d41c93ff90b5670', '62a6a1c49d41c93ff90efd6e', '62a890be9d41c93ff9129a22', '62ad9032423c30268e563375', '62ae2750423c30268e56edab', '62b024a0423c30268e58b611', '62bc283e507271632b9418ab', '62c2e7d5507271632b9b61e9', '62c4e558507271632b9cc1c7', '62c5af1e507271632b9e1b43', '62d11e8d0b4c4c7502a5bb11', '62d11f080b4c4c7502a5be3d', '62d11fbc9900f20e1f2121d8', '62d55a5e9900f20e1f259d24', '631a067c1babf83920070ad7', '6332f892ef33d840a099abb3', '6350be0a5a3ac10b5fe82028', '635676965a3ac10b5fe8508f', '63622fb65a3ac10b5fe8ba7a', '639bb8dcd603b8138e33780b', '63c919d450eb530dfd134718', '63ca5c4b50eb530dfd134f67', '63d0c34350eb530dfd1370ee', '63d89df750eb530dfd13a940', '63e9a89eee122e6319921a52', '63fab4682a0144119186e8a3', '63fae32c2a0144119186e923', '63fb5daa2a0144119186eca8', '63fb8d492a0144119186eead', '64013d16d746c60

In [141]:
total_user_id = list(merge_df_users_fin['youtube_user_id'].unique())

In [157]:
# 계정 이상치, 콘텐츠 정상 - 잠재적 가능성이 있는 계정
print(len(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(set(fraud_user_id) - set(fraud_video_user_id))]['channel_title'].unique()))
print(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(set(fraud_user_id) - set(fraud_video_user_id))]['channel_title'].unique())

19
['도아이 Doh-I' '뚜니랑' '벽돌할아버지 Brick grandpa' '앙찡' 'DDONIE 또니 / 러브크레센트'
 'fromsuzy 프롬수지' '슈로시안 SUROSIAN' '키키낙낙' '수빙수tv sooBingsoo' 'kiu기우쌤' '김우다'
 '임삐나' 'GMENCY 멘시의 마인크래프트' '채림처럼firstcherry' "루다의 댄스 연구소 Ruda's Dance Lab"
 '잼스기타' '너굴몬' '고도람 Go!doram' '코인덕 차트아지']


In [161]:
# 계정 이상치, 콘텐츠 이상치 - 영향력이 적은 계정
print(len(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(set(list(fraud_user_id) + list(fraud_video_user_id)))]['channel_title'].unique()))
print(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(set(list(fraud_user_id) + list(fraud_video_user_id)))]['channel_title'].unique())

82
['자수의숲jasooforest' '도아이 Doh-I' '스타트업잡스' '시골낭만아재' 'OBL - 온라인 농부, 사자가 되다'
 'Jeffreyxking' '복지다있소' '세계여행 테리로그 TERRYLOG' '래띠 LAETI'
 '석시원 커플 SeokSiWon Couple' '법무법인 슈가스퀘어' '뚜니랑' '어웨이커 | 크리에이터 이코노미'
 '벽돌할아버지 Brick grandpa' 'Ella' '잉툰TV- 만화로 쉽게 영어배우자' '앙찡' '이현우의 MLBTV'
 '북토크' '시리얼 Sireal' 'AllaproTV' 'DDONIE 또니 / 러브크레센트' 'fromsuzy 프롬수지'
 'KIMBEE 킴비' '슈로시안 SUROSIAN' 'hyeppening 혜프닝' '키키낙낙' '은는이가' '니들needle'
 'ORlGN 오리진' '이숲soop' '닷츠 DOTS' '수빙수tv sooBingsoo' 'kiu기우쌤' 'Lizzy리지'
 '비됴클래스' '김우다' '쿜쿜쿜' '래아TV' '슬기런바디 Run Body' '임삐나' '지니원장의피부톡톡' '흙회장'
 '태권민국_Captain Master' '라나제이베이킹Lana J' '보미름' '에피코딩' '주피코' '비제TV'
 '월텍남 - 월스트리트 테크남' '소피요가 Sophie Yoga' 'Mind Patting마음토닥' '부반TV_부에 반하다'
 '황나겸' '돈냄새' 'MINLEE 민리' 'GMENCY 멘시의 마인크래프트' '나연이즈백 LPGA Na Yeon Choi'
 '청어람ARMC' '김두부' 'abbapraise 아바프레이즈' '그롬마쉬TV' '모하지연 MOHAJIYEON'
 '채림처럼firstcherry' '미니멀영어 Minimal English' 'SATUR 세터업' 'Dalhae달달해'
 "루다의 댄스 연구소 Ruda's Dance Lab" '동아일보' '키나kkina' '바라던 바다 BADACHANNEL'
 '잼스기타' 'Seol-A 라이더 설아' '중년독수리의 대리여행' '평범한 사업가'

In [162]:
# 계정 정상, 콘텐츠 정상 - 영향력이 큰 계정
print(len(merge_df_users_fin[~merge_df_users_fin['youtube_user_id'].isin(set(list(fraud_user_id) + list(fraud_video_user_id)))]['channel_title'].unique()))
print(merge_df_users_fin[~merge_df_users_fin['youtube_user_id'].isin(set(list(fraud_user_id) + list(fraud_video_user_id)))]['channel_title'].unique())

152
['담비' '임영곤 게임방송' '모리녀' '정가거부' '콜로니' '미키버그 VR게임' '-mentalholder 멘탈홀더 tv'
 '빅민 GAME' '군대위키' '드론브이로그 DroneVlog' '키메키친 Kime_kitchen' '후니트립 hoony_trip'
 '채채ChaeChae' '로컬필름 LOCAL FILM' 'OSSC' '모염 moyeom' '소리미의 신화방송'
 '두꼽이Challenge' 'WORKS.D PLAYLIST' '수집의 수집' '막셋의 종합게임' '에디레일 Eddy Rails'
 '윤새 Yoonsae' '탬니몰리' '여행윤Tripyun' '고기,요정 MeatPixie' '히스커버리 역사채널' '자린이 조피디'
 '강포동하우스' '시현하다 RECORDERS' '라이라마' 'sa lly' '마파TV' '콜드쉽 Coldsheep'
 '띠혜 ddihye' '담순언니 Twins Vlog' 'BUNNY' '김밈서' '카이바군' '구봉바다낚시 뽀식이'
 '정케빈 KEVIN' '오디디 코미디' '축구 읽어주는 여자 쵱내' 'the sence' '목소리 연기자 유지컬' '팀브라더스'
 '핸슥슥' '코딩국수' '전또' 'JN테크리뷰' '고군 Gohgoon' '하부유튜브 Minor / (Lower) YouTube'
 '오늘도희다 HEEDA' '뻘짓연구소' '유익한 균튜버' '꿈꾼 배기' '용싸부 yongssaboo' '한나임한나Hannaim'
 '만능혁키' '오엔티엘 패션 / ONTL FASHION' '요니의 응원 yoni' '나나무비' '유경몬' '뛰뛰빵빵 김옥순'
 '일렉트릭 차이나' '루깬미' '두남자 토익TV' "IT가 쉬워지는 '백개의 리뷰'" '채찍단' '찌늉' '름쿠 ᴘʟᴀʏʟɪꜱᴛ'
 '너드 슬로리 SloLee' '수란쿤' '배우GO' '나는미도' '주당 김자케' 'Mein 미인' '지미 geemi.'
 '데일리 슬슬' '김퍼프PUFF' 'assesta' '오토컨테이너 스튜디오' '집구석구석꿀팁, 집꿀' '낭만아저씨코디TV'
 'Yer

In [163]:
# 계정 정상, 콘텐츠 이상치 - 이상치 계정
print(len(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(set(fraud_video_user_id) - set(fraud_user_id))]['channel_title'].unique()))
print(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(set(fraud_video_user_id) - set(fraud_user_id))]['channel_title'].unique())

40
['자수의숲jasooforest' '스타트업잡스' '복지다있소' '세계여행 테리로그 TERRYLOG' '래띠 LAETI'
 '어웨이커 | 크리에이터 이코노미' 'Ella' '시리얼 Sireal' 'AllaproTV' 'KIMBEE 킴비'
 'hyeppening 혜프닝' '은는이가' '니들needle' '이숲soop' '닷츠 DOTS' 'Lizzy리지'
 '슬기런바디 Run Body' '지니원장의피부톡톡' '흙회장' '태권민국_Captain Master' '라나제이베이킹Lana J'
 '보미름' '에피코딩' '비제TV' '월텍남 - 월스트리트 테크남' '소피요가 Sophie Yoga' '황나겸' '돈냄새'
 '나연이즈백 LPGA Na Yeon Choi' '김두부' 'SATUR 세터업' 'Dalhae달달해' '동아일보'
 '바라던 바다 BADACHANNEL' 'Seol-A 라이더 설아' '중년독수리의 대리여행' 'Ood 오드' '디지털생활제안'
 '하원장 강동현' '황헬린 탈출기']
