In [1]:
import pymongo
import pandas as pd
import numpy as np
import math
from datetime import datetime

# import dask.dataframe as dd ## 대규모 데이터 처리
# import vaex ## 대규모 데이터 처리

# from sklearn.linear_model import LinearRegression ## likes, dislikes 대체
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

import os
from dotenv import load_dotenv

## MongoDB 연동

In [2]:
# .env 파일에서 환경 변수 로드
load_dotenv('C:/py_src/awake/.env')

# 환경 변수에서 MongoDB 연결 정보 가져오기
mongo_password = os.getenv('MONGO_PASSWORD')
mongo_user = os.getenv('MONGO_USER')

In [3]:
# MongoDB 연결 URL
url = f"mongodb+srv://{mongo_user}:{mongo_password}@meercat-external.udyfs.mongodb.net/?retryWrites=true&w=majority&appName=meercat-external"
client = pymongo.MongoClient(url, serverSelectionTimeoutMS=100000)

In [4]:
# 연결 확인
from pymongo.errors import ServerSelectionTimeoutError

try:
    # 데이터베이스 목록을 가져오는 쿼리
    databases = client.list_database_names()
    print("Connected successfully. Databases:", databases)
    
except ServerSelectionTimeoutError as err:
    print("Connection failed:", err)

Connected successfully. Databases: ['Test', 'admin', 'config', 'local']


In [5]:
# Test DB 선택
db = client.Test

# Test DB 컬렉션 확인
collections = db.list_collection_names()

In [67]:
# # 단위 환산
# def convert_bytes(num):
#     for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
#         if num < 1024.0:
#             return f"{num:.2f} {x}"
#         num /= 1024.0

In [7]:
# for collection_name in collections:
#     # 컬렉션 통계 정보 가져오기
#     stats = db.command("collStats", collection_name)

#     # 컬렉션의 크기와 문서 수 출력    
#     print(f"Collection '{collection_name}' size: {convert_bytes(stats['size'])}")
#     print(f"Collection '{collection_name}' document count: {stats['count']}")
#     print(f"Total index size: {convert_bytes(stats['totalIndexSize'])}")
#     print('--------------------------------------------------------------------')

In [6]:
## 인스타 관련 데이터 제외
## 'youtube_videos'/'youtube_datas' 따로 수집
## 'youtube_report_v2'/'youtube_report' 날짜 정보불분명, 다른 테이블 정보와 중복
collections_need = [
    'youtube_users',
    'youtube_channel_locations',
    'youtube_daily_channel_basics',
    ]

In [7]:
# 데이터 로드
youtube_dict={}
for collection_name in collections_need:

    youtube_dict[collection_name] = pd.DataFrame(list(db[collection_name].find()))
    
    print(collection_name)
    print(youtube_dict[collection_name].columns)
    print(">> Success")
    print("--------------------------------------")
    print("")

youtube_users
Index(['_id', 'country', 'phone_num', 'kakao_nick', 'kakao_account_id',
       'user_kind', 'created_at', '__v', 'channel_title', 'channel_id',
       'thumbnail_url', 'published_at', 'subscriber_count', 'is_rev_saved',
       'is_subs_saved', 'updated_at', 'brandingSettings', 'contentDetails',
       'contentOwnerDetails', 'etag', 'id', 'kind', 'snippet', 'statistics',
       'status', 'topicDetails', 'connected', 'refresh_error', 'localizations',
       'ads_array', 'age', 'gender', 'region_array', 'is_active',
       'category_array', 'account_type', 'children_age_array',
       'is_accept_suggestion', 'is_add_info', 'pet_array', 'user_id',
       'report_user_id'],
      dtype='object')
>> Success
--------------------------------------



In [8]:
# 키값 형식 맞추기
for key, df in youtube_dict.items():
    # 각 딕셔너리의 value가 DataFrame인지 확인
    if isinstance(df, pd.DataFrame):
        # '_id' 컬럼이 있는지 확인하고 문자열로 변환
        if '_id' in df.columns:
            df['_id'] = df['_id'].astype(str)
        
        # 'youtube_user_id' 컬럼이 있는지 확인하고 문자열로 변환
        if 'youtube_user_id' in df.columns:
            df['youtube_user_id'] = df['youtube_user_id'].astype(str)

        # 변경된 DataFrame을 다시 딕셔너리에 저장
        youtube_dict[key] = df

In [9]:
del df

In [10]:
youtube_dict.keys()

dict_keys(['youtube_users'])

## 데이터 불러오기

### 계정 데이터

#### youtube_users

In [11]:
youtube_users = youtube_dict['youtube_users']

In [12]:
len(youtube_users['channel_id'].unique())
## 유튜버 계정 : 883개

883

In [13]:
# 필요컬럼추출
youtube_users = youtube_users[['channel_id', 'channel_title', 'phone_num', 'report_user_id', 'statistics','published_at']] 
## published_at : 유튜브 가입일
## 'statistics' 컬럼의 'subscriberCount' 정보와 'subscriber_count' 컬럼 정보가 다름 --> 구독자 수
## 'channel_id' 컬럼, 'contentDetails' 컬럼의 'uploads' 같은 정보

In [14]:
# cast 하여 최종 데이터셋 생성
youtube_users = pd.concat([youtube_users, pd.json_normalize(youtube_users['statistics'])],axis=1)
youtube_users = youtube_users.drop(['statistics','hiddenSubscriberCount'],axis=1)
youtube_users = youtube_users.dropna(how = 'all')

# null 값 0으로 대체
youtube_users[['viewCount', 'subscriberCount', 'videoCount']] = youtube_users[['viewCount', 'subscriberCount', 'videoCount']].fillna(0)
youtube_users['viewCount'] = youtube_users['viewCount'].astype(int)
youtube_users['subscriberCount'] = youtube_users['subscriberCount'].astype(int)
youtube_users['videoCount'] = youtube_users['videoCount'].astype(int)

youtube_users = youtube_users.sort_values('channel_id').drop_duplicates().reset_index(drop=True)

# id가 null인 계정 제거
youtube_users = youtube_users[~youtube_users['channel_id'].isnull()].reset_index(drop=True)

# channel_id별로 그룹화하여 null값을 해당 그룹 내에서 채우기
youtube_users['phone_num'] = youtube_users.groupby('channel_id')['phone_num'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))

# 계정 지표 모두 0인 계정 삭제
youtube_users = youtube_users[youtube_users[['viewCount', 'subscriberCount', 'videoCount']].sum(axis=1)!=0].reset_index(drop=True)

  youtube_users['phone_num'] = youtube_users.groupby('channel_id')['phone_num'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
  youtube_users['phone_num'] = youtube_users.groupby('channel_id')['phone_num'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))


#### youtube_channel_locations
- 채널 구독자 위치

In [14]:
youtube_channel_locations = youtube_dict['youtube_channel_locations']
# youtube_channel_locations = pd.read_csv(file_path + 'raw_data/youtube_channel_locations.csv', low_memory=False)

In [15]:
len(youtube_channel_locations['youtube_user_id'].unique())
## 유튜버 계정 : 906개

906

In [16]:
# 필요컬럼추출
youtube_channel_locations = youtube_channel_locations[youtube_channel_locations['locations'].apply(lambda x: len(x) > 0)]
youtube_channel_locations = youtube_channel_locations[['youtube_user_id','end_date','locations']]

In [17]:
# melt, cast 하여 최종데이터셋 생성
youtube_channel_locations = youtube_channel_locations.explode(['locations']).reset_index(drop=True)

# cast 하여 최종 데이터셋 생성
youtube_channel_locations = pd.concat([youtube_channel_locations,pd.json_normalize(youtube_channel_locations['locations'])], axis=1)
youtube_channel_locations = youtube_channel_locations.drop(['locations','subscribersGained','subscribersLost'],axis=1)
youtube_channel_locations = youtube_channel_locations[youtube_channel_locations[youtube_channel_locations.columns[3:]].apply(sum,axis=1)!=0] ## 모두 0인 행 제거
youtube_channel_locations = youtube_channel_locations.drop_duplicates().sort_values(['youtube_user_id','end_date']).reset_index(drop=True)

In [18]:
# 계정별 일자별 지표 계산
youtube_channel_locations = youtube_channel_locations.groupby(['youtube_user_id', 'end_date']).agg({'views': 'sum',
                                                                                                    'estimatedMinutesWatched': 'sum',
                                                                                                    'averageViewDuration': 'mean',
                                                                                                    'averageViewPercentage': 'mean'}).reset_index()

In [19]:
# 시간 분 단위로 변경
youtube_channel_locations['averageViewDuration'] = youtube_channel_locations['averageViewDuration'] / 60

In [20]:
# 날짜형식 변경
youtube_channel_locations = youtube_channel_locations.rename(columns={'end_date':'date'})
youtube_channel_locations['date'] = youtube_channel_locations['date'].astype(str)

#### youtube_daily_channel_basics

In [21]:
youtube_daily_channel_basics = youtube_dict['youtube_daily_channel_basics']

In [22]:
len(youtube_daily_channel_basics['youtube_user_id'].unique())
## 유튜버 계정 : 906개

906

In [23]:
# 필요컬럼추출
youtube_daily_channel_basics = youtube_daily_channel_basics[youtube_daily_channel_basics['daily_basics'].apply(lambda x: len(x) > 0)]
youtube_daily_channel_basics = youtube_daily_channel_basics[['youtube_user_id','daily_basics']] ## 'end_date' --> 'day' 컬럼 활용
youtube_daily_channel_basics = youtube_daily_channel_basics.explode(['daily_basics']).reset_index(drop=True) ## melt

In [24]:
# daily_basics 컬럼 형식 맞추기 --> 딕셔너리 값들 cast 형태로 맞추기
youtube_daily_channel_basics_cast = []
for item in youtube_daily_channel_basics['daily_basics']:
    if isinstance(item, list):
        youtube_daily_channel_basics_cast.extend(item)
    else:
        youtube_daily_channel_basics_cast.append(item)

youtube_daily_channel_basics_cast = pd.json_normalize(youtube_daily_channel_basics_cast)

In [25]:
# melt, cast 하여 최종 데이터셋 생성
youtube_daily_channel_basics = pd.concat([youtube_daily_channel_basics, youtube_daily_channel_basics_cast],axis=1)
youtube_daily_channel_basics = youtube_daily_channel_basics.drop('daily_basics',axis=1)
youtube_daily_channel_basics = youtube_daily_channel_basics.fillna(0) ## null값 0으로 대체
youtube_daily_channel_basics = youtube_daily_channel_basics[youtube_daily_channel_basics[youtube_daily_channel_basics.columns[3:]].sum(axis=1)!=0] ## 모두 0인 행 제거
youtube_daily_channel_basics = youtube_daily_channel_basics.sort_values(['youtube_user_id','day']).reset_index(drop=True)

del youtube_daily_channel_basics_cast

In [26]:
# 날짜형식 변경
youtube_daily_channel_basics = youtube_daily_channel_basics.rename(columns={'day':'date'})
youtube_daily_channel_basics['date'] = youtube_daily_channel_basics['date'].astype(str)

# 시간 분 단위로 변경
youtube_daily_channel_basics['averageViewDuration'] = youtube_daily_channel_basics['averageViewDuration'] / 60

In [27]:
# # 최종데이터셋 기준 필요 계정 수 추출 - report, report_v2 제외
# youtube_user_id_outer = list(set(list(youtube_channel_locations['youtube_user_id']) +
#                                  list(youtube_daily_channel_basics['youtube_user_id'])))
# print('youtube_user_id_outer', len(youtube_user_id_outer))
# ## youtube_user_id 모두 포함 912개

# youtube_user_id_inner = list(set(youtube_channel_locations['youtube_user_id']) &
#                              set(youtube_daily_channel_basics['youtube_user_id']))

# print('youtube_user_id_inner',len(youtube_user_id_inner))
# ## youtube_user_id  공통 포함 250개

#### youtube_datas

In [15]:
youtube_user_id_inner = pd.read_csv('C:/py_src/awake/data/youtube_user_id_inner.csv')

In [16]:
collection = db['youtube_datas']  # 컬렉션 선택

# 파이프라인 정의
pipeline = [
    {
        "$match": {
            "youtube_user_id": {
                "$in": list(youtube_user_id_inner['youtube_user_id'])
            },
            # "end_date": {
            # "$gte": first_date,
            # "$lte": last_date
            # }
        }
    },
    {
        "$sort": {
            "youtube_user_id": 1,
            "data_created_at": 1
        }
    },
    {
        "$project": {
            'youtube_user_id' : 1, 
            'data_created_at' : 1, 
            'published_at' : 1, 
            'channel_id' : 1, 
            'channel_title' : 1, 
            'yt_search_keyword' : 1, 
            'subscribed_status' : 1
        }
    }
]

# 파이프라인 실행
result = list(collection.aggregate(pipeline, allowDiskUse=True))

# 결과를 Pandas 데이터프레임으로 변환
youtube_datas = pd.DataFrame(result)

In [17]:
len(youtube_datas['youtube_user_id'].unique())

249

In [18]:
# 컬럼 순서 정리
need_col = ['youtube_user_id', 'data_created_at', 'published_at', 'channel_id', 'channel_title', 'yt_search_keyword', 'subscribed_status']
youtube_datas = youtube_datas[need_col]

In [19]:
# cast 하여 데이터셋 생성
youtube_datas = pd.concat([youtube_datas, pd.json_normalize(youtube_datas['subscribed_status'])],axis=1)
youtube_datas = youtube_datas.drop(['subscribed_status'],axis=1)

youtube_datas[youtube_datas.columns[6:]] = youtube_datas[youtube_datas.columns[6:]].fillna(0) # null값 0으로 대체
youtube_datas = youtube_datas[youtube_datas[youtube_datas.columns[6:]].sum(axis=1)!=0] ## 모두 0인 행 제거

youtube_datas = youtube_datas.sort_values(['youtube_user_id','data_created_at']).reset_index(drop=True)

In [20]:
# 날짜 기준 필터링
opt_date = (youtube_datas['data_created_at'] >= datetime(2023, 3, 26)) & (youtube_datas['data_created_at'] <= datetime(2024, 5, 3))
youtube_datas = youtube_datas[opt_date].sort_values(['youtube_user_id', 'data_created_at']).reset_index(drop=True)

In [21]:
# 날짜형식 변경
youtube_datas = youtube_datas.rename(columns={'data_created_at':'date'})
youtube_datas['date'] = youtube_datas['date'].astype(str)

### 콘텐츠 데이터

#### youtube_videos

In [35]:
youtube_user_id_inner = pd.read_csv('C:/py_src/awake/data/youtube_user_id_inner.csv')

In [39]:
collection = db['youtube_videos']  # 컬렉션 선택

# 파이프라인 정의
pipeline = [
    {
        "$match": {
            "youtube_user_id": {
                "$in": list(youtube_user_id_inner['youtube_user_id'])
            },
            "videos": {"$ne": []},
            "end_date": {
                "$gte": datetime(2023, 3, 26),
                "$lte": datetime(2024, 5, 3)
            }
        }
    },
    {
        "$sort": {
            "youtube_user_id": 1,
            "end_date": 1
        }
    },
    {
        "$project": {
            "youtube_user_id": 1,
            "end_date": 1,
            "videos": 1
        }
    }
]

# 파이프라인 실행
result = list(collection.aggregate(pipeline, allowDiskUse=True))

# 결과를 Pandas 데이터프레임으로 변환
youtube_videos = pd.DataFrame(result)

In [40]:
len(youtube_videos['youtube_user_id'].unique())

249

In [41]:
# melt, cast하여 최종데이터셋 생성
# melt
youtube_videos = youtube_videos.explode(['videos']).reset_index(drop=True)

#cast
youtube_videos = pd.concat([youtube_videos, pd.json_normalize(youtube_videos['videos'])],axis=1)

youtube_videos = youtube_videos.drop(['_id','videos'],axis=1)
youtube_videos = youtube_videos.fillna(0) ## null값 0으로 대체
youtube_videos = youtube_videos[youtube_videos[youtube_videos.columns[4:]].sum(axis=1)!=0] ## 모두 0인 행 제거
youtube_videos = youtube_videos.drop_duplicates().sort_values(['youtube_user_id', 'end_date']).reset_index(drop=True)

In [23]:
# # 날짜형식 변경
# youtube_videos = youtube_videos.rename(columns={'end_date':'date'})
# youtube_videos['date'] = youtube_videos['date'].astype(str)

In [2]:
# youtube_videos = pd.read_csv('C:/py_src/awake/data/youtube_videos.csv')

In [3]:
# youtube_videos = youtube_videos.fillna(0) ## null값 0으로 대체
# youtube_videos = youtube_videos[youtube_videos[youtube_videos.columns[4:]].sum(axis=1)!=0] ## 모두 0인 행 제거
# youtube_videos = youtube_videos.drop_duplicates().sort_values(['youtube_user_id', 'end_date']).reset_index(drop=True)

## 데이터 전처리

### 계정 데이터

데이터 통합

In [39]:
merge_df_users_fin = pd.merge(youtube_users,youtube_datas,how='left',on='channel_id')
need_col = ['youtube_user_id', 'date', 'channel_id', 'channel_title_x', 'published_at_x', 'phone_num', 'yt_search_keyword', 'viewCount', 'subscriberCount', 'videoCount','UNSUBSCRIBED', 'SUBSCRIBED']
merge_df_users_fin = merge_df_users_fin[need_col]
merge_df_users_fin = merge_df_users_fin.rename(columns={'channel_title_x':'channel_title','published_at_x':'published_at'})
merge_df_users_fin = merge_df_users_fin[~merge_df_users_fin['youtube_user_id'].isnull()].reset_index(drop=True)

merge_df_users_fin = pd.merge(merge_df_users_fin,youtube_channel_locations,how='left',on=['youtube_user_id','date'])
merge_df_users_fin = merge_df_users_fin.drop(['views'],axis=1)

youtube_daily_channel_basics = youtube_daily_channel_basics.drop(['annotationClickThroughRate','annotationCloseRate'],axis=1)
merge_df_users_fin = pd.merge(merge_df_users_fin,youtube_daily_channel_basics,how='left',on=['youtube_user_id','date'])

In [40]:
# null값 대체
merge_df_users_fin['estimatedMinutesWatched_x'] = merge_df_users_fin['estimatedMinutesWatched_x'].fillna(merge_df_users_fin['estimatedMinutesWatched_y'])
merge_df_users_fin['averageViewDuration_x'] = merge_df_users_fin['averageViewDuration_x'].fillna(merge_df_users_fin['averageViewDuration_y'])

merge_df_users_fin = merge_df_users_fin.drop(['estimatedMinutesWatched_y','averageViewDuration_y'],axis=1)
merge_df_users_fin = merge_df_users_fin.rename(columns={'estimatedMinutesWatched_x':'estimatedMinutesWatched','averageViewDuration_x':'averageViewDuration'})

# 영상 시청 시간 합 / 영상 재생 시간 합 = 영상 시청 비율 대체
merge_df_users_fin['averageViewPercentage'] = np.where(merge_df_users_fin['averageViewPercentage'].isnull(), 
                                                       merge_df_users_fin['estimatedMinutesWatched'] / (merge_df_users_fin['averageViewDuration'] * merge_df_users_fin['views']),
                                                       merge_df_users_fin['averageViewPercentage'])
merge_df_users_fin['averageViewPercentage'] = merge_df_users_fin['averageViewPercentage'].fillna(0)

In [41]:
# 환율 적용 - 해당기간 평균환율 : 1322.42
exchange_rate_col = ['estimatedRevenue', 'estimatedAdRevenue', 'estimatedRedPartnerRevenue', 'grossRevenue', 'cpm', 'playbackBasedCpm']
merge_df_users_fin[exchange_rate_col] = merge_df_users_fin[exchange_rate_col] * 1322.42

In [42]:
# 버그로 사용된 수치값 대체
merge_df_users_fin['likes'] = np.where(merge_df_users_fin['likes'] < 0, 0, merge_df_users_fin['likes'])
merge_df_users_fin['dislikes'] = np.where(merge_df_users_fin['dislikes'] < 0, 0, merge_df_users_fin['dislikes'])

### 콘텐츠 데이터

In [4]:
# 최종 콘텐츠 분석 데이터셋
# 환율 적용 - 해당기간 평균환율 : 1322.42
exchange_rate_col = ['estimatedRevenue', 'estimatedAdRevenue', 'estimatedRedPartnerRevenue', 'grossRevenue', 'cpm', 'playbackBasedCpm']
youtube_videos[exchange_rate_col] = youtube_videos[exchange_rate_col] * 1322.42

In [5]:
# 잘못된값 처리
youtube_videos['estimatedRevenue'] = np.where(youtube_videos['estimatedRevenue'] < 0,
                                               youtube_videos['estimatedAdRevenue'] + youtube_videos['estimatedRedPartnerRevenue'],
                                               youtube_videos['estimatedRevenue'])

In [6]:
# 버그로 사용된 수치값 대체
youtube_videos['likes'] = np.where(youtube_videos['likes'] < 0, 0, youtube_videos['likes'])
youtube_videos['dislikes'] = np.where(youtube_videos['dislikes'] < 0, 0, youtube_videos['dislikes'])

## 주제1

### 계정 데이터

#### 파생변수

In [49]:
# y값 파생변수
merge_df_users_fin['total_engage_rate'] = (merge_df_users_fin['likes'] + merge_df_users_fin['comments'] + merge_df_users_fin['shares'] + merge_df_users_fin['dislikes']) / merge_df_users_fin['views'] ## 총 참여율
merge_df_users_fin['net_subscribers_change'] = merge_df_users_fin['subscribersGained'] - merge_df_users_fin['subscribersLost'] ## 구독자 순증가
merge_df_users_fin['revenue_per_view'] = merge_df_users_fin['estimatedRevenue'] / merge_df_users_fin['views'] ## 조회수당 수익
merge_df_users_fin['gross_revenue_per_ad_impression'] = merge_df_users_fin['grossRevenue'] / merge_df_users_fin['adImpressions'] ## 1회 광고노출당 총수익

In [50]:
# 파생변수1 - 참여도 관련
merge_df_users_fin['like_rate'] = merge_df_users_fin['likes'] / merge_df_users_fin['views'] ## 좋아요 비율 
merge_df_users_fin['comment_rate'] = merge_df_users_fin['comments'] / merge_df_users_fin['views'] ## 댓글 비율
merge_df_users_fin['share_rate'] = merge_df_users_fin['shares'] / merge_df_users_fin['views'] ## 공유 비율  
merge_df_users_fin['dislike_rate'] = merge_df_users_fin['dislikes'] / merge_df_users_fin['views'] ## 싫어요 비율

In [51]:
# 파생변수2 - 구독자 관련
merge_df_users_fin['subscribers_conversion_rate'] = merge_df_users_fin['subscribersGained'] / merge_df_users_fin['views'] ## 구독자 전환율
merge_df_users_fin['subscribed_view_rate'] = merge_df_users_fin['SUBSCRIBED'] / (merge_df_users_fin['SUBSCRIBED'] + merge_df_users_fin['UNSUBSCRIBED']) ## 구독자 조회수 비율

In [52]:
# 파생변수3 - 수익 관련
merge_df_users_fin['revenue_per_subscribed_view'] = merge_df_users_fin['estimatedRevenue'] / merge_df_users_fin['SUBSCRIBED'] ## 구독자조회수당 수익
merge_df_users_fin['revenue_per_unsubscribed_view'] = merge_df_users_fin['estimatedRevenue'] / merge_df_users_fin['UNSUBSCRIBED'] ## 비구독자조회수당 수익
merge_df_users_fin['revenue_per_red_view'] = merge_df_users_fin['estimatedRevenue'] / merge_df_users_fin['redViews'] ## 프리미엄당 수익
merge_df_users_fin['cpm_to_revenue_ratio'] = merge_df_users_fin['cpm'] / merge_df_users_fin['estimatedRevenue'] ## cpm 대비 수익
merge_df_users_fin['revenue_per_ad_impression'] = merge_df_users_fin['estimatedRevenue'] / merge_df_users_fin['adImpressions'] ## 광고노출당 수익

In [53]:
# 파생변수4 - 시청 시간 관련
merge_df_users_fin['watched_view_rate'] = merge_df_users_fin['estimatedMinutesWatched'] / merge_df_users_fin['views'] ## 조회수당 시청시간
merge_df_users_fin['unsubscribed_view_time_rate'] = merge_df_users_fin['estimatedMinutesWatched'] / merge_df_users_fin['UNSUBSCRIBED'] ## 비구독자조회수당 시청시간

In [54]:
# null값 대체
merge_df_users_fin = merge_df_users_fin.fillna(0) ## NaN
merge_df_users_fin = merge_df_users_fin.replace([np.inf, -np.inf], 0) ## inf

#### y값 설정

중요 지표 표준화

In [55]:
# y값 활용 지표
y_col = ['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view', 'gross_revenue_per_ad_impression'] ## null값은 views가 0인 데이터

In [56]:
# null값 대체
merge_df_users_fin[y_col] = merge_df_users_fin[y_col].fillna(0) ## NaN
merge_df_users_fin[y_col] = merge_df_users_fin[y_col].replace([np.inf, -np.inf], 0) ## inf

상관분석

In [184]:
# y값 파생변수와 원변수 간 상관분석
corr_df = merge_df_users_fin[['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view', 
                              'gross_revenue_per_ad_impression','likes','comments','shares','dislikes','views',
                              'subscribersGained','subscribersLost','estimatedRevenue','grossRevenue','adImpressions']].corr()

In [185]:
corr_df

Unnamed: 0,total_engage_rate,net_subscribers_change,averageViewPercentage,revenue_per_view,gross_revenue_per_ad_impression,likes,comments,shares,dislikes,views,subscribersGained,subscribersLost,estimatedRevenue,grossRevenue,adImpressions
total_engage_rate,1.0,0.120854,0.001931,0.013802,0.065892,0.11191,0.128123,0.121115,0.115068,0.110325,0.121558,0.118482,0.084117,0.066926,0.055054
net_subscribers_change,0.120854,1.0,0.034984,-0.006529,-0.003437,0.926094,0.480661,0.93316,0.949227,0.93513,0.995933,0.944209,0.56922,0.426375,0.418125
averageViewPercentage,0.001931,0.034984,1.0,-0.026598,-0.065428,0.024621,0.046363,0.026394,0.03831,0.036678,0.036476,0.039008,0.028707,-0.004945,0.001695
revenue_per_view,0.013802,-0.006529,-0.026598,1.0,0.069972,-0.005813,-0.006906,-0.005253,-0.007502,-0.007478,-0.006823,-0.007338,0.032,0.015746,0.008562
gross_revenue_per_ad_impression,0.065892,-0.003437,-0.065428,0.069972,1.0,-0.000309,-0.01002,0.006384,-0.005595,-0.005649,-0.004959,-0.008395,0.022216,0.043926,0.015187
likes,0.11191,0.926094,0.024621,-0.005813,-0.000309,1.0,0.507602,0.98637,0.984268,0.992798,0.944179,0.954301,0.688372,0.516061,0.49752
comments,0.128123,0.480661,0.046363,-0.006906,-0.01002,0.507602,1.0,0.504435,0.521587,0.512667,0.498199,0.525097,0.436524,0.335045,0.333592
shares,0.121115,0.93316,0.026394,-0.005253,0.006384,0.98637,0.504435,1.0,0.974669,0.981701,0.948497,0.951034,0.692005,0.53768,0.516643
dislikes,0.115068,0.949227,0.03831,-0.007502,-0.005595,0.984268,0.521587,0.974669,1.0,0.989836,0.96505,0.968221,0.696631,0.5161,0.502159
views,0.110325,0.93513,0.036678,-0.007478,-0.005649,0.992798,0.512667,0.981701,0.989836,1.0,0.953049,0.962362,0.710533,0.540257,0.527911


다중 지표 결합

In [182]:
# 표준화 (Standard Scaling)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(merge_df_users_fin[y_col])

In [186]:
# Isolation Forest 모델 학습
iso_forest = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
iso_forest.fit(scaled_features)

# 이상치 점수 계산
anomaly_scores = iso_forest.decision_function(scaled_features)

In [187]:
# 임계값 설정
threshold = np.percentile(anomaly_scores, 5) ## 이상치 점수가 작을수록 이상치에 가깝다고 판단하는데, 5번째 백분위수가 (-)로 바뀌는 포인트

# y값 라벨링 (이상치이면 0, 그렇지 않으면 1)
merge_df_users_fin['y_label'] = np.where(anomaly_scores < threshold, 0, 1)

In [188]:
# 전체 데이터 y값 빈도 확인
merge_df_users_fin['y_label'].value_counts()

y_label
1    77512
0     4080
Name: count, dtype: int64

In [83]:
# 계정별 y값 빈도 데이터
y_result_df = merge_df_users_fin.groupby('youtube_user_id')['y_label'].value_counts().reset_index()
y_result_df

Unnamed: 0,youtube_user_id,y_label,count
0,627cb611aa6f212355e0b617,1,397
1,627cb611aa6f212355e0b617,0,1
2,627f59ccaa39226247c60b01,1,374
3,627f59ccaa39226247c60b01,0,19
4,6287228afb15712a8cb931d7,1,400
...,...,...,...
360,65e7b773d8da110bb072e2b5,1,57
361,65e7b773d8da110bb072e2b5,0,4
362,65f7b17ed8da110bb0733b7b,1,49
363,65fecf7ed8da110bb0736199,1,44


In [85]:
# 계정별 일일데이터의 20% 이상 이상치 데이터인 계정 확인
y_result_df[y_result_df['y_label']==0][y_result_df[y_result_df['y_label']==0]['count'] >= 40]

Unnamed: 0,youtube_user_id,y_label,count
17,62872523fb15712a8cb93479,0,154
43,629f6ca6eaf5732d6df0611e,0,330
50,62a35ce69d41c93ff90b5670,0,90
76,62c4e558507271632b9cc1c7,0,99
82,62d11f080b4c4c7502a5be3d,0,398
92,62d55a5e9900f20e1f259d24,0,91
109,6332f892ef33d840a099abb3,0,42
119,639bb8dcd603b8138e33780b,0,181
142,63d77c9650eb530dfd139f8b,0,120
156,63eb4f87ee122e631992279f,0,68


#### 변수선택

##### 데이터 분할

In [189]:
# 컬럼 정리
unique_col = merge_df_users_fin.columns[:10]
x_col = merge_df_users_fin.columns[10:-1]

In [191]:
# 데이터 분할
X = merge_df_users_fin[x_col].drop(columns=['total_engage_rate','net_subscribers_change','revenue_per_view','averageViewPercentage', 'gross_revenue_per_ad_impression','subscribersGained', 'subscribersLost']) ## y값, 원변수 제거
y = merge_df_users_fin['y_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [192]:
# 불균형 확인
print(y_train.value_counts())
print(y_test.value_counts())

y_label
1    62009
0     3264
Name: count, dtype: int64
y_label
1    15503
0      816
Name: count, dtype: int64


##### 언더샘플링

In [193]:
# 언더샘플링
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

##### 변수선택

t-test

In [194]:
from scipy.stats import ttest_ind

# y=0과 y=1 그룹으로 데이터 나누기
group_0 = X_train_resampled[y_train_resampled == 0]
group_1 = X_train_resampled[y_train_resampled == 1]

# t-test 수행
p_values = []
for col in X_train_resampled.columns:
    t_stat, p_val = ttest_ind(group_0[col], group_1[col], equal_var=False)  # Welch's t-test
    p_values.append((col, p_val))

# p-value < 0.05인 변수 선택
selected_features_by_ttest = [col for col, p_val in p_values if p_val < 0.05]
print("Selected Features by t-test:", selected_features_by_ttest)

Selected Features by t-test: ['UNSUBSCRIBED', 'SUBSCRIBED', 'estimatedMinutesWatched', 'averageViewDuration', 'comments', 'dislikes', 'likes', 'shares', 'views', 'redViews', 'estimatedRevenue', 'estimatedAdRevenue', 'estimatedRedPartnerRevenue', 'grossRevenue', 'cpm', 'monetizedPlaybacks', 'adImpressions', 'playbackBasedCpm', 'like_rate', 'comment_rate', 'share_rate', 'dislike_rate', 'subscribers_conversion_rate', 'subscribed_view_rate', 'revenue_per_subscribed_view', 'revenue_per_unsubscribed_view', 'revenue_per_red_view', 'cpm_to_revenue_ratio', 'revenue_per_ad_impression', 'watched_view_rate', 'unsubscribed_view_time_rate']


In [195]:
print(len(selected_features_by_ttest))
print(selected_features_by_ttest)

31
['UNSUBSCRIBED', 'SUBSCRIBED', 'estimatedMinutesWatched', 'averageViewDuration', 'comments', 'dislikes', 'likes', 'shares', 'views', 'redViews', 'estimatedRevenue', 'estimatedAdRevenue', 'estimatedRedPartnerRevenue', 'grossRevenue', 'cpm', 'monetizedPlaybacks', 'adImpressions', 'playbackBasedCpm', 'like_rate', 'comment_rate', 'share_rate', 'dislike_rate', 'subscribers_conversion_rate', 'subscribed_view_rate', 'revenue_per_subscribed_view', 'revenue_per_unsubscribed_view', 'revenue_per_red_view', 'cpm_to_revenue_ratio', 'revenue_per_ad_impression', 'watched_view_rate', 'unsubscribed_view_time_rate']


Lasso

In [196]:
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

# t-test로 선택된 변수로 데이터프레임 구성
X_train_ttest_selected = X_train_resampled[selected_features_by_ttest]

# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_ttest_selected)

# LassoCV 모델 설정: 반복 횟수 증가, alpha 범위 조정
lasso = LassoCV(cv=5, random_state=42, max_iter=10000, alphas=[0.1, 0.05, 0.01, 0.005, 0.001]).fit(X_train_scaled, y_train_resampled)

# 선택된 변수 확인 (회귀 계수가 0이 아닌 변수들)
selected_features_by_lasso = X_train_ttest_selected.columns[(lasso.coef_ != 0)]

In [197]:
print(len(selected_features_by_lasso))
print(selected_features_by_lasso)

29
Index(['UNSUBSCRIBED', 'SUBSCRIBED', 'estimatedMinutesWatched',
       'averageViewDuration', 'comments', 'dislikes', 'likes', 'shares',
       'redViews', 'estimatedRevenue', 'estimatedAdRevenue',
       'estimatedRedPartnerRevenue', 'cpm', 'monetizedPlaybacks',
       'adImpressions', 'playbackBasedCpm', 'like_rate', 'comment_rate',
       'share_rate', 'dislike_rate', 'subscribers_conversion_rate',
       'subscribed_view_rate', 'revenue_per_subscribed_view',
       'revenue_per_unsubscribed_view', 'revenue_per_red_view',
       'cpm_to_revenue_ratio', 'revenue_per_ad_impression',
       'watched_view_rate', 'unsubscribed_view_time_rate'],
      dtype='object')


RandomForest

In [222]:
from sklearn.ensemble import RandomForestRegressor

# 랜덤 포레스트 모델 학습
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train_resampled[selected_features_by_lasso], y_train_resampled)

# 변수 중요도 추출
importances = rf_model.feature_importances_
feature_importance_rf = pd.Series(importances, index=selected_features_by_lasso).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
selected_features_by_rf = list(feature_importance_rf[feature_importance_rf >= 0.001].keys())

# 중요한 변수 출력
print(len(selected_features_by_rf))
print(selected_features_by_rf)

29
['revenue_per_unsubscribed_view', 'like_rate', 'shares', 'cpm', 'share_rate', 'UNSUBSCRIBED', 'subscribers_conversion_rate', 'likes', 'comment_rate', 'dislikes', 'revenue_per_ad_impression', 'averageViewDuration', 'subscribed_view_rate', 'playbackBasedCpm', 'unsubscribed_view_time_rate', 'revenue_per_red_view', 'redViews', 'dislike_rate', 'estimatedMinutesWatched', 'watched_view_rate', 'comments', 'cpm_to_revenue_ratio', 'revenue_per_subscribed_view', 'SUBSCRIBED', 'estimatedRedPartnerRevenue', 'monetizedPlaybacks', 'estimatedRevenue', 'estimatedAdRevenue', 'adImpressions']


Gradient Boosting

In [224]:
from sklearn.ensemble import GradientBoostingRegressor

# Gradient Boosting 모델 학습
gbm_model = GradientBoostingRegressor(random_state=42)
gbm_model.fit(X_train_resampled[selected_features_by_lasso], y_train_resampled)

# 변수 중요도 추출
importances = gbm_model.feature_importances_
feature_importance_gbm = pd.Series(importances, index=selected_features_by_lasso).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
selected_features_by_gbm = list(feature_importance_gbm[feature_importance_gbm >= 0.001].keys())

# 중요한 변수 출력
print(len(selected_features_by_gbm))
print(selected_features_by_gbm)

28
['like_rate', 'revenue_per_unsubscribed_view', 'revenue_per_ad_impression', 'likes', 'shares', 'cpm', 'share_rate', 'revenue_per_red_view', 'estimatedMinutesWatched', 'estimatedRevenue', 'subscribers_conversion_rate', 'cpm_to_revenue_ratio', 'comment_rate', 'dislikes', 'watched_view_rate', 'comments', 'UNSUBSCRIBED', 'revenue_per_subscribed_view', 'averageViewDuration', 'dislike_rate', 'estimatedRedPartnerRevenue', 'SUBSCRIBED', 'redViews', 'subscribed_view_rate', 'estimatedAdRevenue', 'playbackBasedCpm', 'unsubscribed_view_time_rate', 'adImpressions']


LightGBM

In [225]:
import lightgbm as lgb

# LightGBM 모델 학습
lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_model.fit(X_train_resampled[selected_features_by_lasso], y_train_resampled)

# 변수 중요도 추출
importances = lgb_model.feature_importances_
feature_importance_lgbm = pd.Series(importances, index=selected_features_by_lasso).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
feature_importance_lgbm = feature_importance_lgbm / feature_importance_lgbm.sum() ## 중요도 정규화
selected_features_by_lgbm = list(feature_importance_lgbm[feature_importance_lgbm >= 0.005].keys())

# 중요한 변수 출력
print(len(selected_features_by_lgbm))
print(selected_features_by_lgbm)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002351 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 6528, number of used features: 29
[LightGBM] [Info] Start training from score 0.500000
29
['subscribers_conversion_rate', 'like_rate', 'averageViewDuration', 'share_rate', 'subscribed_view_rate', 'revenue_per_unsubscribed_view', 'comment_rate', 'estimatedMinutesWatched', 'unsubscribed_view_time_rate', 'UNSUBSCRIBED', 'cpm', 'dislike_rate', 'revenue_per_ad_impression', 'watched_view_rate', 'shares', 'redViews', 'SUBSCRIBED', 'estimatedRevenue', 'revenue_per_red_view', 'likes', 'revenue_per_subscribed_view', 'cpm_to_revenue_ratio', 'playbackBasedCpm', 'comments', 'dislikes', 'estimatedAdRevenue', 'estimatedRedPartnerRevenue', 'monetizedPlaybacks', 'adImpressions']


XGBoost

In [226]:
import xgboost as xgb

# XGBoost 모델 학습
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_model.fit(X_train_resampled[selected_features_by_lasso], y_train_resampled)

# 변수 중요도 추출
importances = xgb_model.feature_importances_
feature_importance_xgb = pd.Series(importances, index=selected_features_by_lasso).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
selected_features_by_xgb = list(feature_importance_xgb[feature_importance_xgb >= 0.001].keys())

# 중요한 변수 출력
print(len(selected_features_by_xgb))
print(selected_features_by_xgb)

29
['revenue_per_unsubscribed_view', 'like_rate', 'shares', 'cpm', 'share_rate', 'likes', 'comment_rate', 'subscribers_conversion_rate', 'comments', 'dislikes', 'revenue_per_ad_impression', 'UNSUBSCRIBED', 'playbackBasedCpm', 'cpm_to_revenue_ratio', 'revenue_per_red_view', 'unsubscribed_view_time_rate', 'dislike_rate', 'subscribed_view_rate', 'estimatedRevenue', 'watched_view_rate', 'averageViewDuration', 'adImpressions', 'SUBSCRIBED', 'revenue_per_subscribed_view', 'redViews', 'estimatedRedPartnerRevenue', 'estimatedMinutesWatched', 'estimatedAdRevenue', 'monetizedPlaybacks']


In [227]:
# 비선형모델 활용 변수 선정
importances_df = pd.DataFrame({
    'features': selected_features_by_lasso,
    'rf_importance': rf_model.feature_importances_,
    'gbm_importance': gbm_model.feature_importances_,
    'lgbm_importance': lgb_model.feature_importances_,
    'xgb_importance': xgb_model.feature_importances_    
})
importances_df['lgbm_importance'] = importances_df['lgbm_importance'] / importances_df['lgbm_importance'].sum() ## 다른 모델과 중요도 단위 다르기 때문에 정규화
importances_df['mean_importance'] = importances_df[['rf_importance', 'gbm_importance', 'lgbm_importance', 'xgb_importance']].mean(axis=1) ## 각 변수별 모델 중요도 평균

In [228]:
final_selected_features_user = list(importances_df[importances_df['mean_importance']>=0.01]['features'])
print(len(final_selected_features_user))
print(final_selected_features_user)

24
['UNSUBSCRIBED', 'SUBSCRIBED', 'estimatedMinutesWatched', 'averageViewDuration', 'comments', 'dislikes', 'likes', 'shares', 'redViews', 'estimatedRevenue', 'cpm', 'playbackBasedCpm', 'like_rate', 'comment_rate', 'share_rate', 'dislike_rate', 'subscribers_conversion_rate', 'subscribed_view_rate', 'revenue_per_unsubscribed_view', 'revenue_per_red_view', 'cpm_to_revenue_ratio', 'revenue_per_ad_impression', 'watched_view_rate', 'unsubscribed_view_time_rate']


#### 모델링

##### 모델 기법 적용

RandomForest

In [231]:
from sklearn.ensemble import RandomForestClassifier

# 랜덤 포레스트 모델 학습
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_resampled[final_selected_features_user], y_train_resampled)

# 랜덤 포레스트 모델 교차 검증
cv_scores_rf = cross_val_score(rf_model, X_train_resampled[final_selected_features_user], y_train_resampled, cv=5, scoring='accuracy')
print(f"RandomForest Cross-Validation Accuracy: {cv_scores_rf.mean():.2f}")

RandomForest Cross-Validation Accuracy: 0.96


GradientBoosting

In [232]:
from sklearn.ensemble import GradientBoostingClassifier

# 그라디언트 부스팅 모델 학습
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_resampled[final_selected_features_user], y_train_resampled)

# 그라디언트 부스팅 모델 교차 검증
cv_scores_gb = cross_val_score(gb_model, X_train_resampled[final_selected_features_user], y_train_resampled, cv=5, scoring='accuracy')
print(f"GradientBoosting Cross-Validation Accuracy: {cv_scores_gb.mean():.2f}")

GradientBoosting Cross-Validation Accuracy: 0.96


XGBoost

In [238]:
from xgboost import XGBClassifier

# XGBoost를 사용한 모델 학습
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_resampled[final_selected_features_user], y_train_resampled)

# 그라디언트 부스팅 모델 교차 검증
cv_scores_xgb = cross_val_score(xgb_model, X_train_resampled[final_selected_features_user], y_train_resampled, cv=5, scoring='accuracy')
print(f"XGBoost Cross-Validation Accuracy: {cv_scores_xgb.mean():.2f}")

XGBoost Cross-Validation Accuracy: 0.96


##### 모델 성능 평가

In [239]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# 예측 결과 생성
y_pred_rf = rf_model.predict(X_test[final_selected_features_user])
y_pred_gb = gb_model.predict(X_test[final_selected_features_user])
y_pred_xgb = xgb_model.predict(X_test[final_selected_features_user])

# 모델 정확도 확인
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print(f"RandomForest Test Accuracy: {accuracy_rf:.2f}")
print(f"GradientBoosting Test Accuracy: {accuracy_gb:.2f}")
print(f"XGBoost Test Accuracy: {accuracy_xgb:.2f}")

# 정밀도, 재현율, F1-score 출력
print("RandomForest Classification Report:")
print(classification_report(y_test, y_pred_rf))

print("GradientBoosting Classification Report:")
print(classification_report(y_test, y_pred_gb))

print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))

# ROC-AUC 확인
roc_auc_rf = roc_auc_score(y_test, rf_model.predict_proba(X_test[final_selected_features_user])[:, 1])
roc_auc_gb = roc_auc_score(y_test, gb_model.predict_proba(X_test[final_selected_features_user])[:, 1])
roc_auc_xgb = roc_auc_score(y_test, xgb_model.predict_proba(X_test[final_selected_features_user])[:, 1])

print(f"RandomForest ROC-AUC: {roc_auc_rf:.2f}")
print(f"GradientBoosting ROC-AUC: {roc_auc_gb:.2f}")
print(f"XGBoost ROC-AUC: {roc_auc_xgb:.2f}")

RandomForest Test Accuracy: 0.96
GradientBoosting Test Accuracy: 0.96
XGBoost Test Accuracy: 0.97
RandomForest Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.96      0.72       816
           1       1.00      0.96      0.98     15503

    accuracy                           0.96     16319
   macro avg       0.79      0.96      0.85     16319
weighted avg       0.98      0.96      0.97     16319

GradientBoosting Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.95      0.69       816
           1       1.00      0.96      0.98     15503

    accuracy                           0.96     16319
   macro avg       0.77      0.96      0.84     16319
weighted avg       0.97      0.96      0.96     16319

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.96      0.76       816
           1       1.00      0.9

### 콘텐츠 데이터

#### 파생변수

In [10]:
# y값 파생변수
youtube_videos['total_engage_rate'] = (youtube_videos['likes'] + youtube_videos['comments'] + youtube_videos['shares'] + youtube_videos['dislikes']) / youtube_videos['views'] ## 총 참여율
youtube_videos['net_subscribers_change'] = youtube_videos['subscribersGained'] - youtube_videos['subscribersLost'] ## 구독자 순증가
youtube_videos['revenue_per_view'] = youtube_videos['estimatedRevenue'] / youtube_videos['views'] ## 조회수당 수익
youtube_videos['grossRevenue_per_ad_impression'] = youtube_videos['grossRevenue'] / youtube_videos['adImpressions'] ## 1회 광고노출당 총수익
youtube_videos['total_card_teaser_click_rate'] = (youtube_videos['cardClicks'] + youtube_videos['cardTeaserClicks']) / (youtube_videos['cardImpressions'] + youtube_videos['cardTeaserImpressions']) ## 카드와 카드 티저의 총 클릭률
youtube_videos['playlist_engagement_rate'] = (youtube_videos['videosAddedToPlaylists'] + youtube_videos['videosRemovedFromPlaylists']) / youtube_videos['views'] ## 플레이리스트 참여도

In [11]:
# 파생변수1 - 참여도 관련
youtube_videos['comment_rate'] = youtube_videos['comments'] / youtube_videos['views'] ## 댓글 비율
youtube_videos['dislike_rate'] = youtube_videos['dislikes'] / youtube_videos['views'] ## 싫어요 비율

In [12]:
# 파생변수2 - 구독자 관련
youtube_videos['subscribers_conversion_rate'] = youtube_videos['subscribersGained'] / youtube_videos['views'] ## 구독자 전환율

In [13]:
# 파생변수3 - 수익 관련
youtube_videos['revenue_per_red_view'] = youtube_videos['estimatedRevenue'] / youtube_videos['redViews'] ## 프리미엄당 수익
youtube_videos['ad_revenue_rate'] = youtube_videos['estimatedAdRevenue'] / youtube_videos['estimatedRevenue'] ## 광고수익비율
youtube_videos['red_revenue_rate'] = youtube_videos['estimatedRedPartnerRevenue'] / youtube_videos['estimatedRevenue'] ## 프리미엄수익비율
youtube_videos['revenue_per_ad_impression'] = youtube_videos['estimatedRevenue'] / youtube_videos['adImpressions'] ## 광고노출당 수익
youtube_videos['net_revenue_per_playlist_add'] = (youtube_videos['videosAddedToPlaylists'] - youtube_videos['videosRemovedFromPlaylists']) / youtube_videos['estimatedRevenue'] ## 플레이리스트 순추가당 수익

In [14]:
# 파생변수4 - 시청 시간 관련
youtube_videos['avg_view_duration_rate'] = youtube_videos['averageViewDuration'] / youtube_videos['averageViewPercentage'] ## 평균 시청 시간 비율
youtube_videos['watched_time_rate'] = youtube_videos['averageViewPercentage'] * youtube_videos['estimatedMinutesWatched'] ## 재생 비율 대비 시청 시간
youtube_videos['watched_view_red_rate'] = youtube_videos['estimatedRedMinutesWatched'] / youtube_videos['views'] ## 조회수당 프리미엄 이용자 시청시간

In [15]:
# 파생변수5 - 광고 관련
youtube_videos['revenue_per_playback'] = youtube_videos['grossRevenue'] / youtube_videos['monetizedPlaybacks'] ## 1회 광고재생당 수익
youtube_videos['ad_playbacks_per_playlist_add'] = youtube_videos['monetizedPlaybacks'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가당 광고 재생 비율

In [16]:
# 파생변수7 - 비디오 관련
youtube_videos['playlist_addition_rate'] = youtube_videos['videosAddedToPlaylists'] / youtube_videos['views'] ## 플레이리스트 추가 비율
youtube_videos['playlist_removal_rate'] = youtube_videos['videosRemovedFromPlaylists'] / youtube_videos['views'] ## 플레이리스트 제거 비율

In [17]:
# null값 대체
youtube_videos = youtube_videos.fillna(0) ## NaN
youtube_videos = youtube_videos.replace([np.inf, -np.inf], 0) ## inf

#### y값 설정

중요 지표 표준화

In [65]:
# y값 활용 지표
y_col = ['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view', 'grossRevenue_per_ad_impression','total_card_teaser_click_rate','playlist_engagement_rate'] ## null값은 views가 0인 데이터

In [66]:
# null값 대체
youtube_videos[y_col] = youtube_videos[y_col].fillna(0) ## NaN
youtube_videos[y_col] = youtube_videos[y_col].replace([np.inf, -np.inf], 0) ## inf

상관분석

In [251]:
# y값 파생변수와 원변수 간 상관분석
corr_df = youtube_videos[['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view', 'grossRevenue_per_ad_impression','total_card_teaser_click_rate','playlist_engagement_rate',
                          'likes', 'comments', 'shares', 'dislikes', 'views', 'subscribersGained', 'subscribersLost', 'estimatedRevenue', 'grossRevenue', 'adImpressions', 'cardClicks', 'cardTeaserClicks',
                          'cardImpressions', 'cardTeaserImpressions', 'videosAddedToPlaylists', 'videosRemovedFromPlaylists']].corr()
corr_df

Unnamed: 0,total_engage_rate,net_subscribers_change,averageViewPercentage,revenue_per_view,grossRevenue_per_ad_impression,total_card_teaser_click_rate,playlist_engagement_rate,likes,comments,shares,...,subscribersLost,estimatedRevenue,grossRevenue,adImpressions,cardClicks,cardTeaserClicks,cardImpressions,cardTeaserImpressions,videosAddedToPlaylists,videosRemovedFromPlaylists
total_engage_rate,1.0,0.013301,0.023578,0.002411,0.000792,0.000532,0.007157159,0.017302,0.069468,0.022786,...,0.008605,0.005173,0.000212,-0.000431,0.00119,0.001036528,0.000717,0.000463,0.01134,0.006416
net_subscribers_change,0.013301,1.0,0.008422,-0.001132,-0.004823,0.000145,-0.0008879934,0.568465,0.129862,0.585073,...,0.421466,0.206679,0.075275,0.07619,0.003628,0.004960686,0.004387,0.00813,0.319687,0.172606
averageViewPercentage,0.023578,0.008422,1.0,-0.006761,-0.043556,-0.003153,-0.005760457,0.00868,0.004478,0.00895,...,0.003788,-0.001668,-0.008316,-0.007609,-0.001703,-0.001446204,-0.001423,-0.001224,-0.000322,-0.008052
revenue_per_view,0.002411,-0.001132,-0.006761,1.0,0.166622,0.000819,0.0007567005,-0.001439,-0.000404,-0.00114,...,-0.000522,0.02563,0.005968,0.003266,0.000545,0.0005510907,0.00047,0.000371,-7.7e-05,0.001709
grossRevenue_per_ad_impression,0.000792,-0.004823,-0.043556,0.166622,1.0,0.009461,0.003108002,-0.006794,-0.002009,-0.004577,...,-0.002943,0.019918,0.027791,0.015778,0.00618,0.005507015,0.004763,0.003745,0.003443,0.015927
total_card_teaser_click_rate,0.000532,0.000145,-0.003153,0.000819,0.009461,1.0,0.0002001429,-0.000431,0.00146,0.000107,...,-0.000124,0.002249,0.003029,0.002558,0.260639,0.2062068,0.139693,0.013874,0.001603,0.003881
playlist_engagement_rate,0.007157,-0.000888,-0.00576,0.000757,0.003108,0.0002,1.0,-0.0009,-0.000395,-0.000894,...,-0.000481,-0.000952,-0.000304,-0.000305,7e-05,1.307216e-07,4.7e-05,-0.00013,0.061359,0.007399
likes,0.017302,0.568465,0.00868,-0.001439,-0.006794,-0.000431,-0.0009004789,1.0,0.175911,0.857807,...,0.465297,0.24478,0.011773,0.01249,0.000388,0.0007143847,0.00059,0.001365,0.568199,0.213555
comments,0.069468,0.129862,0.004478,-0.000404,-0.002009,0.00146,-0.0003949894,0.175911,1.0,0.169732,...,0.094826,0.104338,0.049918,0.052914,0.017999,0.0323097,0.025878,0.038874,0.130411,0.086937
shares,0.022786,0.585073,0.00895,-0.00114,-0.004577,0.000107,-0.0008937038,0.857807,0.169732,1.0,...,0.431097,0.268022,0.073256,0.072496,0.004277,0.005070999,0.004397,0.010757,0.574019,0.275618


다중 지표 결합

In [253]:
# 표준화 (Standard Scaling)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(youtube_videos[y_col])

In [254]:
# Isolation Forest 모델 학습
iso_forest = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
iso_forest.fit(scaled_features)

# 이상치 점수 계산
anomaly_scores = iso_forest.decision_function(scaled_features)

In [255]:
# 임계값 설정
threshold = np.percentile(anomaly_scores, 5) ## 이상치 점수가 작을수록 이상치에 가깝다고 판단하는데, 5번째 백분위수가 (-)로 바뀌는 포인트

# y값 라벨링 (이상치이면 0, 그렇지 않으면 1)
youtube_videos['y_label'] = np.where(anomaly_scores < threshold, 0, 1)

In [256]:
# 전체 데이터 y값 빈도 확인
youtube_videos['y_label'].value_counts()

y_label
1    8136599
0     427744
Name: count, dtype: int64

In [104]:
# 계정별 y값 빈도 데이터
youtube_videos.groupby('youtube_user_id')['y_label'].value_counts().reset_index()

Unnamed: 0,youtube_user_id,y_label,count
0,627cb611aa6f212355e0b617,1,14802
1,627cb611aa6f212355e0b617,0,561
2,627f59ccaa39226247c60b01,1,8487
3,627f59ccaa39226247c60b01,0,37
4,6287228afb15712a8cb931d7,1,5152
...,...,...,...
491,65f7b17ed8da110bb0733b7b,0,14
492,65fecf7ed8da110bb0736199,1,8626
493,65fecf7ed8da110bb0736199,0,173
494,66230ee6d8da110bb0744b2d,1,3275


In [105]:
# 콘텐츠별 y값 빈도 데이터
y_result_df = youtube_videos.groupby('video')['y_label'].value_counts().reset_index()

# 콘텐츠별 일일 측정 데이터 수
video_cnt_df = youtube_videos.groupby('video')['end_date'].count().reset_index()

y_result_df = pd.merge(y_result_df,video_cnt_df,how='left',on='video')
y_result_df = y_result_df.rename(columns={'end_date':'total_count'})
y_result_df['standard_cnt'] = round(y_result_df['total_count'] * 0.1) ## 비디오별 일일 전체 데이터 중 이상치 데이터가 10% 이상
y_result_df['standard_cnt'] = y_result_df['standard_cnt'].astype(int)

y_result_df

Unnamed: 0,video,y_label,count,total_count,standard_cnt
0,,1,8,8,1
1,--0HSDH6J7o,1,15,15,2
2,--0XOlJ3Lw4,1,395,399,40
3,--0XOlJ3Lw4,0,4,399,40
4,--7sZPRc1H4,1,29,32,3
...,...,...,...,...,...
120126,zzamnOdv5BM,1,7,7,1
120127,zzjDvCasA4Q,1,342,342,34
120128,zzlQiqh04eE,1,21,21,2
120129,zzwBOCOq5YI,1,342,342,34


In [106]:
# 계정별 일일데이터의 10% 이상 이상치 데이터
video_outlier_df = y_result_df[y_result_df['y_label']==0][y_result_df[y_result_df['y_label']==0]['count'] > y_result_df[y_result_df['y_label']==0]['standard_cnt']].reset_index(drop=True)
video_outlier_df

Unnamed: 0,video,y_label,count,total_count,standard_cnt
0,--T14bWvRFg,0,9,24,2
1,--mDukleraA,0,25,178,18
2,--stoDeuI_4,0,4,8,1
3,--vBR7LAnYs,0,31,47,5
4,-08UP7FVd68,0,2,2,0
...,...,...,...,...,...
9339,zyNd-y7gw3o,0,30,87,9
9340,zyRJyjYppbM,0,4,15,2
9341,zyRZIo2q9Oc,0,73,324,32
9342,zyjbDedYZB8,0,5,5,0


In [107]:
# 이상치 중 데이터 부족으로 판단 어려운 것 제거 후 최종 이상치 콘텐츠
video_outlier_fin = video_outlier_df[video_outlier_df['standard_cnt']!=0]['video'].unique()

In [108]:
# 계정별 이상치 콘텐츠 결과 데이터셋
result_contents_df = pd.merge(youtube_videos.groupby('youtube_user_id')['video'].count().reset_index(),
                              youtube_videos[youtube_videos['video'].isin(video_outlier_fin)].groupby('youtube_user_id')['video'].count().reset_index(),
                              how='left',on='youtube_user_id')
result_contents_df = result_contents_df[~result_contents_df['video_y'].isnull()].reset_index(drop=True)

In [114]:
result_contents_df

Unnamed: 0,youtube_user_id,video_x,video_y
0,627cb611aa6f212355e0b617,15363,651.0
1,6287229efb15712a8cb93225,79776,2151.0
2,628722c8fb15712a8cb9326e,34639,392.0
3,62872317fb15712a8cb932e9,43204,21.0
4,62872370fb15712a8cb93337,36418,47.0
...,...,...,...
191,65cc401305bf1c0baa425146,12038,5607.0
192,65e7b773d8da110bb072e2b5,6191,216.0
193,65f7b17ed8da110bb0733b7b,2524,41.0
194,65fecf7ed8da110bb0736199,8799,195.0


#### 변수선택

##### 데이터 분할

In [257]:
# 컬럼 정리
unique_col = youtube_videos.columns[:3]
x_col = youtube_videos.columns[3:-1]

In [258]:
# 데이터 분할
X = youtube_videos[x_col].drop(columns=['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view', 'grossRevenue_per_ad_impression','total_card_teaser_click_rate','playlist_engagement_rate','subscribersGained']) ## y값, 원변수 제거
y = youtube_videos['y_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [119]:
# 불균형 확인
print(y_train.value_counts())
print(y_test.value_counts())

y_label
1    6509053
0     342352
Name: count, dtype: int64
y_label
1    1626991
0      85861
Name: count, dtype: int64


##### 언더샘플링

In [259]:
# 언더샘플링
rus = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

##### 변수선택

t-test

In [260]:
from scipy.stats import ttest_ind

# y=0과 y=1 그룹으로 데이터 나누기
group_0 = X_train_resampled[y_train_resampled == 0]
group_1 = X_train_resampled[y_train_resampled == 1]

# t-test 수행
p_values = []
for col in X_train_resampled.columns:
    t_stat, p_val = ttest_ind(group_0[col], group_1[col], equal_var=False)  # Welch's t-test
    p_values.append((col, p_val))

# p-value < 0.05인 변수 선택
selected_features_by_ttest = [col for col, p_val in p_values if p_val < 0.05]

print(len(selected_features_by_ttest))
print(selected_features_by_ttest)

Selected Features by t-test: ['views', 'redViews', 'comments', 'likes', 'dislikes', 'videosAddedToPlaylists', 'videosRemovedFromPlaylists', 'shares', 'estimatedMinutesWatched', 'estimatedRedMinutesWatched', 'averageViewDuration', 'cardClickRate', 'cardTeaserClickRate', 'cardImpressions', 'cardTeaserImpressions', 'cardClicks', 'cardTeaserClicks', 'subscribersLost', 'estimatedRevenue', 'estimatedAdRevenue', 'grossRevenue', 'estimatedRedPartnerRevenue', 'monetizedPlaybacks', 'playbackBasedCpm', 'adImpressions', 'cpm', 'comment_rate', 'dislike_rate', 'subscribers_conversion_rate', 'revenue_per_red_view', 'ad_revenue_rate', 'red_revenue_rate', 'revenue_per_ad_impression', 'net_revenue_per_playlist_add', 'avg_view_duration_rate', 'watched_time_rate', 'watched_view_red_rate', 'revenue_per_playback', 'ad_playbacks_per_playlist_add', 'playlist_addition_rate', 'playlist_removal_rate']


In [261]:
print(len(selected_features_by_ttest))
print(selected_features_by_ttest)

41
['views', 'redViews', 'comments', 'likes', 'dislikes', 'videosAddedToPlaylists', 'videosRemovedFromPlaylists', 'shares', 'estimatedMinutesWatched', 'estimatedRedMinutesWatched', 'averageViewDuration', 'cardClickRate', 'cardTeaserClickRate', 'cardImpressions', 'cardTeaserImpressions', 'cardClicks', 'cardTeaserClicks', 'subscribersLost', 'estimatedRevenue', 'estimatedAdRevenue', 'grossRevenue', 'estimatedRedPartnerRevenue', 'monetizedPlaybacks', 'playbackBasedCpm', 'adImpressions', 'cpm', 'comment_rate', 'dislike_rate', 'subscribers_conversion_rate', 'revenue_per_red_view', 'ad_revenue_rate', 'red_revenue_rate', 'revenue_per_ad_impression', 'net_revenue_per_playlist_add', 'avg_view_duration_rate', 'watched_time_rate', 'watched_view_red_rate', 'revenue_per_playback', 'ad_playbacks_per_playlist_add', 'playlist_addition_rate', 'playlist_removal_rate']


Lasso

In [262]:
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

# t-test로 선택된 변수로 데이터프레임 구성
X_train_ttest_selected = X_train_resampled[selected_features_by_ttest]

# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_ttest_selected)

# LassoCV 모델 설정: 반복 횟수 증가, alpha 범위 조정
lasso = LassoCV(cv=5, random_state=42, max_iter=10000, alphas=[0.1, 0.05, 0.01, 0.005, 0.001]).fit(X_train_scaled, y_train_resampled)

# 선택된 변수 확인 (회귀 계수가 0이 아닌 변수들)
selected_features_by_lasso = X_train_ttest_selected.columns[(lasso.coef_ != 0)]

print(len(selected_features_by_lasso))
print(selected_features_by_lasso)

33
Index(['views', 'redViews', 'comments', 'likes', 'dislikes',
       'videosAddedToPlaylists', 'videosRemovedFromPlaylists', 'shares',
       'estimatedRedMinutesWatched', 'averageViewDuration', 'cardClickRate',
       'cardTeaserClickRate', 'cardImpressions', 'estimatedRevenue',
       'estimatedAdRevenue', 'grossRevenue', 'estimatedRedPartnerRevenue',
       'monetizedPlaybacks', 'adImpressions', 'cpm', 'comment_rate',
       'dislike_rate', 'subscribers_conversion_rate', 'revenue_per_red_view',
       'ad_revenue_rate', 'red_revenue_rate', 'revenue_per_ad_impression',
       'net_revenue_per_playlist_add', 'watched_time_rate',
       'watched_view_red_rate', 'ad_playbacks_per_playlist_add',
       'playlist_addition_rate', 'playlist_removal_rate'],
      dtype='object')


LightGBM

In [268]:
import lightgbm as lgb

# LightGBM 모델 학습
lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_model.fit(X_train_resampled[selected_features_by_lasso], y_train_resampled)

# 변수 중요도 추출
importances = lgb_model.feature_importances_
feature_importance_lgbm = pd.Series(importances, index=selected_features_by_lasso).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
feature_importance_lgbm = feature_importance_lgbm / feature_importance_lgbm.sum() ## 중요도 정규화
selected_features_by_lgbm = list(feature_importance_lgbm[feature_importance_lgbm >= 0.005].keys())

# 중요한 변수 출력
print(len(selected_features_by_lgbm))
print(selected_features_by_lgbm)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027518 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7763
[LightGBM] [Info] Number of data points in the train set: 684684, number of used features: 33
[LightGBM] [Info] Start training from score 0.500000
23
['views', 'likes', 'cpm', 'playlist_addition_rate', 'watched_time_rate', 'shares', 'subscribers_conversion_rate', 'estimatedRevenue', 'playlist_removal_rate', 'averageViewDuration', 'revenue_per_ad_impression', 'comment_rate', 'revenue_per_red_view', 'videosAddedToPlaylists', 'estimatedAdRevenue', 'dislike_rate', 'net_revenue_per_playlist_add', 'redViews', 'adImpressions', 'grossRevenue', 'watched_view_red_rate', 'ad_revenue_rate', 'red_revenue_rate']


XGBoost

In [269]:
import xgboost as xgb

# XGBoost 모델 학습
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_model.fit(X_train_resampled[selected_features_by_lasso], y_train_resampled)

# 변수 중요도 추출
importances = xgb_model.feature_importances_
feature_importance_xgb = pd.Series(importances, index=selected_features_by_lasso).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
selected_features_by_xgb = list(feature_importance_xgb[feature_importance_xgb >= 0.001].keys())

# 중요한 변수 출력
print(len(selected_features_by_xgb))
print(selected_features_by_xgb)

29
['videosAddedToPlaylists', 'cpm', 'playlist_removal_rate', 'shares', 'likes', 'views', 'dislike_rate', 'revenue_per_ad_impression', 'comment_rate', 'playlist_addition_rate', 'redViews', 'estimatedRevenue', 'subscribers_conversion_rate', 'watched_time_rate', 'ad_playbacks_per_playlist_add', 'watched_view_red_rate', 'red_revenue_rate', 'adImpressions', 'averageViewDuration', 'estimatedAdRevenue', 'revenue_per_red_view', 'comments', 'estimatedRedMinutesWatched', 'dislikes', 'monetizedPlaybacks', 'ad_revenue_rate', 'grossRevenue', 'net_revenue_per_playlist_add', 'videosRemovedFromPlaylists']


In [270]:
# 비선형모델 활용 변수 선정
importances_df = pd.DataFrame({
    'features': selected_features_by_lasso,
    'lgbm_importance': lgb_model.feature_importances_,
    'xgb_importance': xgb_model.feature_importances_    
})
importances_df['lgbm_importance'] = importances_df['lgbm_importance'] / importances_df['lgbm_importance'].sum() ## 다른 모델과 중요도 단위 다르기 때문에 정규화
importances_df['mean_importance'] = importances_df[['lgbm_importance', 'xgb_importance']].mean(axis=1) ## 각 변수별 모델 중요도 평균

In [271]:
importances_df.sort_values('mean_importance',ascending=False)

Unnamed: 0,features,lgbm_importance,xgb_importance,mean_importance
5,videosAddedToPlaylists,0.016,0.326459,0.17123
0,views,0.148333,0.053886,0.10111
19,cpm,0.079,0.100828,0.089914
3,likes,0.092667,0.058364,0.075515
32,playlist_removal_rate,0.055667,0.093912,0.074789
7,shares,0.075333,0.071198,0.073266
31,playlist_addition_rate,0.077667,0.033067,0.055367
28,watched_time_rate,0.077333,0.011751,0.044542
13,estimatedRevenue,0.068667,0.016634,0.04265
22,subscribers_conversion_rate,0.071667,0.011951,0.041809


In [273]:
final_selected_features_video = list(importances_df[importances_df['mean_importance']>=0.005]['features'])
print(len(final_selected_features_video))
print(final_selected_features_video)

24
['views', 'redViews', 'likes', 'videosAddedToPlaylists', 'shares', 'averageViewDuration', 'estimatedRevenue', 'estimatedAdRevenue', 'grossRevenue', 'adImpressions', 'cpm', 'comment_rate', 'dislike_rate', 'subscribers_conversion_rate', 'revenue_per_red_view', 'ad_revenue_rate', 'red_revenue_rate', 'revenue_per_ad_impression', 'net_revenue_per_playlist_add', 'watched_time_rate', 'watched_view_red_rate', 'ad_playbacks_per_playlist_add', 'playlist_addition_rate', 'playlist_removal_rate']


#### 모델링

##### 모델 기법 적용

XGBoost

In [276]:
from xgboost import XGBClassifier

# XGBoost를 사용한 모델 학습
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_resampled[final_selected_features_video], y_train_resampled)

# 그라디언트 부스팅 모델 교차 검증
cv_scores_xgb = cross_val_score(xgb_model, X_train_resampled[final_selected_features_video], y_train_resampled, cv=5, scoring='accuracy')
print(f"XGBoost Cross-Validation Accuracy: {cv_scores_xgb.mean():.2f}")

XGBoost Cross-Validation Accuracy: 0.99


##### 모델 성능 평가

In [277]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# 예측 결과 생성
y_pred_xgb = xgb_model.predict(X_test[final_selected_features_video])

# 모델 정확도 확인
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Test Accuracy: {accuracy_xgb:.2f}")

print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))

# ROC-AUC 확인
roc_auc_xgb = roc_auc_score(y_test, xgb_model.predict_proba(X_test[final_selected_features_video])[:, 1])
print(f"XGBoost ROC-AUC: {roc_auc_xgb:.2f}")

XGBoost Test Accuracy: 0.98
XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.99      0.87     85402
           1       1.00      0.98      0.99   1627467

    accuracy                           0.98   1712869
   macro avg       0.88      0.99      0.93   1712869
weighted avg       0.99      0.98      0.99   1712869

XGBoost ROC-AUC: 1.00


### 결과 확인

#### 계정 데이터

In [67]:
final_col = ['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view', 'gross_revenue_per_ad_impression','UNSUBSCRIBED', 'SUBSCRIBED', 'estimatedMinutesWatched', 
             'averageViewDuration', 'comments', 'dislikes', 'likes', 'shares', 'redViews', 'estimatedRevenue', 'cpm', 'playbackBasedCpm', 'like_rate', 'comment_rate', 'share_rate', 'dislike_rate', 
             'subscribers_conversion_rate', 'subscribed_view_rate', 'revenue_per_unsubscribed_view', 'revenue_per_red_view', 'cpm_to_revenue_ratio', 'revenue_per_ad_impression', 'watched_view_rate', 
             'unsubscribed_view_time_rate']

다중 지표 결합

In [69]:
# 표준화 (Standard Scaling)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(merge_df_users_fin[final_col])

In [70]:
# Isolation Forest 모델 학습
iso_forest_user = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
iso_forest_user.fit(scaled_features)

# 이상치 점수 계산
anomaly_scores = iso_forest_user.decision_function(scaled_features)

In [71]:
# 임계값 설정
threshold = np.percentile(anomaly_scores, 5) ## 이상치 점수가 작을수록 이상치에 가깝다고 판단하는데, 5번째 백분위수가 (-)로 바뀌는 포인트

# y값 라벨링 (이상치이면 0, 그렇지 않으면 1)
merge_df_users_fin['y_label'] = np.where(anomaly_scores < threshold, 0, 1)

In [72]:
# 계정별 y값 빈도 데이터
y_result_df = merge_df_users_fin.groupby('youtube_user_id')['y_label'].value_counts().reset_index()
y_result_df

Unnamed: 0,youtube_user_id,y_label,count
0,627cb611aa6f212355e0b617,1,397
1,627cb611aa6f212355e0b617,0,1
2,627f59ccaa39226247c60b01,1,384
3,627f59ccaa39226247c60b01,0,9
4,6287228afb15712a8cb931d7,1,400
...,...,...,...
346,65cc401305bf1c0baa425146,1,1
347,65e7b773d8da110bb072e2b5,1,61
348,65f7b17ed8da110bb0733b7b,1,49
349,65fecf7ed8da110bb0736199,1,44


In [141]:
# 계정별 일일데이터의 10% 이상 이상치 데이터인 계정 확인
fraud_user_id = list(y_result_df[y_result_df['y_label']==0][y_result_df[y_result_df['y_label']==0]['count'] >= 30]['youtube_user_id'])

In [142]:
print(len(fraud_user_id))
print(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(fraud_user_id)]['channel_title'].unique())

23
['OBL - 온라인 농부, 사자가 되다' 'Jeffreyxking' '벽돌할아버지 Brick grandpa'
 '잉툰TV- 만화로 쉽게 영어배우자' '앙찡' '북토크' '수빙수tv sooBingsoo' 'kiu기우쌤' '임삐나' '주피코'
 'Mind Patting마음토닥' '부반TV_부에 반하다' 'MINLEE 민리' 'abbapraise 아바프레이즈' '그롬마쉬TV'
 '채림처럼firstcherry' '미니멀영어 Minimal English' '日本ジヌ【니혼지누】ー韓国に関する全て' '키나kkina'
 '잼스기타' '뷰드름 유튜버 인씨' '너굴몬' '코인덕 차트아지']


In [171]:
# 각 집단 통계수치 비교
merge_df_users_fin.groupby('y_label')[['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view', 'gross_revenue_per_ad_impression']].describe().T

Unnamed: 0,y_label,0,1
revenue_per_view,count,4080.0,77512.0
revenue_per_view,mean,10.530775,1.270002
revenue_per_view,std,60.362506,5.555334
revenue_per_view,min,0.0,-1344.523306
revenue_per_view,25%,0.218283,0.203928
revenue_per_view,50%,1.502663,0.897934
revenue_per_view,75%,8.624007,1.763227
revenue_per_view,max,1891.0606,317.601203
gross_revenue_per_ad_impression,count,4080.0,77512.0
gross_revenue_per_ad_impression,mean,8.277194,4.071427


#### 콘텐츠 데이터

In [77]:
final_video_col = ['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view', 'grossRevenue_per_ad_impression', 'total_card_teaser_click_rate', 
                   'playlist_engagement_rate','views', 'redViews', 'likes', 'videosAddedToPlaylists', 'shares', 'averageViewDuration', 'estimatedRevenue', 'estimatedAdRevenue', 'grossRevenue', 
                   'adImpressions', 'cpm', 'comment_rate', 'dislike_rate', 'subscribers_conversion_rate', 'revenue_per_red_view', 'ad_revenue_rate', 'red_revenue_rate', 'revenue_per_ad_impression',
                   'net_revenue_per_playlist_add', 'watched_time_rate', 'watched_view_red_rate', 'ad_playbacks_per_playlist_add', 'playlist_addition_rate', 'playlist_removal_rate']

다중 지표 결합

In [78]:
# 표준화 (Standard Scaling)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(youtube_videos[final_video_col])

In [79]:
# Isolation Forest 모델 학습
iso_forest_video = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
iso_forest_video.fit(scaled_features)

# 이상치 점수 계산
anomaly_scores = iso_forest_video.decision_function(scaled_features)

In [80]:
# 임계값 설정
threshold = np.percentile(anomaly_scores, 5) ## 이상치 점수가 작을수록 이상치에 가깝다고 판단하는데, 5번째 백분위수가 (-)로 바뀌는 포인트

# y값 라벨링 (이상치이면 0, 그렇지 않으면 1)
youtube_videos['y_label'] = np.where(anomaly_scores < threshold, 0, 1)

In [81]:
# 이상치 콘텐츠 확인 - 콘텐츠 일일 데이터 중 10% 이상 이상치 데이터 확인
result_video_df = youtube_videos.groupby('video')['y_label'].value_counts().reset_index()

result_video_df_outlier = pd.merge(result_video_df[result_video_df['y_label']==0].reset_index(drop=True), ## video별 이상치 테이블
                                   result_video_df.groupby('video')['count'].sum().reset_index(), ## 전체 video 테이블
                                   how='left', on='video')
video_id_outlier = list(result_video_df_outlier[(result_video_df_outlier['count_x'] / result_video_df_outlier['count_y']) >= 0.1]['video'].unique()) ## video별 이상치 비율

In [143]:
# 이상치 콘텐츠가 20% 이상 있는 계정 확인
result_user_df_outlier = pd.merge(youtube_videos[youtube_videos['video'].isin(video_id_outlier)].groupby('youtube_user_id')['video'].count().reset_index(),
                                  youtube_videos.groupby(['youtube_user_id'])['video'].count().reset_index(), how='left', on='youtube_user_id')
fraud_video_user_id = list(result_user_df_outlier[(result_user_df_outlier['video_x'] / result_user_df_outlier['video_y']) >= 0.15]['youtube_user_id'].unique())

In [144]:
print(len(fraud_video_user_id))
print(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(fraud_video_user_id)]['channel_title'].unique())

40
['정가거부' 'Jeffreyxking' '래띠 LAETI' 'OSSC' 'WORKS.D PLAYLIST' '히스커버리 역사채널'
 '앙찡' '이현우의 MLBTV' '북토크' '축구 읽어주는 여자 쵱내' '수빙수tv sooBingsoo' '팀브라더스'
 'kiu기우쌤' '비됴클래스' '쿜쿜쿜' '뻘짓연구소' '나나무비' '주피코' '름쿠 ᴘʟᴀʏʟɪꜱᴛ'
 '월텍남 - 월스트리트 테크남' 'Mind Patting마음토닥' '황나겸' '돈냄새' 'Suzevi ASMR'
 'MINLEE 민리' '나연이즈백 LPGA Na Yeon Choi' '청어람ARMC' 'abbapraise 아바프레이즈'
 '그롬마쉬TV' '미니멀영어 Minimal English' '日本ジヌ【니혼지누】ー韓国に関する全て' '잼스기타'
 '뷰드름 유튜버 인씨' '너굴몬' 'MerryMa 메리마' '와뷰티TV | Wow Beauty ASMR']


In [173]:
youtube_videos.groupby('y_label')[['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view', 
                                   'grossRevenue_per_ad_impression', 'total_card_teaser_click_rate', 'playlist_engagement_rate']].describe().T

Unnamed: 0,y_label,0,1
grossRevenue_per_ad_impression,count,428218.0,8136125.0
grossRevenue_per_ad_impression,mean,10.282877,1.861948
grossRevenue_per_ad_impression,std,35.43299,3.460786
grossRevenue_per_ad_impression,min,0.0,0.0
grossRevenue_per_ad_impression,25%,0.0,0.0
grossRevenue_per_ad_impression,50%,4.78887,0.0
grossRevenue_per_ad_impression,75%,8.395355,2.975445
grossRevenue_per_ad_impression,max,5280.42306,113.7281
total_card_teaser_click_rate,count,428218.0,8136125.0
total_card_teaser_click_rate,mean,0.000584,0.0001346088


계정, 콘텐츠 이상치별 결과 확인

In [149]:
# 계정 이상치, 콘텐츠 이상치 - 영향력이 큰 계정
print(len(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(set(fraud_user_id) & set(fraud_video_user_id))]['channel_title'].unique()))
print(list(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(set(fraud_user_id) & set(fraud_video_user_id))]['channel_title'].unique()))

15
['Jeffreyxking', '앙찡', '북토크', '수빙수tv sooBingsoo', 'kiu기우쌤', '주피코', 'Mind Patting마음토닥', 'MINLEE 민리', 'abbapraise 아바프레이즈', '그롬마쉬TV', '미니멀영어 Minimal English', '日本ジヌ【니혼지누】ー韓国に関する全て', '잼스기타', '뷰드름 유튜버 인씨', '너굴몬']


In [150]:
# 계정 정상, 콘텐츠 정상 - 영향력이 작거나 측정값이 부족
print(len(merge_df_users_fin[~merge_df_users_fin['youtube_user_id'].isin(set(list(fraud_user_id) + list(fraud_video_user_id)))]['channel_title'].unique()))
print(list(merge_df_users_fin[~merge_df_users_fin['youtube_user_id'].isin(set(list(fraud_user_id) + list(fraud_video_user_id)))]['channel_title'].unique()))

190
['담비', '임영곤 게임방송', '모리녀', '자수의숲jasooforest', '도아이 Doh-I', '스타트업잡스', '콜로니', '미키버그 VR게임', '시골낭만아재', '-mentalholder 멘탈홀더 tv', '빅민 GAME', '군대위키', '드론브이로그 DroneVlog', '키메키친 Kime_kitchen', '후니트립 hoony_trip', '채채ChaeChae', '복지다있소', '세계여행 테리로그 TERRYLOG', '로컬필름 LOCAL FILM', '모염 moyeom', '석시원 커플 SeokSiWon Couple', '법무법인 슈가스퀘어', '소리미의 신화방송', '두꼽이Challenge', '수집의 수집', '막셋의 종합게임', '뚜니랑', '어웨이커 | 크리에이터 이코노미', '에디레일 Eddy Rails', '윤새 Yoonsae', '탬니몰리', '여행윤Tripyun', '고기,요정 MeatPixie', '자린이 조피디', '강포동하우스', '시현하다 RECORDERS', 'Ella', '라이라마', 'sa lly', '마파TV', '콜드쉽 Coldsheep', '띠혜 ddihye', '담순언니 Twins Vlog', 'BUNNY', '김밈서', '시리얼 Sireal', 'AllaproTV', '카이바군', '구봉바다낚시 뽀식이', 'DDONIE 또니 / 러브크레센트', '정케빈 KEVIN', '오디디 코미디', 'fromsuzy 프롬수지', 'KIMBEE 킴비', '슈로시안 SUROSIAN', 'hyeppening 혜프닝', '키키낙낙', '은는이가', '니들needle', 'ORlGN 오리진', 'the sence', '이숲soop', '목소리 연기자 유지컬', '닷츠 DOTS', 'Lizzy리지', '핸슥슥', '김우다', '코딩국수', '전또', 'JN테크리뷰', '래아TV', '고군 Gohgoon', '하부유튜브 Minor / (Lower) YouTube', '오늘도희다 HEEDA', '슬기런바디 Run Body'

In [151]:
# 계정 정상, 콘텐츠 이상치 - 잠재적 영향력이 있는 계정
print(len(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(set(fraud_video_user_id) - set(fraud_user_id))]['channel_title'].unique()))
print(list(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(set(fraud_video_user_id) - set(fraud_user_id))]['channel_title'].unique()))

21
['정가거부', '래띠 LAETI', 'OSSC', 'WORKS.D PLAYLIST', '히스커버리 역사채널', '이현우의 MLBTV', '축구 읽어주는 여자 쵱내', '팀브라더스', '비됴클래스', '쿜쿜쿜', '뻘짓연구소', '나나무비', '름쿠 ᴘʟᴀʏʟɪꜱᴛ', '월텍남 - 월스트리트 테크남', '황나겸', '돈냄새', 'Suzevi ASMR', '나연이즈백 LPGA Na Yeon Choi', '청어람ARMC', 'MerryMa 메리마', '와뷰티TV | Wow Beauty ASMR']


In [152]:
# 계정 이상치, 콘텐츠 정상 - 가짜 영향력 계정
print(len(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(set(fraud_user_id) - set(fraud_video_user_id))]['channel_title'].unique()))
print(list(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(set(fraud_user_id) - set(fraud_video_user_id))]['channel_title'].unique()))

8
['OBL - 온라인 농부, 사자가 되다', '벽돌할아버지 Brick grandpa', '잉툰TV- 만화로 쉽게 영어배우자', '임삐나', '부반TV_부에 반하다', '채림처럼firstcherry', '키나kkina', '코인덕 차트아지']


## 주제2

### 콘텐츠 데이터 분석

#### 파생변수

In [None]:
# y값 파생변수 - 구독자 순증가
youtube_videos['net_subscribers_change'] = youtube_videos['subscribersGained'] - youtube_videos['subscribersLost']

In [36]:
# 파생변수1 - 참여도 관련
youtube_videos['share_rate'] = youtube_videos['shares'] / youtube_videos['views'] ## 공유 비율  
youtube_videos['dislike_rate'] = youtube_videos['dislikes'] / youtube_videos['views'] ## 싫어요 비율

In [37]:
# 파생변수2 - 구독자 관련
youtube_videos['subscribers_conversion_rate'] = youtube_videos['subscribersGained'] / youtube_videos['views'] ## 구독자 전환율
youtube_videos['subscribers_gained_per_playlist_add'] = youtube_videos['subscribersGained'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가당 구독자 증가
youtube_videos['subscribers_lost_per_playlist_remove'] = youtube_videos['subscribersLost'] / youtube_videos['videosRemovedFromPlaylists'] ## 플레이리스트 제거당 구독자 손실

In [38]:
# 파생변수4 - 시청 시간 관련
youtube_videos['watched_view_red_rate'] = youtube_videos['estimatedRedMinutesWatched'] / youtube_videos['views'] ## 조회수당 프리미엄 이용자 시청시간

In [39]:
# 파생변수7 - 비디오 관련
youtube_videos['net_playlist_addition_rate'] = (youtube_videos['videosAddedToPlaylists'] - youtube_videos['videosRemovedFromPlaylists']) / youtube_videos['views'] ## 플레이리스트 순추가 비율

In [40]:
# null값 대체
youtube_videos = youtube_videos.fillna(0) ## NaN
youtube_videos = youtube_videos.replace([np.inf, -np.inf], 0) ## inf

#### 기간별 피처 생성

In [41]:
# 주별, 월별, 분기별 변수 생성

# 조회수 관련
youtube_videos['weekly_views'] = youtube_videos['views'].rolling(window=7).sum()
youtube_videos['monthly_views'] = youtube_videos['views'].rolling(window=30).sum()
youtube_videos['quarterly_views'] = youtube_videos['views'].rolling(window=90).sum()

# 시청 시간 관련
youtube_videos['weekly_watch_time'] = youtube_videos['estimatedMinutesWatched'].rolling(window=7).sum()
youtube_videos['monthly_watch_time'] = youtube_videos['estimatedMinutesWatched'].rolling(window=30).sum()

# 참여도 관련
youtube_videos['weekly_total_engagement'] = (youtube_videos['likes'].rolling(window=7).sum() +
                                 youtube_videos['dislikes'].rolling(window=7).sum() +
                                 youtube_videos['comments'].rolling(window=7).sum() +
                                 youtube_videos['shares'].rolling(window=7).sum())
youtube_videos['monthly_total_engagement'] = (youtube_videos['likes'].rolling(window=30).sum() +
                                  youtube_videos['dislikes'].rolling(window=30).sum() +
                                  youtube_videos['comments'].rolling(window=30).sum() +
                                  youtube_videos['shares'].rolling(window=30).sum())
youtube_videos['quarterly_total_engagement'] = (youtube_videos['likes'].rolling(window=90).sum() +
                                    youtube_videos['dislikes'].rolling(window=90).sum() +
                                    youtube_videos['comments'].rolling(window=90).sum() +
                                    youtube_videos['shares'].rolling(window=90).sum())
youtube_videos['weekly_engagement_rate'] = youtube_videos['weekly_total_engagement'] / (youtube_videos['weekly_views'] + 1)
youtube_videos['weekly_videos_added'] = youtube_videos['videosAddedToPlaylists'].rolling(window=7).sum()

In [42]:
# null값 대체
youtube_videos = youtube_videos.fillna(0) ## NaN
youtube_videos = youtube_videos.replace([np.inf, -np.inf], 0) ## inf

#### 데이터 분할
- 미래를 예측하기 위한 모델이므로 시간순으로 데이터 분할
- 시작날짜 : 2023-03-26
- 종료날짜 : 2024-05-06
- 전체 기간의 80% 날짜 : 2024-02-11

In [43]:
# 컬럼 정리
unique_col = youtube_videos.columns[:3]
x_col = youtube_videos.columns[3:].drop(['net_subscribers_change','subscribersGained', 'subscribersLost']) ## y값 및 원변수 제거

In [44]:
# 데이터 분할
train_data = youtube_videos[youtube_videos['end_date'] <= '2024-02-11']
test_data = youtube_videos[youtube_videos['end_date'] > '2024-02-11']

In [45]:
# 불균형 확인
print(train_data.shape)
print(test_data.shape)

(6533916, 49)
(2030341, 49)


### 변수선택

상관분석

In [20]:
# 상관계수 절대값이 0.3 이상
corr_df = train_data[['net_subscribers_change'] + list(x_col)].corr()
selected_features_by_corr = list(corr_df['net_subscribers_change'][abs(corr_df['net_subscribers_change']) >= 0.3].keys())
selected_features_by_corr.remove('net_subscribers_change')
print(selected_features_by_corr)
print(len(selected_features_by_corr))

['views', 'redViews', 'likes', 'dislikes', 'shares', 'estimatedMinutesWatched', 'weekly_views', 'monthly_views', 'quarterly_views', 'weekly_watch_time', 'monthly_watch_time', 'weekly_total_engagement', 'monthly_total_engagement', 'quarterly_total_engagement']
14


LightGBM

In [21]:
import lightgbm as lgb

# LightGBM 모델 학습
lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_model.fit(train_data[x_col], train_data['net_subscribers_change'])

# 변수 중요도 추출
importances = lgb_model.feature_importances_
feature_importance_lgbm = pd.Series(importances, index=x_col).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
feature_importance_lgbm = feature_importance_lgbm / feature_importance_lgbm.sum() ## 중요도 정규화
selected_features_by_lgbm = list(feature_importance_lgbm[feature_importance_lgbm >= 0.005].keys())

# 중요한 변수 출력
print(len(selected_features_by_lgbm))
print(selected_features_by_lgbm)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.211521 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26161
[LightGBM] [Info] Number of data points in the train set: 6533916, number of used features: 128
[LightGBM] [Info] Start training from score 0.848527
31
['subscribers_conversion_rate', 'dislikes', 'dislike_rate', 'shares', 'subscribers_gained_per_playlist_add', 'views', 'videosAddedToPlaylists', 'subscribers_lost_per_playlist_remove', 'likes', 'share_rate', 'watched_view_rate', 'comments', 'averageViewPercentage', 'videosRemovedFromPlaylists', 'total_engage_rate', 'watch_time_per_playlist_add', 'watch_time_loss_per_playlist_remove', 'like_to_dislike_ratio', 'averageViewDuration', 'monthly_videos_removed', 'redViews', 'monthly_avg_view_duration', 'estimatedRedMinutesWatched', 'comment_rate', 'weekly_engagement_rate', 'monthly_avg_v

XGBoost

In [22]:
import xgboost as xgb

# XGBoost 모델 학습
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_model.fit(train_data[x_col], train_data['net_subscribers_change'])

# 변수 중요도 추출
importances = xgb_model.feature_importances_
feature_importance_xgb = pd.Series(importances, index=x_col).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
selected_features_by_xgb = list(feature_importance_xgb[feature_importance_xgb >= 0.001].keys())

# 중요한 변수 출력
print(len(selected_features_by_xgb))
print(f"XGBoost로 선정된 변수: \n{selected_features_by_xgb}")

51
XGBoost로 선정된 변수: 
['dislikes', 'shares', 'weekly_engagement_rate', 'subscribers_conversion_rate', 'net_playlist_addition_rate', 'videosAddedToPlaylists', 'weekly_videos_added', 'subscribers_gained_per_playlist_add', 'likes', 'videosRemovedFromPlaylists', 'estimatedRedMinutesWatched', 'weekly_watch_time', 'ad_impressions_per_playlist_add', 'like_rate', 'monthly_views', 'views', 'monthly_videos_added', 'monthly_avg_view_duration', 'quarterly_revenue_per_ad_impression', 'comments', 'quarterly_videos_added', 'comment_rate', 'estimatedRedPartnerRevenue', 'watched_view_rate', 'quarterly_avg_view_duration', 'weekly_playlist_change_rate', 'quarterly_playlist_change_rate', 'like_to_dislike_ratio', 'weekly_videos_removed', 'dislike_rate', 'total_engage_rate', 'averageViewPercentage', 'weekly_views', 'subscribers_lost_per_playlist_remove', 'playlist_engagement_rate', 'watch_time_loss_per_playlist_remove', 'quarterly_views', 'adImpressions', 'estimatedMinutesWatched', 'red_revenue_rate', 'share

In [23]:
# 비선형모델 활용 변수 선정
importances_df = pd.DataFrame({
    'features': x_col,
    'lgbm_importance': lgb_model.feature_importances_,
    'xgb_importance': xgb_model.feature_importances_
    })
importances_df['lgbm_importance'] = importances_df['lgbm_importance'] / importances_df['lgbm_importance'].sum() ## 다른 모델과 중요도 단위 다르기 때문에 정규화
importances_df['mean_importance'] = importances_df[['lgbm_importance', 'xgb_importance']].mean(axis=1) ## 각 변수별 모델 중요도 평균

In [24]:
importances_df.sort_values(['mean_importance'],ascending=False).iloc[:40]

Unnamed: 0,features,lgbm_importance,xgb_importance,mean_importance
4,dislikes,0.108667,0.277542,0.193104
34,subscribers_conversion_rate,0.184,0.062526,0.123263
5,shares,0.068,0.163281,0.115641
122,weekly_engagement_rate,0.006667,0.097462,0.052064
10,videosAddedToPlaylists,0.051,0.0449,0.04795
36,subscribers_gained_per_playlist_add,0.065,0.024105,0.044552
29,dislike_rate,0.084667,0.003644,0.044155
0,views,0.055333,0.007759,0.031546
74,net_playlist_addition_rate,0.003,0.058954,0.030977
3,likes,0.035667,0.021837,0.028752


In [43]:
# 최종 변수 선택
final_selected_features = list(set(list(importances_df[importances_df['mean_importance'] >= 0.01]['features']) + selected_features_by_corr))
print(len(final_selected_features))
print(final_selected_features)

27
['watched_view_rate', 'estimatedMinutesWatched', 'weekly_engagement_rate', 'monthly_views', 'estimatedRedMinutesWatched', 'redViews', 'weekly_videos_added', 'views', 'share_rate', 'weekly_total_engagement', 'dislike_rate', 'weekly_watch_time', 'videosRemovedFromPlaylists', 'subscribers_conversion_rate', 'subscribers_gained_per_playlist_add', 'net_playlist_addition_rate', 'shares', 'quarterly_total_engagement', 'dislikes', 'quarterly_views', 'comments', 'likes', 'subscribers_lost_per_playlist_remove', 'monthly_watch_time', 'videosAddedToPlaylists', 'monthly_total_engagement', 'weekly_views']


In [48]:
final_selected_features = ['watched_view_red_rate', 'estimatedMinutesWatched', 'weekly_engagement_rate', 'monthly_views', 'estimatedRedMinutesWatched', 'redViews', 'weekly_videos_added', 'views', 'share_rate', 'weekly_total_engagement', 'dislike_rate', 'weekly_watch_time', 'videosRemovedFromPlaylists', 'subscribers_conversion_rate', 'subscribers_gained_per_playlist_add', 'net_playlist_addition_rate', 'shares', 'quarterly_total_engagement', 'dislikes', 'quarterly_views', 'comments', 'likes', 'subscribers_lost_per_playlist_remove', 'monthly_watch_time', 'videosAddedToPlaylists', 'monthly_total_engagement', 'weekly_views']

### 모델링

#### 모델 적용

XGBoost

In [50]:
# 콘텐츠별 구독자 순증가 평균
youtube_videos.groupby('video')['net_subscribers_change'].sum().mean()

np.float64(91.76659115215216)

In [51]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# 모델 정의 및 학습
# XGBoost를 사용한 모델 학습
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_model.fit(train_data[final_selected_features], train_data['net_subscribers_change'])

# 예측
y_pred = xgb_model.predict(test_data[final_selected_features])

# MSE 계산
mse = mean_squared_error(test_data['net_subscribers_change'], y_pred)

# RMSE 계산
rmse = np.sqrt(mse)

# R² 값 계산
r2 = r2_score(test_data['net_subscribers_change'], y_pred)

# Adjusted R² 계산
n = len(test_data)  # 샘플 수
p = test_data.shape[1]  # 독립 변수(특성) 수
adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

# 결과 출력
print('XGBoost')
print(f"R² 값: {r2:.4f}")
print(f"Adjusted R² 값: {adjusted_r2:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print("----------------------------------------")
print("")

XGBoost
R² 값: 0.7233
Adjusted R² 값: 0.7233
MSE: 188.7693
RMSE: 13.7393
----------------------------------------



### 결과 확인

In [52]:
# 예측 결과 확인
youtube_videos['predict'] = xgb_model.predict(youtube_videos[final_selected_features])

In [53]:
# 계정별 구독자수 평균, 구독자 예측수 평균 비교
result_contents_df = youtube_videos[['youtube_user_id', 'video', 'end_date', 'net_subscribers_change', 'predict']]

# 계정별 콘텐츠의 구독자 순증감 1일 합계
result_contents_df = result_contents_df.groupby(['youtube_user_id', 'end_date'], as_index=False).agg({
    'net_subscribers_change': 'sum',
    'predict': 'sum'
})

# 1개월, 3개월, 6개월, 1년 뒤 구독자수 예측값 계산
result_contents_df['end_date'] = pd.to_datetime(result_contents_df['end_date'])

# Shift와 Rolling 연산을 위한 그룹별 처리
result_contents_df['1_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-30).rolling(window=30).sum())
result_contents_df['3_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-90).rolling(window=90).sum())
result_contents_df['6_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-180).rolling(window=180).sum())
result_contents_df['12_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-365).rolling(window=365).sum())

# 계정별로 최종 평균값을 계산
result_contents_df_final = result_contents_df.groupby('youtube_user_id').agg({
    'net_subscribers_change': 'mean',
    'predict': 'mean',
    '1_month_future_predict': 'mean',
    '3_month_future_predict': 'mean',
    '6_month_future_predict': 'mean',
    '12_month_future_predict': 'mean'
}).reset_index()

In [56]:
# 계정 확인 테이블
user_info_df = merge_df_users_fin[['youtube_user_id','channel_title','published_at']].drop_duplicates().reset_index(drop=True)
user_info_df = user_info_df[user_info_df['channel_title']!='0'].drop_duplicates().reset_index(drop=True)
user_info_df = user_info_df[~user_info_df['channel_title'].isnull()].reset_index(drop=True)
user_info_df = user_info_df.drop_duplicates(['youtube_user_id']).reset_index(drop=True)
user_info_df = user_info_df.drop_duplicates(['channel_title']).reset_index(drop=True)

1개월 결과 확인

In [149]:
# 기초통계 확인
result_contents_df_final['1_month_future_predict'].describe()

count       239.000000
mean       3734.765172
std       26370.672442
min          -0.174987
25%          45.014520
50%         216.290368
75%        1138.661205
max      370256.660160
Name: 1_month_future_predict, dtype: float64

In [58]:
# 사분위수 기준 구간 나누기
# Q1 (25%), Q2 (중위수, 50%), Q3 (75%)를 구함
Q1 = result_contents_df_final['1_month_future_predict'].quantile(0.25)
# Q2 = result_contents_df_final['1_month_future_predict'].quantile(0.5)
Q3 = result_contents_df_final['1_month_future_predict'].quantile(0.75)

# Q1, Q2, Q3에 기반하여 구간 나누기
result_contents_df_final['1_month_future_predict_result'] = pd.cut(result_contents_df_final['1_month_future_predict'],
                                                                   bins=[-float('inf'), Q1, Q3, float('inf')],
                                                                   labels=['Low', 'Medium', 'High'])

In [62]:
# 유지 및 감소 계정 - Low
print(len(user_info_df[user_info_df['youtube_user_id'].isin(result_contents_df_final[result_contents_df_final['1_month_future_predict_result']=='Low']['youtube_user_id'])]['channel_title'].unique()))
print(user_info_df[user_info_df['youtube_user_id'].isin(result_contents_df_final[result_contents_df_final['1_month_future_predict_result']=='Low']['youtube_user_id'])]['channel_title'].unique())

57
['자수의숲jasooforest' '도아이 Doh-I' '콜로니' '-mentalholder 멘탈홀더 tv'
 'OBL - 온라인 농부, 사자가 되다' '드론브이로그 DroneVlog' '세계여행 테리로그 TERRYLOG'
 '로컬필름 LOCAL FILM' '법무법인 슈가스퀘어' '두꼽이Challenge' 'WORKS.D PLAYLIST' '강포동하우스'
 'Ella' 'sa lly' '마파TV' '담순언니 Twins Vlog' '카이바군' 'DDONIE 또니 / 러브크레센트'
 'fromsuzy 프롬수지' '키키낙낙' '은는이가' 'ORlGN 오리진' 'the sence' '이숲soop'
 '고군 Gohgoon' '임삐나' '유익한 균튜버' '꿈꾼 배기' '한나임한나Hannaim' '보미름' '나나무비' '루깬미'
 '름쿠 ᴘʟᴀʏʟɪꜱᴛ' '배우GO' '부반TV_부에 반하다' '오토컨테이너 스튜디오' '연디아 채널 Yeondia Channel'
 '청어람ARMC' '김두부' 'SBM&E Official' '홈바부부_HOMBA BOOBOO' '캠핑 릴리아빠' '달고캠핑'
 '세남자 물고기' 'MORE김모어' '성한준' 'gahyun 가현' '이고 EGO' '기자 황덕현 KIJA HWANG' '모리캠핑'
 '키나kkina' '나는 불독' '마미라(마이미니라이프)' '임퓨의 비트메이킹 클래스' '고도람 Go!doram' '김희영'
 '혜성네일_comet']


In [63]:
# 증가 계정 - Medium
print(len(user_info_df[user_info_df['youtube_user_id'].isin(result_contents_df_final[result_contents_df_final['1_month_future_predict_result']=='Medium']['youtube_user_id'])]['channel_title'].unique()))
print(user_info_df[user_info_df['youtube_user_id'].isin(result_contents_df_final[result_contents_df_final['1_month_future_predict_result']=='Medium']['youtube_user_id'])]['channel_title'].unique())

114
['담비' '임영곤 게임방송' '모리녀' '스타트업잡스' '미키버그 VR게임' '시골낭만아재' '빅민 GAME' '군대위키'
 '키메키친 Kime_kitchen' '후니트립 hoony_trip' '채채ChaeChae' '복지다있소' '소리미의 신화방송'
 '수집의 수집' '막셋의 종합게임' '어웨이커 | 크리에이터 이코노미' '에디레일 Eddy Rails' '윤새 Yoonsae'
 '탬니몰리' '여행윤Tripyun' '히스커버리 역사채널' '자린이 조피디' '시현하다 RECORDERS'
 '잉툰TV- 만화로 쉽게 영어배우자' '라이라마' '이현우의 MLBTV' 'BUNNY' '김밈서' '시리얼 Sireal'
 'AllaproTV' '구봉바다낚시 뽀식이' '정케빈 KEVIN' 'KIMBEE 킴비' 'hyeppening 혜프닝'
 '닷츠 DOTS' '팀브라더스' 'Lizzy리지' '핸슥슥' '김우다' '전또' '래아TV' '오늘도희다 HEEDA'
 '슬기런바디 Run Body' '지니원장의피부톡톡' '용싸부 yongssaboo' '흙회장' '태권민국_Captain Master'
 '라나제이베이킹Lana J' '요니의 응원 yoni' '유경몬' '에피코딩' '일렉트릭 차이나' '두남자 토익TV' '찌늉'
 '비제TV' '너드 슬로리 SloLee' '수란쿤' 'Mein 미인' '지미 geemi.' '데일리 슬슬' '김퍼프PUFF'
 'assesta' '낭만아저씨코디TV' '돈냄새' 'Suzevi ASMR' 'GMENCY 멘시의 마인크래프트'
 'JinBlog 진블로그' '믿식당' '굥플레이스 맛집투어' '다먹어라이언' '콤므' '인썸니아TV' '퓨츠앙' '차박씬'
 '사부작장제소sabujakfarrier' '약사 이진수💊' '꾸앤끄' 'D_tail_디테'
 "파파스캠핑 papa's camp | a korean camper" '그롬마쉬TV' '모하지연 MOHAJIYEON'
 'TJ 영상채널' '윤순의 평범치 않은 생활' '단곰' '태다린tae_darin' 'SA

In [64]:
# 큰 증가 계정 - High
print(len(user_info_df[user_info_df['youtube_user_id'].isin(result_contents_df_final[result_contents_df_final['1_month_future_predict_result']=='High']['youtube_user_id'])]['channel_title'].unique()))
print(user_info_df[user_info_df['youtube_user_id'].isin(result_contents_df_final[result_contents_df_final['1_month_future_predict_result']=='High']['youtube_user_id'])]['channel_title'].unique())

58
['정가거부' 'Jeffreyxking' '래띠 LAETI' 'OSSC' '모염 moyeom'
 '석시원 커플 SeokSiWon Couple' '뚜니랑' '벽돌할아버지 Brick grandpa' '고기,요정 MeatPixie'
 '앙찡' '콜드쉽 Coldsheep' '띠혜 ddihye' '북토크' '오디디 코미디' '슈로시안 SUROSIAN'
 '축구 읽어주는 여자 쵱내' '니들needle' '수빙수tv sooBingsoo' 'kiu기우쌤' '비됴클래스' '코딩국수'
 '쿜쿜쿜' '뻘짓연구소' '만능혁키' '오엔티엘 패션 / ONTL FASHION' '뛰뛰빵빵 김옥순' '주피코' '채찍단'
 '월텍남 - 월스트리트 테크남' '소피요가 Sophie Yoga' 'Mind Patting마음토닥' '나는미도' '주당 김자케'
 '황나겸' '집구석구석꿀팁, 집꿀' 'MINLEE 민리' '나연이즈백 LPGA Na Yeon Choi'
 'abbapraise 아바프레이즈' '맛집남자 foodman' '밖비타' '여리여리YeoriYeori'
 '채림처럼firstcherry' '미니멀영어 Minimal English' '동도니TV DongDo2TV'
 '日本ジヌ【니혼지누】ー韓国に関する全て' '동아일보' '주대성 JooDaesung' '유네린NERIN' '잼스기타'
 '꽃 읽어주는 남자 kkotnam' '중년독수리의 대리여행' '뷰드름 유튜버 인씨' '하원장 강동현' '너굴몬'
 'MerryMa 메리마' '와뷰티TV | Wow Beauty ASMR' 'V I N 빈 ' '코인덕 차트아지']


3개월 결과 확인

In [86]:
# 기초통계 확인
# result_contents_df_final['3_month_future_predict'].describe()

# 소수점 이하 자릿수 설정 (예: 소수점 이하 2자리)
pd.options.display.float_format = '{:.2f}'.format
# 결과 확인
result_contents_df_final['3_month_future_predict'].describe()

# 표현법 초기화
# pd.reset_option('display.float_format')

count       221.00
mean      11729.85
std      111553.43
min          -0.89
25%         121.91
50%         585.39
75%        3557.71
max     1656497.70
Name: 3_month_future_predict, dtype: float64

In [87]:
# 사분위수 기준 구간 나누기
# Q1 (25%), Q2 (중위수, 50%), Q3 (75%)를 구함
Q1 = result_contents_df_final['3_month_future_predict'].quantile(0.25)
# Q2 = result_contents_df_final['3_month_future_predict'].quantile(0.5)
Q3 = result_contents_df_final['3_month_future_predict'].quantile(0.75)

# Q1, Q2, Q3에 기반하여 구간 나누기
result_contents_df_final['3_month_future_predict_result'] = pd.cut(result_contents_df_final['3_month_future_predict'],
                                                                   bins=[-float('inf'), Q1, Q3, float('inf')],
                                                                   labels=['Low', 'Medium', 'High'])

In [88]:
# 유지 및 감소 계정 - Low
print(len(user_info_df[user_info_df['youtube_user_id'].isin(result_contents_df_final[result_contents_df_final['3_month_future_predict_result']=='Low']['youtube_user_id'])]['channel_title'].unique()))
print(user_info_df[user_info_df['youtube_user_id'].isin(result_contents_df_final[result_contents_df_final['3_month_future_predict_result']=='Low']['youtube_user_id'])]['channel_title'].unique())

53
['도아이 Doh-I' '콜로니' '-mentalholder 멘탈홀더 tv' 'OBL - 온라인 농부, 사자가 되다'
 '세계여행 테리로그 TERRYLOG' '로컬필름 LOCAL FILM' '법무법인 슈가스퀘어' '두꼽이Challenge'
 'WORKS.D PLAYLIST' '강포동하우스' 'Ella' 'sa lly' '마파TV' '담순언니 Twins Vlog'
 '김밈서' 'DDONIE 또니 / 러브크레센트' 'fromsuzy 프롬수지' '키키낙낙' '은는이가' 'ORlGN 오리진'
 'the sence' '이숲soop' '고군 Gohgoon' '임삐나' '유익한 균튜버' '꿈꾼 배기' '한나임한나Hannaim'
 '보미름' '나나무비' '름쿠 ᴘʟᴀʏʟɪꜱᴛ' '배우GO' '부반TV_부에 반하다' '오토컨테이너 스튜디오'
 '연디아 채널 Yeondia Channel' '청어람ARMC' '김두부' '홈바부부_HOMBA BOOBOO' '캠핑 릴리아빠'
 '달고캠핑' '세남자 물고기' '꾸앤끄' 'MORE김모어' '성한준' 'gahyun 가현' '이고 EGO'
 '기자 황덕현 KIJA HWANG' '모리캠핑' '키나kkina' '서유 SEOYU DANCE' '나는 불독'
 '임퓨의 비트메이킹 클래스' '고도람 Go!doram' '김희영']


In [89]:
# 증가 계정 - Medium
print(len(user_info_df[user_info_df['youtube_user_id'].isin(result_contents_df_final[result_contents_df_final['3_month_future_predict_result']=='Medium']['youtube_user_id'])]['channel_title'].unique()))
print(user_info_df[user_info_df['youtube_user_id'].isin(result_contents_df_final[result_contents_df_final['3_month_future_predict_result']=='Medium']['youtube_user_id'])]['channel_title'].unique())

107
['담비' '임영곤 게임방송' '스타트업잡스' '미키버그 VR게임' '시골낭만아재' '빅민 GAME' '군대위키'
 '드론브이로그 DroneVlog' '키메키친 Kime_kitchen' '후니트립 hoony_trip' '채채ChaeChae'
 '복지다있소' '소리미의 신화방송' '수집의 수집' '막셋의 종합게임' '에디레일 Eddy Rails' '윤새 Yoonsae'
 '탬니몰리' '여행윤Tripyun' '시현하다 RECORDERS' '잉툰TV- 만화로 쉽게 영어배우자' '라이라마'
 '이현우의 MLBTV' 'BUNNY' '시리얼 Sireal' 'AllaproTV' '카이바군' '구봉바다낚시 뽀식이'
 '정케빈 KEVIN' 'KIMBEE 킴비' 'hyeppening 혜프닝' '닷츠 DOTS' '팀브라더스' '핸슥슥' '김우다'
 '전또' '래아TV' '오늘도희다 HEEDA' '지니원장의피부톡톡' '용싸부 yongssaboo' '흙회장'
 '태권민국_Captain Master' '라나제이베이킹Lana J' '요니의 응원 yoni' '유경몬' '에피코딩'
 '두남자 토익TV' '찌늉' '비제TV' '너드 슬로리 SloLee' '수란쿤' '소피요가 Sophie Yoga' 'Mein 미인'
 '지미 geemi.' '데일리 슬슬' '김퍼프PUFF' 'assesta' '낭만아저씨코디TV' '돈냄새' 'Suzevi ASMR'
 'GMENCY 멘시의 마인크래프트' 'JinBlog 진블로그' '믿식당' '굥플레이스 맛집투어' '다먹어라이언' '콤므'
 '인썸니아TV' '퓨츠앙' '차박씬' '사부작장제소sabujakfarrier' '약사 이진수💊' 'D_tail_디테'
 "파파스캠핑 papa's camp | a korean camper" '그롬마쉬TV' '모하지연 MOHAJIYEON'
 'TJ 영상채널' '윤순의 평범치 않은 생활' '단곰' '태다린tae_darin' 'SATUR 세터업' '윈플즈TV'
 'Dalhae달달해' "루다의 댄스 연구소 Ruda's Dance 

In [90]:
# 큰 증가 계정 - High
print(len(user_info_df[user_info_df['youtube_user_id'].isin(result_contents_df_final[result_contents_df_final['3_month_future_predict_result']=='High']['youtube_user_id'])]['channel_title'].unique()))
print(user_info_df[user_info_df['youtube_user_id'].isin(result_contents_df_final[result_contents_df_final['3_month_future_predict_result']=='High']['youtube_user_id'])]['channel_title'].unique())

54
['정가거부' 'Jeffreyxking' '래띠 LAETI' 'OSSC' '모염 moyeom'
 '석시원 커플 SeokSiWon Couple' '뚜니랑' '벽돌할아버지 Brick grandpa' '고기,요정 MeatPixie'
 '히스커버리 역사채널' '앙찡' '콜드쉽 Coldsheep' '띠혜 ddihye' '북토크' '오디디 코미디'
 '슈로시안 SUROSIAN' '축구 읽어주는 여자 쵱내' '니들needle' '수빙수tv sooBingsoo' 'kiu기우쌤'
 '비됴클래스' '코딩국수' '쿜쿜쿜' '뻘짓연구소' '만능혁키' '오엔티엘 패션 / ONTL FASHION' '뛰뛰빵빵 김옥순'
 '채찍단' '월텍남 - 월스트리트 테크남' 'Mind Patting마음토닥' '나는미도' '주당 김자케' '황나겸'
 '집구석구석꿀팁, 집꿀' 'MINLEE 민리' '나연이즈백 LPGA Na Yeon Choi' 'abbapraise 아바프레이즈'
 '맛집남자 foodman' '밖비타' '채림처럼firstcherry' '미니멀영어 Minimal English'
 '동도니TV DongDo2TV' '日本ジヌ【니혼지누】ー韓国に関する全て' '동아일보' '주대성 JooDaesung'
 '유네린NERIN' '잼스기타' '중년독수리의 대리여행' '뷰드름 유튜버 인씨' '너굴몬' 'MerryMa 메리마'
 '와뷰티TV | Wow Beauty ASMR' 'V I N 빈 ' '코인덕 차트아지']


1개월 -> 3개월 변화 결과 확인

In [99]:
# 1개월 Low, 3개월 Medium
user_info_df[user_info_df['youtube_user_id'].isin(result_contents_df_final[(result_contents_df_final['1_month_future_predict_result']=='Low') & (result_contents_df_final['3_month_future_predict_result']=='Medium')]['youtube_user_id'])]

Unnamed: 0,youtube_user_id,channel_title,published_at
15,63062b8c85e48e6e40225679,드론브이로그 DroneVlog,2019-09-01 00:45:47
59,63f074f8efd51c165b4419c3,카이바군,2012-06-19 16:33:55
205,64acc2b5616bd20e303698b7,마미라(마이미니라이프),2016-03-02 12:03:11


In [106]:
# 1개월 Low, 3개월 High
user_info_df[user_info_df['youtube_user_id'].isin(result_contents_df_final[(result_contents_df_final['1_month_future_predict_result']=='Low') & (result_contents_df_final['3_month_future_predict_result']=='High')]['youtube_user_id'])]

Unnamed: 0,youtube_user_id,channel_title,published_at


In [105]:
# 1개월 Medium, 3개월 Low
user_info_df[user_info_df['youtube_user_id'].isin(result_contents_df_final[(result_contents_df_final['1_month_future_predict_result']=='Medium') & (result_contents_df_final['3_month_future_predict_result']=='Low')]['youtube_user_id'])]

Unnamed: 0,youtube_user_id,channel_title,published_at
56,62dac066ce4fcc731dac964f,김밈서,2020-09-01 18:23:15.930
166,6405a51c118c0f58588199da,꾸앤끄,2023-01-08 08:51:33.734
193,62c74450507271632b9f76f8,서유 SEOYU DANCE,2017-09-28 19:04:17.000


In [107]:
# 1개월 Medium, 3개월 High
user_info_df[user_info_df['youtube_user_id'].isin(result_contents_df_final[(result_contents_df_final['1_month_future_predict_result']=='Medium') & (result_contents_df_final['3_month_future_predict_result']=='High')]['youtube_user_id'])]

Unnamed: 0,youtube_user_id,channel_title,published_at
40,6401da7ad746c60e1271fdd6,히스커버리 역사채널,2019-08-09 15:32:44


In [108]:
# 1개월 High, 3개월 Low
user_info_df[user_info_df['youtube_user_id'].isin(result_contents_df_final[(result_contents_df_final['1_month_future_predict_result']=='High') & (result_contents_df_final['3_month_future_predict_result']=='Low')]['youtube_user_id'])]

Unnamed: 0,youtube_user_id,channel_title,published_at


In [109]:
# 1개월 High, 3개월 Medium
user_info_df[user_info_df['youtube_user_id'].isin(result_contents_df_final[(result_contents_df_final['1_month_future_predict_result']=='High') & (result_contents_df_final['3_month_future_predict_result']=='Medium')]['youtube_user_id'])]

Unnamed: 0,youtube_user_id,channel_title,published_at
124,64489f342aff1641e3f3b42b,소피요가 Sophie Yoga,2012-03-19 07:27:17.000
199,63f96f352a0144119186e0e4,꽃 읽어주는 남자 kkotnam,2021-04-22 06:15:00.467


## 주제3

### 콘텐츠 데이터 분석

#### 파생변수

In [7]:
# 파생변수1 - 참여도 관련
youtube_videos['like_rate'] = youtube_videos['likes'] / youtube_videos['views'] ## 좋아요 비율 
youtube_videos['comment_rate'] = youtube_videos['comments'] / youtube_videos['views'] ## 댓글 비율
youtube_videos['share_rate'] = youtube_videos['shares'] / youtube_videos['views'] ## 공유 비율  
youtube_videos['dislike_rate'] = youtube_videos['dislikes'] / youtube_videos['views'] ## 싫어요 비율
youtube_videos['total_engage_rate'] = (youtube_videos['likes'] + youtube_videos['comments'] + youtube_videos['shares'] + youtube_videos['dislikes']) / youtube_videos['views'] ## 총 참여율
youtube_videos['positive_engage_rate'] = (youtube_videos['likes'] + youtube_videos['shares']) / youtube_videos['views'] ## 긍정적 참여율
youtube_videos['comment_to_like_rate'] = youtube_videos['comments'] / youtube_videos['likes'] ## 댓글/좋아요 비율
youtube_videos['like_to_dislike_ratio'] = youtube_videos['likes'] / (youtube_videos['dislikes']) ## 좋아요/싫어요 비율

In [8]:
# 파생변수2 - 구독자 관련
youtube_videos['net_subscribers_change'] = youtube_videos['subscribersGained'] - youtube_videos['subscribersLost'] ## 구독자 순증가
youtube_videos['subscribers_conversion_rate'] = youtube_videos['subscribersGained'] / youtube_videos['views'] ## 구독자 전환율
youtube_videos['subscribers_gained_per_card_click'] = youtube_videos['subscribersGained'] / youtube_videos['cardClicks'] ## 카드 클릭당 구독자 증가
youtube_videos['subscribers_gained_per_playlist_add'] = youtube_videos['subscribersGained'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가당 구독자 증가
youtube_videos['card_click_to_subscriber_conversion'] = youtube_videos['cardClickRate'] / youtube_videos['subscribersGained'] ## 카드 클릭률 대비 구독자 전환율
youtube_videos['subscribers_lost_per_playlist_remove'] = youtube_videos['subscribersLost'] / youtube_videos['videosRemovedFromPlaylists'] ## 플레이리스트 제거당 구독자 손실

In [9]:
# 파생변수3 - 수익 관련
youtube_videos['revenue_per_view'] = youtube_videos['estimatedRevenue'] / youtube_videos['views'] ## 조회수당 수익
youtube_videos['revenue_per_red_view'] = youtube_videos['estimatedRevenue'] / youtube_videos['redViews'] ## 프리미엄당 수익
youtube_videos['ad_revenue_rate'] = youtube_videos['estimatedAdRevenue'] / youtube_videos['estimatedRevenue'] ## 광고수익비율
youtube_videos['red_revenue_rate'] = youtube_videos['estimatedRedPartnerRevenue'] / youtube_videos['estimatedRevenue'] ## 프리미엄수익비율
youtube_videos['cpm_to_revenue_ratio'] = youtube_videos['cpm'] / youtube_videos['estimatedRevenue'] ## cpm 대비 수익
youtube_videos['revenue_per_ad_impression'] = youtube_videos['estimatedRevenue'] / youtube_videos['adImpressions'] ## 광고노출당 수익
youtube_videos['playback_based_cpm_rate'] = youtube_videos['playbackBasedCpm'] / youtube_videos['cpm'] ## 재생 기반 수익
youtube_videos['revenue_per_card_click'] = youtube_videos['estimatedRevenue'] / youtube_videos['cardClicks'] ## 카드 클릭당 수익
youtube_videos['revenue_per_playlist_add'] = youtube_videos['estimatedRevenue'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가당 수익
youtube_videos['card_click_to_revenue_ratio'] = youtube_videos['cardClickRate'] / youtube_videos['estimatedRevenue'] ## 카드 클릭률 대비 수익 비율
youtube_videos['net_revenue_per_playlist_add'] = (youtube_videos['videosAddedToPlaylists'] - youtube_videos['videosRemovedFromPlaylists']) / youtube_videos['estimatedRevenue'] ## 플레이리스트 순추가당 수익
youtube_videos['ad_revenue_per_card_click'] = youtube_videos['estimatedAdRevenue'] / youtube_videos['cardClicks'] ## 카드 클릭당 광고 수익

In [10]:
# 파생변수4 - 시청 시간 관련
youtube_videos['revenue_per_minute_watched'] = youtube_videos['estimatedRevenue'] / youtube_videos['estimatedMinutesWatched'] ## 시청 시간 당 수익
youtube_videos['revenue_per_minute_watched'] = youtube_videos['estimatedRevenue'] / youtube_videos['estimatedRedMinutesWatched'] ## 프리미엄 이용자 시청 시간 당 수익
youtube_videos['avg_view_duration_rate'] = youtube_videos['averageViewDuration'] / youtube_videos['averageViewPercentage'] ## 평균 시청 시간 비율
youtube_videos['watched_time_rate'] = youtube_videos['averageViewPercentage'] * youtube_videos['estimatedMinutesWatched'] ## 재생 비율 대비 시청 시간
youtube_videos['watched_time_red_rate'] = youtube_videos['averageViewPercentage'] * youtube_videos['estimatedRedMinutesWatched'] ## 재생 비율 대비 프리미엄 이용자 시청 시간
youtube_videos['watched_view_rate'] = youtube_videos['estimatedMinutesWatched'] / youtube_videos['views'] ## 조회수당 시청시간
youtube_videos['watched_view_red_rate'] = youtube_videos['estimatedRedMinutesWatched'] / youtube_videos['views'] ## 조회수당 프리미엄 이용자 시청시간
youtube_videos['watch_time_per_card_click'] = youtube_videos['estimatedMinutesWatched'] / youtube_videos['cardClicks'] ## 카드 클릭당 시청 시간
youtube_videos['watch_time_per_playlist_add'] = youtube_videos['estimatedMinutesWatched'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가당 시청 시간
youtube_videos['avg_view_duration_per_card_click'] = youtube_videos['averageViewDuration'] / youtube_videos['cardClicks'] ## 카드 클릭 대비 평균 재생 시간 비율
youtube_videos['watch_time_loss_per_playlist_remove'] = youtube_videos['estimatedMinutesWatched'] / youtube_videos['videosRemovedFromPlaylists'] ## 플레이리스트 제거당 시청 시간 손실


In [11]:
# 파생변수5 - 광고 관련
youtube_videos['revenue_per_playback'] = youtube_videos['grossRevenue'] / youtube_videos['monetizedPlaybacks'] ## 1회 광고재생당 수익
youtube_videos['grossRevenue_per_ad_impression'] = youtube_videos['grossRevenue'] / youtube_videos['adImpressions'] ## 1회 광고노출당 총수익
youtube_videos['playback_rate'] = youtube_videos['monetizedPlaybacks'] / youtube_videos['adImpressions'] ## 광고노출 대비 재생율
youtube_videos['unplayback_rate'] = (youtube_videos['adImpressions'] - youtube_videos['monetizedPlaybacks']) / youtube_videos['adImpressions'] ## 광고노출 대비 비재생율
youtube_videos['revenue_per_card_click'] = youtube_videos['grossRevenue'] / youtube_videos['cardClicks'] ## 카드 클릭당 수익
youtube_videos['revenue_per_playlist_add'] = youtube_videos['grossRevenue'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가당 수익
youtube_videos['card_click_to_revenue_ratio'] = youtube_videos['cardClickRate'] / youtube_videos['grossRevenue'] ## 카드 클릭률 대비 수익 비율
youtube_videos['net_revenue_per_playlist_add'] = (youtube_videos['videosAddedToPlaylists'] - youtube_videos['videosRemovedFromPlaylists']) / youtube_videos['grossRevenue'] ## 플레이리스트 순추가당 수익
youtube_videos['ad_impressions_per_card_click'] = youtube_videos['adImpressions'] / youtube_videos['cardClicks'] ## 카드 클릭당 광고 노출
youtube_videos['ad_impressions_per_playlist_add'] = youtube_videos['adImpressions'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가당 광고 노출
youtube_videos['ad_playbacks_per_card_click'] = youtube_videos['monetizedPlaybacks'] / youtube_videos['cardClicks'] ## 카드 클릭 대비 광고 재생 비율
youtube_videos['ad_playbacks_per_playlist_add'] = youtube_videos['monetizedPlaybacks'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가당 광고 재생 비율

In [12]:
# 파생변수6 - 카드 관련
youtube_videos['card_to_teaser_click_rate'] = youtube_videos['cardClickRate'] / youtube_videos['cardTeaserClickRate'] ## 카드 티저 클릭률 대비 카드 클릭률
youtube_videos['card_click_per_impression_rate'] = youtube_videos['cardClicks'] / youtube_videos['cardImpressions'] ## 카드 노출당 클릭 비율
youtube_videos['card_teaser_click_per_impression_rate'] = youtube_videos['cardTeaserClicks'] / youtube_videos['cardTeaserImpressions'] ## 카드 티저 노출당 카드 티저 클릭 비율
youtube_videos['total_card_teaser_click_rate'] = (youtube_videos['cardClicks'] + youtube_videos['cardTeaserClicks']) / (youtube_videos['cardImpressions'] + youtube_videos['cardTeaserImpressions']) ## 카드와 카드 티저의 총 클릭률
youtube_videos['card_conversion_rate'] = youtube_videos['cardClicks'] / youtube_videos['cardTeaserClicks'] ## 카드 클릭 전환율

In [13]:
# 파생변수7 - 비디오 관련
youtube_videos['playlist_addition_rate'] = youtube_videos['videosAddedToPlaylists'] / youtube_videos['views'] ## 플레이리스트 추가 비율
youtube_videos['playlist_removal_rate'] = youtube_videos['videosRemovedFromPlaylists'] / youtube_videos['views'] ## 플레이리스트 제거 비율
youtube_videos['net_playlist_addition_rate'] = (youtube_videos['videosAddedToPlaylists'] - youtube_videos['videosRemovedFromPlaylists']) / youtube_videos['views'] ## 플레이리스트 순추가 비율
youtube_videos['playlist_engagement_rate'] = (youtube_videos['videosAddedToPlaylists'] + youtube_videos['videosRemovedFromPlaylists']) / youtube_videos['views'] ## 플레이리스트 참여도
youtube_videos['playlist_related_revenue_rate'] = youtube_videos['estimatedRevenue'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가 당 순수익

In [14]:
# null값 대체
youtube_videos = youtube_videos.fillna(0) ## NaN
youtube_videos = youtube_videos.replace([np.inf, -np.inf], 0) ## inf

#### 기간별 피처 생성

In [15]:
# 주별, 월별, 분기별 변수 생성

# 조회수 및 시청 시간 관련
youtube_videos['weekly_views'] = youtube_videos['views'].rolling(window=7).sum()
youtube_videos['monthly_views'] = youtube_videos['views'].rolling(window=30).sum()
youtube_videos['quarterly_views'] = youtube_videos['views'].rolling(window=90).sum()

youtube_videos['weekly_watch_time'] = youtube_videos['estimatedMinutesWatched'].rolling(window=7).sum()
youtube_videos['monthly_watch_time'] = youtube_videos['estimatedMinutesWatched'].rolling(window=30).sum()
youtube_videos['quarterly_watch_time'] = youtube_videos['estimatedMinutesWatched'].rolling(window=90).sum()

youtube_videos['weekly_avg_view_duration'] = youtube_videos['averageViewDuration'].rolling(window=7).mean()
youtube_videos['monthly_avg_view_duration'] = youtube_videos['averageViewDuration'].rolling(window=30).mean()
youtube_videos['quarterly_avg_view_duration'] = youtube_videos['averageViewDuration'].rolling(window=90).mean()

youtube_videos['weekly_avg_view_percentage'] = youtube_videos['averageViewPercentage'].rolling(window=7).mean()
youtube_videos['monthly_avg_view_percentage'] = youtube_videos['averageViewPercentage'].rolling(window=30).mean()
youtube_videos['quarterly_avg_view_percentage'] = youtube_videos['averageViewPercentage'].rolling(window=90).mean()


# 참여도 관련
youtube_videos['weekly_videos_added'] = youtube_videos['videosAddedToPlaylists'].rolling(window=7).sum()
youtube_videos['monthly_videos_added'] = youtube_videos['videosAddedToPlaylists'].rolling(window=30).sum()
youtube_videos['quarterly_videos_added'] = youtube_videos['videosAddedToPlaylists'].rolling(window=90).sum()

youtube_videos['weekly_videos_removed'] = youtube_videos['videosRemovedFromPlaylists'].rolling(window=7).sum()
youtube_videos['monthly_videos_removed'] = youtube_videos['videosRemovedFromPlaylists'].rolling(window=30).sum()
youtube_videos['quarterly_videos_removed'] = youtube_videos['videosRemovedFromPlaylists'].rolling(window=90).sum()

# 수익 및 광고 관련
youtube_videos['weekly_estimated_revenue'] = youtube_videos['estimatedRevenue'].rolling(window=7).sum()
youtube_videos['monthly_estimated_revenue'] = youtube_videos['estimatedRevenue'].rolling(window=30).sum()
youtube_videos['quarterly_estimated_revenue'] = youtube_videos['estimatedRevenue'].rolling(window=90).sum()

youtube_videos['weekly_estimated_ad_revenue'] = youtube_videos['estimatedAdRevenue'].rolling(window=7).sum()
youtube_videos['monthly_estimated_ad_revenue'] = youtube_videos['estimatedAdRevenue'].rolling(window=30).sum()
youtube_videos['quarterly_estimated_ad_revenue'] = youtube_videos['estimatedAdRevenue'].rolling(window=90).sum()

youtube_videos['weekly_revenue_per_ad_impression'] = youtube_videos['weekly_estimated_ad_revenue'] / (youtube_videos['adImpressions'].rolling(window=7).sum() + 1)
youtube_videos['monthly_revenue_per_ad_impression'] = youtube_videos['monthly_estimated_ad_revenue'] / (youtube_videos['adImpressions'].rolling(window=30).sum() + 1)
youtube_videos['quarterly_revenue_per_ad_impression'] = youtube_videos['quarterly_estimated_ad_revenue'] / (youtube_videos['adImpressions'].rolling(window=90).sum() + 1)

youtube_videos['weekly_ad_impressions'] = youtube_videos['adImpressions'].rolling(window=7).sum()
youtube_videos['monthly_ad_impressions'] = youtube_videos['adImpressions'].rolling(window=30).sum()
youtube_videos['quarterly_ad_impressions'] = youtube_videos['adImpressions'].rolling(window=90).sum()

# 카드 및 티저 관련
youtube_videos['weekly_card_click_rate'] = youtube_videos['cardClickRate'].rolling(window=7).mean()
youtube_videos['monthly_card_click_rate'] = youtube_videos['cardClickRate'].rolling(window=30).mean()
youtube_videos['quarterly_card_click_rate'] = youtube_videos['cardClickRate'].rolling(window=90).mean()

youtube_videos['weekly_card_teaser_click_rate'] = youtube_videos['cardTeaserClickRate'].rolling(window=7).mean()
youtube_videos['monthly_card_teaser_click_rate'] = youtube_videos['cardTeaserClickRate'].rolling(window=30).mean()
youtube_videos['quarterly_card_teaser_click_rate'] = youtube_videos['cardTeaserClickRate'].rolling(window=90).mean()

youtube_videos['weekly_card_clicks'] = youtube_videos['cardClicks'].rolling(window=7).sum()
youtube_videos['monthly_card_clicks'] = youtube_videos['cardClicks'].rolling(window=30).sum()
youtube_videos['quarterly_card_clicks'] = youtube_videos['cardClicks'].rolling(window=90).sum()

youtube_videos['weekly_card_teaser_clicks'] = youtube_videos['cardTeaserClicks'].rolling(window=7).sum()
youtube_videos['monthly_card_teaser_clicks'] = youtube_videos['cardTeaserClicks'].rolling(window=30).sum()
youtube_videos['quarterly_card_teaser_clicks'] = youtube_videos['cardTeaserClicks'].rolling(window=90).sum()

# 참여도 관련
youtube_videos['weekly_total_engagement'] = (youtube_videos['likes'].rolling(window=7).sum() +
                                 youtube_videos['dislikes'].rolling(window=7).sum() +
                                 youtube_videos['comments'].rolling(window=7).sum() +
                                 youtube_videos['shares'].rolling(window=7).sum())

youtube_videos['monthly_total_engagement'] = (youtube_videos['likes'].rolling(window=30).sum() +
                                  youtube_videos['dislikes'].rolling(window=30).sum() +
                                  youtube_videos['comments'].rolling(window=30).sum() +
                                  youtube_videos['shares'].rolling(window=30).sum())

youtube_videos['quarterly_total_engagement'] = (youtube_videos['likes'].rolling(window=90).sum() +
                                    youtube_videos['dislikes'].rolling(window=90).sum() +
                                    youtube_videos['comments'].rolling(window=90).sum() +
                                    youtube_videos['shares'].rolling(window=90).sum())

youtube_videos['weekly_engagement_rate'] = youtube_videos['weekly_total_engagement'] / (youtube_videos['weekly_views'] + 1)
youtube_videos['monthly_engagement_rate'] = youtube_videos['monthly_total_engagement'] / (youtube_videos['monthly_views'] + 1)
youtube_videos['quarterly_engagement_rate'] = youtube_videos['quarterly_total_engagement'] / (youtube_videos['quarterly_views'] + 1)

youtube_videos['weekly_playlist_change_rate'] = (youtube_videos['weekly_videos_added'] - youtube_videos['weekly_videos_removed']) / (youtube_videos['weekly_videos_added'] + 1)
youtube_videos['monthly_playlist_change_rate'] = (youtube_videos['monthly_videos_added'] - youtube_videos['monthly_videos_removed']) / (youtube_videos['monthly_videos_added'] + 1)
youtube_videos['quarterly_playlist_change_rate'] = (youtube_videos['quarterly_videos_added'] - youtube_videos['quarterly_videos_removed']) / (youtube_videos['quarterly_videos_added'] + 1)

In [16]:
# null값 대체
# youtube_videos = youtube_videos.fillna(0) ## NaN
# youtube_videos = youtube_videos.replace([np.inf, -np.inf], 0) ## inf

youtube_videos.fillna(0, inplace=True)
youtube_videos.replace([np.inf, -np.inf], 0, inplace=True)

#### 데이터 분할
- 미래를 예측하기 위한 모델이므로 시간순으로 데이터 분할
- 시작날짜 : 2023-03-26
- 종료날짜 : 2024-05-06
- 전체 기간의 80% 날짜 : 2024-02-11

In [None]:
# 컬럼 정리
unique_col = youtube_videos.columns[:3]
x_col = youtube_videos.columns[3:].drop(['estimatedAdRevenue']) ## y값 및 원변수 제거

In [None]:
# 데이터 분할
train_data = youtube_videos[youtube_videos['end_date'] <= '2024-02-11']
test_data = youtube_videos[youtube_videos['end_date'] > '2024-02-11']

In [None]:
# 불균형 확인
print(train_data.shape)
print(test_data.shape)

(6533916, 134)
(2030341, 134)


### 변수선택

상관분석

In [None]:
# 상관계수 절대값이 0.3 이상
corr_df = train_data[['estimatedAdRevenue'] + list(x_col)].corr()
selected_features_by_corr = list(corr_df['estimatedAdRevenue'][abs(corr_df['estimatedAdRevenue']) >= 0.3].keys())
selected_features_by_corr.remove('estimatedAdRevenue')
print(selected_features_by_corr)
print(len(selected_features_by_corr))

['views', 'redViews', 'likes', 'dislikes', 'shares', 'estimatedMinutesWatched', 'weekly_views', 'monthly_views', 'quarterly_views', 'weekly_watch_time', 'monthly_watch_time', 'weekly_total_engagement', 'monthly_total_engagement', 'quarterly_total_engagement']
14


LightGBM

In [None]:
import lightgbm as lgb

# LightGBM 모델 학습
lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_model.fit(train_data[x_col], train_data['estimatedAdRevenue'])

# 변수 중요도 추출
importances = lgb_model.feature_importances_
feature_importance_lgbm = pd.Series(importances, index=x_col).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
feature_importance_lgbm = feature_importance_lgbm / feature_importance_lgbm.sum() ## 중요도 정규화
selected_features_by_lgbm = list(feature_importance_lgbm[feature_importance_lgbm >= 0.005].keys())

# 중요한 변수 출력
print(len(selected_features_by_lgbm))
print(selected_features_by_lgbm)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.211521 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26161
[LightGBM] [Info] Number of data points in the train set: 6533916, number of used features: 128
[LightGBM] [Info] Start training from score 0.848527
31
['subscribers_conversion_rate', 'dislikes', 'dislike_rate', 'shares', 'subscribers_gained_per_playlist_add', 'views', 'videosAddedToPlaylists', 'subscribers_lost_per_playlist_remove', 'likes', 'share_rate', 'watched_view_rate', 'comments', 'averageViewPercentage', 'videosRemovedFromPlaylists', 'total_engage_rate', 'watch_time_per_playlist_add', 'watch_time_loss_per_playlist_remove', 'like_to_dislike_ratio', 'averageViewDuration', 'monthly_videos_removed', 'redViews', 'monthly_avg_view_duration', 'estimatedRedMinutesWatched', 'comment_rate', 'weekly_engagement_rate', 'monthly_avg_v

XGBoost

In [None]:
import xgboost as xgb

# XGBoost 모델 학습
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_model.fit(train_data[x_col], train_data['estimatedAdRevenue'])

# 변수 중요도 추출
importances = xgb_model.feature_importances_
feature_importance_xgb = pd.Series(importances, index=x_col).sort_values(ascending=False)

# 중요도 0.001 이상 변수 선정
selected_features_by_xgb = list(feature_importance_xgb[feature_importance_xgb >= 0.001].keys())

# 중요한 변수 출력
print(len(selected_features_by_xgb))
print(f"XGBoost로 선정된 변수: \n{selected_features_by_xgb}")

51
XGBoost로 선정된 변수: 
['dislikes', 'shares', 'weekly_engagement_rate', 'subscribers_conversion_rate', 'net_playlist_addition_rate', 'videosAddedToPlaylists', 'weekly_videos_added', 'subscribers_gained_per_playlist_add', 'likes', 'videosRemovedFromPlaylists', 'estimatedRedMinutesWatched', 'weekly_watch_time', 'ad_impressions_per_playlist_add', 'like_rate', 'monthly_views', 'views', 'monthly_videos_added', 'monthly_avg_view_duration', 'quarterly_revenue_per_ad_impression', 'comments', 'quarterly_videos_added', 'comment_rate', 'estimatedRedPartnerRevenue', 'watched_view_rate', 'quarterly_avg_view_duration', 'weekly_playlist_change_rate', 'quarterly_playlist_change_rate', 'like_to_dislike_ratio', 'weekly_videos_removed', 'dislike_rate', 'total_engage_rate', 'averageViewPercentage', 'weekly_views', 'subscribers_lost_per_playlist_remove', 'playlist_engagement_rate', 'watch_time_loss_per_playlist_remove', 'quarterly_views', 'adImpressions', 'estimatedMinutesWatched', 'red_revenue_rate', 'share

In [None]:
# 비선형모델 활용 변수 선정
importances_df = pd.DataFrame({
    'features': x_col,
    'lgbm_importance': lgb_model.feature_importances_,
    'xgb_importance': xgb_model.feature_importances_
    })
importances_df['lgbm_importance'] = importances_df['lgbm_importance'] / importances_df['lgbm_importance'].sum() ## 다른 모델과 중요도 단위 다르기 때문에 정규화
importances_df['mean_importance'] = importances_df[['lgbm_importance', 'xgb_importance']].mean(axis=1) ## 각 변수별 모델 중요도 평균

In [None]:
importances_df.sort_values(['mean_importance'],ascending=False).iloc[:40]

Unnamed: 0,features,lgbm_importance,xgb_importance,mean_importance
4,dislikes,0.108667,0.277542,0.193104
34,subscribers_conversion_rate,0.184,0.062526,0.123263
5,shares,0.068,0.163281,0.115641
122,weekly_engagement_rate,0.006667,0.097462,0.052064
10,videosAddedToPlaylists,0.051,0.0449,0.04795
36,subscribers_gained_per_playlist_add,0.065,0.024105,0.044552
29,dislike_rate,0.084667,0.003644,0.044155
0,views,0.055333,0.007759,0.031546
74,net_playlist_addition_rate,0.003,0.058954,0.030977
3,likes,0.035667,0.021837,0.028752


In [None]:
# 최종 변수 선택
final_selected_features = list(set(list(importances_df[importances_df['mean_importance'] >= 0.01]['features']) + selected_features_by_corr))
print(len(final_selected_features))
print(final_selected_features)

27
['watched_view_rate', 'estimatedMinutesWatched', 'weekly_engagement_rate', 'monthly_views', 'estimatedRedMinutesWatched', 'redViews', 'weekly_videos_added', 'views', 'share_rate', 'weekly_total_engagement', 'dislike_rate', 'weekly_watch_time', 'videosRemovedFromPlaylists', 'subscribers_conversion_rate', 'subscribers_gained_per_playlist_add', 'net_playlist_addition_rate', 'shares', 'quarterly_total_engagement', 'dislikes', 'quarterly_views', 'comments', 'likes', 'subscribers_lost_per_playlist_remove', 'monthly_watch_time', 'videosAddedToPlaylists', 'monthly_total_engagement', 'weekly_views']


### 모델링

#### 모델 적용

XGBoost

In [None]:
# 콘텐츠별 구독자 순증가 평균
youtube_videos.groupby('video')['estimatedAdRevenue'].sum().mean()

np.float64(91.76659115215216)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# 모델 정의 및 학습
# XGBoost를 사용한 모델 학습
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_model.fit(train_data[final_selected_features], train_data['estimatedAdRevenue'])

# 예측
y_pred = xgb_model.predict(test_data[final_selected_features])

# MSE 계산
mse = mean_squared_error(test_data['estimatedAdRevenue'], y_pred)

# RMSE 계산
rmse = np.sqrt(mse)

# R² 값 계산
r2 = r2_score(test_data['estimatedAdRevenue'], y_pred)

# Adjusted R² 계산
n = len(test_data)  # 샘플 수
p = test_data.shape[1]  # 독립 변수(특성) 수
adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

# 결과 출력
print('XGBoost')
print(f"R² 값: {r2:.4f}")
print(f"Adjusted R² 값: {adjusted_r2:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print("----------------------------------------")
print("")

XGBoost
R² 값: 0.7233
MSE: 188.7693
RMSE: 13.7393
----------------------------------------



### 실제데이터 결과 확인

In [None]:
# 예측 결과 확인
youtube_videos['predict'] = xgb_model.predict(youtube_videos[final_selected_features])

In [None]:
# 계정별 구독자수 평균, 구독자 예측수 평균 비교
result_contents_df = youtube_videos[['youtube_user_id', 'video', 'end_date', 'estimatedAdRevenue', 'predict']]

# 계정별 콘텐츠의 구독자 순증감 1일 합계
result_contents_df = result_contents_df.groupby(['youtube_user_id', 'end_date'], as_index=False).agg({
    'estimatedAdRevenue': 'sum',
    'predict': 'sum'
})

# 1개월, 3개월, 6개월, 1년 뒤 구독자수 예측값 계산
result_contents_df['end_date'] = pd.to_datetime(result_contents_df['end_date'])

# Shift와 Rolling 연산을 위한 그룹별 처리
result_contents_df['1_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-30).rolling(window=30).sum())
result_contents_df['3_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-90).rolling(window=90).sum())
result_contents_df['6_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-180).rolling(window=180).sum())
result_contents_df['12_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-365).rolling(window=365).sum())

# 계정별로 최종 평균값을 계산
result_contents_df_final = result_contents_df.groupby('youtube_user_id').agg({
    'estimatedAdRevenue': 'mean',
    'predict': 'mean',
    '1_month_future_predict': 'mean',
    '3_month_future_predict': 'mean',
    '6_month_future_predict': 'mean',
    '12_month_future_predict': 'mean'
}).reset_index()

In [58]:
result_contents_df_final[result_contents_df_final['1_month_future_predict'] ==result_contents_df_final['1_month_future_predict'].max()]

Unnamed: 0,youtube_user_id,net_subscribers_change,predict,1_month_future_predict,3_month_future_predict,6_month_future_predict,12_month_future_predict
84,639bb8dcd603b8138e33780b,10702.618076,10771.09668,370256.66016,1656498.0,,


In [None]:
merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(final_result_df.sort_values(['6_month_future_ratio'],ascending=False)['youtube_user_id'].iloc[:20])]['channel_title'].unique()

array(['오디디 코미디', '0', '황나겸', '다먹어라이언', '래아TV', '슈로시안 SUROSIAN',
       '채림처럼firstcherry', '드론브이로그 DroneVlog', '군대위키', '월텍남 - 월스트리트 테크남',
       '콤므', '맛집남자 foodman', '엔트리뷰 [누구나 재미있는 테크리뷰]', 'Dalhae달달해', '단곰',
       '너굴몬', 'GMENCY 멘시의 마인크래프트', '코인덕 차트아지', '日本ジヌ【니혼지누】ー韓国に関する全て',
       "파파스캠핑 papa's camp | a korean camper", '중년독수리의 대리여행'], dtype=object)

In [None]:
(set(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(final_result_df.sort_values(['1_month_future_ratio'],ascending=False)['youtube_user_id'].iloc[:50])]['channel_title'].unique()) & 
 set(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(final_result_df.sort_values(['3_month_future_ratio'],ascending=False)['youtube_user_id'].iloc[:50])]['channel_title'].unique()) & 
 set(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(final_result_df.sort_values(['6_month_future_ratio'],ascending=False)['youtube_user_id'].iloc[:50])]['channel_title'].unique()))

{'0',
 'GMENCY 멘시의 마인크래프트',
 '日本ジヌ【니혼지누】ー韓国に関する全て',
 '군대위키',
 '너굴몬',
 '다먹어라이언',
 '단곰',
 '래아TV',
 '맛집남자 foodman',
 '슈로시안 SUROSIAN',
 '오늘도희다 HEEDA',
 '월텍남 - 월스트리트 테크남',
 '중년독수리의 대리여행',
 '채림처럼firstcherry',
 '채찍단',
 '코인덕 차트아지',
 '콤므',
 '탬니몰리',
 "파파스캠핑 papa's camp | a korean camper",
 '황나겸'}