In [1]:
import pymongo
import pandas as pd
import numpy as np
import math
from datetime import datetime

# import dask.dataframe as dd ## 대규모 데이터 처리
# import vaex ## 대규모 데이터 처리

# from sklearn.linear_model import LinearRegression ## likes, dislikes 대체
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

import os
from dotenv import load_dotenv

## MongoDB 연동

In [2]:
# .env 파일에서 환경 변수 로드
load_dotenv('C:/py_src/awake/env')

# 환경 변수에서 MongoDB 연결 정보 가져오기
mongo_password = os.getenv('MONGO_PASSWORD')
mongo_user = os.getenv('MONGO_USER')

In [3]:
# MongoDB 연결 URL
url = f"mongodb+srv://{mongo_user}:{mongo_password}@meercat-external.udyfs.mongodb.net/?retryWrites=true&w=majority&appName=meercat-external"
client = pymongo.MongoClient(url, serverSelectionTimeoutMS=100000)

In [4]:
# 연결 확인
from pymongo.errors import ServerSelectionTimeoutError

try:
    # 데이터베이스 목록을 가져오는 쿼리
    databases = client.list_database_names()
    print("Connected successfully. Databases:", databases)
    
except ServerSelectionTimeoutError as err:
    print("Connection failed:", err)

Connected successfully. Databases: ['Test', 'admin', 'config', 'local']


In [5]:
# Test DB 선택
db = client.Test

# Test DB 컬렉션 확인
collections = db.list_collection_names()

In [6]:
# # 단위 환산
# def convert_bytes(num):
#     for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
#         if num < 1024.0:
#             return f"{num:.2f} {x}"
#         num /= 1024.0

In [7]:
# for collection_name in collections:
#     # 컬렉션 통계 정보 가져오기
#     stats = db.command("collStats", collection_name)

#     # 컬렉션의 크기와 문서 수 출력    
#     print(f"Collection '{collection_name}' size: {convert_bytes(stats['size'])}")
#     print(f"Collection '{collection_name}' document count: {stats['count']}")
#     print(f"Total index size: {convert_bytes(stats['totalIndexSize'])}")
#     print('--------------------------------------------------------------------')

In [8]:
## 인스타 관련 데이터 제외
## 'youtube_videos'/'youtube_datas' 따로 수집
## 'youtube_report_v2'/'youtube_report' 날짜 정보불분명, 다른 테이블 정보와 중복
collections_need = [
    'youtube_users',
    'youtube_channel_locations',
    'youtube_daily_channel_basics',
    ]

In [9]:
# 데이터 로드
youtube_dict={}
for collection_name in collections_need:

    youtube_dict[collection_name] = pd.DataFrame(list(db[collection_name].find()))
    
    print(collection_name)
    print(youtube_dict[collection_name].columns)
    print(">> Success")
    print("--------------------------------------")
    print("")

youtube_users
Index(['_id', 'country', 'phone_num', 'kakao_nick', 'kakao_account_id',
       'user_kind', 'created_at', '__v', 'channel_title', 'channel_id',
       'thumbnail_url', 'published_at', 'subscriber_count', 'is_rev_saved',
       'is_subs_saved', 'updated_at', 'brandingSettings', 'contentDetails',
       'contentOwnerDetails', 'etag', 'id', 'kind', 'snippet', 'statistics',
       'status', 'topicDetails', 'connected', 'refresh_error', 'localizations',
       'ads_array', 'age', 'gender', 'region_array', 'is_active',
       'category_array', 'account_type', 'children_age_array',
       'is_accept_suggestion', 'is_add_info', 'pet_array', 'user_id',
       'report_user_id'],
      dtype='object')
>> Success
--------------------------------------

youtube_channel_locations
Index(['_id', 'locations', 'youtube_user_id', 'end_date', 'created_at',
       'updated_at', '__v'],
      dtype='object')
>> Success
--------------------------------------

youtube_daily_channel_basics
Index(

In [10]:
# 키값 형식 맞추기
for key, df in youtube_dict.items():
    # 각 딕셔너리의 value가 DataFrame인지 확인
    if isinstance(df, pd.DataFrame):
        # '_id' 컬럼이 있는지 확인하고 문자열로 변환
        if '_id' in df.columns:
            df['_id'] = df['_id'].astype(str)
        
        # 'youtube_user_id' 컬럼이 있는지 확인하고 문자열로 변환
        if 'youtube_user_id' in df.columns:
            df['youtube_user_id'] = df['youtube_user_id'].astype(str)

        # 변경된 DataFrame을 다시 딕셔너리에 저장
        youtube_dict[key] = df

In [11]:
del df

In [12]:
youtube_dict.keys()

dict_keys(['youtube_users', 'youtube_channel_locations', 'youtube_daily_channel_basics'])

## 데이터 불러오기

In [13]:
# file_path = 'C:/py_src/awake/data/'

### 계정 데이터

#### youtube_users

In [14]:
youtube_users = youtube_dict['youtube_users']

In [15]:
len(youtube_users['channel_id'].unique())
## 유튜버 계정 : 883개

883

In [16]:
# 필요컬럼추출
youtube_users = youtube_users[['channel_id', 'channel_title', 'phone_num', 'report_user_id', 'statistics','published_at']] 
## published_at : 유튜브 가입일
## 'statistics' 컬럼의 'subscriberCount' 정보와 'subscriber_count' 컬럼 정보가 다름 --> 구독자 수
## 'channel_id' 컬럼, 'contentDetails' 컬럼의 'uploads' 같은 정보

In [17]:
# cast 하여 최종 데이터셋 생성
youtube_users = pd.concat([youtube_users, pd.json_normalize(youtube_users['statistics'])],axis=1)
youtube_users = youtube_users.drop(['statistics','hiddenSubscriberCount'],axis=1)
youtube_users = youtube_users.dropna(how = 'all')

# null 값 0으로 대체
youtube_users[['viewCount', 'subscriberCount', 'videoCount']] = youtube_users[['viewCount', 'subscriberCount', 'videoCount']].fillna(0)
youtube_users['viewCount'] = youtube_users['viewCount'].astype(int)
youtube_users['subscriberCount'] = youtube_users['subscriberCount'].astype(int)
youtube_users['videoCount'] = youtube_users['videoCount'].astype(int)

youtube_users = youtube_users.sort_values('channel_id').drop_duplicates().reset_index(drop=True)

# id가 null인 계정 제거
youtube_users = youtube_users[~youtube_users['channel_id'].isnull()].reset_index(drop=True)

# channel_id별로 그룹화하여 null값을 해당 그룹 내에서 채우기
youtube_users['phone_num'] = youtube_users.groupby('channel_id')['phone_num'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))

# 계정 지표 모두 0인 계정 삭제
youtube_users = youtube_users[youtube_users[['viewCount', 'subscriberCount', 'videoCount']].sum(axis=1)!=0].reset_index(drop=True)

  youtube_users['phone_num'] = youtube_users.groupby('channel_id')['phone_num'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
  youtube_users['phone_num'] = youtube_users.groupby('channel_id')['phone_num'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))


In [18]:
youtube_users

Unnamed: 0,channel_id,channel_title,phone_num,report_user_id,published_at,viewCount,subscriberCount,videoCount
0,UC-4K3WFY7e_TzaqjXgwxRoA,대신 밍튜브,,,2013-08-25 04:05:50.000,0,496,0
1,UC-Hk-8YkjlFeEmtdESSbFbA,VoidNeverstop,01026248499,,2014-01-19 06:21:57.000,196505,1160,225
2,UC-R60VevGuKc0NcMkW4owWg,Eunyoung Im,,63c545e04238543bcaf030d4,2016-10-05 12:37:49.000,119,1,1
3,UC-REbxnsUAMlTczWO-2VFwA,푸,01096020425,,2022-02-12 05:02:14.799,23780,97,12
4,UC-VwBnob-SqSdyxIshilfWg,월간 미니츄[Monthly Minichew],01043539304,,2018-11-20 07:00:35.000,6962,33,21
...,...,...,...,...,...,...,...,...
736,UCz60pfE3dOBsTGC2FeUuERg,휘쿡 Hwi Cook,,,2018-11-19 18:01:36.000,82955,265,66
737,UCz8fQdHImw3Ikm_KnxOanrA,1분뉴스,01098384125,,2020-05-18 01:04:35.239,31325716,17500,1123
738,UCzIP5zrZaiqlpWcSw5V8-eg,엔트리뷰 [누구나 재미있는 테크리뷰],,63f7726d55baf50e2df73caf,2020-03-21 13:05:43.289,1767452,5090,152
739,UCzNsDl-geB-n6MFFkqKDftA,맛있는부산 쥰맛지도,01043209436,,2011-11-22 07:34:54.000,651042,579,94


### youtube_channel_locations
- 채널 구독자 위치

In [19]:
youtube_channel_locations = youtube_dict['youtube_channel_locations']
# youtube_channel_locations = pd.read_csv(file_path + 'raw_data/youtube_channel_locations.csv', low_memory=False)

In [20]:
len(youtube_channel_locations['youtube_user_id'].unique())
## 유튜버 계정 : 906개

906

In [21]:
# 필요컬럼추출
youtube_channel_locations = youtube_channel_locations[youtube_channel_locations['locations'].apply(lambda x: len(x) > 0)]
youtube_channel_locations = youtube_channel_locations[['youtube_user_id','end_date','locations']]

In [22]:
# melt, cast 하여 최종데이터셋 생성
youtube_channel_locations = youtube_channel_locations.explode(['locations']).reset_index(drop=True)

# cast 하여 최종 데이터셋 생성
youtube_channel_locations = pd.concat([youtube_channel_locations,pd.json_normalize(youtube_channel_locations['locations'])], axis=1)
youtube_channel_locations = youtube_channel_locations.drop(['locations','subscribersGained','subscribersLost'],axis=1)
youtube_channel_locations = youtube_channel_locations[youtube_channel_locations[youtube_channel_locations.columns[3:]].apply(sum,axis=1)!=0] ## 모두 0인 행 제거
youtube_channel_locations = youtube_channel_locations.drop_duplicates().sort_values(['youtube_user_id','end_date']).reset_index(drop=True)

In [23]:
# 계정별 일자별 지표 계산
youtube_channel_locations = youtube_channel_locations.groupby(['youtube_user_id', 'end_date']).agg({'views': 'sum',
                                                                                                    'estimatedMinutesWatched': 'sum',
                                                                                                    'averageViewDuration': 'mean',
                                                                                                    'averageViewPercentage': 'mean'}).reset_index()

In [24]:
# 시간 분 단위로 변경
youtube_channel_locations['averageViewDuration'] = youtube_channel_locations['averageViewDuration'] / 60

In [25]:
# 날짜형식 변경
youtube_channel_locations = youtube_channel_locations.rename(columns={'end_date':'date'})
youtube_channel_locations['date'] = youtube_channel_locations['date'].astype(str)

In [26]:
youtube_channel_locations

Unnamed: 0,youtube_user_id,date,views,estimatedMinutesWatched,averageViewDuration,averageViewPercentage
0,627cb611aa6f212355e0b617,2023-03-26,3941,18721,2.905208,26.291250
1,627cb611aa6f212355e0b617,2023-03-27,3118,15679,3.087931,27.480000
2,627cb611aa6f212355e0b617,2023-03-28,3549,17190,2.937963,25.808333
3,627cb611aa6f212355e0b617,2023-03-29,4037,18420,3.032828,26.713030
4,627cb611aa6f212355e0b617,2023-03-30,4228,18914,2.725980,24.471471
...,...,...,...,...,...,...
139179,66230ee6d8da110bb0744b2d,2024-04-29,138716,98396,0.685714,88.166286
139180,66230ee6d8da110bb0744b2d,2024-04-30,171182,120358,0.776961,102.200882
139181,66230ee6d8da110bb0744b2d,2024-05-01,149877,105717,0.745588,96.980000
139182,66230ee6d8da110bb0744b2d,2024-05-02,143236,100363,0.680556,88.562333


### youtube_daily_channel_basics

In [180]:
youtube_daily_channel_basics = youtube_dict['youtube_daily_channel_basics']

In [181]:
len(youtube_daily_channel_basics['youtube_user_id'].unique())
## 유튜버 계정 : 906개

906

In [182]:
# 필요컬럼추출
youtube_daily_channel_basics = youtube_daily_channel_basics[youtube_daily_channel_basics['daily_basics'].apply(lambda x: len(x) > 0)]
youtube_daily_channel_basics = youtube_daily_channel_basics[['youtube_user_id','daily_basics']] ## 'end_date' --> 'day' 컬럼 활용
youtube_daily_channel_basics = youtube_daily_channel_basics.explode(['daily_basics']).reset_index(drop=True) ## melt

In [183]:
# daily_basics 컬럼 형식 맞추기 --> 딕셔너리 값들 cast 형태로 맞추기
youtube_daily_channel_basics_cast = []
for item in youtube_daily_channel_basics['daily_basics']:
    if isinstance(item, list):
        youtube_daily_channel_basics_cast.extend(item)
    else:
        youtube_daily_channel_basics_cast.append(item)

youtube_daily_channel_basics_cast = pd.json_normalize(youtube_daily_channel_basics_cast)

In [184]:
# melt, cast 하여 최종 데이터셋 생성
youtube_daily_channel_basics = pd.concat([youtube_daily_channel_basics, youtube_daily_channel_basics_cast],axis=1)
youtube_daily_channel_basics = youtube_daily_channel_basics.drop('daily_basics',axis=1)
youtube_daily_channel_basics = youtube_daily_channel_basics.fillna(0) ## null값 0으로 대체
youtube_daily_channel_basics = youtube_daily_channel_basics[youtube_daily_channel_basics[youtube_daily_channel_basics.columns[3:]].sum(axis=1)!=0] ## 모두 0인 행 제거
youtube_daily_channel_basics = youtube_daily_channel_basics.sort_values(['youtube_user_id','day']).reset_index(drop=True)

del youtube_daily_channel_basics_cast

In [185]:
# 날짜형식 변경
youtube_daily_channel_basics = youtube_daily_channel_basics.rename(columns={'day':'date'})
youtube_daily_channel_basics['date'] = youtube_daily_channel_basics['date'].astype(str)

# 시간 분 단위로 변경
youtube_daily_channel_basics['averageViewDuration'] = youtube_daily_channel_basics['averageViewDuration'] / 60

In [41]:
# # 최종데이터셋 기준 필요 계정 수 추출 - report, report_v2 제외
# youtube_user_id_outer = list(set(list(youtube_channel_locations['youtube_user_id']) +
#                                  list(youtube_daily_channel_basics['youtube_user_id'])))
# print('youtube_user_id_outer', len(youtube_user_id_outer))
# ## youtube_user_id 모두 포함 912개

# youtube_user_id_inner = list(set(youtube_channel_locations['youtube_user_id']) &
#                              set(youtube_daily_channel_basics['youtube_user_id']))

# print('youtube_user_id_inner',len(youtube_user_id_inner))
# ## youtube_user_id  공통 포함 250개

youtube_user_id_outer 838
youtube_user_id_inner 644


#### youtube_datas

In [35]:
youtube_user_id_inner = pd.read_csv('C:/py_src/awake/data/raw_data/youtube_user_id_inner.csv')

In [36]:
collection = db['youtube_datas']  # 컬렉션 선택

# 파이프라인 정의
pipeline = [
    {
        "$match": {
            "youtube_user_id": {
                "$in": list(youtube_user_id_inner['youtube_user_id'])
            },
            # "end_date": {
            # "$gte": first_date,
            # "$lte": last_date
            # }
        }
    },
    {
        "$sort": {
            "youtube_user_id": 1,
            "data_created_at": 1
        }
    },
    {
        "$project": {
            'youtube_user_id' : 1, 
            'data_created_at' : 1, 
            'published_at' : 1, 
            'channel_id' : 1, 
            'channel_title' : 1, 
            'yt_search_keyword' : 1, 
            'subscribed_status' : 1
        }
    }
]

# 파이프라인 실행
result = list(collection.aggregate(pipeline, allowDiskUse=True))

# 결과를 Pandas 데이터프레임으로 변환
youtube_datas = pd.DataFrame(result)

In [37]:
len(youtube_datas['youtube_user_id'].unique())

249

In [38]:
# 컬럼 순서 정리
need_col = ['youtube_user_id', 'data_created_at', 'published_at', 'channel_id', 'channel_title', 'yt_search_keyword', 'subscribed_status']
youtube_datas = youtube_datas[need_col]

In [39]:
# cast 하여 데이터셋 생성
youtube_datas = pd.concat([youtube_datas, pd.json_normalize(youtube_datas['subscribed_status'])],axis=1)
youtube_datas = youtube_datas.drop(['subscribed_status'],axis=1)

youtube_datas[youtube_datas.columns[6:]] = youtube_datas[youtube_datas.columns[6:]].fillna(0) # null값 0으로 대체
youtube_datas = youtube_datas[youtube_datas[youtube_datas.columns[6:]].sum(axis=1)!=0] ## 모두 0인 행 제거

youtube_datas = youtube_datas.sort_values(['youtube_user_id','data_created_at']).reset_index(drop=True)

In [40]:
# 날짜 기준 필터링
opt_date = (youtube_datas['data_created_at'] >= datetime(2023, 3, 26)) & (youtube_datas['data_created_at'] <= datetime(2024, 5, 3))
youtube_datas = youtube_datas[opt_date].sort_values(['youtube_user_id', 'data_created_at']).reset_index(drop=True)

In [42]:
# 날짜형식 변경
youtube_datas = youtube_datas.rename(columns={'data_created_at':'date'})
youtube_datas['date'] = youtube_datas['date'].astype(str)

In [43]:
youtube_datas

Unnamed: 0,youtube_user_id,date,published_at,channel_id,channel_title,yt_search_keyword,UNSUBSCRIBED,SUBSCRIBED
0,627cb611aa6f212355e0b617,2023-03-26,NaT,UCxuEudcvmg4zMQhW7isWi-w,,"{'성팩': 40, '빙수빙': 28, '원칩챌린지 먹방': 14, '수빙수': 1...",3356.0,879.0
1,627cb611aa6f212355e0b617,2023-03-27,NaT,UCxuEudcvmg4zMQhW7isWi-w,,"{'원칩챌린지 먹방': 19, '빙수빙': 18, '성팩': 18, 'one chi...",2714.0,676.0
2,627cb611aa6f212355e0b617,2023-03-28,NaT,UCxuEudcvmg4zMQhW7isWi-w,,"{'수빙수': 18, '빙수빙': 17, '성팩': 17, '원칩챌린지 먹방': 1...",3060.0,735.0
3,627cb611aa6f212355e0b617,2023-03-29,NaT,UCxuEudcvmg4zMQhW7isWi-w,,"{'성팩': 45, '빙수빙': 23, '수빙수': 21, '원칩': 13, '원칩...",3504.0,818.0
4,627cb611aa6f212355e0b617,2023-03-30,NaT,UCxuEudcvmg4zMQhW7isWi-w,,"{'원칩챌린지 먹방': 29, '빙수빙': 28, '원칩': 22, '성팩': 16...",3829.0,744.0
...,...,...,...,...,...,...,...,...
82133,66230ee6d8da110bb0744b2d,2024-04-29,NaT,UCIPg5mOIS-KlyxYei_EI50A,,"{'김승연 회장': 290, '쇼츠': 164, '무타구치 렌야': 155, '나훈...",132857.0,6164.0
82134,66230ee6d8da110bb0744b2d,2024-04-30,NaT,UCIPg5mOIS-KlyxYei_EI50A,,"{'쇼츠': 220, '무타구치 렌야': 178, '김승연 회장': 176, '장미...",165145.0,6267.0
82135,66230ee6d8da110bb0744b2d,2024-05-01,NaT,UCIPg5mOIS-KlyxYei_EI50A,,"{'나훈아': 152, '나훈아콘서트': 145, '쇼츠': 136, '김승연 회장...",145112.0,5248.0
82136,66230ee6d8da110bb0744b2d,2024-05-02,NaT,UCIPg5mOIS-KlyxYei_EI50A,,"{'나훈아': 358, '쇼츠': 126, '김승연 회장': 109, '무타구치 렌...",137492.0,6019.0


### 콘텐츠 데이터

#### youtube_videos

In [44]:
youtube_user_id_inner = pd.read_csv('C:/py_src/awake/data/raw_data/youtube_user_id_inner.csv')

In [45]:
collection = db['youtube_videos']  # 컬렉션 선택

# 파이프라인 정의
pipeline = [
    {
        "$match": {
            "youtube_user_id": {
                "$in": list(youtube_user_id_inner['youtube_user_id'])
            },
            "videos": {"$ne": []},
            "end_date": {
                "$gte": datetime(2023, 3, 26),
                "$lte": datetime(2024, 5, 3)
            }
        }
    },
    {
        "$sort": {
            "youtube_user_id": 1,
            "end_date": 1
        }
    },
    {
        "$project": {
            "youtube_user_id": 1,
            "end_date": 1,
            "videos": 1
        }
    }
]

# 파이프라인 실행
result = list(collection.aggregate(pipeline, allowDiskUse=True))

# 결과를 Pandas 데이터프레임으로 변환
youtube_videos = pd.DataFrame(result)

In [46]:
len(youtube_videos['youtube_user_id'].unique())

249

In [47]:
# melt, cast하여 최종데이터셋 생성
# melt
youtube_videos = youtube_videos.explode(['videos']).reset_index(drop=True)

#cast
youtube_videos = pd.concat([youtube_videos, pd.json_normalize(youtube_videos['videos'])],axis=1)

youtube_videos = youtube_videos.drop(['_id','videos'],axis=1)
youtube_videos = youtube_videos.fillna(0) ## null값 0으로 대체
youtube_videos = youtube_videos[youtube_videos[youtube_videos.columns[4:]].sum(axis=1)!=0] ## 모두 0인 행 제거
youtube_videos = youtube_videos.drop_duplicates().sort_values(['youtube_user_id', 'end_date']).reset_index(drop=True)

In [49]:
# 날짜형식 변경
youtube_videos = youtube_videos.rename(columns={'end_date':'date'})
youtube_videos['date'] = youtube_videos['date'].astype(str)

In [None]:
# 내보내기
# youtube_videos.to_csv('C:/py_src/awake/data/youtube_videos.csv',encoding='utf-8-sig',index=False)

In [None]:
# 내보내기
# df.to_csv('')

## 데이터 전처리

### 계정 데이터

데이터 통합

In [215]:
merge_df_users_fin = pd.merge(youtube_users,youtube_datas,how='left',on='channel_id')
need_col = ['youtube_user_id', 'date', 'channel_id', 'channel_title_x', 'published_at_x', 'phone_num', 'yt_search_keyword', 'viewCount', 'subscriberCount', 'videoCount','UNSUBSCRIBED', 'SUBSCRIBED']
merge_df_users_fin = merge_df_users_fin[need_col]
merge_df_users_fin = merge_df_users_fin.rename(columns={'channel_title_x':'channel_title','published_at_x':'published_at'})
merge_df_users_fin = merge_df_users_fin[~merge_df_users_fin['youtube_user_id'].isnull()].reset_index(drop=True)

merge_df_users_fin = pd.merge(merge_df_users_fin,youtube_channel_locations,how='left',on=['youtube_user_id','date'])
merge_df_users_fin = merge_df_users_fin.drop(['views'],axis=1)

youtube_daily_channel_basics = youtube_daily_channel_basics.drop(['annotationClickThroughRate','annotationCloseRate'],axis=1)
merge_df_users_fin = pd.merge(merge_df_users_fin,youtube_daily_channel_basics,how='left',on=['youtube_user_id','date'])

In [218]:
# null값 대체
merge_df_users_fin['estimatedMinutesWatched_x'] = merge_df_users_fin['estimatedMinutesWatched_x'].fillna(merge_df_users_fin['estimatedMinutesWatched_y'])
merge_df_users_fin['averageViewDuration_x'] = merge_df_users_fin['averageViewDuration_x'].fillna(merge_df_users_fin['averageViewDuration_y'])

merge_df_users_fin = merge_df_users_fin.drop(['estimatedMinutesWatched_y','averageViewDuration_y'],axis=1)
merge_df_users_fin = merge_df_users_fin.rename(columns={'estimatedMinutesWatched_x':'estimatedMinutesWatched','averageViewDuration_x':'averageViewDuration'})

# 영상 시청 시간 합 / 영상 재생 시간 합 = 영상 시청 비율 대체
merge_df_users_fin['averageViewPercentage'] = np.where(merge_df_users_fin['averageViewPercentage'].isnull(), 
                                                       merge_df_users_fin['estimatedMinutesWatched'] / (merge_df_users_fin['averageViewDuration'] * merge_df_users_fin['views']),
                                                       merge_df_users_fin['averageViewPercentage'])
merge_df_users_fin['averageViewPercentage'] = merge_df_users_fin['averageViewPercentage'].fillna(0)

In [225]:
# 환율 적용 - 해당기간 평균환율 : 1322.42
exchange_rate_col = ['estimatedRevenue', 'estimatedAdRevenue', 'estimatedRedPartnerRevenue', 'grossRevenue', 'cpm', 'playbackBasedCpm']
merge_df_users_fin[exchange_rate_col] = merge_df_users_fin[exchange_rate_col] * 1322.42

## 주제1

### 파생변수

In [233]:
merge_df_users_fin.columns

Index(['youtube_user_id', 'date', 'channel_id', 'channel_title',
       'published_at', 'phone_num', 'yt_search_keyword', 'viewCount',
       'subscriberCount', 'videoCount', 'UNSUBSCRIBED', 'SUBSCRIBED',
       'estimatedMinutesWatched', 'averageViewDuration',
       'averageViewPercentage', 'comments', 'dislikes', 'likes', 'shares',
       'subscribersGained', 'subscribersLost', 'views', 'redViews',
       'estimatedRevenue', 'estimatedAdRevenue', 'estimatedRedPartnerRevenue',
       'grossRevenue', 'cpm', 'monetizedPlaybacks', 'adImpressions',
       'playbackBasedCpm'],
      dtype='object')

In [234]:
# y값 파생변수
merge_df_users_fin['total_engage_rate'] = (merge_df_users_fin['likes'] + merge_df_users_fin['comments'] + merge_df_users_fin['shares'] + merge_df_users_fin['dislikes']) / merge_df_users_fin['views'] ## 총 참여율
merge_df_users_fin['net_subscribers_change'] = merge_df_users_fin['subscribersGained'] - merge_df_users_fin['subscribersLost'] ## 구독자 순증가
merge_df_users_fin['revenue_per_view'] = merge_df_users_fin['estimatedRevenue'] / merge_df_users_fin['views'] ## 조회수당 수익
merge_df_users_fin['gross_revenue_per_ad_impression'] = merge_df_users_fin['grossRevenue'] / merge_df_users_fin['adImpressions'] ## 1회 광고노출당 총수익

y_col = ['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view', 'gross_revenue_per_ad_impression'] ## null값은 views가 0인 데이터

In [239]:
(merge_df_users_fin['likes'] < 0).sum()

np.int64(5611)

In [235]:
merge_df_users_fin

Unnamed: 0,youtube_user_id,date,channel_id,channel_title,published_at,phone_num,yt_search_keyword,viewCount,subscriberCount,videoCount,...,estimatedRedPartnerRevenue,grossRevenue,cpm,monetizedPlaybacks,adImpressions,playbackBasedCpm,total_engage_rate,net_subscribers_change,revenue_per_view,gross_revenue_per_ad_impression
0,6455d84dd88e4c67532fcdc4,2023-05-04,UC-oyaiXwIMrvI3k7eIqZh-g,담비,2013-03-02 09:19:46.000,,"{'역도': 89, '복압잡는법': 45, '역도 스쿼트': 26, '마티아스 슈타...",12591273,65700,175,...,666.49968,2404.15956,3171.16316,590.0,758.0,4074.37602,0.017563,13,0.240885,3.171714
1,6455d84dd88e4c67532fcdc4,2023-05-05,UC-oyaiXwIMrvI3k7eIqZh-g,담비,2013-03-02 09:19:46.000,,"{'역도': 88, '복압잡는법': 37, '김담비': 26, '역도 스쿼트': 1...",12591273,65700,175,...,640.05128,2052.39584,3704.09842,437.0,554.0,4695.91342,0.014127,29,0.778834,3.704686
2,6455d84dd88e4c67532fcdc4,2023-05-06,UC-oyaiXwIMrvI3k7eIqZh-g,담비,2013-03-02 09:19:46.000,,"{'역도': 88, '역도 스쿼트': 32, '담비': 24, '복압잡는법': 23...",12591273,65700,175,...,592.44416,1942.63498,3532.18382,431.0,550.0,4506.80736,0.015248,12,0.240136,3.532064
3,6455d84dd88e4c67532fcdc4,2023-05-07,UC-oyaiXwIMrvI3k7eIqZh-g,담비,2013-03-02 09:19:46.000,,"{'역도': 67, '담비': 36, '복압잡는법': 28, '김담비': 22, '...",12591273,65700,175,...,1163.72960,7158.25946,3292.82580,1759.0,2174.0,4069.08634,0.021709,-20,0.783993,3.292668
4,6455d84dd88e4c67532fcdc4,2023-05-08,UC-oyaiXwIMrvI3k7eIqZh-g,담비,2013-03-02 09:19:46.000,,"{'역도': 56, '복압잡는법': 27, '기절': 19, '복압': 16, '담...",12591273,65700,175,...,522.35590,1798.49120,3343.07776,450.0,538.0,3996.35324,0.017057,3,0.779095,3.342920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81587,63f7726d55baf50e2df73cb2,2024-04-29,UCzIP5zrZaiqlpWcSw5V8-eg,엔트리뷰 [누구나 재미있는 테크리뷰],2020-03-21 13:05:43.289,,"{'플스5 프로': 20, '갤럭시탭s10': 16, '갤럭시s25': 13, 's...",1767452,5090,152,...,62.15374,1457.30684,6746.98684,150.0,216.0,9715.81974,-0.002457,0,2.150963,6.746791
81588,63f7726d55baf50e2df73cb2,2024-04-30,UCzIP5zrZaiqlpWcSw5V8-eg,엔트리뷰 [누구나 재미있는 테크리뷰],2020-03-21 13:05:43.289,,"{'갤럭시s25': 22, 's25': 18, '갤럭시탭s10': 18, '갤럭시탭...",1767452,5090,152,...,78.02278,1282.74740,4858.57108,192.0,264.0,6680.86584,0.004494,3,1.780066,4.858892
81589,63f7726d55baf50e2df73cb2,2024-05-01,UCzIP5zrZaiqlpWcSw5V8-eg,엔트리뷰 [누구나 재미있는 테크리뷰],2020-03-21 13:05:43.289,,"{'플스5 프로': 18, 's25울트라': 11, 's25': 10, '갤럭시s2...",1767452,5090,152,...,59.50890,1704.59938,7819.46946,169.0,218.0,10086.09734,0.004525,0,2.276836,7.819263
81590,63f7726d55baf50e2df73cb2,2024-05-02,UCzIP5zrZaiqlpWcSw5V8-eg,엔트리뷰 [누구나 재미있는 테크리뷰],2020-03-21 13:05:43.289,,"{'갤럭시탭s10': 19, '갤럭시s25': 17, 's25': 16, '플스5 ...",1767452,5090,152,...,91.24698,1068.51536,4813.60880,173.0,222.0,6177.02382,0.009456,1,1.628796,4.813132


In [None]:
# 파생변수1 - 참여도 관련
merge_df_users_fin['like_rate'] = merge_df_users_fin['likes'] / merge_df_users_fin['views'] ## 좋아요 비율 
merge_df_users_fin['comment_rate'] = merge_df_users_fin['comments'] / merge_df_users_fin['views'] ## 댓글 비율
merge_df_users_fin['share_rate'] = merge_df_users_fin['shares'] / merge_df_users_fin['views'] ## 공유 비율  
merge_df_users_fin['dislike_rate'] = merge_df_users_fin['dislikes'] / merge_df_users_fin['views'] ## 싫어요 비율
merge_df_users_fin['total_engage_rate'] = (merge_df_users_fin['likes'] + merge_df_users_fin['comments'] + merge_df_users_fin['shares'] + merge_df_users_fin['dislikes']) / merge_df_users_fin['views'] ## 총 참여율
merge_df_users_fin['positive_engage_rate'] = (merge_df_users_fin['likes'] + merge_df_users_fin['shares']) / merge_df_users_fin['views'] ## 긍정적 참여율
merge_df_users_fin['comment_to_like_rate'] = merge_df_users_fin['comments'] / merge_df_users_fin['likes'] ## 댓글/좋아요 비율
merge_df_users_fin['like_to_dislike_ratio'] = merge_df_users_fin['likes'] / (merge_df_users_fin['dislikes']) ## 좋아요/싫어요 비율

In [None]:
# 파생변수2 - 구독자 관련
merge_df_users_fin['subscriber_increase_rate'] = merge_df_users_fin['subscribers_gained'] / merge_df_users_fin['subscribers_count'] ## 구독자 증가율
merge_df_users_fin['subscriber_decrease_rate'] = merge_df_users_fin['subscribers_lost'] / merge_df_users_fin['subscribers_count'] ## 구독자 감소율
merge_df_users_fin['net_subscribers_change'] = merge_df_users_fin['subscribers_gained'] - merge_df_users_fin['subscribers_lost'] ## 구독자 순증가
merge_df_users_fin['subscribers_conversion_rate'] = merge_df_users_fin['subscribers_gained'] / merge_df_users_fin['views'] ## 구독자 전환율
merge_df_users_fin['subscriber_retention_rate'] = (merge_df_users_fin['SUBSCRIBED'] - merge_df_users_fin['UNSUBSCRIBED']) / merge_df_users_fin['subscribers_count'] ## 구독자 유지율
merge_df_users_fin['subscribed_view_rate'] = merge_df_users_fin['SUBSCRIBED'] / (merge_df_users_fin['SUBSCRIBED'] + merge_df_users_fin['UNSUBSCRIBED']) ## 구독자 조회수 비율
merge_df_users_fin['unsubscribed_view_rate'] = merge_df_users_fin['UNSUBSCRIBED'] / (merge_df_users_fin['SUBSCRIBED'] + merge_df_users_fin['UNSUBSCRIBED']) ## 비구독자 조회수 비율

In [None]:
# 파생변수3 - 수익 관련
merge_df_users_fin['revenue_per_view'] = merge_df_users_fin['estimated_revenue'] / merge_df_users_fin['views'] ## 조회수당 수익
merge_df_users_fin['revenue_per_subscribed_view'] = merge_df_users_fin['estimated_revenue'] / merge_df_users_fin['SUBSCRIBED'] ## 구독자조회수당 수익
merge_df_users_fin['revenue_per_unsubscribed_view'] = merge_df_users_fin['estimated_revenue'] / merge_df_users_fin['UNSUBSCRIBED'] ## 비구독자조회수당 수익
merge_df_users_fin['revenue_per_subscriber'] = merge_df_users_fin['estimated_revenue'] / merge_df_users_fin['subscribers_count'] ## 구독자당 수익
merge_df_users_fin['revenue_per_red_view'] = merge_df_users_fin['estimated_revenue'] / merge_df_users_fin['redViews'] ## 프리미엄당 수익
merge_df_users_fin['ad_revenue_rate'] = merge_df_users_fin['estimated_ad_revenue'] / merge_df_users_fin['estimated_revenue'] ## 광고수익비율
merge_df_users_fin['red_revenue_rate'] = merge_df_users_fin['estimated_red_partner_revenue'] / merge_df_users_fin['estimated_revenue'] ## 프리미엄수익비율
merge_df_users_fin['cpm_to_revenue_ratio'] = merge_df_users_fin['cpm'] / merge_df_users_fin['estimated_revenue'] ## cpm 대비 수익
merge_df_users_fin['revenue_per_ad_impression'] = merge_df_users_fin['estimated_revenue'] / merge_df_users_fin['ad_impressions'] ## 광고노출당 수익
merge_df_users_fin['playback_based_cpm_rate'] = merge_df_users_fin['playback_based_cpm'] / merge_df_users_fin['cpm'] ## 재생 기반 수익

In [None]:
# 파생변수4 - 시청 시간 관련
merge_df_users_fin['revenue_per_minute_watched'] = merge_df_users_fin['estimated_revenue'] / merge_df_users_fin['estimatedMinutesWatched'] ## 시청 시간 당 수익
merge_df_users_fin['avg_view_duration_rate'] = merge_df_users_fin['averageViewDuration'] / merge_df_users_fin['averageViewPercentage'] ## 평균 시청 시간 비율
merge_df_users_fin['watched_time_rate'] = merge_df_users_fin['averageViewPercentage'] * merge_df_users_fin['estimatedMinutesWatched'] ## 재생 비율 대비 시청 시간
merge_df_users_fin['watched_view_rate'] = merge_df_users_fin['estimatedMinutesWatched'] / merge_df_users_fin['views'] ## 조회수당 시청시간
merge_df_users_fin['subscribed_view_time_rate'] = merge_df_users_fin['estimatedMinutesWatched'] / merge_df_users_fin['SUBSCRIBED'] ## 구독자조회수당 시청시간
merge_df_users_fin['unsubscribed_view_time_rate'] = merge_df_users_fin['estimatedMinutesWatched'] / merge_df_users_fin['UNSUBSCRIBED'] ## 비구독자조회수당 시청시간
merge_df_users_fin['subscriber_view_time_rate'] = merge_df_users_fin['estimatedMinutesWatched'] / merge_df_users_fin['subscribers_count'] ## 구독자당 시청시간

In [None]:
# 파생변수5 - 광고 관련
merge_df_users_fin['revenue_per_playback'] = merge_df_users_fin['gross_revenue'] / merge_df_users_fin['monetized_playbacks'] ## 1회 광고재생당 수익
merge_df_users_fin['gross_revenue_per_ad_impression'] = merge_df_users_fin['gross_revenue'] / merge_df_users_fin['ad_impressions'] ## 1회 광고노출당 총수익
merge_df_users_fin['playback_rate'] = merge_df_users_fin['monetized_playbacks'] / merge_df_users_fin['ad_impressions'] ## 광고노출 대비 재생율
merge_df_users_fin['unplayback_rate'] = (merge_df_users_fin['ad_impressions'] - merge_df_users_fin['monetized_playbacks']) / merge_df_users_fin['ad_impressions'] ## 광고노출 대비 비재생율

In [None]:
# 파생변수6 - 연령 및 성별 관련
female_col = [col_nm for col_nm in merge_df_users_fin.columns if 'female' in col_nm ]
male_col = [col_nm for col_nm in merge_df_users_fin.columns if ('male' in col_nm) and ('female' not in col_nm)]

youth_col = [col_nm for col_nm in merge_df_users_fin.columns if '13-17' in col_nm]
adult_col = [col_nm for col_nm in merge_df_users_fin.columns if ('18-24' in col_nm) or ('25-34' in col_nm) or ('35-44' in col_nm) or ('45-54' in col_nm)]
older_col = [col_nm for col_nm in merge_df_users_fin.columns if ('55-64' in col_nm) or ('65' in col_nm)]

merge_df_users_fin['female_viewer_rate'] = merge_df_users_fin[female_col].sum(axis=1) ## 여성 시청자 비율
merge_df_users_fin['male_viewer_rate'] = merge_df_users_fin[male_col].sum(axis=1) ## 남성 시청자 비율

merge_df_users_fin['youth_viewer_rate'] = merge_df_users_fin[youth_col].sum(axis=1) ## 청소년 시청자 비율
merge_df_users_fin['adult_viewer_rate'] = merge_df_users_fin[adult_col].sum(axis=1) ## 성인 시청자 비율
merge_df_users_fin['older_viewer_rate'] = merge_df_users_fin[older_col].sum(axis=1) ## 노인 시청자 비율

In [None]:
# null값 대체
merge_df_users_fin = merge_df_users_fin.fillna(0) ## NaN
merge_df_users_fin = merge_df_users_fin.replace([np.inf, -np.inf], 0) ## inf

### y값 설정

#### 중요 지표 표준화

In [None]:
# y값 활용 지표
y_col = ['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view', 'gross_revenue_per_ad_impression'] ## null값은 views가 0인 데이터

In [None]:
# null값 대체
merge_df_users_fin[y_col] = merge_df_users_fin[y_col].fillna(0) ## NaN
merge_df_users_fin[y_col] = merge_df_users_fin[y_col].replace([np.inf, -np.inf], 0) ## inf

In [None]:
# 표준화 (Standard Scaling)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(merge_df_users_fin[y_col])

#### 다중 지표 결합

In [None]:
# Isolation Forest 모델 학습
iso_forest = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
iso_forest.fit(scaled_features)

# 이상치 점수 계산
anomaly_scores = iso_forest.decision_function(scaled_features)

In [None]:
# 임계값 설정
threshold = np.percentile(anomaly_scores, 5) ## 이상치 점수가 작을수록 이상치에 가깝다고 판단하는데, 5번째 백분위수가 (-)로 바뀌는 포인트

# y값 라벨링 (이상치이면 0, 그렇지 않으면 1)
merge_df_users_fin['y_label'] = np.where(anomaly_scores < threshold, 0, 1)

In [None]:
# 전체 데이터 y값 빈도 확인
merge_df_users_fin['y_label'].value_counts()

y_label
1    100398
0      5285
Name: count, dtype: int64

In [None]:
# merge_df_users_fin.to_csv('C:/py_src/awake/data/merge_df_users_fin_eda.csv', encoding='utf-8-sig', index=False)

In [None]:
# 계정별 y값 빈도 데이터
y_result_df = merge_df_users_fin.groupby('youtube_user_id')['y_label'].value_counts().reset_index()
y_result_df

Unnamed: 0,youtube_user_id,y_label,count
0,627cb611aa6f212355e0b617,1,405
1,627f59ccaa39226247c60b01,1,402
2,627f59ccaa39226247c60b01,0,3
3,6287228afb15712a8cb931d7,1,405
4,6287229efb15712a8cb93225,1,398
...,...,...,...
394,65e7b773d8da110bb072e2b5,0,1
395,65f7b17ed8da110bb0733b7b,1,405
396,65fecf7ed8da110bb0736199,1,405
397,66230ee6d8da110bb0744b2d,1,373


In [None]:
# 계정별 일일데이터의 20% 이상 이상치 데이터인 계정 확인
y_result_df[y_result_df['y_label']==0][y_result_df[y_result_df['y_label']==0]['count'] >= 40]

Unnamed: 0,youtube_user_id,y_label,count
17,62872523fb15712a8cb93479,0,90
54,62a35ce69d41c93ff90b5670,0,110
78,62c4e558507271632b9cc1c7,0,50
85,62d11f080b4c4c7502a5be3d,0,400
131,639bb8dcd603b8138e33780b,0,187
139,63c9075250eb530dfd1346bd,0,46
156,63d77c9650eb530dfd139f8b,0,335
171,63eb4f87ee122e631992279f,0,302
205,640001db0abaa11316396d3b,0,223
218,64020bf4d746c60e1272055f,0,155


In [None]:
# 주제1 최종 변수
'subscribers_gained',
 'likes',
 'estimatedMinutesWatched',
 'estimated_revenue',
 'revenue_per_red_view',
 'positive_engage_rate',
 'estimated_red_partner_revenue',
 'revenue_per_minute_watched',
 'comments',
 'shares',
 'like_rate',
 'monetized_playbacks',
 'gross_revenue',
 'revenue_per_subscriber',
 'subscribers_lost',
 'redViews',
 'cpm',
 'subscriber_increase_rate',
 'ad_revenue_rate',
 'playback_rate',
 'subscriber_view_time_rate',
 'red_revenue_rate',
 'watched_view_rate',
 'subscribers_conversion_rate',
 'avg_view_duration_rate',
 'averageViewDuration',
 'comment_rate',
 'subscriber_decrease_rate',
 'SUBSCRIBED',
 'subscribers_count',
 'unsubscribed_view_time_rate',
 'subscriber_retention_rate',
 'like_to_dislike_ratio',
 'ad_impressions',
 'subscribed_view_rate'