In [1]:
import pymongo
import pandas as pd
import numpy as np
import math
from datetime import datetime

# import dask.dataframe as dd ## 대규모 데이터 처리
# import vaex ## 대규모 데이터 처리

# from sklearn.linear_model import LinearRegression ## likes, dislikes 대체
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

import os
from dotenv import load_dotenv

## MongoDB 연동

In [2]:
# .env 파일에서 환경 변수 로드
load_dotenv('C:/py_src/awake/.env')

# 환경 변수에서 MongoDB 연결 정보 가져오기
mongo_password = os.getenv('MONGO_PASSWORD')
mongo_user = os.getenv('MONGO_USER')

In [3]:
def mongodb_connection(user, password):
    # 환경 변수에서 MongoDB 연결 정보 가져오기
    mongo_password = password
    mongo_user = user

    # MongoDB 연결 URL
    url = f"mongodb+srv://{mongo_user}:{mongo_password}@meercat-external.udyfs.mongodb.net/?retryWrites=true&w=majority&appName=meercat-external"
    client = pymongo.MongoClient(url, serverSelectionTimeoutMS=100000)

    # Test DB 선택
    database = client.Test

    # # Test DB 컬렉션 확인
    # collections_name = db.list_collection_names()

    return database

In [4]:
db = mongodb_connection(mongo_user,mongo_password)

In [7]:
# 해당 데이터 경로
user_info_df_path = 'C:/py_src/awake/data/user_info_df.csv'
youtube_videos_path = 'C:/py_src/awake/data/youtube_videos.csv'
features_weight_path = 'C:/py_src/awake/data/features_weight.csv'

## 데이터 준비

### 계정 데이터

In [8]:
def data_preparation_user(databases):
    # youtube_users
    youtube_users = pd.DataFrame(list(databases['youtube_users'].find()))
    
    # 필요컬럼추출
    youtube_users = youtube_users[['channel_id', 'channel_title', 'phone_num', 'report_user_id', 'statistics','published_at']] 

    # cast 하여 최종 데이터셋 생성
    youtube_users = pd.concat([youtube_users, pd.json_normalize(youtube_users['statistics'])],axis=1)
    youtube_users = youtube_users.drop(['statistics','hiddenSubscriberCount'],axis=1)
    youtube_users = youtube_users.dropna(how = 'all')

    # null 값 0으로 대체
    youtube_users[['viewCount', 'subscriberCount', 'videoCount']] = youtube_users[['viewCount', 'subscriberCount', 'videoCount']].fillna(0)
    youtube_users['viewCount'] = youtube_users['viewCount'].astype(int)
    youtube_users['subscriberCount'] = youtube_users['subscriberCount'].astype(int)
    youtube_users['videoCount'] = youtube_users['videoCount'].astype(int)

    youtube_users = youtube_users.sort_values('channel_id').drop_duplicates().reset_index(drop=True)

    # id가 null인 계정 제거
    youtube_users = youtube_users[~youtube_users['channel_id'].isnull()].reset_index(drop=True)

    # channel_id별로 그룹화하여 null값을 해당 그룹 내에서 채우기
    youtube_users['phone_num'] = youtube_users.groupby('channel_id')['phone_num'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))

    # 계정 지표 모두 0인 계정 삭제
    youtube_users = youtube_users[youtube_users[['viewCount', 'subscriberCount', 'videoCount']].sum(axis=1)!=0].reset_index(drop=True)
    
    ##########################################################################################################################################################
    # youtube_channel_locations
    youtube_channel_locations = pd.DataFrame(list(databases['youtube_channel_locations'].find()))

    # 필요컬럼추출
    youtube_channel_locations = youtube_channel_locations[youtube_channel_locations['locations'].apply(lambda x: len(x) > 0)]
    youtube_channel_locations = youtube_channel_locations[['youtube_user_id','end_date','locations']]

    # melt, cast 하여 최종데이터셋 생성
    youtube_channel_locations = youtube_channel_locations.explode(['locations']).reset_index(drop=True)

    # cast 하여 최종 데이터셋 생성
    youtube_channel_locations = pd.concat([youtube_channel_locations,pd.json_normalize(youtube_channel_locations['locations'])], axis=1)
    youtube_channel_locations = youtube_channel_locations.drop(['locations','subscribersGained','subscribersLost'],axis=1)
    youtube_channel_locations = youtube_channel_locations[youtube_channel_locations[youtube_channel_locations.columns[3:]].apply(sum,axis=1)!=0] ## 모두 0인 행 제거
    youtube_channel_locations = youtube_channel_locations.drop_duplicates().sort_values(['youtube_user_id','end_date']).reset_index(drop=True)

    # 계정별 일자별 지표 계산
    youtube_channel_locations = youtube_channel_locations.groupby(['youtube_user_id', 'end_date']).agg({'views': 'sum',
                                                                                                        'estimatedMinutesWatched': 'sum',
                                                                                                        'averageViewDuration': 'mean',
                                                                                                        'averageViewPercentage': 'mean'}).reset_index()
    
    # 시간 분 단위로 변경
    youtube_channel_locations['averageViewDuration'] = youtube_channel_locations['averageViewDuration'] / 60

    # 날짜형식 변경
    youtube_channel_locations = youtube_channel_locations.rename(columns={'end_date':'date'})
    youtube_channel_locations['date'] = youtube_channel_locations['date'].astype(str)

    ##########################################################################################################################################################
    # youtube_daily_channel_basics
    youtube_daily_channel_basics = pd.DataFrame(list(databases['youtube_daily_channel_basics'].find()))

    # 필요컬럼추출
    youtube_daily_channel_basics = youtube_daily_channel_basics[youtube_daily_channel_basics['daily_basics'].apply(lambda x: len(x) > 0)]
    youtube_daily_channel_basics = youtube_daily_channel_basics[['youtube_user_id','daily_basics']] ## 'end_date' --> 'day' 컬럼 활용
    youtube_daily_channel_basics = youtube_daily_channel_basics.explode(['daily_basics']).reset_index(drop=True) ## melt

    # daily_basics 컬럼 형식 맞추기 --> 딕셔너리 값들 cast 형태로 맞추기
    youtube_daily_channel_basics_cast = []
    for item in youtube_daily_channel_basics['daily_basics']:
        if isinstance(item, list):
            youtube_daily_channel_basics_cast.extend(item)
        else:
            youtube_daily_channel_basics_cast.append(item)

    youtube_daily_channel_basics_cast = pd.json_normalize(youtube_daily_channel_basics_cast)

    # melt, cast 하여 최종 데이터셋 생성
    youtube_daily_channel_basics = pd.concat([youtube_daily_channel_basics, youtube_daily_channel_basics_cast],axis=1)
    youtube_daily_channel_basics = youtube_daily_channel_basics.drop('daily_basics',axis=1)
    youtube_daily_channel_basics = youtube_daily_channel_basics.fillna(0) ## null값 0으로 대체
    youtube_daily_channel_basics = youtube_daily_channel_basics[youtube_daily_channel_basics[youtube_daily_channel_basics.columns[3:]].sum(axis=1)!=0] ## 모두 0인 행 제거
    youtube_daily_channel_basics = youtube_daily_channel_basics.sort_values(['youtube_user_id','day']).reset_index(drop=True)

    del youtube_daily_channel_basics_cast

    # 날짜형식 변경
    youtube_daily_channel_basics = youtube_daily_channel_basics.rename(columns={'day':'date'})
    youtube_daily_channel_basics['date'] = youtube_daily_channel_basics['date'].astype(str)

    # 시간 분 단위로 변경
    youtube_daily_channel_basics['averageViewDuration'] = youtube_daily_channel_basics['averageViewDuration'] / 60

    ##########################################################################################################################################################
    # youtube_datas
    youtube_datas_collection = databases['youtube_datas']

    # 파이프라인 정의
    pipeline = [
        {
            "$sort": {
                "youtube_user_id": 1,
                "data_created_at": 1
            }
        },
        {
            "$project": {
                'youtube_user_id' : 1, 
                'data_created_at' : 1, 
                'published_at' : 1, 
                'channel_id' : 1, 
                'channel_title' : 1, 
                'yt_search_keyword' : 1, 
                'subscribed_status' : 1
            }
        }
    ]

    # 파이프라인 실행
    result = list(youtube_datas_collection.aggregate(pipeline, allowDiskUse=True))

    # 결과를 Pandas 데이터프레임으로 변환
    youtube_datas = pd.DataFrame(result)

    # 컬럼 순서 정리
    need_col = ['youtube_user_id', 'data_created_at', 'published_at', 'channel_id', 'channel_title', 'yt_search_keyword', 'subscribed_status']
    youtube_datas = youtube_datas[need_col]

    # cast 하여 데이터셋 생성
    youtube_datas = pd.concat([youtube_datas, pd.json_normalize(youtube_datas['subscribed_status'])],axis=1)
    youtube_datas = youtube_datas.drop(['subscribed_status'],axis=1)

    youtube_datas[youtube_datas.columns[6:]] = youtube_datas[youtube_datas.columns[6:]].fillna(0) # null값 0으로 대체
    youtube_datas = youtube_datas[youtube_datas[youtube_datas.columns[6:]].sum(axis=1)!=0] ## 모두 0인 행 제거

    youtube_datas = youtube_datas.sort_values(['youtube_user_id','data_created_at']).reset_index(drop=True)

    # 날짜형식 변경
    youtube_datas = youtube_datas.rename(columns={'data_created_at':'date'})
    youtube_datas['date'] = youtube_datas['date'].astype(str)

    ##########################################################################################################################################################
    # 데이터 전처리
    # 데이터통합
    merge_df_users_fin = pd.merge(youtube_users,youtube_datas,how='left',on='channel_id')
    need_col = ['youtube_user_id', 'date', 'channel_id', 'channel_title_x', 'published_at_x', 'phone_num', 'yt_search_keyword', 'viewCount', 'subscriberCount', 'videoCount','UNSUBSCRIBED', 'SUBSCRIBED']
    merge_df_users_fin = merge_df_users_fin[need_col]
    merge_df_users_fin = merge_df_users_fin.rename(columns={'channel_title_x':'channel_title','published_at_x':'published_at'})
    merge_df_users_fin = merge_df_users_fin[~merge_df_users_fin['youtube_user_id'].isnull()].reset_index(drop=True)

    merge_df_users_fin = pd.merge(merge_df_users_fin,youtube_channel_locations,how='left',on=['youtube_user_id','date'])
    merge_df_users_fin = merge_df_users_fin.drop(['views'],axis=1)

    youtube_daily_channel_basics = youtube_daily_channel_basics.drop(['annotationClickThroughRate','annotationCloseRate'],axis=1)
    merge_df_users_fin = pd.merge(merge_df_users_fin,youtube_daily_channel_basics,how='left',on=['youtube_user_id','date'])

    # null값 대체
    merge_df_users_fin['estimatedMinutesWatched_x'] = merge_df_users_fin['estimatedMinutesWatched_x'].fillna(merge_df_users_fin['estimatedMinutesWatched_y'])
    merge_df_users_fin['averageViewDuration_x'] = merge_df_users_fin['averageViewDuration_x'].fillna(merge_df_users_fin['averageViewDuration_y'])

    merge_df_users_fin = merge_df_users_fin.drop(['estimatedMinutesWatched_y','averageViewDuration_y'],axis=1)
    merge_df_users_fin = merge_df_users_fin.rename(columns={'estimatedMinutesWatched_x':'estimatedMinutesWatched','averageViewDuration_x':'averageViewDuration'})

    # 영상 시청 시간 합 / 영상 재생 시간 합 = 영상 시청 비율 대체
    merge_df_users_fin['averageViewPercentage'] = np.where(merge_df_users_fin['averageViewPercentage'].isnull(), 
                                                        merge_df_users_fin['estimatedMinutesWatched'] / (merge_df_users_fin['averageViewDuration'] * merge_df_users_fin['views']),
                                                        merge_df_users_fin['averageViewPercentage'])
    merge_df_users_fin['averageViewPercentage'] = merge_df_users_fin['averageViewPercentage'].fillna(0)

    # 환율 적용 - 해당기간 평균환율 : 1322.42
    exchange_rate_col = ['estimatedRevenue', 'estimatedAdRevenue', 'estimatedRedPartnerRevenue', 'grossRevenue', 'cpm', 'playbackBasedCpm']
    merge_df_users_fin[exchange_rate_col] = merge_df_users_fin[exchange_rate_col] * 1322.42

    # 버그로 사용된 수치값 대체
    merge_df_users_fin['likes'] = np.where(merge_df_users_fin['likes'] < 0, 0, merge_df_users_fin['likes'])
    merge_df_users_fin['dislikes'] = np.where(merge_df_users_fin['dislikes'] < 0, 0, merge_df_users_fin['dislikes'])

    return merge_df_users_fin

In [9]:
merge_df_users_fin = data_preparation_user(db)

  youtube_users['phone_num'] = youtube_users.groupby('channel_id')['phone_num'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
  youtube_users['phone_num'] = youtube_users.groupby('channel_id')['phone_num'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))


### 콘텐츠 데이터

In [10]:
# 새로운 입력 데이터 확인
user_info_df = pd.read_csv(user_info_df_path)
user_info_df

Unnamed: 0,youtube_user_id,channel_title,phone_num,published_at,viewCount,subscriberCount,videoCount
0,64a41a0c9247f326464d1ec1,대신 밍튜브,,2013-08-25 04:05:50.000,0,496,0
1,62850de04e1c157f51ac7550,VoidNeverstop,1.026248e+09,2014-01-19 06:21:57.000,196505,1160,225
2,62873100fb15712a8cb936b0,푸,1.096020e+09,2022-02-12 05:02:14.799,23780,97,12
3,62b86400507271632b906ecb,월간 미니츄[Monthly Minichew],1.043539e+09,2018-11-20 07:00:35.000,6962,33,21
4,62dcd06919284d77beba310c,루리,1.047646e+09,2014-12-20 05:38:12.000,36342,54,109
...,...,...,...,...,...,...,...
645,647b05e019c22b644dddd844,휘쿡 Hwi Cook,,2018-11-19 18:01:36.000,82955,265,66
646,62a00f3beaf5732d6df064cc,1분뉴스,1.098384e+09,2020-05-18 01:04:35.239,31325716,17500,1123
647,63f7726d55baf50e2df73cb2,엔트리뷰 [누구나 재미있는 테크리뷰],,2020-03-21 13:05:43.289,1767452,5090,152
648,62d11eae0b4c4c7502a5bc0a,맛있는부산 쥰맛지도,1.043209e+09,2011-11-22 07:34:54.000,651042,579,94


In [11]:
user_id = list(user_info_df['youtube_user_id'].iloc[:10])
user_id

['64a41a0c9247f326464d1ec1',
 '62850de04e1c157f51ac7550',
 '62873100fb15712a8cb936b0',
 '62b86400507271632b906ecb',
 '62dcd06919284d77beba310c',
 '6455d84dd88e4c67532fcdc4',
 '632717701babf8392007a899',
 '63fba59f2a0144119186eeef',
 '654c4f871120b40b442a931f',
 '62873c84fb15712a8cb93837']

In [12]:
def data_preparation_user(databases, user_id, youtube_videos_path):
    collection = databases['youtube_videos']  # 컬렉션 선택

    # 파이프라인 정의
    pipeline = [
        {
            "$match": {
                "youtube_user_id": {
                    "$in": user_id
                },
                "videos": {"$ne": []},
                # "end_date": {
                #     "$gte": datetime(2023, 3, 26),
                #     "$lte": datetime(2024, 5, 3)
                # }
            }
        },
        {
            "$sort": {
                "youtube_user_id": 1,
                "end_date": 1
            }
        },
        {
            "$project": {
                "youtube_user_id": 1,
                "end_date": 1,
                "videos": 1
            }
        }
    ]

    # 파이프라인 실행
    result = list(collection.aggregate(pipeline, allowDiskUse=True))

    # 결과를 Pandas 데이터프레임으로 변환
    youtube_videos = pd.DataFrame(result)

    # melt, cast하여 최종데이터셋 생성
    # melt
    youtube_videos = youtube_videos.explode(['videos']).reset_index(drop=True)

    #cast
    youtube_videos = pd.concat([youtube_videos, pd.json_normalize(youtube_videos['videos'])],axis=1)

    youtube_videos = youtube_videos.drop(['_id','videos'],axis=1)

    youtube_videos_temp = pd.read_csv(youtube_videos_path) ## 학습데이터
    youtube_videos = pd.concat([youtube_videos_temp,youtube_videos],axis=0).drop_duplicates().reset_index(drop=True)

    youtube_videos = youtube_videos.fillna(0) ## null값 0으로 대체
    youtube_videos = youtube_videos[youtube_videos[youtube_videos.columns[4:]].sum(axis=1)!=0] ## 모두 0인 행 제거
    youtube_videos = youtube_videos.drop_duplicates().sort_values(['youtube_user_id', 'end_date']).reset_index(drop=True)

    # 최종 콘텐츠 분석 데이터셋
    # 환율 적용 - 해당기간 평균환율 : 1322.42
    exchange_rate_col = ['estimatedRevenue', 'estimatedAdRevenue', 'estimatedRedPartnerRevenue', 'grossRevenue', 'cpm', 'playbackBasedCpm']
    youtube_videos[exchange_rate_col] = youtube_videos[exchange_rate_col] * 1322.42

    # 잘못된값 처리
    youtube_videos['estimatedRevenue'] = np.where(youtube_videos['estimatedRevenue'] < 0,
                                                youtube_videos['estimatedAdRevenue'] + youtube_videos['estimatedRedPartnerRevenue'],
                                                youtube_videos['estimatedRevenue'])

    # 버그로 사용된 수치값 대체
    youtube_videos['likes'] = np.where(youtube_videos['likes'] < 0, 0, youtube_videos['likes'])
    youtube_videos['dislikes'] = np.where(youtube_videos['dislikes'] < 0, 0, youtube_videos['dislikes'])

    return youtube_videos

In [13]:
youtube_videos = data_preparation_user(db,user_id,youtube_videos_path)

## 주제1

In [14]:
def result_topic1(merge_df_users_fin, youtube_videos, user_id):
    # 계정데이터
    # y값 파생변수
    merge_df_users_fin['total_engage_rate'] = (merge_df_users_fin['likes'] + merge_df_users_fin['comments'] + merge_df_users_fin['shares'] + merge_df_users_fin['dislikes']) / merge_df_users_fin['views'] ## 총 참여율
    merge_df_users_fin['net_subscribers_change'] = merge_df_users_fin['subscribersGained'] - merge_df_users_fin['subscribersLost'] ## 구독자 순증가
    merge_df_users_fin['revenue_per_view'] = merge_df_users_fin['estimatedRevenue'] / merge_df_users_fin['views'] ## 조회수당 수익
    merge_df_users_fin['gross_revenue_per_ad_impression'] = merge_df_users_fin['grossRevenue'] / merge_df_users_fin['adImpressions'] ## 1회 광고노출당 총수익

    # 파생변수1 - 참여도 관련
    merge_df_users_fin['like_rate'] = merge_df_users_fin['likes'] / merge_df_users_fin['views'] ## 좋아요 비율 
    merge_df_users_fin['comment_rate'] = merge_df_users_fin['comments'] / merge_df_users_fin['views'] ## 댓글 비율
    merge_df_users_fin['share_rate'] = merge_df_users_fin['shares'] / merge_df_users_fin['views'] ## 공유 비율  
    merge_df_users_fin['dislike_rate'] = merge_df_users_fin['dislikes'] / merge_df_users_fin['views'] ## 싫어요 비율

    # 파생변수2 - 구독자 관련
    merge_df_users_fin['subscribers_conversion_rate'] = merge_df_users_fin['subscribersGained'] / merge_df_users_fin['views'] ## 구독자 전환율
    merge_df_users_fin['subscribed_view_rate'] = merge_df_users_fin['SUBSCRIBED'] / (merge_df_users_fin['SUBSCRIBED'] + merge_df_users_fin['UNSUBSCRIBED']) ## 구독자 조회수 비율

    # 파생변수3 - 수익 관련
    merge_df_users_fin['revenue_per_subscribed_view'] = merge_df_users_fin['estimatedRevenue'] / merge_df_users_fin['SUBSCRIBED'] ## 구독자조회수당 수익
    merge_df_users_fin['revenue_per_unsubscribed_view'] = merge_df_users_fin['estimatedRevenue'] / merge_df_users_fin['UNSUBSCRIBED'] ## 비구독자조회수당 수익
    merge_df_users_fin['revenue_per_red_view'] = merge_df_users_fin['estimatedRevenue'] / merge_df_users_fin['redViews'] ## 프리미엄당 수익
    merge_df_users_fin['cpm_to_revenue_ratio'] = merge_df_users_fin['cpm'] / merge_df_users_fin['estimatedRevenue'] ## cpm 대비 수익
    merge_df_users_fin['revenue_per_ad_impression'] = merge_df_users_fin['estimatedRevenue'] / merge_df_users_fin['adImpressions'] ## 광고노출당 수익

    # 파생변수4 - 시청 시간 관련
    merge_df_users_fin['watched_view_rate'] = merge_df_users_fin['estimatedMinutesWatched'] / merge_df_users_fin['views'] ## 조회수당 시청시간
    merge_df_users_fin['unsubscribed_view_time_rate'] = merge_df_users_fin['estimatedMinutesWatched'] / merge_df_users_fin['UNSUBSCRIBED'] ## 비구독자조회수당 시청시간

    # null값 대체
    merge_df_users_fin.fillna(0, inplace=True) ## NaN
    merge_df_users_fin.replace([np.inf, -np.inf], 0, inplace=True) ## inf

    final_col = ['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view', 'gross_revenue_per_ad_impression','UNSUBSCRIBED', 'SUBSCRIBED', 'estimatedMinutesWatched', 
                'averageViewDuration', 'comments', 'dislikes', 'likes', 'shares', 'redViews', 'estimatedRevenue', 'cpm', 'playbackBasedCpm', 'like_rate', 'comment_rate', 'share_rate', 'dislike_rate', 
                'subscribers_conversion_rate', 'subscribed_view_rate', 'revenue_per_unsubscribed_view', 'revenue_per_red_view', 'cpm_to_revenue_ratio', 'revenue_per_ad_impression', 'watched_view_rate', 
                'unsubscribed_view_time_rate']

    # 표준화 (Standard Scaling)
    scaler = StandardScaler()
    scaled_features_user = scaler.fit_transform(merge_df_users_fin[final_col])

    # Isolation Forest 모델 학습
    iso_forest_user = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
    iso_forest_user.fit(scaled_features_user)

    # 이상치 점수 계산
    anomaly_scores = iso_forest_user.decision_function(scaled_features_user)

    # 임계값 설정
    threshold = np.percentile(anomaly_scores, 5) ## 이상치 점수가 작을수록 이상치에 가깝다고 판단하는데, 5번째 백분위수가 (-)로 바뀌는 포인트

    # y값 라벨링 (이상치이면 0, 그렇지 않으면 1)
    merge_df_users_fin['y_label'] = np.where(anomaly_scores < threshold, 0, 1)

    # 계정별 y값 빈도 데이터
    y_result_df = merge_df_users_fin.groupby('youtube_user_id')['y_label'].value_counts().reset_index()

    # 계정별 일일데이터의 10% 이상 이상치 데이터인 계정 확인
    fraud_user_id = list(y_result_df[y_result_df['y_label']==0][y_result_df[y_result_df['y_label']==0]['count'] >= 40]['youtube_user_id'])

    ##########################################################################################################################################################
    # 콘텐츠 데이터
    # y값 파생변수
    youtube_videos['total_engage_rate'] = (youtube_videos['likes'] + youtube_videos['comments'] + youtube_videos['shares'] + youtube_videos['dislikes']) / youtube_videos['views'] ## 총 참여율
    youtube_videos['net_subscribers_change'] = youtube_videos['subscribersGained'] - youtube_videos['subscribersLost'] ## 구독자 순증가
    youtube_videos['revenue_per_view'] = youtube_videos['estimatedRevenue'] / youtube_videos['views'] ## 조회수당 수익
    youtube_videos['grossRevenue_per_ad_impression'] = youtube_videos['grossRevenue'] / youtube_videos['adImpressions'] ## 1회 광고노출당 총수익
    youtube_videos['total_card_teaser_click_rate'] = (youtube_videos['cardClicks'] + youtube_videos['cardTeaserClicks']) / (youtube_videos['cardImpressions'] + youtube_videos['cardTeaserImpressions']) ## 카드와 카드 티저의 총 클릭률
    youtube_videos['playlist_engagement_rate'] = (youtube_videos['videosAddedToPlaylists'] + youtube_videos['videosRemovedFromPlaylists']) / youtube_videos['views'] ## 플레이리스트 참여도

    # 파생변수1 - 참여도 관련
    youtube_videos['comment_rate'] = youtube_videos['comments'] / youtube_videos['views'] ## 댓글 비율
    youtube_videos['dislike_rate'] = youtube_videos['dislikes'] / youtube_videos['views'] ## 싫어요 비율

    # 파생변수2 - 구독자 관련
    youtube_videos['subscribers_conversion_rate'] = youtube_videos['subscribersGained'] / youtube_videos['views'] ## 구독자 전환율

    # 파생변수3 - 수익 관련
    youtube_videos['revenue_per_red_view'] = youtube_videos['estimatedRevenue'] / youtube_videos['redViews'] ## 프리미엄당 수익
    youtube_videos['ad_revenue_rate'] = youtube_videos['estimatedAdRevenue'] / youtube_videos['estimatedRevenue'] ## 광고수익비율
    youtube_videos['red_revenue_rate'] = youtube_videos['estimatedRedPartnerRevenue'] / youtube_videos['estimatedRevenue'] ## 프리미엄수익비율
    youtube_videos['revenue_per_ad_impression'] = youtube_videos['estimatedRevenue'] / youtube_videos['adImpressions'] ## 광고노출당 수익
    youtube_videos['net_revenue_per_playlist_add'] = (youtube_videos['videosAddedToPlaylists'] - youtube_videos['videosRemovedFromPlaylists']) / youtube_videos['estimatedRevenue'] ## 플레이리스트 순추가당 수익

    # 파생변수4 - 시청 시간 관련
    youtube_videos['avg_view_duration_rate'] = youtube_videos['averageViewDuration'] / youtube_videos['averageViewPercentage'] ## 평균 시청 시간 비율
    youtube_videos['watched_time_rate'] = youtube_videos['averageViewPercentage'] * youtube_videos['estimatedMinutesWatched'] ## 재생 비율 대비 시청 시간
    youtube_videos['watched_view_red_rate'] = youtube_videos['estimatedRedMinutesWatched'] / youtube_videos['views'] ## 조회수당 프리미엄 이용자 시청시간

    # 파생변수5 - 광고 관련
    youtube_videos['revenue_per_playback'] = youtube_videos['grossRevenue'] / youtube_videos['monetizedPlaybacks'] ## 1회 광고재생당 수익
    youtube_videos['ad_playbacks_per_playlist_add'] = youtube_videos['monetizedPlaybacks'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가당 광고 재생 비율

    # 파생변수6 - 비디오 관련
    youtube_videos['playlist_addition_rate'] = youtube_videos['videosAddedToPlaylists'] / youtube_videos['views'] ## 플레이리스트 추가 비율
    youtube_videos['playlist_removal_rate'] = youtube_videos['videosRemovedFromPlaylists'] / youtube_videos['views'] ## 플레이리스트 제거 비율

    # null값 대체
    youtube_videos.fillna(0, inplace=True) ## NaN
    youtube_videos.replace([np.inf, -np.inf], 0, inplace=True) ## inf

    final_video_col = ['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view', 'grossRevenue_per_ad_impression', 'total_card_teaser_click_rate', 
                    'playlist_engagement_rate','views', 'redViews', 'likes', 'videosAddedToPlaylists', 'shares', 'averageViewDuration', 'estimatedRevenue', 'estimatedAdRevenue', 'grossRevenue', 
                    'adImpressions', 'cpm', 'comment_rate', 'dislike_rate', 'subscribers_conversion_rate', 'revenue_per_red_view', 'ad_revenue_rate', 'red_revenue_rate', 'revenue_per_ad_impression',
                    'net_revenue_per_playlist_add', 'watched_time_rate', 'watched_view_red_rate', 'ad_playbacks_per_playlist_add', 'playlist_addition_rate', 'playlist_removal_rate']

    # 표준화 (Standard Scaling)
    scaler = StandardScaler()
    scaled_features_video = scaler.fit_transform(youtube_videos[final_video_col])

    # Isolation Forest 모델 학습
    iso_forest_video = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
    iso_forest_video.fit(scaled_features_video)

    # 이상치 점수 계산
    anomaly_scores = iso_forest_video.decision_function(scaled_features_video)

    # 임계값 설정
    threshold = np.percentile(anomaly_scores, 5) ## 이상치 점수가 작을수록 이상치에 가깝다고 판단하는데, 5번째 백분위수가 (-)로 바뀌는 포인트

    # y값 라벨링 (이상치이면 0, 그렇지 않으면 1)
    youtube_videos['y_label'] = np.where(anomaly_scores < threshold, 0, 1)

    # 이상치 콘텐츠 확인 - 콘텐츠 일일 데이터 중 10% 이상 이상치 데이터 확인
    result_video_df = youtube_videos.groupby('video')['y_label'].value_counts().reset_index()

    result_video_df_outlier = pd.merge(result_video_df[result_video_df['y_label']==0].reset_index(drop=True), ## video별 이상치 테이블
                                    result_video_df.groupby('video')['count'].sum().reset_index(), ## 전체 video 테이블
                                    how='left', on='video')
    video_id_outlier = list(result_video_df_outlier[(result_video_df_outlier['count_x'] / result_video_df_outlier['count_y']) >= 0.1]['video'].unique()) ## video별 이상치 비율

    # 이상치 콘텐츠가 20% 이상 있는 계정 확인
    result_user_df_outlier = pd.merge(youtube_videos[youtube_videos['video'].isin(video_id_outlier)].groupby('youtube_user_id')['video'].count().reset_index(),
                                    youtube_videos.groupby(['youtube_user_id'])['video'].count().reset_index(), how='left', on='youtube_user_id')
    fraud_video_user_id = list(result_user_df_outlier[(result_user_df_outlier['video_x'] / result_user_df_outlier['video_y']) >= 0.15]['youtube_user_id'].unique())

    # 계정 이상치, 콘텐츠 이상치 - 영향력이 큰 계정
    user_id1 = (set(fraud_user_id) & set(fraud_video_user_id)) & set(user_id)
    print('큰 영향력 계정')
    print(len(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(user_id1)]['channel_title'].unique()))
    print(list(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(user_id1)]['channel_title'].unique()))
    print('---------------------------------------------------------------------')
    print('')

    # 계정 정상, 콘텐츠 정상 - 영향력이 작거나 측정값이 부족
    set(merge_df_users_fin[(~merge_df_users_fin['youtube_user_id'].isin(set(fraud_user_id) & set(fraud_video_user_id))) & (merge_df_users_fin['youtube_user_id'].isin(set(user_id)))]['youtube_user_id'].unique())
    user_id2 = set(merge_df_users_fin[(~merge_df_users_fin['youtube_user_id'].isin(set(fraud_user_id) & set(fraud_video_user_id))) & (merge_df_users_fin['youtube_user_id'].isin(set(user_id)))]['youtube_user_id'].unique())
    print('작은 영향력 계정')
    print(len(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(user_id2)]['channel_title'].unique()))
    print(list(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(user_id2)]['channel_title'].unique()))
    print('---------------------------------------------------------------------')
    print('')

    # 계정 정상, 콘텐츠 이상치 - 잠재적 영향력이 있는 계정
    user_id3 = set(set(fraud_video_user_id) - set(fraud_user_id)) & set(user_id)
    print('잠재적 영향력 계정')
    print(len(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(user_id3)]['channel_title'].unique()))
    print(list(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(user_id3)]['channel_title'].unique()))
    print('---------------------------------------------------------------------')
    print('')

    # 계정 이상치, 콘텐츠 정상 - 가짜 영향력 계정
    user_id4 = set(set(fraud_user_id) - set(fraud_video_user_id)) & set(user_id)
    print('가짜 영향력 계정')
    print(len(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(user_id4)]['channel_title'].unique()))
    print(list(merge_df_users_fin[merge_df_users_fin['youtube_user_id'].isin(user_id4)]['channel_title'].unique()))
    print('---------------------------------------------------------------------')
    print('')

In [15]:
result_topic1(merge_df_users_fin, youtube_videos, user_id)

큰 영향력 계정
0
[]
---------------------------------------------------------------------

작은 영향력 계정
10
['대신 밍튜브', 'VoidNeverstop', '푸', '월간 미니츄[Monthly Minichew]', '루리', '담비', '쵸파춥스 Chopa Chups', '임영곤 게임방송', '모리녀', '미노피']
---------------------------------------------------------------------

잠재적 영향력 계정
0
[]
---------------------------------------------------------------------

가짜 영향력 계정
0
[]
---------------------------------------------------------------------



## 주제2

In [17]:
def result_topic2(youtube_videos, user_id):
    # y값 파생변수 - 구독자 순증가
    youtube_videos['net_subscribers_change'] = youtube_videos['subscribersGained'] - youtube_videos['subscribersLost']

    # 파생변수1 - 참여도 관련
    youtube_videos['share_rate'] = youtube_videos['shares'] / youtube_videos['views'] ## 공유 비율  
    youtube_videos['dislike_rate'] = youtube_videos['dislikes'] / youtube_videos['views'] ## 싫어요 비율

    # 파생변수2 - 구독자 관련
    youtube_videos['subscribers_conversion_rate'] = youtube_videos['subscribersGained'] / youtube_videos['views'] ## 구독자 전환율
    youtube_videos['subscribers_gained_per_playlist_add'] = youtube_videos['subscribersGained'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가당 구독자 증가
    youtube_videos['subscribers_lost_per_playlist_remove'] = youtube_videos['subscribersLost'] / youtube_videos['videosRemovedFromPlaylists'] ## 플레이리스트 제거당 구독자 손실

    # 파생변수4 - 시청 시간 관련
    youtube_videos['watched_view_red_rate'] = youtube_videos['estimatedRedMinutesWatched'] / youtube_videos['views'] ## 조회수당 프리미엄 이용자 시청시간

    # 파생변수7 - 비디오 관련
    youtube_videos['net_playlist_addition_rate'] = (youtube_videos['videosAddedToPlaylists'] - youtube_videos['videosRemovedFromPlaylists']) / youtube_videos['views'] ## 플레이리스트 순추가 비율

    # null값 대체
    youtube_videos.fillna(0, inplace=True) ## NaN
    youtube_videos.replace([np.inf, -np.inf], 0, inplace=True) ## inf

    ##########################################################################################################################################################
    # 주별, 월별, 분기별 변수 생성

    # 조회수 관련
    youtube_videos['weekly_views'] = youtube_videos['views'].rolling(window=7).sum()
    youtube_videos['monthly_views'] = youtube_videos['views'].rolling(window=30).sum()
    youtube_videos['quarterly_views'] = youtube_videos['views'].rolling(window=90).sum()

    # 시청 시간 관련
    youtube_videos['weekly_watch_time'] = youtube_videos['estimatedMinutesWatched'].rolling(window=7).sum()
    youtube_videos['monthly_watch_time'] = youtube_videos['estimatedMinutesWatched'].rolling(window=30).sum()

    # 참여도 관련
    youtube_videos['weekly_total_engagement'] = (youtube_videos['likes'].rolling(window=7).sum() +
                                    youtube_videos['dislikes'].rolling(window=7).sum() +
                                    youtube_videos['comments'].rolling(window=7).sum() +
                                    youtube_videos['shares'].rolling(window=7).sum())
    youtube_videos['monthly_total_engagement'] = (youtube_videos['likes'].rolling(window=30).sum() +
                                    youtube_videos['dislikes'].rolling(window=30).sum() +
                                    youtube_videos['comments'].rolling(window=30).sum() +
                                    youtube_videos['shares'].rolling(window=30).sum())
    youtube_videos['quarterly_total_engagement'] = (youtube_videos['likes'].rolling(window=90).sum() +
                                        youtube_videos['dislikes'].rolling(window=90).sum() +
                                        youtube_videos['comments'].rolling(window=90).sum() +
                                        youtube_videos['shares'].rolling(window=90).sum())
    youtube_videos['weekly_engagement_rate'] = youtube_videos['weekly_total_engagement'] / (youtube_videos['weekly_views'] + 1)
    youtube_videos['weekly_videos_added'] = youtube_videos['videosAddedToPlaylists'].rolling(window=7).sum()

    # null값 대체
    youtube_videos.fillna(0, inplace=True) ## NaN
    youtube_videos.replace([np.inf, -np.inf], 0, inplace=True) ## inf

    ##########################################################################################################################################################
    # 모델 적용
    final_selected_features = ['watched_view_red_rate', 'estimatedMinutesWatched', 'weekly_engagement_rate', 'monthly_views', 'estimatedRedMinutesWatched', 'redViews', 'weekly_videos_added', 
                               'views', 'share_rate', 'weekly_total_engagement', 'dislike_rate', 'weekly_watch_time', 'videosRemovedFromPlaylists', 'subscribers_conversion_rate', 
                               'subscribers_gained_per_playlist_add', 'net_playlist_addition_rate', 'shares', 'quarterly_total_engagement', 'dislikes', 'quarterly_views', 'comments', 'likes', 
                               'subscribers_lost_per_playlist_remove', 'monthly_watch_time', 'videosAddedToPlaylists', 'monthly_total_engagement', 'weekly_views']

    # 모델 정의 및 학습
    # XGBoost를 사용한 모델 학습
    xgb_model = xgb.XGBRegressor(random_state=42)
    xgb_model.fit(youtube_videos[final_selected_features], youtube_videos['net_subscribers_change'])

    # 예측 결과 확인
    youtube_videos['predict'] = xgb_model.predict(youtube_videos[final_selected_features])

    # 계정별 구독자수 평균, 구독자 예측수 평균 비교
    result_contents_df = youtube_videos[['youtube_user_id', 'video', 'end_date', 'net_subscribers_change', 'predict']]

    # 계정별 콘텐츠의 구독자 순증감 1일 합계
    result_contents_df = result_contents_df.groupby(['youtube_user_id', 'end_date'], as_index=False).agg({
        'net_subscribers_change': 'sum',
        'predict': 'sum'
    })

    # 1개월, 3개월, 6개월, 1년 뒤 구독자수 예측값 계산
    result_contents_df['end_date'] = pd.to_datetime(result_contents_df['end_date'])

    # Shift와 Rolling 연산을 위한 그룹별 처리
    result_contents_df['1_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-30).rolling(window=30).sum())
    result_contents_df['3_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-90).rolling(window=90).sum())
    result_contents_df['6_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-180).rolling(window=180).sum())
    result_contents_df['12_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-365).rolling(window=365).sum())

    # 계정별로 최종 평균값을 계산
    result_contents_df_fin = result_contents_df.groupby('youtube_user_id').agg({
        'net_subscribers_change': 'mean',
        'predict': 'mean',
        '1_month_future_predict': 'mean',
        '3_month_future_predict': 'mean',
        '6_month_future_predict': 'mean',
        '12_month_future_predict': 'mean'
    }).reset_index()

    # 1개월 결과 - 사분위수 기준 구간 나누기
    # Q1 (25%), Q2 (중위수, 50%), Q3 (75%)를 구함
    Q1_1month = result_contents_df_fin['1_month_future_predict'].quantile(0.25)
    Q3_1month = result_contents_df_fin['1_month_future_predict'].quantile(0.75)

    # Q1, Q2, Q3에 기반하여 구간 나누기
    result_contents_df_fin['1_month_future_predict_result'] = pd.cut(result_contents_df_fin['1_month_future_predict'],
                                                                    bins=[-float('inf'), Q1_1month, Q3_1month, float('inf')],
                                                                    labels=['Low', 'Medium', 'High'])
    
    # 3개월 결과 - 사분위수 기준 구간 나누기
    # Q1 (25%), Q2 (중위수, 50%), Q3 (75%)를 구함
    Q1_3month = result_contents_df_fin['3_month_future_predict'].quantile(0.25)
    Q3_3month = result_contents_df_fin['3_month_future_predict'].quantile(0.75)

    # Q1, Q2, Q3에 기반하여 구간 나누기
    result_contents_df_fin['3_month_future_predict_result'] = pd.cut(result_contents_df_fin['3_month_future_predict'],
                                                                    bins=[-float('inf'), Q1_3month, Q3_3month, float('inf')],
                                                                    labels=['Low', 'Medium', 'High'])
    
    result_contents_df_final = result_contents_df_fin[result_contents_df_fin['youtube_user_id'].isin(user_id)].reset_index(drop=True)

    # 1개월 후 결과
    # 유지 및 감소 계정 - Low
    low_1month = list(result_contents_df_final[result_contents_df_final['1_month_future_predict_result']=='Low']['youtube_user_id'])
    print('1개월 후 유지 및 감소 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(low_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(low_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')

    # 증가 계정 - Medium
    medium_1month = list(result_contents_df_final[result_contents_df_final['1_month_future_predict_result']=='Medium']['youtube_user_id'])
    print('1개월 후 증가 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(medium_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(medium_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')

    # 큰 증가 계정 - High
    high_1month = list(result_contents_df_final[result_contents_df_final['1_month_future_predict_result']=='High']['youtube_user_id'])
    print('1개월 후 증가 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(high_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(high_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')

    # 3개월 후 결과
    # 유지 및 감소 계정 - Low
    low_1month = list(result_contents_df_final[result_contents_df_final['3_month_future_predict_result']=='Low']['youtube_user_id'])
    print('3개월 후 유지 및 감소 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(low_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(low_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')

    # 증가 계정 - Medium
    medium_1month = list(result_contents_df_final[result_contents_df_final['3_month_future_predict_result']=='Medium']['youtube_user_id'])
    print('3개월 후 증가 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(medium_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(medium_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')

    # 큰 증가 계정 - High
    high_1month = list(result_contents_df_final[result_contents_df_final['3_month_future_predict_result']=='High']['youtube_user_id'])
    print('3개월 후 증가 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(high_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(high_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')
    
    return result_contents_df_final

In [18]:
result_topic2(youtube_videos, user_id)

1개월 후 유지 및 감소 예상 계정
5
['대신 밍튜브' 'VoidNeverstop' '푸' '월간 미니츄[Monthly Minichew]' '루리']
---------------------------------------------------------------------

1개월 후 증가 예상 계정
3
['담비' '임영곤 게임방송' '모리녀']
---------------------------------------------------------------------

1개월 후 증가 예상 계정
0
[]
---------------------------------------------------------------------

3개월 후 유지 및 감소 예상 계정
2
['VoidNeverstop' '루리']
---------------------------------------------------------------------

3개월 후 증가 예상 계정
3
['담비' '임영곤 게임방송' '모리녀']
---------------------------------------------------------------------

3개월 후 증가 예상 계정
0
[]
---------------------------------------------------------------------



Unnamed: 0,youtube_user_id,net_subscribers_change,predict,1_month_future_predict,3_month_future_predict,6_month_future_predict,12_month_future_predict,1_month_future_predict_result,3_month_future_predict_result
0,62850de04e1c157f51ac7550,0.070175,0.079192,2.5829,6.67218,11.249593,,Low,Low
1,62873100fb15712a8cb936b0,0.016529,0.012903,0.462947,,,,Low,
2,62873c84fb15712a8cb93837,0.038462,0.036724,,,,,,
3,62b86400507271632b906ecb,0.0,-0.002229,-0.061259,,,,Low,
4,62dcd06919284d77beba310c,0.060302,0.069588,1.86216,4.162778,6.457463,,Low,Low
5,632717701babf8392007a899,0.0,-0.001208,,,,,,
6,63fba59f2a0144119186eeef,3.728411,3.767629,95.284017,294.187542,651.929372,1275.552858,Medium,Medium
7,6455d84dd88e4c67532fcdc4,40.674003,39.719566,1107.372952,3195.344832,7246.757797,,Medium,Medium
8,64a41a0c9247f326464d1ec1,0.026667,0.023797,-0.0657,,,,Low,
9,654c4f871120b40b442a931f,3.75766,3.772498,108.474346,338.898192,,,Medium,Medium


## 주제3

In [19]:
def result_topic3(youtube_videos, user_id):
    # 파생변수1 - 수익 관련
    youtube_videos['net_revenue_per_playlist_add'] = (youtube_videos['videosAddedToPlaylists'] - youtube_videos['videosRemovedFromPlaylists']) / youtube_videos['estimatedRevenue'] ## 플레이리스트 순추가당 수익
    youtube_videos['revenue_per_playlist_add'] = youtube_videos['estimatedRevenue'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가당 수익
    youtube_videos['red_revenue_rate'] = youtube_videos['estimatedRedPartnerRevenue'] / youtube_videos['estimatedRevenue'] ## 프리미엄수익비율
    youtube_videos['playback_based_cpm_rate'] = youtube_videos['playbackBasedCpm'] / youtube_videos['cpm'] ## 재생 기반 수익
    youtube_videos['cpm_to_revenue_ratio'] = youtube_videos['cpm'] / youtube_videos['estimatedRevenue'] ## cpm 대비 수익

    # 파생변수2 - 시청 시간 관련
    youtube_videos['watched_time_rate'] = youtube_videos['averageViewPercentage'] * youtube_videos['estimatedMinutesWatched'] ## 재생 비율 대비 이용자 시청 시간
    youtube_videos['watched_time_red_rate'] = youtube_videos['averageViewPercentage'] * youtube_videos['estimatedRedMinutesWatched'] ## 재생 비율 대비 프리미엄 이용자 시청 시간

    # 파생변수3 - 광고 관련
    youtube_videos['ad_impressions_per_playlist_add'] = youtube_videos['adImpressions'] / youtube_videos['videosAddedToPlaylists'] ## 플레이리스트 추가당 광고 노출

    # null값 대체
    youtube_videos.fillna(0, inplace=True) ## NaN
    youtube_videos.replace([np.inf, -np.inf], 0, inplace=True) ## inf

    ##########################################################################################################################################################
    # 주별, 월별, 분기별 변수 생성

    # 조회수 및 시청 시간 관련
    youtube_videos['weekly_watch_time'] = youtube_videos['estimatedMinutesWatched'].rolling(window=7).sum()
    youtube_videos['monthly_watch_time'] = youtube_videos['estimatedMinutesWatched'].rolling(window=30).sum()
    youtube_videos['quarterly_watch_time'] = youtube_videos['estimatedMinutesWatched'].rolling(window=90).sum()

    # 참여도 관련
    youtube_videos['weekly_videos_removed'] = youtube_videos['videosRemovedFromPlaylists'].rolling(window=7).sum()

    # null값 대체
    youtube_videos.fillna(0, inplace=True) ## NaN
    youtube_videos.replace([np.inf, -np.inf], 0, inplace=True) ## inf

    ##########################################################################################################################################################
    # 모델 적용
    final_selected_features = ['net_revenue_per_playlist_add', 'revenue_per_playlist_add', 'likes', 'videosRemovedFromPlaylists', 'estimatedMinutesWatched', 'playbackBasedCpm', 'cpm', 'dislikes', 
                            'watched_time_rate', 'watched_time_red_rate', 'shares', 'monthly_watch_time', 'weekly_watch_time', 'red_revenue_rate', 'estimatedRedMinutesWatched', 'subscribersLost', 
                            'ad_impressions_per_playlist_add', 'videosAddedToPlaylists', 'redViews', 'quarterly_watch_time', 'views', 'playback_based_cpm_rate', 'cpm_to_revenue_ratio', 'weekly_videos_removed']

    # 모델 정의 및 학습
    # XGBoost를 사용한 모델 학습
    xgb_model = xgb.XGBRegressor(random_state=42)
    xgb_model.fit(youtube_videos[final_selected_features], youtube_videos['estimatedAdRevenue'])

    # 예측 결과 확인
    youtube_videos['predict'] = xgb_model.predict(youtube_videos[final_selected_features])

    # 계정별 구독자수 평균, 구독자 예측수 평균 비교
    result_contents_df = youtube_videos[['youtube_user_id', 'video', 'end_date', 'estimatedAdRevenue', 'predict']]

    # 계정별 콘텐츠의 구독자 순증감 1일 합계
    result_contents_df = result_contents_df.groupby(['youtube_user_id', 'end_date'], as_index=False).agg({
        'estimatedAdRevenue': 'sum',
        'predict': 'sum'
    })

    # 1개월, 3개월, 6개월, 1년 뒤 구독자수 예측값 계산
    result_contents_df['end_date'] = pd.to_datetime(result_contents_df['end_date'])

    # Shift와 Rolling 연산을 위한 그룹별 처리
    result_contents_df['1_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-30).rolling(window=30).sum())
    result_contents_df['3_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-90).rolling(window=90).sum())
    result_contents_df['6_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-180).rolling(window=180).sum())
    result_contents_df['12_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-365).rolling(window=365).sum())

    # 계정별로 최종 평균값을 계산
    result_contents_df_fin = result_contents_df.groupby('youtube_user_id').agg({
        'estimatedAdRevenue': 'mean',
        'predict': 'mean',
        '1_month_future_predict': 'mean',
        '3_month_future_predict': 'mean',
        '6_month_future_predict': 'mean',
        '12_month_future_predict': 'mean'
    }).reset_index()

    # 1개월 결과 - 사분위수 기준 구간 나누기
    # Q1 (25%), Q2 (중위수, 50%), Q3 (75%)를 구함
    Q1_1month = result_contents_df_fin['1_month_future_predict'].quantile(0.25)
    Q3_1month = result_contents_df_fin['1_month_future_predict'].quantile(0.75)

    # Q1, Q2, Q3에 기반하여 구간 나누기
    result_contents_df_fin['1_month_future_predict_result'] = pd.cut(result_contents_df_fin['1_month_future_predict'],
                                                                    bins=[-float('inf'), Q1_1month, Q3_1month, float('inf')],
                                                                    labels=['Low', 'Medium', 'High'])
    
    # 3개월 결과 - 사분위수 기준 구간 나누기
    # Q1 (25%), Q2 (중위수, 50%), Q3 (75%)를 구함
    Q1_3month = result_contents_df_fin['3_month_future_predict'].quantile(0.25)
    Q3_3month = result_contents_df_fin['3_month_future_predict'].quantile(0.75)

    # Q1, Q2, Q3에 기반하여 구간 나누기
    result_contents_df_fin['3_month_future_predict_result'] = pd.cut(result_contents_df_fin['3_month_future_predict'],
                                                                    bins=[-float('inf'), Q1_3month, Q3_3month, float('inf')],
                                                                    labels=['Low', 'Medium', 'High'])
    
    result_contents_df_final = result_contents_df_fin[result_contents_df_fin['youtube_user_id'].isin(user_id)].reset_index(drop=True)

    # 1개월 후 결과
    # 유지 및 감소 계정 - Low
    low_1month = list(result_contents_df_final[result_contents_df_final['1_month_future_predict_result']=='Low']['youtube_user_id'])
    print('1개월 후 유지 및 감소 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(low_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(low_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')

    # 증가 계정 - Medium
    medium_1month = list(result_contents_df_final[result_contents_df_final['1_month_future_predict_result']=='Medium']['youtube_user_id'])
    print('1개월 후 증가 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(medium_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(medium_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')

    # 큰 증가 계정 - High
    high_1month = list(result_contents_df_final[result_contents_df_final['1_month_future_predict_result']=='High']['youtube_user_id'])
    print('1개월 후 증가 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(high_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(high_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')

    # 3개월 후 결과
    # 유지 및 감소 계정 - Low
    low_1month = list(result_contents_df_final[result_contents_df_final['3_month_future_predict_result']=='Low']['youtube_user_id'])
    print('3개월 후 유지 및 감소 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(low_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(low_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')

    # 증가 계정 - Medium
    medium_1month = list(result_contents_df_final[result_contents_df_final['3_month_future_predict_result']=='Medium']['youtube_user_id'])
    print('3개월 후 증가 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(medium_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(medium_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')

    # 큰 증가 계정 - High
    high_1month = list(result_contents_df_final[result_contents_df_final['3_month_future_predict_result']=='High']['youtube_user_id'])
    print('3개월 후 증가 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(high_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(high_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')
    
    return result_contents_df_final

In [20]:
result_topic3(youtube_videos, user_id)

1개월 후 유지 및 감소 예상 계정
6
['대신 밍튜브' 'VoidNeverstop' '푸' '월간 미니츄[Monthly Minichew]' '루리' '모리녀']
---------------------------------------------------------------------

1개월 후 증가 예상 계정
2
['담비' '임영곤 게임방송']
---------------------------------------------------------------------

1개월 후 증가 예상 계정
0
[]
---------------------------------------------------------------------

3개월 후 유지 및 감소 예상 계정
3
['VoidNeverstop' '루리' '모리녀']
---------------------------------------------------------------------

3개월 후 증가 예상 계정
2
['담비' '임영곤 게임방송']
---------------------------------------------------------------------

3개월 후 증가 예상 계정
0
[]
---------------------------------------------------------------------



Unnamed: 0,youtube_user_id,estimatedAdRevenue,predict,1_month_future_predict,3_month_future_predict,6_month_future_predict,12_month_future_predict,1_month_future_predict_result,3_month_future_predict_result
0,62850de04e1c157f51ac7550,0.0,-3.085256,-80.615569,-228.171074,-447.3453,,Low,Low
1,62873100fb15712a8cb936b0,0.0,-0.272655,-7.914874,,,,Low,
2,62873c84fb15712a8cb93837,0.0,-0.188522,,,,,,
3,62b86400507271632b906ecb,0.0,-0.091421,2.779001,,,,Low,
4,62dcd06919284d77beba310c,0.0,-0.937933,-26.53599,-58.154149,-119.661,,Low,Low
5,632717701babf8392007a899,0.0,0.439288,,,,,,
6,63fba59f2a0144119186eeef,5900.037241,5887.48584,153570.139195,462504.760776,1017487.0,2087781.0,Medium,Medium
7,6455d84dd88e4c67532fcdc4,4101.975853,4003.440918,114187.320641,339547.887733,740485.2,,Medium,Medium
8,64a41a0c9247f326464d1ec1,0.0,-0.966231,-11.275959,,,,Low,
9,654c4f871120b40b442a931f,0.0,11.416277,344.076431,1023.491894,,,Low,Low


## 주제4

In [29]:
def result_topic4(youtube_videos, features_weight_path, user_id):
    # y값 설정
    # youtube_videos['estimatedRevenue'], youtube_videos['net_subscribers_change'], youtube_videos['engage_rate']
    youtube_videos['net_subscribers_change'] = youtube_videos['subscribersGained'] - youtube_videos['subscribersLost']
    youtube_videos['engage_rate'] = (youtube_videos['likes'] + youtube_videos['comments'] + youtube_videos['shares']) / youtube_videos['views']

    # 조회수당 수익
    youtube_videos['revenue_per_view'] = youtube_videos['estimatedRevenue'] / youtube_videos['views']
    # 구독자당 수익
    youtube_videos['revenue_per_Subscriber'] = youtube_videos['estimatedRevenue'] / youtube_videos['subscribersGained']
    # YouTube Premium 수익
    youtube_videos['estimatedRedPartnerRevenue']
    # 수익 다변화 비율
    youtube_videos['revenue_diversification_ratio'] = (youtube_videos['grossRevenue'] - youtube_videos['estimatedRevenue']) / youtube_videos['estimatedRevenue']
    # 구독자 증가율
    youtube_videos['subscriber_growth_rate'] = youtube_videos['subscribersGained'] / (youtube_videos['subscribersGained'] + youtube_videos['subscribersLost'])
    # 구독자 감소율
    youtube_videos['subscriber_loss_rate'] = youtube_videos['subscribersLost'] / (youtube_videos['subscribersGained'] + youtube_videos['subscribersLost'])
    # 구독자 유지율
    youtube_videos['subscriber_retention_rate'] = (youtube_videos['subscribersGained'] - youtube_videos['subscribersLost']) / youtube_videos['subscribersGained']
    # 콘텐츠당 구독자 증가율
    youtube_videos['subscriber_gain_per_content'] = youtube_videos['subscribersGained']/ youtube_videos['videosAddedToPlaylists']
    # 구독자당 시청 시간
    youtube_videos['watch_time_per_subscriber'] = youtube_videos['estimatedMinutesWatched'] / youtube_videos['subscribersGained']
    # 광고재생률
    youtube_videos['ad_playback_rate'] = youtube_videos['monetizedPlaybacks'] / youtube_videos['adImpressions']

    ##########################################################################################################################################################
    # 변동계수 생성
    # 월별 수익 다변화 비율 변동계수
    youtube_videos['monthly_revenue_diversification_ratio_trd'] = youtube_videos.groupby('video')['revenue_diversification_ratio'].transform(lambda x: (x - x.shift(30)) / x.shift(30))
    # 월별 구독자당 시청 시간 변동계수
    youtube_videos['monthly_watch_time_per_subscriber_std'] = youtube_videos.groupby('video')['watch_time_per_subscriber'].transform(lambda x: x.rolling(window=30).std())
    # 월별 YouTube Premium 수익 변동계수
    youtube_videos['monthly_estimatedRedPartnerRevenue_std'] = youtube_videos.groupby('video')['estimatedRedPartnerRevenue'].transform(lambda x: x.rolling(window=30).std())

    # null값 대체
    youtube_videos.fillna(0, inplace=True) ## NaN
    youtube_videos.replace([np.inf, -np.inf], 0, inplace=True) ## inf

    ##########################################################################################################################################################
    # 평가요소별 가중치 적용
    final_col = ['dislikes', 'likes', 'shares', 'comments', 'redViews', 'estimatedRedPartnerRevenue', 'monthly_estimatedRedPartnerRevenue_std', 'revenue_per_view', 'subscriber_gain_per_content',
                'videosRemovedFromPlaylists', 'monthly_watch_time_per_subscriber_std', 'watch_time_per_subscriber', 'revenue_diversification_ratio', 'monthly_revenue_diversification_ratio_trd',
                'playbackBasedCpm', 'monetizedPlaybacks', 'adImpressions', 'averageViewDuration', 'averageViewPercentage', 'estimatedMinutesWatched']
    
    importances_df_final = pd.read_csv(features_weight_path)

    # 상환이력
    eval_col1 = ['dislikes','likes','shares','comments','redViews']
    importances_df_final[importances_df_final['features'].isin(eval_col1)]['weight'].sum()
    # 부채수준
    eval_col2 = ['estimatedRedPartnerRevenue','monthly_estimatedRedPartnerRevenue_std','revenue_per_view']
    importances_df_final[importances_df_final['features'].isin(eval_col2)]['weight'].sum()
    # 신용거래기간
    eval_col3 = ['subscriber_gain_per_content','videosRemovedFromPlaylists','monthly_watch_time_per_subscriber_std','watch_time_per_subscriber']
    importances_df_final[importances_df_final['features'].isin(eval_col3)]['weight'].sum()
    # 신용형태
    eval_col4 = ['revenue_diversification_ratio','monthly_revenue_diversification_ratio_trd','playbackBasedCpm','monetizedPlaybacks','adImpressions']
    importances_df_final[importances_df_final['features'].isin(eval_col4)]['weight'].sum()
    # 비금융/마이데이터
    eval_col5 = ['averageViewDuration','averageViewPercentage','estimatedMinutesWatched']
    importances_df_final[importances_df_final['features'].isin(eval_col5)]['weight'].sum()

    ##########################################################################################################################################################
    # 가중치 실제값 적용
    # 상환이력
    youtube_videos['score1'] = (youtube_videos[eval_col1] * np.array(importances_df_final[importances_df_final['features'].isin(eval_col1)]['weight'])).sum(axis=1)
    # 부채수준
    youtube_videos['score2'] = (youtube_videos[eval_col2] * np.array(importances_df_final[importances_df_final['features'].isin(eval_col2)]['weight'])).sum(axis=1)
    # 신용거래기간
    youtube_videos['score3'] = (youtube_videos[eval_col3] * np.array(importances_df_final[importances_df_final['features'].isin(eval_col3)]['weight'])).sum(axis=1)
    # 신용형태
    youtube_videos['score4'] = (youtube_videos[eval_col4] * np.array(importances_df_final[importances_df_final['features'].isin(eval_col4)]['weight'])).sum(axis=1)
    # 비금융/마이데이터
    youtube_videos['score5'] = (youtube_videos[eval_col5] * np.array(importances_df_final[importances_df_final['features'].isin(eval_col5)]['weight'])).sum(axis=1)

    ##########################################################################################################################################################
    # 계정별 신용점수 확인 - 계정별 평가요소 합계
    credit_df = youtube_videos.groupby('youtube_user_id')[['score1','score2','score3','score4','score5']].sum().reset_index()

    # 불필요 및 이상치 제거
    credit_df = credit_df[~credit_df['youtube_user_id'].isin(['639bb8dcd603b8138e33780b'])].reset_index(drop=True)
    # '639bb8dcd603b8138e33780b' ## 없는 계정 및 이상치

    from sklearn.preprocessing import MinMaxScaler

    # 1. MinMaxScaler 적용
    scaler = MinMaxScaler()
    # credit_df['score_scale'] = scaler.fit_transform(credit_df[['score']])
    credit_df['score1_scale'] = scaler.fit_transform(credit_df[['score1']])
    credit_df['score2_scale'] = scaler.fit_transform(credit_df[['score2']])
    credit_df['score3_scale'] = scaler.fit_transform(credit_df[['score3']])
    credit_df['score4_scale'] = scaler.fit_transform(credit_df[['score4']])
    credit_df['score5_scale'] = scaler.fit_transform(credit_df[['score5']])

    credit_df['score1_final'] = scaler.fit_transform(credit_df[['score1_scale']]) * 430
    credit_df['score2_final'] = scaler.fit_transform(credit_df[['score2_scale']]) * 410
    credit_df['score3_final'] = scaler.fit_transform(credit_df[['score3_scale']]) * 50
    credit_df['score4_final'] = scaler.fit_transform(credit_df[['score4_scale']]) * 60
    credit_df['score5_final'] = scaler.fit_transform(credit_df[['score5_scale']]) * 50

    # 평가요소 분류 활용 스코어링
    credit_df['credit_score'] = credit_df[['score1_final','score2_final','score3_final','score4_final','score5_final']].sum(axis=1)

    # 신용평가점수 테이블
    credit_df_fin = pd.merge(credit_df[['youtube_user_id','credit_score']],user_info_df,how='left',on='youtube_user_id')
    credit_df_fin = credit_df_fin[['youtube_user_id','channel_title','credit_score']]
    credit_df_fin = credit_df_fin[~credit_df_fin['channel_title'].isnull()].reset_index(drop=True)

    ##########################################################################################################################################################
    # 변동계수
    # estimatedRevenue
    youtube_videos['weekly_estimatedRevenue_cv'] = youtube_videos.groupby('video')['estimatedRevenue'].transform(lambda x: x.rolling(window=7).std() / x.rolling(window=7).mean())
    youtube_videos['monthly_estimatedRevenue_cv'] = youtube_videos.groupby('video')['estimatedRevenue'].transform(lambda x: x.rolling(window=30).std() / x.rolling(window=30).mean())

    # net_subscribers_change
    youtube_videos['weekly_net_subscribers_change_cv'] = youtube_videos[youtube_videos['net_subscribers_change']!=0].groupby('video')['net_subscribers_change'].transform(lambda x: x.rolling(window=7).std() / x.rolling(window=7).mean())
    youtube_videos['monthly_net_subscribers_change_cv'] = youtube_videos[youtube_videos['net_subscribers_change']!=0].groupby('video')['net_subscribers_change'].transform(lambda x: x.rolling(window=30).std() / x.rolling(window=30).mean())

    # engage_rate
    youtube_videos['weekly_engage_rate_cv'] = youtube_videos[youtube_videos['engage_rate']!=0].groupby('video')['engage_rate'].transform(lambda x: x.rolling(window=7).std() / x.rolling(window=7).mean())
    youtube_videos['monthly_engage_rate_cv'] = youtube_videos[youtube_videos['engage_rate']!=0].groupby('video')['engage_rate'].transform(lambda x: x.rolling(window=30).std() / x.rolling(window=30).mean())

    cv_col = ['weekly_estimatedRevenue_cv','monthly_estimatedRevenue_cv','weekly_net_subscribers_change_cv','monthly_net_subscribers_change_cv','weekly_engage_rate_cv','monthly_engage_rate_cv']

    # null값 대체
    youtube_videos.fillna(0, inplace=True) ## NaN
    youtube_videos.replace([np.inf, -np.inf], 0, inplace=True) ## inf

    # 계정별 변동계수 평균
    coefvar_df = youtube_videos.groupby('youtube_user_id')[cv_col].mean().reset_index()

    # 최종 신용평가 테이블
    credit_coef_df = pd.merge(credit_df_fin, coefvar_df, how='left', on='youtube_user_id')
    
    # 가중치 역으로 활용하여 최종 변동 계수 도출
    # (1 - 0.36) + (1 - 0.3) + (1 - 0.34) 활용 스케일링
    # 0.32 / 0.35 / 0.33 가중치 적용
    credit_coef_df['cv_score'] = (
                                (((credit_coef_df['weekly_estimatedRevenue_cv'] + credit_coef_df['monthly_estimatedRevenue_cv']) / 2) * 0.32) + 
                                (((credit_coef_df['weekly_net_subscribers_change_cv'] + credit_coef_df['monthly_net_subscribers_change_cv']) / 2) * 0.35) + 
                                (((credit_coef_df['weekly_engage_rate_cv'] + credit_coef_df['monthly_engage_rate_cv']) / 2) * 0.33)
                                )
    # 최종 데이터셋 확인
    credit_df_final = credit_coef_df[['youtube_user_id','channel_title','credit_score','cv_score']]

    # 신용점수별 구간 나누기
    credit_df_final['credit_score_result'] = pd.cut(credit_df_final['credit_score'],
                                                    bins=[-float('inf'), 200, 400, 600, 800, float('inf')],
                                                    labels=['매우위험','위험','보통','안전','매우안전'])
    # 변동계수 구간 나누기
    credit_df_final['cv_score_result'] = pd.cut(credit_df_final['cv_score'],
                                                    bins=[-float('inf'), 0.1, 0.3, 0.5, float('inf')],
                                                    labels=['매우안전','안전','위험','매우위험'])
    
    print(credit_df_final[credit_df_final['youtube_user_id'].isin(user_id)].reset_index(drop=True).sort_values('credit_score',ascending=False))
    return credit_df_final

### 실제데이터 결과 확인

In [101]:
# 신용점수별 구간 나누기
credit_df_final['credit_score_result'] = pd.cut(credit_df_final['credit_score'],
                                                bins=[-float('inf'), 200, 400, 600, 800, float('inf')],
                                                labels=['매우위험','위험','보통','안전','매우안전'])
# 변동계수 구간 나누기
credit_df_final['cv_score_result'] = pd.cut(credit_df_final['cv_score'],
                                                bins=[-float('inf'), 0.1, 0.3, 0.5, float('inf')],
                                                labels=['매우안전','안전','위험','매우위험'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  credit_df_final['credit_score_result'] = pd.cut(credit_df_final['credit_score'],


In [119]:
# 안전
credit_df_final[(credit_df_final['credit_score_result'].isin(['매우안전','안전','보통'])) & (credit_df_final['cv_score_result'].isin(['매우안전','안전']))]['youtube_user_id']

Unnamed: 0,youtube_user_id,channel_title,credit_score,cv_score,credit_score_result,cv_score_result
90,63d77c9650eb530dfd139f8b,kiu기우쌤,808.882446,0.273353,매우안전,안전
98,63eb4f87ee122e631992279f,수빙수tv sooBingsoo,605.135672,0.236631,안전,안전
128,6401e117d746c60e1271fdef,앙찡,495.411207,0.277777,보통,안전


In [120]:
# 위험
credit_df_final[(credit_df_final['credit_score_result'].isin(['위험','매우위험'])) & (credit_df_final['cv_score_result'].isin(['위험','매우위험']))]['youtube_user_id']

Unnamed: 0,youtube_user_id,channel_title,credit_score,cv_score,credit_score_result,cv_score_result
4,628722c8fb15712a8cb9326e,소리미의 신화방송,4.969442,0.452355,매우위험,위험
5,62872317fb15712a8cb932e9,Ella,7.245852,0.414489,매우위험,위험
6,6287231efb15712a8cb932f2,찌늉,9.037498,0.397066,매우위험,위험
7,62872370fb15712a8cb93337,hyeppening 혜프닝,6.696743,0.407804,매우위험,위험
8,6287239cfb15712a8cb93368,KIMBEE 킴비,5.384523,0.368944,매우위험,위험
...,...,...,...,...,...,...
214,6537f4381120b40b4429df06,이현우의 MLBTV,54.102551,0.330029,매우위험,위험
216,653b4ff61120b40b442a0807,코딩국수,9.480579,0.319355,매우위험,위험
219,654ce7b71120b40b442a988e,슬기런바디 Run Body,2.150600,0.392718,매우위험,위험
220,654f30971120b40b442aa90c,Lizzy리지,8.721679,0.332033,매우위험,위험


In [125]:
# 보통
aa = set(credit_df_final['youtube_user_id']) - set(list(credit_df_final[(credit_df_final['credit_score_result'].isin(['매우안전','안전','보통'])) & (credit_df_final['cv_score_result'].isin(['매우안전','안전']))]['youtube_user_id']) + list(credit_df_final[(credit_df_final['credit_score_result'].isin(['위험','매우위험'])) & (credit_df_final['cv_score_result'].isin(['위험','매우위험']))]['youtube_user_id']))
credit_df_final[credit_df_final['youtube_user_id'].isin(aa)]

Unnamed: 0,youtube_user_id,channel_title,credit_score,cv_score,credit_score_result,cv_score_result
0,627cb611aa6f212355e0b617,성팩 SPAAK,13.018805,0.219686,매우위험,안전
1,627f59ccaa39226247c60b01,고도람 Go!doram,0.472203,0.259715,매우위험,안전
2,6287228afb15712a8cb931d7,세남자 물고기,0.518941,0.123913,매우위험,안전
3,6287229efb15712a8cb93225,띠혜 ddihye,79.937059,0.175295,매우위험,안전
9,628723b7fb15712a8cb9337c,름쿠 ᴘʟᴀʏʟɪꜱᴛ,2.091439,0.031660,매우위험,매우안전
...,...,...,...,...,...,...
228,65cc401305bf1c0baa425146,주피코,133.657913,0.249367,매우위험,안전
229,65e7b773d8da110bb072e2b5,신크TV,1.773555,0.166368,매우위험,안전
230,65f7b17ed8da110bb0733b7b,Yerendipity예렌디피티,0.228189,0.122814,매우위험,안전
231,65fecf7ed8da110bb0736199,JN테크리뷰,6.571291,0.149326,매우위험,안전
