In [1]:
import pymongo
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

import xgboost as xgb

import os
from dotenv import load_dotenv

In [146]:
import joblib

## MongoDB 연동

In [2]:
# .env 파일에서 환경 변수 로드
load_dotenv('C:/py_src/awake/.env')

# 환경 변수에서 MongoDB 연결 정보 가져오기
mongo_password = os.getenv('MONGO_PASSWORD')
mongo_user = os.getenv('MONGO_USER')

In [154]:
def mongodb_connection(user, password):
    # 환경 변수에서 MongoDB 연결 정보 가져오기
    mongo_pw = password
    mongo_id = user

    # MongoDB 연결 URL
    url = f"mongodb+srv://{mongo_id}:{mongo_pw}@meercat-external.udyfs.mongodb.net/?retryWrites=true&w=majority&appName=meercat-external"
    client = pymongo.MongoClient(url, serverSelectionTimeoutMS=100000)

    # Test DB 선택
    database = client.Test

    # # Test DB 컬렉션 확인
    # collections_name = db.list_collection_names()

    return database

In [4]:
db = mongodb_connection(mongo_user,mongo_password)

In [5]:
# # 필요 데이터 경로
# # 파일명 그대로 사용!!
# user_info_df_path = 'C:/py_src/awake/data/user_info_df.csv'
# trained_data_path = 'C:/py_src/awake/data/trained_data.csv'
# features_weight_path = 'C:/py_src/awake/data/features_weight.csv'

## 필요데이터 준비

In [16]:
need_data_path = 'C:/py_src/awake/data/need/'

In [17]:
def data_preparation_need(file_path):
    user_info_df_path = file_path + 'user_info_df.csv'
    trained_data_path = file_path + 'trained_data.csv'
    features_weight_path = file_path + 'features_weight.csv'
    input_data_path = file_path + 'input_data.csv'

    df1 = pd.read_csv(user_info_df_path)
    df2 = pd.read_csv(trained_data_path)
    df3 = pd.read_csv(features_weight_path)
    df4 = pd.read_csv(input_data_path)

    return df1, df2, df3, df4

In [18]:
user_info_df, trained_data, features_weight, input_data = data_preparation_need(need_data_path)

## 데이터 준비

### 계정 데이터

In [11]:
def data_preparation_user(databases):
    # youtube_users
    youtube_users = pd.DataFrame(list(databases['youtube_users'].find()))
    
    # 필요컬럼추출
    youtube_users = youtube_users[['channel_id', 'channel_title', 'phone_num', 'report_user_id', 'statistics','published_at']] 

    # cast 하여 최종 데이터셋 생성
    youtube_users = pd.concat([youtube_users, pd.json_normalize(youtube_users['statistics'])],axis=1)
    youtube_users = youtube_users.drop(['statistics','hiddenSubscriberCount'],axis=1)
    youtube_users = youtube_users.dropna(how = 'all')

    # null 값 0으로 대체
    youtube_users[['viewCount', 'subscriberCount', 'videoCount']] = youtube_users[['viewCount', 'subscriberCount', 'videoCount']].fillna(0)
    youtube_users['viewCount'] = youtube_users['viewCount'].astype(int)
    youtube_users['subscriberCount'] = youtube_users['subscriberCount'].astype(int)
    youtube_users['videoCount'] = youtube_users['videoCount'].astype(int)

    youtube_users = youtube_users.sort_values('channel_id').drop_duplicates().reset_index(drop=True)

    # id가 null인 계정 제거
    youtube_users = youtube_users[~youtube_users['channel_id'].isnull()].reset_index(drop=True)

    # channel_id별로 그룹화하여 null값을 해당 그룹 내에서 채우기
    youtube_users['phone_num'] = youtube_users.groupby('channel_id')['phone_num'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))

    # 계정 지표 모두 0인 계정 삭제
    youtube_users = youtube_users[youtube_users[['viewCount', 'subscriberCount', 'videoCount']].sum(axis=1)!=0].reset_index(drop=True)
    
    ##########################################################################################################################################################
    # youtube_channel_locations
    youtube_channel_locations = pd.DataFrame(list(databases['youtube_channel_locations'].find()))

    # 필요컬럼추출
    youtube_channel_locations = youtube_channel_locations[youtube_channel_locations['locations'].apply(lambda x: len(x) > 0)]
    youtube_channel_locations = youtube_channel_locations[['youtube_user_id','end_date','locations']]

    # melt, cast 하여 최종데이터셋 생성
    youtube_channel_locations = youtube_channel_locations.explode(['locations']).reset_index(drop=True)

    # cast 하여 최종 데이터셋 생성
    youtube_channel_locations = pd.concat([youtube_channel_locations,pd.json_normalize(youtube_channel_locations['locations'])], axis=1)
    youtube_channel_locations = youtube_channel_locations.drop(['locations','subscribersGained','subscribersLost'],axis=1)
    youtube_channel_locations = youtube_channel_locations[youtube_channel_locations[youtube_channel_locations.columns[3:]].apply(sum,axis=1)!=0] ## 모두 0인 행 제거
    youtube_channel_locations = youtube_channel_locations.drop_duplicates().sort_values(['youtube_user_id','end_date']).reset_index(drop=True)

    # 계정별 일자별 지표 계산
    youtube_channel_locations = youtube_channel_locations.groupby(['youtube_user_id', 'end_date']).agg({'views': 'sum',
                                                                                                        'estimatedMinutesWatched': 'sum',
                                                                                                        'averageViewDuration': 'mean',
                                                                                                        'averageViewPercentage': 'mean'}).reset_index()
    
    # 시간 분 단위로 변경
    youtube_channel_locations['averageViewDuration'] = youtube_channel_locations['averageViewDuration'] / 60

    # 날짜형식 변경
    youtube_channel_locations = youtube_channel_locations.rename(columns={'end_date':'date'})
    youtube_channel_locations['date'] = youtube_channel_locations['date'].astype(str)

    ##########################################################################################################################################################
    # youtube_daily_channel_basics
    youtube_daily_channel_basics = pd.DataFrame(list(databases['youtube_daily_channel_basics'].find()))

    # 필요컬럼추출
    youtube_daily_channel_basics = youtube_daily_channel_basics[youtube_daily_channel_basics['daily_basics'].apply(lambda x: len(x) > 0)]
    youtube_daily_channel_basics = youtube_daily_channel_basics[['youtube_user_id','daily_basics']] ## 'end_date' --> 'day' 컬럼 활용
    youtube_daily_channel_basics = youtube_daily_channel_basics.explode(['daily_basics']).reset_index(drop=True) ## melt

    # daily_basics 컬럼 형식 맞추기 --> 딕셔너리 값들 cast 형태로 맞추기
    youtube_daily_channel_basics_cast = []
    for item in youtube_daily_channel_basics['daily_basics']:
        if isinstance(item, list):
            youtube_daily_channel_basics_cast.extend(item)
        else:
            youtube_daily_channel_basics_cast.append(item)

    youtube_daily_channel_basics_cast = pd.json_normalize(youtube_daily_channel_basics_cast)

    # melt, cast 하여 최종 데이터셋 생성
    youtube_daily_channel_basics = pd.concat([youtube_daily_channel_basics, youtube_daily_channel_basics_cast],axis=1)
    youtube_daily_channel_basics = youtube_daily_channel_basics.drop('daily_basics',axis=1)
    youtube_daily_channel_basics = youtube_daily_channel_basics.fillna(0) ## null값 0으로 대체
    youtube_daily_channel_basics = youtube_daily_channel_basics[youtube_daily_channel_basics[youtube_daily_channel_basics.columns[3:]].sum(axis=1)!=0] ## 모두 0인 행 제거
    youtube_daily_channel_basics = youtube_daily_channel_basics.sort_values(['youtube_user_id','day']).reset_index(drop=True)

    del youtube_daily_channel_basics_cast

    # 날짜형식 변경
    youtube_daily_channel_basics = youtube_daily_channel_basics.rename(columns={'day':'date'})
    youtube_daily_channel_basics['date'] = youtube_daily_channel_basics['date'].astype(str)

    # 시간 분 단위로 변경
    youtube_daily_channel_basics['averageViewDuration'] = youtube_daily_channel_basics['averageViewDuration'] / 60

    ##########################################################################################################################################################
    # youtube_datas
    youtube_datas_collection = databases['youtube_datas']

    # 파이프라인 정의
    pipeline = [
        {
            "$sort": {
                "youtube_user_id": 1,
                "data_created_at": 1
            }
        },
        {
            "$project": {
                'youtube_user_id' : 1, 
                'data_created_at' : 1, 
                'published_at' : 1, 
                'channel_id' : 1, 
                'channel_title' : 1, 
                'yt_search_keyword' : 1, 
                'subscribed_status' : 1
            }
        }
    ]

    # 파이프라인 실행
    result = list(youtube_datas_collection.aggregate(pipeline, allowDiskUse=True))

    # 결과를 Pandas 데이터프레임으로 변환
    youtube_datas = pd.DataFrame(result)

    # 컬럼 순서 정리
    need_col = ['youtube_user_id', 'data_created_at', 'published_at', 'channel_id', 'channel_title', 'yt_search_keyword', 'subscribed_status']
    youtube_datas = youtube_datas[need_col]

    # cast 하여 데이터셋 생성
    youtube_datas = pd.concat([youtube_datas, pd.json_normalize(youtube_datas['subscribed_status'])],axis=1)
    youtube_datas = youtube_datas.drop(['subscribed_status'],axis=1)

    youtube_datas[youtube_datas.columns[6:]] = youtube_datas[youtube_datas.columns[6:]].fillna(0) # null값 0으로 대체
    youtube_datas = youtube_datas[youtube_datas[youtube_datas.columns[6:]].sum(axis=1)!=0] ## 모두 0인 행 제거

    youtube_datas = youtube_datas.sort_values(['youtube_user_id','data_created_at']).reset_index(drop=True)

    # 날짜형식 변경
    youtube_datas = youtube_datas.rename(columns={'data_created_at':'date'})
    youtube_datas['date'] = youtube_datas['date'].astype(str)

    ##########################################################################################################################################################
    # 데이터 전처리
    # 데이터통합
    merge_df_users_fin = pd.merge(youtube_users,youtube_datas,how='left',on='channel_id')
    need_col = ['youtube_user_id', 'date', 'channel_id', 'channel_title_x', 'published_at_x', 'phone_num', 'yt_search_keyword', 'viewCount', 'subscriberCount', 'videoCount','UNSUBSCRIBED', 'SUBSCRIBED']
    merge_df_users_fin = merge_df_users_fin[need_col]
    merge_df_users_fin = merge_df_users_fin.rename(columns={'channel_title_x':'channel_title','published_at_x':'published_at'})
    merge_df_users_fin = merge_df_users_fin[~merge_df_users_fin['youtube_user_id'].isnull()].reset_index(drop=True)

    merge_df_users_fin = pd.merge(merge_df_users_fin,youtube_channel_locations,how='left',on=['youtube_user_id','date'])
    merge_df_users_fin = merge_df_users_fin.drop(['views'],axis=1)

    youtube_daily_channel_basics = youtube_daily_channel_basics.drop(['annotationClickThroughRate','annotationCloseRate'],axis=1)
    merge_df_users_fin = pd.merge(merge_df_users_fin,youtube_daily_channel_basics,how='left',on=['youtube_user_id','date'])

    # null값 대체
    merge_df_users_fin['estimatedMinutesWatched_x'] = merge_df_users_fin['estimatedMinutesWatched_x'].fillna(merge_df_users_fin['estimatedMinutesWatched_y'])
    merge_df_users_fin['averageViewDuration_x'] = merge_df_users_fin['averageViewDuration_x'].fillna(merge_df_users_fin['averageViewDuration_y'])

    merge_df_users_fin = merge_df_users_fin.drop(['estimatedMinutesWatched_y','averageViewDuration_y'],axis=1)
    merge_df_users_fin = merge_df_users_fin.rename(columns={'estimatedMinutesWatched_x':'estimatedMinutesWatched','averageViewDuration_x':'averageViewDuration'})

    # 영상 시청 시간 합 / 영상 재생 시간 합 = 영상 시청 비율 대체
    merge_df_users_fin['averageViewPercentage'] = np.where(merge_df_users_fin['averageViewPercentage'].isnull(), 
                                                        merge_df_users_fin['estimatedMinutesWatched'] / (merge_df_users_fin['averageViewDuration'] * merge_df_users_fin['views']),
                                                        merge_df_users_fin['averageViewPercentage'])
    merge_df_users_fin['averageViewPercentage'] = merge_df_users_fin['averageViewPercentage'].fillna(0)

    # 환율 적용 - 해당기간 평균환율 : 1322.42
    exchange_rate_col = ['estimatedRevenue', 'estimatedAdRevenue', 'estimatedRedPartnerRevenue', 'grossRevenue', 'cpm', 'playbackBasedCpm']
    merge_df_users_fin[exchange_rate_col] = merge_df_users_fin[exchange_rate_col] * 1322.42

    # 버그로 사용된 수치값 대체
    merge_df_users_fin['likes'] = np.where(merge_df_users_fin['likes'] < 0, 0, merge_df_users_fin['likes'])
    merge_df_users_fin['dislikes'] = np.where(merge_df_users_fin['dislikes'] < 0, 0, merge_df_users_fin['dislikes'])

    return merge_df_users_fin

In [12]:
youtube_user_df = data_preparation_user(db)

  youtube_users['phone_num'] = youtube_users.groupby('channel_id')['phone_num'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
  youtube_users['phone_num'] = youtube_users.groupby('channel_id')['phone_num'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))


### 콘텐츠 데이터

In [23]:
def data_preparation_contents(databases):
    collection = databases['youtube_videos']  # 컬렉션 선택

    # 새로운 입력 데이터 확인
    user_id = input_data['youtube_user_id'].tolist()

    # 파이프라인 정의
    pipeline = [
        {
            "$match": {
                "youtube_user_id": {
                    "$in": user_id
                },
                "videos": {"$ne": []},
                # "end_date": {
                #     "$gte": datetime(2023, 3, 26),
                #     "$lte": datetime(2024, 5, 3)
                # }
            }
        },
        {
            "$sort": {
                "youtube_user_id": 1,
                "end_date": 1
            }
        },
        {
            "$project": {
                "youtube_user_id": 1,
                "end_date": 1,
                "videos": 1
            }
        }
    ]

    # 파이프라인 실행
    result = list(collection.aggregate(pipeline, allowDiskUse=True))

    # 결과를 Pandas 데이터프레임으로 변환
    youtube_videos = pd.DataFrame(result)

    # melt, cast하여 최종데이터셋 생성
    # melt
    youtube_videos = youtube_videos.explode(['videos']).reset_index(drop=True)

    #cast
    youtube_videos = pd.concat([youtube_videos, pd.json_normalize(youtube_videos['videos'])],axis=1)

    youtube_videos = youtube_videos.drop(['_id','videos'],axis=1)

    youtube_videos = pd.concat([trained_data,youtube_videos],axis=0).drop_duplicates().reset_index(drop=True)

    youtube_videos = youtube_videos.fillna(0) ## null값 0으로 대체
    youtube_videos = youtube_videos[youtube_videos[youtube_videos.columns[4:]].sum(axis=1)!=0] ## 모두 0인 행 제거
    youtube_videos = youtube_videos.drop_duplicates().sort_values(['youtube_user_id', 'end_date']).reset_index(drop=True)

    # 최종 콘텐츠 분석 데이터셋
    # 환율 적용 - 해당기간 평균환율 : 1322.42
    exchange_rate_col = ['estimatedRevenue', 'estimatedAdRevenue', 'estimatedRedPartnerRevenue', 'grossRevenue', 'cpm', 'playbackBasedCpm']
    youtube_videos[exchange_rate_col] = youtube_videos[exchange_rate_col] * 1322.42

    # 잘못된값 처리
    youtube_videos['estimatedRevenue'] = np.where(youtube_videos['estimatedRevenue'] < 0,
                                                youtube_videos['estimatedAdRevenue'] + youtube_videos['estimatedRedPartnerRevenue'],
                                                youtube_videos['estimatedRevenue'])

    # 버그로 사용된 수치값 대체
    youtube_videos['likes'] = np.where(youtube_videos['likes'] < 0, 0, youtube_videos['likes'])
    youtube_videos['dislikes'] = np.where(youtube_videos['dislikes'] < 0, 0, youtube_videos['dislikes'])

    return youtube_videos

In [24]:
youtube_contents_df = data_preparation_contents(db)

## 주제1

In [104]:
def result_topic1():
    # 계정데이터
    # y값 파생변수
    youtube_user_df['total_engage_rate'] = (youtube_user_df['likes'] + youtube_user_df['comments'] + youtube_user_df['shares'] + youtube_user_df['dislikes']) / youtube_user_df['views'] ## 총 참여율
    youtube_user_df['net_subscribers_change'] = youtube_user_df['subscribersGained'] - youtube_user_df['subscribersLost'] ## 구독자 순증가
    youtube_user_df['revenue_per_view'] = youtube_user_df['estimatedRevenue'] / youtube_user_df['views'] ## 조회수당 수익
    youtube_user_df['gross_revenue_per_ad_impression'] = youtube_user_df['grossRevenue'] / youtube_user_df['adImpressions'] ## 1회 광고노출당 총수익

    # 파생변수1 - 참여도 관련
    youtube_user_df['like_rate'] = youtube_user_df['likes'] / youtube_user_df['views'] ## 좋아요 비율 
    youtube_user_df['comment_rate'] = youtube_user_df['comments'] / youtube_user_df['views'] ## 댓글 비율
    youtube_user_df['share_rate'] = youtube_user_df['shares'] / youtube_user_df['views'] ## 공유 비율  
    youtube_user_df['dislike_rate'] = youtube_user_df['dislikes'] / youtube_user_df['views'] ## 싫어요 비율

    # 파생변수2 - 구독자 관련
    youtube_user_df['subscribers_conversion_rate'] = youtube_user_df['subscribersGained'] / youtube_user_df['views'] ## 구독자 전환율
    youtube_user_df['subscribed_view_rate'] = youtube_user_df['SUBSCRIBED'] / (youtube_user_df['SUBSCRIBED'] + youtube_user_df['UNSUBSCRIBED']) ## 구독자 조회수 비율

    # 파생변수3 - 수익 관련
    youtube_user_df['revenue_per_subscribed_view'] = youtube_user_df['estimatedRevenue'] / youtube_user_df['SUBSCRIBED'] ## 구독자조회수당 수익
    youtube_user_df['revenue_per_unsubscribed_view'] = youtube_user_df['estimatedRevenue'] / youtube_user_df['UNSUBSCRIBED'] ## 비구독자조회수당 수익
    youtube_user_df['revenue_per_red_view'] = youtube_user_df['estimatedRevenue'] / youtube_user_df['redViews'] ## 프리미엄당 수익
    youtube_user_df['cpm_to_revenue_ratio'] = youtube_user_df['cpm'] / youtube_user_df['estimatedRevenue'] ## cpm 대비 수익
    youtube_user_df['revenue_per_ad_impression'] = youtube_user_df['estimatedRevenue'] / youtube_user_df['adImpressions'] ## 광고노출당 수익

    # 파생변수4 - 시청 시간 관련
    youtube_user_df['watched_view_rate'] = youtube_user_df['estimatedMinutesWatched'] / youtube_user_df['views'] ## 조회수당 시청시간
    youtube_user_df['unsubscribed_view_time_rate'] = youtube_user_df['estimatedMinutesWatched'] / youtube_user_df['UNSUBSCRIBED'] ## 비구독자조회수당 시청시간

    # null값 대체
    youtube_user_df.fillna(0, inplace=True) ## NaN
    youtube_user_df.replace([np.inf, -np.inf], 0, inplace=True) ## inf

    final_col = ['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view', 'gross_revenue_per_ad_impression','UNSUBSCRIBED', 'SUBSCRIBED', 'estimatedMinutesWatched', 
                'averageViewDuration', 'comments', 'dislikes', 'likes', 'shares', 'redViews', 'estimatedRevenue', 'cpm', 'playbackBasedCpm', 'like_rate', 'comment_rate', 'share_rate', 'dislike_rate', 
                'subscribers_conversion_rate', 'subscribed_view_rate', 'revenue_per_unsubscribed_view', 'revenue_per_red_view', 'cpm_to_revenue_ratio', 'revenue_per_ad_impression', 'watched_view_rate', 
                'unsubscribed_view_time_rate']

    # 표준화 (Standard Scaling)
    scaler = StandardScaler()
    scaled_features_user = scaler.fit_transform(youtube_user_df[final_col])

    # Isolation Forest 모델 학습
    iso_forest_user = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
    iso_forest_user.fit(scaled_features_user)

    # 이상치 점수 계산
    anomaly_scores = iso_forest_user.decision_function(scaled_features_user)

    # 임계값 설정
    threshold = np.percentile(anomaly_scores, 5) ## 이상치 점수가 작을수록 이상치에 가깝다고 판단하는데, 5번째 백분위수가 (-)로 바뀌는 포인트

    # y값 라벨링 (이상치이면 0, 그렇지 않으면 1)
    youtube_user_df['y_label'] = np.where(anomaly_scores < threshold, 0, 1)

    # 계정별 y값 빈도 데이터
    result_user_df = youtube_user_df.groupby('youtube_user_id')['y_label'].value_counts().reset_index()

    # 계정별 일일데이터의 10% 이상 이상치 데이터인 계정 확인
    fraud_user_id = list(result_user_df[result_user_df['y_label']==0][result_user_df[result_user_df['y_label']==0]['count'] >= 40]['youtube_user_id'])

    ##########################################################################################################################################################
    # 콘텐츠 데이터
    # y값 파생변수
    youtube_contents_df['total_engage_rate'] = (youtube_contents_df['likes'] + youtube_contents_df['comments'] + youtube_contents_df['shares'] + youtube_contents_df['dislikes']) / youtube_contents_df['views'] ## 총 참여율
    youtube_contents_df['net_subscribers_change'] = youtube_contents_df['subscribersGained'] - youtube_contents_df['subscribersLost'] ## 구독자 순증가
    youtube_contents_df['revenue_per_view'] = youtube_contents_df['estimatedRevenue'] / youtube_contents_df['views'] ## 조회수당 수익
    youtube_contents_df['grossRevenue_per_ad_impression'] = youtube_contents_df['grossRevenue'] / youtube_contents_df['adImpressions'] ## 1회 광고노출당 총수익
    youtube_contents_df['total_card_teaser_click_rate'] = (youtube_contents_df['cardClicks'] + youtube_contents_df['cardTeaserClicks']) / (youtube_contents_df['cardImpressions'] + youtube_contents_df['cardTeaserImpressions']) ## 카드와 카드 티저의 총 클릭률
    youtube_contents_df['playlist_engagement_rate'] = (youtube_contents_df['videosAddedToPlaylists'] + youtube_contents_df['videosRemovedFromPlaylists']) / youtube_contents_df['views'] ## 플레이리스트 참여도

    # 파생변수1 - 참여도 관련
    youtube_contents_df['comment_rate'] = youtube_contents_df['comments'] / youtube_contents_df['views'] ## 댓글 비율
    youtube_contents_df['dislike_rate'] = youtube_contents_df['dislikes'] / youtube_contents_df['views'] ## 싫어요 비율

    # 파생변수2 - 구독자 관련
    youtube_contents_df['subscribers_conversion_rate'] = youtube_contents_df['subscribersGained'] / youtube_contents_df['views'] ## 구독자 전환율

    # 파생변수3 - 수익 관련
    youtube_contents_df['revenue_per_red_view'] = youtube_contents_df['estimatedRevenue'] / youtube_contents_df['redViews'] ## 프리미엄당 수익
    youtube_contents_df['ad_revenue_rate'] = youtube_contents_df['estimatedAdRevenue'] / youtube_contents_df['estimatedRevenue'] ## 광고수익비율
    youtube_contents_df['red_revenue_rate'] = youtube_contents_df['estimatedRedPartnerRevenue'] / youtube_contents_df['estimatedRevenue'] ## 프리미엄수익비율
    youtube_contents_df['revenue_per_ad_impression'] = youtube_contents_df['estimatedRevenue'] / youtube_contents_df['adImpressions'] ## 광고노출당 수익
    youtube_contents_df['net_revenue_per_playlist_add'] = (youtube_contents_df['videosAddedToPlaylists'] - youtube_contents_df['videosRemovedFromPlaylists']) / youtube_contents_df['estimatedRevenue'] ## 플레이리스트 순추가당 수익

    # 파생변수4 - 시청 시간 관련
    youtube_contents_df['avg_view_duration_rate'] = youtube_contents_df['averageViewDuration'] / youtube_contents_df['averageViewPercentage'] ## 평균 시청 시간 비율
    youtube_contents_df['watched_time_rate'] = youtube_contents_df['averageViewPercentage'] * youtube_contents_df['estimatedMinutesWatched'] ## 재생 비율 대비 시청 시간
    youtube_contents_df['watched_view_red_rate'] = youtube_contents_df['estimatedRedMinutesWatched'] / youtube_contents_df['views'] ## 조회수당 프리미엄 이용자 시청시간

    # 파생변수5 - 광고 관련
    youtube_contents_df['revenue_per_playback'] = youtube_contents_df['grossRevenue'] / youtube_contents_df['monetizedPlaybacks'] ## 1회 광고재생당 수익
    youtube_contents_df['ad_playbacks_per_playlist_add'] = youtube_contents_df['monetizedPlaybacks'] / youtube_contents_df['videosAddedToPlaylists'] ## 플레이리스트 추가당 광고 재생 비율

    # 파생변수6 - 비디오 관련
    youtube_contents_df['playlist_addition_rate'] = youtube_contents_df['videosAddedToPlaylists'] / youtube_contents_df['views'] ## 플레이리스트 추가 비율
    youtube_contents_df['playlist_removal_rate'] = youtube_contents_df['videosRemovedFromPlaylists'] / youtube_contents_df['views'] ## 플레이리스트 제거 비율

    # null값 대체
    youtube_contents_df.fillna(0, inplace=True) ## NaN
    youtube_contents_df.replace([np.inf, -np.inf], 0, inplace=True) ## inf

    final_video_col = ['total_engage_rate', 'net_subscribers_change', 'averageViewPercentage', 'revenue_per_view', 'grossRevenue_per_ad_impression', 'total_card_teaser_click_rate', 
                    'playlist_engagement_rate','views', 'redViews', 'likes', 'videosAddedToPlaylists', 'shares', 'averageViewDuration', 'estimatedRevenue', 'estimatedAdRevenue', 'grossRevenue', 
                    'adImpressions', 'cpm', 'comment_rate', 'dislike_rate', 'subscribers_conversion_rate', 'revenue_per_red_view', 'ad_revenue_rate', 'red_revenue_rate', 'revenue_per_ad_impression',
                    'net_revenue_per_playlist_add', 'watched_time_rate', 'watched_view_red_rate', 'ad_playbacks_per_playlist_add', 'playlist_addition_rate', 'playlist_removal_rate']

    # 표준화 (Standard Scaling)
    scaler = StandardScaler()
    scaled_features_video = scaler.fit_transform(youtube_contents_df[final_video_col])

    # Isolation Forest 모델 학습
    iso_forest_video = IsolationForest(contamination=0.05, random_state=42)  # 이상치 비율 설정
    iso_forest_video.fit(scaled_features_video)

    # 이상치 점수 계산
    anomaly_scores = iso_forest_video.decision_function(scaled_features_video)

    # 임계값 설정
    threshold = np.percentile(anomaly_scores, 5) ## 이상치 점수가 작을수록 이상치에 가깝다고 판단하는데, 5번째 백분위수가 (-)로 바뀌는 포인트

    # y값 라벨링 (이상치이면 0, 그렇지 않으면 1)
    youtube_contents_df['y_label'] = np.where(anomaly_scores < threshold, 0, 1)

    # 이상치 콘텐츠 확인 - 콘텐츠 일일 데이터 중 10% 이상 이상치 데이터 확인
    result_video_df = youtube_contents_df.groupby('video')['y_label'].value_counts().reset_index()

    result_video_df_outlier = pd.merge(result_video_df[result_video_df['y_label']==0].reset_index(drop=True), ## video별 이상치 테이블
                                    result_video_df.groupby('video')['count'].sum().reset_index(), ## 전체 video 테이블
                                    how='left', on='video')
    video_id_outlier = list(result_video_df_outlier[(result_video_df_outlier['count_x'] / result_video_df_outlier['count_y']) >= 0.1]['video'].unique()) ## video별 이상치 비율

    # 이상치 콘텐츠가 20% 이상 있는 계정 확인
    result_user_df_outlier = pd.merge(youtube_contents_df[youtube_contents_df['video'].isin(video_id_outlier)].groupby('youtube_user_id')['video'].count().reset_index(),
                                    youtube_contents_df.groupby(['youtube_user_id'])['video'].count().reset_index(), how='left', on='youtube_user_id')
    fraud_video_user_id = list(result_user_df_outlier[(result_user_df_outlier['video_x'] / result_user_df_outlier['video_y']) >= 0.15]['youtube_user_id'].unique())

    ##########################################################################################################################################################
    # 새로운 입력 데이터 확인
    fraud_result_df = user_info_df[user_info_df['youtube_user_id'].isin(input_data['youtube_user_id'])][['youtube_user_id','channel_title']].reset_index(drop=True)
    fraud_result_df['fraud_user'] = np.where(fraud_result_df['youtube_user_id'].isin(fraud_user_id),'이상치','정상')
    fraud_result_df['fraud_contents'] = np.where(fraud_result_df['youtube_user_id'].isin(fraud_video_user_id),'이상치','정상')

    # 계정 이상치, 콘텐츠 이상치 - 영향력이 큰 계정
    print('큰 영향력 계정')
    print(len(fraud_result_df[(fraud_result_df['fraud_user']=='이상치') & (fraud_result_df['fraud_contents']=='이상치')]['channel_title'].tolist()))
    print(list(fraud_result_df[(fraud_result_df['fraud_user']=='이상치') & (fraud_result_df['fraud_contents']=='이상치')]['channel_title'].tolist()))
    print('---------------------------------------------------------------------')
    print('')

    # 계정 정상, 콘텐츠 정상 - 영향력이 작거나 측정값이 부족
    print('작은 영향력 계정')
    print(len(fraud_result_df[(fraud_result_df['fraud_user']=='정상') & (fraud_result_df['fraud_contents']=='정상')]['channel_title'].tolist()))
    print(list(fraud_result_df[(fraud_result_df['fraud_user']=='정상') & (fraud_result_df['fraud_contents']=='정상')]['channel_title'].tolist()))
    print('---------------------------------------------------------------------')
    print('')

    # 계정 정상, 콘텐츠 이상치 - 잠재적 영향력이 있는 계정
    print('잠재적 영향력 계정')
    print(len(fraud_result_df[(fraud_result_df['fraud_user']=='정상') & (fraud_result_df['fraud_contents']=='이상치')]['channel_title'].tolist()))
    print(list(fraud_result_df[(fraud_result_df['fraud_user']=='정상') & (fraud_result_df['fraud_contents']=='이상치')]['channel_title'].tolist()))
    print('---------------------------------------------------------------------')
    print('')

    # 계정 이상치, 콘텐츠 정상 - 가짜 영향력 계정
    print('가짜 영향력 계정')
    print(len(fraud_result_df[(fraud_result_df['fraud_user']=='이상치') & (fraud_result_df['fraud_contents']=='정상')]['channel_title'].tolist()))
    print(list(fraud_result_df[(fraud_result_df['fraud_user']=='이상치') & (fraud_result_df['fraud_contents']=='정상')]['channel_title'].tolist()))
    print('---------------------------------------------------------------------')
    print('')

    return fraud_result_df

In [105]:
result_topic1()

큰 영향력 계정
0
[]
---------------------------------------------------------------------

작은 영향력 계정
29
['난리범석', 'AllaproTV', '띵크박스 ThinkBox Korea', '로켓런처 - 한계를 돌파하는 힘이 되는 채널', '혜옥메이크업', 'RightHere', '에이든 우지 Aiden uzi', '카이바군', '코그룸', '이서방', '구봉바다낚시 뽀식이', '박진서', 'DDONIE 또니 / 러브크레센트', '링링언니', '떤두', "밍의 하루_Ming's day", '정케빈 KEVIN', '우아린WOOARIN', 'Lee Stave', '룩앳댓태리 Look at that taeri', '황지수', '끄루끄루뽜끄루', '유걸YU-Girl TV', '커피수혈', '하윤 Hayoon', 'fromsuzy 프롬수지', 'yeahs', '내맘대로 1시간', '내가 니 앱이다']
---------------------------------------------------------------------

잠재적 영향력 계정
0
[]
---------------------------------------------------------------------

가짜 영향력 계정
1
['오디디 코미디']
---------------------------------------------------------------------



Unnamed: 0,youtube_user_id,channel_title,fraud_user,fraud_contents
0,647ac28f19c22b644dddd4e8,난리범석,정상,정상
1,64a524ff9247f326464d2d44,AllaproTV,정상,정상
2,62872297fb15712a8cb93218,띵크박스 ThinkBox Korea,정상,정상
3,64b0a896616bd20e3036ddb2,로켓런처 - 한계를 돌파하는 힘이 되는 채널,정상,정상
4,643553f1659261656b3e9b66,혜옥메이크업,정상,정상
5,63c9f07a50eb530dfd134e39,RightHere,정상,정상
6,6471ded7699a835a90fbfa19,에이든 우지 Aiden uzi,정상,정상
7,63f074f8efd51c165b4419c3,카이바군,정상,정상
8,65e96cfad8da110bb072ea82,코그룸,정상,정상
9,630be35e85e48e6e4022a05d,이서방,정상,정상


## 주제2

In [106]:
def result_topic2():
    # y값 파생변수 - 구독자 순증가
    youtube_contents_df['net_subscribers_change'] = youtube_contents_df['subscribersGained'] - youtube_contents_df['subscribersLost']

    # 파생변수1 - 참여도 관련
    youtube_contents_df['share_rate'] = youtube_contents_df['shares'] / youtube_contents_df['views'] ## 공유 비율  
    youtube_contents_df['dislike_rate'] = youtube_contents_df['dislikes'] / youtube_contents_df['views'] ## 싫어요 비율

    # 파생변수2 - 구독자 관련
    youtube_contents_df['subscribers_conversion_rate'] = youtube_contents_df['subscribersGained'] / youtube_contents_df['views'] ## 구독자 전환율
    youtube_contents_df['subscribers_gained_per_playlist_add'] = youtube_contents_df['subscribersGained'] / youtube_contents_df['videosAddedToPlaylists'] ## 플레이리스트 추가당 구독자 증가
    youtube_contents_df['subscribers_lost_per_playlist_remove'] = youtube_contents_df['subscribersLost'] / youtube_contents_df['videosRemovedFromPlaylists'] ## 플레이리스트 제거당 구독자 손실

    # 파생변수4 - 시청 시간 관련
    youtube_contents_df['watched_view_red_rate'] = youtube_contents_df['estimatedRedMinutesWatched'] / youtube_contents_df['views'] ## 조회수당 프리미엄 이용자 시청시간

    # 파생변수7 - 비디오 관련
    youtube_contents_df['net_playlist_addition_rate'] = (youtube_contents_df['videosAddedToPlaylists'] - youtube_contents_df['videosRemovedFromPlaylists']) / youtube_contents_df['views'] ## 플레이리스트 순추가 비율

    # null값 대체
    youtube_contents_df.fillna(0, inplace=True) ## NaN
    youtube_contents_df.replace([np.inf, -np.inf], 0, inplace=True) ## inf

    ##########################################################################################################################################################
    # 주별, 월별, 분기별 변수 생성

    # 조회수 관련
    youtube_contents_df['weekly_views'] = youtube_contents_df['views'].rolling(window=7).sum()
    youtube_contents_df['monthly_views'] = youtube_contents_df['views'].rolling(window=30).sum()
    youtube_contents_df['quarterly_views'] = youtube_contents_df['views'].rolling(window=90).sum()

    # 시청 시간 관련
    youtube_contents_df['weekly_watch_time'] = youtube_contents_df['estimatedMinutesWatched'].rolling(window=7).sum()
    youtube_contents_df['monthly_watch_time'] = youtube_contents_df['estimatedMinutesWatched'].rolling(window=30).sum()

    # 참여도 관련
    youtube_contents_df['weekly_total_engagement'] = (youtube_contents_df['likes'].rolling(window=7).sum() +
                                    youtube_contents_df['dislikes'].rolling(window=7).sum() +
                                    youtube_contents_df['comments'].rolling(window=7).sum() +
                                    youtube_contents_df['shares'].rolling(window=7).sum())
    youtube_contents_df['monthly_total_engagement'] = (youtube_contents_df['likes'].rolling(window=30).sum() +
                                    youtube_contents_df['dislikes'].rolling(window=30).sum() +
                                    youtube_contents_df['comments'].rolling(window=30).sum() +
                                    youtube_contents_df['shares'].rolling(window=30).sum())
    youtube_contents_df['quarterly_total_engagement'] = (youtube_contents_df['likes'].rolling(window=90).sum() +
                                        youtube_contents_df['dislikes'].rolling(window=90).sum() +
                                        youtube_contents_df['comments'].rolling(window=90).sum() +
                                        youtube_contents_df['shares'].rolling(window=90).sum())
    youtube_contents_df['weekly_engagement_rate'] = youtube_contents_df['weekly_total_engagement'] / (youtube_contents_df['weekly_views'] + 1)
    youtube_contents_df['weekly_videos_added'] = youtube_contents_df['videosAddedToPlaylists'].rolling(window=7).sum()

    # null값 대체
    youtube_contents_df.fillna(0, inplace=True) ## NaN
    youtube_contents_df.replace([np.inf, -np.inf], 0, inplace=True) ## inf

    ##########################################################################################################################################################
    # 모델 적용
    final_selected_features = ['watched_view_red_rate', 'estimatedMinutesWatched', 'weekly_engagement_rate', 'monthly_views', 'estimatedRedMinutesWatched', 'redViews', 'weekly_videos_added', 
                               'views', 'share_rate', 'weekly_total_engagement', 'dislike_rate', 'weekly_watch_time', 'videosRemovedFromPlaylists', 'subscribers_conversion_rate', 
                               'subscribers_gained_per_playlist_add', 'net_playlist_addition_rate', 'shares', 'quarterly_total_engagement', 'dislikes', 'quarterly_views', 'comments', 'likes', 
                               'subscribers_lost_per_playlist_remove', 'monthly_watch_time', 'videosAddedToPlaylists', 'monthly_total_engagement', 'weekly_views']

    # 모델 정의 및 학습
    # XGBoost를 사용한 모델 학습
    xgb_model = xgb.XGBRegressor(random_state=42)
    xgb_model.fit(youtube_contents_df[final_selected_features], youtube_contents_df['net_subscribers_change'])

    # 예측 결과 확인
    youtube_contents_df['predict'] = xgb_model.predict(youtube_contents_df[final_selected_features])

    # 계정별 구독자수 평균, 구독자 예측수 평균 비교
    result_contents_df = youtube_contents_df[['youtube_user_id', 'video', 'end_date', 'net_subscribers_change', 'predict']]

    # 계정별 콘텐츠의 구독자 순증감 1일 합계
    result_contents_df = result_contents_df.groupby(['youtube_user_id', 'end_date'], as_index=False).agg({
        'net_subscribers_change': 'sum',
        'predict': 'sum'
    })

    # 1개월, 3개월, 6개월, 1년 뒤 구독자수 예측값 계산
    result_contents_df['end_date'] = pd.to_datetime(result_contents_df['end_date'])

    # Shift와 Rolling 연산을 위한 그룹별 처리
    result_contents_df['1_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-30).rolling(window=30).sum())
    result_contents_df['3_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-90).rolling(window=90).sum())
    result_contents_df['6_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-180).rolling(window=180).sum())
    result_contents_df['12_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-365).rolling(window=365).sum())

    # 계정별로 최종 평균값을 계산
    result_contents_df_fin = result_contents_df.groupby('youtube_user_id').agg({
        'net_subscribers_change': 'mean',
        'predict': 'mean',
        '1_month_future_predict': 'mean',
        '3_month_future_predict': 'mean',
        '6_month_future_predict': 'mean',
        '12_month_future_predict': 'mean'
    }).reset_index()

    # 1개월 결과 - 사분위수 기준 구간 나누기
    # Q1 (25%), Q2 (중위수, 50%), Q3 (75%)를 구함
    Q1_1month = result_contents_df_fin['1_month_future_predict'].quantile(0.25)
    Q3_1month = result_contents_df_fin['1_month_future_predict'].quantile(0.75)

    # Q1, Q2, Q3에 기반하여 구간 나누기
    result_contents_df_fin['1_month_future_predict_result'] = pd.cut(result_contents_df_fin['1_month_future_predict'],
                                                                    bins=[-float('inf'), Q1_1month, Q3_1month, float('inf')],
                                                                    labels=['Low', 'Medium', 'High'])
    
    # 3개월 결과 - 사분위수 기준 구간 나누기
    # Q1 (25%), Q2 (중위수, 50%), Q3 (75%)를 구함
    Q1_3month = result_contents_df_fin['3_month_future_predict'].quantile(0.25)
    Q3_3month = result_contents_df_fin['3_month_future_predict'].quantile(0.75)

    # Q1, Q2, Q3에 기반하여 구간 나누기
    result_contents_df_fin['3_month_future_predict_result'] = pd.cut(result_contents_df_fin['3_month_future_predict'],
                                                                    bins=[-float('inf'), Q1_3month, Q3_3month, float('inf')],
                                                                    labels=['Low', 'Medium', 'High'])
    
    ##########################################################################################################################################################
    # 새로운 입력 데이터 확인
    user_id = input_data['youtube_user_id'].tolist()
    
    result_contents_df_final = result_contents_df_fin[result_contents_df_fin['youtube_user_id'].isin(user_id)].reset_index(drop=True)

    # 1개월 후 결과
    # 유지 및 감소 계정 - Low
    low_1month = list(result_contents_df_final[result_contents_df_final['1_month_future_predict_result']=='Low']['youtube_user_id'])
    print('1개월 후 유지 및 감소 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(low_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(low_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')

    # 증가 계정 - Medium
    medium_1month = list(result_contents_df_final[result_contents_df_final['1_month_future_predict_result']=='Medium']['youtube_user_id'])
    print('1개월 후 증가 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(medium_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(medium_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')

    # 큰 증가 계정 - High
    high_1month = list(result_contents_df_final[result_contents_df_final['1_month_future_predict_result']=='High']['youtube_user_id'])
    print('1개월 후 크게 증가 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(high_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(high_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')

    # 3개월 후 결과
    # 유지 및 감소 계정 - Low
    low_1month = list(result_contents_df_final[result_contents_df_final['3_month_future_predict_result']=='Low']['youtube_user_id'])
    print('3개월 후 유지 및 감소 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(low_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(low_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')

    # 증가 계정 - Medium
    medium_1month = list(result_contents_df_final[result_contents_df_final['3_month_future_predict_result']=='Medium']['youtube_user_id'])
    print('3개월 후 증가 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(medium_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(medium_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')

    # 큰 증가 계정 - High
    high_1month = list(result_contents_df_final[result_contents_df_final['3_month_future_predict_result']=='High']['youtube_user_id'])
    print('3개월 후 크게 증가 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(high_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(high_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')
    
    return result_contents_df_final

In [107]:
result_topic2()

1개월 후 유지 및 감소 예상 계정
20
['난리범석' '띵크박스 ThinkBox Korea' '로켓런처 - 한계를 돌파하는 힘이 되는 채널' '혜옥메이크업'
 'RightHere' '에이든 우지 Aiden uzi' '이서방' 'DDONIE 또니 / 러브크레센트' '떤두'
 '우아린WOOARIN' 'Lee Stave' '룩앳댓태리 Look at that taeri' '황지수' '끄루끄루뽜끄루'
 '유걸YU-Girl TV' '커피수혈' '하윤 Hayoon' 'yeahs' '내맘대로 1시간' '내가 니 앱이다']
---------------------------------------------------------------------

1개월 후 증가 예상 계정
6
['AllaproTV' '카이바군' '구봉바다낚시 뽀식이' '링링언니' '정케빈 KEVIN' 'fromsuzy 프롬수지']
---------------------------------------------------------------------

1개월 후 크게 증가 예상 계정
1
['오디디 코미디']
---------------------------------------------------------------------

3개월 후 유지 및 감소 예상 계정
15
['난리범석' '띵크박스 ThinkBox Korea' '로켓런처 - 한계를 돌파하는 힘이 되는 채널' '혜옥메이크업'
 'RightHere' 'DDONIE 또니 / 러브크레센트' '우아린WOOARIN' 'Lee Stave'
 '룩앳댓태리 Look at that taeri' '끄루끄루뽜끄루' '유걸YU-Girl TV' '커피수혈' 'yeahs'
 '내맘대로 1시간' '내가 니 앱이다']
---------------------------------------------------------------------

3개월 후 증가 예상 계정
6
['AllaproTV' '카이바군' '구봉바다낚시 뽀식이' '링링언니' '정케빈 KEVIN' '

Unnamed: 0,youtube_user_id,net_subscribers_change,predict,1_month_future_predict,3_month_future_predict,6_month_future_predict,12_month_future_predict,1_month_future_predict_result,3_month_future_predict_result
0,62810d9eaa39226247c994f2,0.00905,0.011442,0.470283,2.390716,,,Low,Low
1,62872297fb15712a8cb93218,0.027439,0.03173,1.028341,3.613875,,,Low,Low
2,628722cefb15712a8cb93281,0.021672,0.023823,0.552903,2.977331,,,Low,Low
3,628722f1fb15712a8cb932b4,0.002985,9.8e-05,0.002476,0.006813,,,Low,Low
4,62875913fb15712a8cb9cb28,0.0,-0.001134,-0.066252,,,,Low,
5,6294ab84fe241a32a48ada00,144.992491,150.080154,4497.363178,14740.644804,27802.495489,54577.023617,High,High
6,62a38fd29d41c93ff90b576f,0.0,-0.020114,,,,,,
7,62ac8133423c30268e55801a,0.028205,0.03449,1.033837,4.457181,9.901888,,Low,Low
8,62d11f9f0b4c4c7502a5c1b6,0.007566,0.007748,0.240729,0.397517,1.255576,3.068025,Low,Low
9,62d790c1ce4fcc731da68115,0.008969,0.008062,0.105241,-0.171341,,,Low,Low


## 주제3

In [108]:
def result_topic3():
    # 파생변수1 - 수익 관련
    youtube_contents_df['net_revenue_per_playlist_add'] = (youtube_contents_df['videosAddedToPlaylists'] - youtube_contents_df['videosRemovedFromPlaylists']) / youtube_contents_df['estimatedRevenue'] ## 플레이리스트 순추가당 수익
    youtube_contents_df['revenue_per_playlist_add'] = youtube_contents_df['estimatedRevenue'] / youtube_contents_df['videosAddedToPlaylists'] ## 플레이리스트 추가당 수익
    youtube_contents_df['red_revenue_rate'] = youtube_contents_df['estimatedRedPartnerRevenue'] / youtube_contents_df['estimatedRevenue'] ## 프리미엄수익비율
    youtube_contents_df['playback_based_cpm_rate'] = youtube_contents_df['playbackBasedCpm'] / youtube_contents_df['cpm'] ## 재생 기반 수익
    youtube_contents_df['cpm_to_revenue_ratio'] = youtube_contents_df['cpm'] / youtube_contents_df['estimatedRevenue'] ## cpm 대비 수익

    # 파생변수2 - 시청 시간 관련
    youtube_contents_df['watched_time_rate'] = youtube_contents_df['averageViewPercentage'] * youtube_contents_df['estimatedMinutesWatched'] ## 재생 비율 대비 이용자 시청 시간
    youtube_contents_df['watched_time_red_rate'] = youtube_contents_df['averageViewPercentage'] * youtube_contents_df['estimatedRedMinutesWatched'] ## 재생 비율 대비 프리미엄 이용자 시청 시간

    # 파생변수3 - 광고 관련
    youtube_contents_df['ad_impressions_per_playlist_add'] = youtube_contents_df['adImpressions'] / youtube_contents_df['videosAddedToPlaylists'] ## 플레이리스트 추가당 광고 노출

    # null값 대체
    youtube_contents_df.fillna(0, inplace=True) ## NaN
    youtube_contents_df.replace([np.inf, -np.inf], 0, inplace=True) ## inf

    ##########################################################################################################################################################
    # 주별, 월별, 분기별 변수 생성

    # 조회수 및 시청 시간 관련
    youtube_contents_df['weekly_watch_time'] = youtube_contents_df['estimatedMinutesWatched'].rolling(window=7).sum()
    youtube_contents_df['monthly_watch_time'] = youtube_contents_df['estimatedMinutesWatched'].rolling(window=30).sum()
    youtube_contents_df['quarterly_watch_time'] = youtube_contents_df['estimatedMinutesWatched'].rolling(window=90).sum()

    # 참여도 관련
    youtube_contents_df['weekly_videos_removed'] = youtube_contents_df['videosRemovedFromPlaylists'].rolling(window=7).sum()

    # null값 대체
    youtube_contents_df.fillna(0, inplace=True) ## NaN
    youtube_contents_df.replace([np.inf, -np.inf], 0, inplace=True) ## inf

    ##########################################################################################################################################################
    # 모델 적용
    final_selected_features = ['net_revenue_per_playlist_add', 'revenue_per_playlist_add', 'likes', 'videosRemovedFromPlaylists', 'estimatedMinutesWatched', 'playbackBasedCpm', 'cpm', 'dislikes', 
                            'watched_time_rate', 'watched_time_red_rate', 'shares', 'monthly_watch_time', 'weekly_watch_time', 'red_revenue_rate', 'estimatedRedMinutesWatched', 'subscribersLost', 
                            'ad_impressions_per_playlist_add', 'videosAddedToPlaylists', 'redViews', 'quarterly_watch_time', 'views', 'playback_based_cpm_rate', 'cpm_to_revenue_ratio', 'weekly_videos_removed']

    # 모델 정의 및 학습
    # XGBoost를 사용한 모델 학습
    xgb_model = xgb.XGBRegressor(random_state=42)
    xgb_model.fit(youtube_contents_df[final_selected_features], youtube_contents_df['estimatedAdRevenue'])

    # 예측 결과 확인
    youtube_contents_df['predict'] = xgb_model.predict(youtube_contents_df[final_selected_features])

    # 계정별 구독자수 평균, 구독자 예측수 평균 비교
    result_contents_df = youtube_contents_df[['youtube_user_id', 'video', 'end_date', 'estimatedAdRevenue', 'predict']]

    # 계정별 콘텐츠의 구독자 순증감 1일 합계
    result_contents_df = result_contents_df.groupby(['youtube_user_id', 'end_date'], as_index=False).agg({
        'estimatedAdRevenue': 'sum',
        'predict': 'sum'
    })

    # 1개월, 3개월, 6개월, 1년 뒤 구독자수 예측값 계산
    result_contents_df['end_date'] = pd.to_datetime(result_contents_df['end_date'])

    # Shift와 Rolling 연산을 위한 그룹별 처리
    result_contents_df['1_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-30).rolling(window=30).sum())
    result_contents_df['3_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-90).rolling(window=90).sum())
    result_contents_df['6_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-180).rolling(window=180).sum())
    result_contents_df['12_month_future_predict'] = result_contents_df.groupby('youtube_user_id')['predict'].transform(lambda x: x.shift(-365).rolling(window=365).sum())

    # 계정별로 최종 평균값을 계산
    result_contents_df_fin = result_contents_df.groupby('youtube_user_id').agg({
        'estimatedAdRevenue': 'mean',
        'predict': 'mean',
        '1_month_future_predict': 'mean',
        '3_month_future_predict': 'mean',
        '6_month_future_predict': 'mean',
        '12_month_future_predict': 'mean'
    }).reset_index()

    # 1개월 결과 - 사분위수 기준 구간 나누기
    # Q1 (25%), Q2 (중위수, 50%), Q3 (75%)를 구함
    Q1_1month = result_contents_df_fin['1_month_future_predict'].quantile(0.25)
    Q3_1month = result_contents_df_fin['1_month_future_predict'].quantile(0.75)

    # Q1, Q2, Q3에 기반하여 구간 나누기
    result_contents_df_fin['1_month_future_predict_result'] = pd.cut(result_contents_df_fin['1_month_future_predict'],
                                                                    bins=[-float('inf'), Q1_1month, Q3_1month, float('inf')],
                                                                    labels=['Low', 'Medium', 'High'])
    
    # 3개월 결과 - 사분위수 기준 구간 나누기
    # Q1 (25%), Q2 (중위수, 50%), Q3 (75%)를 구함
    Q1_3month = result_contents_df_fin['3_month_future_predict'].quantile(0.25)
    Q3_3month = result_contents_df_fin['3_month_future_predict'].quantile(0.75)

    # Q1, Q2, Q3에 기반하여 구간 나누기
    result_contents_df_fin['3_month_future_predict_result'] = pd.cut(result_contents_df_fin['3_month_future_predict'],
                                                                    bins=[-float('inf'), Q1_3month, Q3_3month, float('inf')],
                                                                    labels=['Low', 'Medium', 'High'])
    
    ##########################################################################################################################################################
    # 새로운 입력 데이터 확인
    user_id = input_data['youtube_user_id'].tolist()
    
    result_contents_df_final = result_contents_df_fin[result_contents_df_fin['youtube_user_id'].isin(user_id)].reset_index(drop=True)

    # 1개월 후 결과
    # 유지 및 감소 계정 - Low
    low_1month = list(result_contents_df_final[result_contents_df_final['1_month_future_predict_result']=='Low']['youtube_user_id'])
    print('1개월 후 유지 및 감소 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(low_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(low_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')

    # 증가 계정 - Medium
    medium_1month = list(result_contents_df_final[result_contents_df_final['1_month_future_predict_result']=='Medium']['youtube_user_id'])
    print('1개월 후 증가 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(medium_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(medium_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')

    # 큰 증가 계정 - High
    high_1month = list(result_contents_df_final[result_contents_df_final['1_month_future_predict_result']=='High']['youtube_user_id'])
    print('1개월 후 증가 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(high_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(high_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')

    # 3개월 후 결과
    # 유지 및 감소 계정 - Low
    low_1month = list(result_contents_df_final[result_contents_df_final['3_month_future_predict_result']=='Low']['youtube_user_id'])
    print('3개월 후 유지 및 감소 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(low_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(low_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')

    # 증가 계정 - Medium
    medium_1month = list(result_contents_df_final[result_contents_df_final['3_month_future_predict_result']=='Medium']['youtube_user_id'])
    print('3개월 후 증가 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(medium_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(medium_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')

    # 큰 증가 계정 - High
    high_1month = list(result_contents_df_final[result_contents_df_final['3_month_future_predict_result']=='High']['youtube_user_id'])
    print('3개월 후 증가 예상 계정')
    print(len(user_info_df[user_info_df['youtube_user_id'].isin(high_1month)]['channel_title'].unique()))
    print(user_info_df[user_info_df['youtube_user_id'].isin(high_1month)]['channel_title'].unique())
    print('---------------------------------------------------------------------')
    print('')
    
    return result_contents_df_final

In [109]:
result_topic3()

1개월 후 유지 및 감소 예상 계정
21
['난리범석' '띵크박스 ThinkBox Korea' '로켓런처 - 한계를 돌파하는 힘이 되는 채널' '혜옥메이크업'
 'RightHere' '에이든 우지 Aiden uzi' '이서방' 'DDONIE 또니 / 러브크레센트' '링링언니' '떤두'
 '우아린WOOARIN' 'Lee Stave' '룩앳댓태리 Look at that taeri' '황지수' '끄루끄루뽜끄루'
 '유걸YU-Girl TV' '커피수혈' '하윤 Hayoon' 'yeahs' '내맘대로 1시간' '내가 니 앱이다']
---------------------------------------------------------------------

1개월 후 증가 예상 계정
4
['카이바군' '구봉바다낚시 뽀식이' '정케빈 KEVIN' 'fromsuzy 프롬수지']
---------------------------------------------------------------------

1개월 후 증가 예상 계정
2
['AllaproTV' '오디디 코미디']
---------------------------------------------------------------------

3개월 후 유지 및 감소 예상 계정
16
['난리범석' '띵크박스 ThinkBox Korea' '로켓런처 - 한계를 돌파하는 힘이 되는 채널' '혜옥메이크업'
 'RightHere' 'DDONIE 또니 / 러브크레센트' '링링언니' '우아린WOOARIN' 'Lee Stave'
 '룩앳댓태리 Look at that taeri' '끄루끄루뽜끄루' '유걸YU-Girl TV' '커피수혈' 'yeahs'
 '내맘대로 1시간' '내가 니 앱이다']
---------------------------------------------------------------------

3개월 후 증가 예상 계정
5
['AllaproTV' '카이바군' '구봉바다낚시 뽀식이' '정케빈 KEVIN' 'fro

Unnamed: 0,youtube_user_id,estimatedAdRevenue,predict,1_month_future_predict,3_month_future_predict,6_month_future_predict,12_month_future_predict,1_month_future_predict_result,3_month_future_predict_result
0,62810d9eaa39226247c994f2,0.0,0.161146,5.282463,12.95773,,,Low,Low
1,62872297fb15712a8cb93218,0.0,-2.084649,-79.974892,-442.914482,,,Low,Low
2,628722cefb15712a8cb93281,0.0,0.030948,0.145475,-2.247396,,,Low,Low
3,628722f1fb15712a8cb932b4,0.0,0.066488,1.912188,5.723985,,,Low,Low
4,62875913fb15712a8cb9cb28,0.0,0.17036,7.469201,,,,Low,
5,6294ab84fe241a32a48ada00,7466.773922,7422.478027,215429.857551,718324.781842,1369476.0,2669152.0,High,High
6,62a38fd29d41c93ff90b576f,0.0,0.071202,,,,,,
7,62ac8133423c30268e55801a,0.0,1.198686,40.890234,177.931934,431.877,,Low,Low
8,62d11f9f0b4c4c7502a5c1b6,0.0,0.245732,7.030602,14.89583,41.36144,88.95092,Low,Low
9,62d790c1ce4fcc731da68115,0.0,-0.031614,-8.771471,-45.156607,,,Low,Low


## 주제4

In [144]:
def result_topic4():
    # y값 설정
    # youtube_contents_df['estimatedRevenue'], youtube_contents_df['net_subscribers_change'], youtube_contents_df['engage_rate']
    youtube_contents_df['net_subscribers_change'] = youtube_contents_df['subscribersGained'] - youtube_contents_df['subscribersLost']
    youtube_contents_df['engage_rate'] = (youtube_contents_df['likes'] + youtube_contents_df['comments'] + youtube_contents_df['shares']) / youtube_contents_df['views']

    # 조회수당 수익
    youtube_contents_df['revenue_per_view'] = youtube_contents_df['estimatedRevenue'] / youtube_contents_df['views']
    # 구독자당 수익
    youtube_contents_df['revenue_per_Subscriber'] = youtube_contents_df['estimatedRevenue'] / youtube_contents_df['subscribersGained']
    # YouTube Premium 수익
    youtube_contents_df['estimatedRedPartnerRevenue']
    # 수익 다변화 비율
    youtube_contents_df['revenue_diversification_ratio'] = (youtube_contents_df['grossRevenue'] - youtube_contents_df['estimatedRevenue']) / youtube_contents_df['estimatedRevenue']
    # 구독자 증가율
    youtube_contents_df['subscriber_growth_rate'] = youtube_contents_df['subscribersGained'] / (youtube_contents_df['subscribersGained'] + youtube_contents_df['subscribersLost'])
    # 구독자 감소율
    youtube_contents_df['subscriber_loss_rate'] = youtube_contents_df['subscribersLost'] / (youtube_contents_df['subscribersGained'] + youtube_contents_df['subscribersLost'])
    # 구독자 유지율
    youtube_contents_df['subscriber_retention_rate'] = (youtube_contents_df['subscribersGained'] - youtube_contents_df['subscribersLost']) / youtube_contents_df['subscribersGained']
    # 콘텐츠당 구독자 증가율
    youtube_contents_df['subscriber_gain_per_content'] = youtube_contents_df['subscribersGained']/ youtube_contents_df['videosAddedToPlaylists']
    # 구독자당 시청 시간
    youtube_contents_df['watch_time_per_subscriber'] = youtube_contents_df['estimatedMinutesWatched'] / youtube_contents_df['subscribersGained']
    # 광고재생률
    youtube_contents_df['ad_playback_rate'] = youtube_contents_df['monetizedPlaybacks'] / youtube_contents_df['adImpressions']

    ##########################################################################################################################################################
    # 변동계수 생성
    # 월별 수익 다변화 비율 변동계수
    youtube_contents_df['monthly_revenue_diversification_ratio_trd'] = youtube_contents_df.groupby('video')['revenue_diversification_ratio'].transform(lambda x: (x - x.shift(30)) / x.shift(30))
    # 월별 구독자당 시청 시간 변동계수
    youtube_contents_df['monthly_watch_time_per_subscriber_std'] = youtube_contents_df.groupby('video')['watch_time_per_subscriber'].transform(lambda x: x.rolling(window=30).std())
    # 월별 YouTube Premium 수익 변동계수
    youtube_contents_df['monthly_estimatedRedPartnerRevenue_std'] = youtube_contents_df.groupby('video')['estimatedRedPartnerRevenue'].transform(lambda x: x.rolling(window=30).std())

    # null값 대체
    youtube_contents_df.fillna(0, inplace=True) ## NaN
    youtube_contents_df.replace([np.inf, -np.inf], 0, inplace=True) ## inf

    ##########################################################################################################################################################
    # 평가요소별 가중치 적용
    final_col = ['dislikes', 'likes', 'shares', 'comments', 'redViews', 'estimatedRedPartnerRevenue', 'monthly_estimatedRedPartnerRevenue_std', 'revenue_per_view', 'subscriber_gain_per_content',
                'videosRemovedFromPlaylists', 'monthly_watch_time_per_subscriber_std', 'watch_time_per_subscriber', 'revenue_diversification_ratio', 'monthly_revenue_diversification_ratio_trd',
                'playbackBasedCpm', 'monetizedPlaybacks', 'adImpressions', 'averageViewDuration', 'averageViewPercentage', 'estimatedMinutesWatched']
    
    importances_df_final = features_weight.copy()
    
    # 상환이력
    eval_col1 = ['dislikes','likes','shares','comments','redViews']
    importances_df_final[importances_df_final['features'].isin(eval_col1)]['weight'].sum()
    # 부채수준
    eval_col2 = ['estimatedRedPartnerRevenue','monthly_estimatedRedPartnerRevenue_std','revenue_per_view']
    importances_df_final[importances_df_final['features'].isin(eval_col2)]['weight'].sum()
    # 신용거래기간
    eval_col3 = ['subscriber_gain_per_content','videosRemovedFromPlaylists','monthly_watch_time_per_subscriber_std','watch_time_per_subscriber']
    importances_df_final[importances_df_final['features'].isin(eval_col3)]['weight'].sum()
    # 신용형태
    eval_col4 = ['revenue_diversification_ratio','monthly_revenue_diversification_ratio_trd','playbackBasedCpm','monetizedPlaybacks','adImpressions']
    importances_df_final[importances_df_final['features'].isin(eval_col4)]['weight'].sum()
    # 비금융/마이데이터
    eval_col5 = ['averageViewDuration','averageViewPercentage','estimatedMinutesWatched']
    importances_df_final[importances_df_final['features'].isin(eval_col5)]['weight'].sum()

    ##########################################################################################################################################################
    # 가중치 실제값 적용
    # 상환이력
    youtube_contents_df['score1'] = (youtube_contents_df[eval_col1] * np.array(importances_df_final[importances_df_final['features'].isin(eval_col1)]['weight'])).sum(axis=1)
    # 부채수준
    youtube_contents_df['score2'] = (youtube_contents_df[eval_col2] * np.array(importances_df_final[importances_df_final['features'].isin(eval_col2)]['weight'])).sum(axis=1)
    # 신용거래기간
    youtube_contents_df['score3'] = (youtube_contents_df[eval_col3] * np.array(importances_df_final[importances_df_final['features'].isin(eval_col3)]['weight'])).sum(axis=1)
    # 신용형태
    youtube_contents_df['score4'] = (youtube_contents_df[eval_col4] * np.array(importances_df_final[importances_df_final['features'].isin(eval_col4)]['weight'])).sum(axis=1)
    # 비금융/마이데이터
    youtube_contents_df['score5'] = (youtube_contents_df[eval_col5] * np.array(importances_df_final[importances_df_final['features'].isin(eval_col5)]['weight'])).sum(axis=1)

    ##########################################################################################################################################################
    # 계정별 신용점수 확인 - 계정별 평가요소 합계
    credit_df = youtube_contents_df.groupby('youtube_user_id')[['score1','score2','score3','score4','score5']].sum().reset_index()

    # 불필요 및 이상치 제거
    credit_df = credit_df[~credit_df['youtube_user_id'].isin(['639bb8dcd603b8138e33780b'])].reset_index(drop=True)
    # '639bb8dcd603b8138e33780b' ## 없는 계정 및 이상치

    from sklearn.preprocessing import MinMaxScaler

    # 1. MinMaxScaler 적용
    scaler = MinMaxScaler()
    # credit_df['score_scale'] = scaler.fit_transform(credit_df[['score']])
    credit_df['score1_scale'] = scaler.fit_transform(credit_df[['score1']])
    credit_df['score2_scale'] = scaler.fit_transform(credit_df[['score2']])
    credit_df['score3_scale'] = scaler.fit_transform(credit_df[['score3']])
    credit_df['score4_scale'] = scaler.fit_transform(credit_df[['score4']])
    credit_df['score5_scale'] = scaler.fit_transform(credit_df[['score5']])

    credit_df['score1_final'] = scaler.fit_transform(credit_df[['score1_scale']]) * 430
    credit_df['score2_final'] = scaler.fit_transform(credit_df[['score2_scale']]) * 410
    credit_df['score3_final'] = scaler.fit_transform(credit_df[['score3_scale']]) * 50
    credit_df['score4_final'] = scaler.fit_transform(credit_df[['score4_scale']]) * 60
    credit_df['score5_final'] = scaler.fit_transform(credit_df[['score5_scale']]) * 50

    # 평가요소 분류 활용 스코어링
    credit_df['credit_score'] = credit_df[['score1_final','score2_final','score3_final','score4_final','score5_final']].sum(axis=1)

    # 신용평가점수 테이블
    credit_df_fin = pd.merge(credit_df[['youtube_user_id','credit_score']],user_info_df,how='left',on='youtube_user_id')
    credit_df_fin = credit_df_fin[['youtube_user_id','channel_title','credit_score']]
    credit_df_fin = credit_df_fin[~credit_df_fin['channel_title'].isnull()].reset_index(drop=True)

    ##########################################################################################################################################################
    # 변동계수
    # estimatedRevenue
    youtube_contents_df['weekly_estimatedRevenue_cv'] = youtube_contents_df.groupby('video')['estimatedRevenue'].transform(lambda x: x.rolling(window=7).std() / x.rolling(window=7).mean())
    youtube_contents_df['monthly_estimatedRevenue_cv'] = youtube_contents_df.groupby('video')['estimatedRevenue'].transform(lambda x: x.rolling(window=30).std() / x.rolling(window=30).mean())

    # net_subscribers_change
    youtube_contents_df['weekly_net_subscribers_change_cv'] = youtube_contents_df[youtube_contents_df['net_subscribers_change']!=0].groupby('video')['net_subscribers_change'].transform(lambda x: x.rolling(window=7).std() / x.rolling(window=7).mean())
    youtube_contents_df['monthly_net_subscribers_change_cv'] = youtube_contents_df[youtube_contents_df['net_subscribers_change']!=0].groupby('video')['net_subscribers_change'].transform(lambda x: x.rolling(window=30).std() / x.rolling(window=30).mean())

    # engage_rate
    youtube_contents_df['weekly_engage_rate_cv'] = youtube_contents_df[youtube_contents_df['engage_rate']!=0].groupby('video')['engage_rate'].transform(lambda x: x.rolling(window=7).std() / x.rolling(window=7).mean())
    youtube_contents_df['monthly_engage_rate_cv'] = youtube_contents_df[youtube_contents_df['engage_rate']!=0].groupby('video')['engage_rate'].transform(lambda x: x.rolling(window=30).std() / x.rolling(window=30).mean())

    cv_col = ['weekly_estimatedRevenue_cv','monthly_estimatedRevenue_cv','weekly_net_subscribers_change_cv','monthly_net_subscribers_change_cv','weekly_engage_rate_cv','monthly_engage_rate_cv']

    # null값 대체
    youtube_contents_df.fillna(0, inplace=True) ## NaN
    youtube_contents_df.replace([np.inf, -np.inf], 0, inplace=True) ## inf

    # 계정별 변동계수 평균
    coefvar_df = youtube_contents_df.groupby('youtube_user_id')[cv_col].mean().reset_index()

    # 최종 신용평가 테이블
    credit_coef_df = pd.merge(credit_df_fin, coefvar_df, how='left', on='youtube_user_id')
    
    # 가중치 역으로 활용하여 최종 변동 계수 도출
    # (1 - 0.36) + (1 - 0.3) + (1 - 0.34) 활용 스케일링
    # 0.32 / 0.35 / 0.33 가중치 적용
    credit_coef_df['cv_score'] = (
                                (((credit_coef_df['weekly_estimatedRevenue_cv'] + credit_coef_df['monthly_estimatedRevenue_cv']) / 2) * 0.32) + 
                                (((credit_coef_df['weekly_net_subscribers_change_cv'] + credit_coef_df['monthly_net_subscribers_change_cv']) / 2) * 0.35) + 
                                (((credit_coef_df['weekly_engage_rate_cv'] + credit_coef_df['monthly_engage_rate_cv']) / 2) * 0.33)
                                )
    # 최종 데이터셋 확인
    credit_df_final = credit_coef_df[['youtube_user_id','channel_title','credit_score','cv_score']]

    # 신용점수별 구간 나누기
    credit_df_final['credit_score_result'] = pd.cut(credit_df_final['credit_score'],
                                                    bins=[-float('inf'), 200, 400, 600, 800, float('inf')],
                                                    labels=['매우위험','위험','보통','안전','매우안전'])
    # 변동계수 구간 나누기
    credit_df_final['cv_score_result'] = pd.cut(credit_df_final['cv_score'],
                                                    bins=[-float('inf'), 0.1, 0.3, 0.5, float('inf')],
                                                    labels=['매우안전','안전','위험','매우위험'])
    
    ##########################################################################################################################################################
    # 새로운 입력 데이터 확인
    user_id = input_data['youtube_user_id'].tolist()
    
    # 최종 결과 데이터셋
    credit_df_final = credit_df_final[credit_df_final['youtube_user_id'].isin(user_id)].reset_index(drop=True)
    
    credit_df_final['degree'] = '보통'    
    credit_df_final['degree'] = np.where(((credit_df_final['credit_score_result']=='매우안전') & (credit_df_final['cv_score_result']=='안전') | 
                                          (credit_df_final['credit_score_result']=='안전') & (credit_df_final['cv_score_result']=='매우안전') |
                                          (credit_df_final['credit_score_result']=='매우안전') & (credit_df_final['cv_score_result']=='매우안전')),
                                          '안전',
                                          credit_df_final['degree'])
    credit_df_final['degree'] = np.where(((credit_df_final['credit_score_result'].str.contains('위험')) | (credit_df_final['cv_score_result'].str.contains('위험'))),
                                          '위험',
                                          credit_df_final['degree'])
    
    # 안전 등급 계정
    print('부도 가능성 안전')
    print(len(credit_df_final[credit_df_final['degree']=='안전']['channel_title']))
    print(list(credit_df_final[credit_df_final['degree']=='안전']['channel_title']))
    print('---------------------------------------------------------------------')
    print('')

    # 보통 등급 계정
    print('부도 가능성 보통')
    print(len(credit_df_final[credit_df_final['degree']=='보통']['channel_title']))
    print(list(credit_df_final[credit_df_final['degree']=='보통']['channel_title']))
    print('---------------------------------------------------------------------')
    print('')

    # 위험 등급 계정
    print('부도 가능성 위험')
    print(len(credit_df_final[credit_df_final['degree']=='위험']['channel_title']))
    print(list(credit_df_final[credit_df_final['degree']=='위험']['channel_title']))
    print('---------------------------------------------------------------------')
    print('')

    return credit_df_final

In [145]:
result_topic4()

부도 가능성 안전
0
Series([], Name: channel_title, dtype: object)
---------------------------------------------------------------------

부도 가능성 보통
0
Series([], Name: channel_title, dtype: object)
---------------------------------------------------------------------

부도 가능성 위험
30
0                 유걸YU-Girl TV
1          띵크박스 ThinkBox Korea
2                         커피수혈
3                   우아린WOOARIN
4                    하윤 Hayoon
5                      오디디 코미디
6             밍의 하루_Ming's day
7                        yeahs
8           DDONIE 또니 / 러브크레센트
9                     내맘대로 1시간
10                         이서방
11                         박진서
12                    내가 니 앱이다
13                         황지수
14                        링링언니
15                   RightHere
16                   Lee Stave
17                        카이바군
18                     끄루끄루뽜끄루
19               fromsuzy 프롬수지
20                      혜옥메이크업
21            에이든 우지 Aiden uzi
22                   정케빈 KEVIN
23            

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  credit_df_final['credit_score_result'] = pd.cut(credit_df_final['credit_score'],
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  credit_df_final['cv_score_result'] = pd.cut(credit_df_final['cv_score'],


Unnamed: 0,youtube_user_id,channel_title,credit_score,cv_score,credit_score_result,cv_score_result,degree
0,62810d9eaa39226247c994f2,유걸YU-Girl TV,0.005735,0.0,매우위험,매우안전,위험
1,62872297fb15712a8cb93218,띵크박스 ThinkBox Korea,0.021053,0.000246,매우위험,매우안전,위험
2,628722cefb15712a8cb93281,커피수혈,0.012165,0.000494,매우위험,매우안전,위험
3,628722f1fb15712a8cb932b4,우아린WOOARIN,0.006107,0.001666,매우위험,매우안전,위험
4,62875913fb15712a8cb9cb28,하윤 Hayoon,0.002219,0.0,매우위험,매우안전,위험
5,6294ab84fe241a32a48ada00,오디디 코미디,234.425159,0.272144,위험,안전,위험
6,62a38fd29d41c93ff90b576f,밍의 하루_Ming's day,0.000885,0.0,매우위험,매우안전,위험
7,62ac8133423c30268e55801a,yeahs,0.087162,0.006425,매우위험,매우안전,위험
8,62d11f9f0b4c4c7502a5c1b6,DDONIE 또니 / 러브크레센트,0.049634,0.002318,매우위험,매우안전,위험
9,62d790c1ce4fcc731da68115,내맘대로 1시간,0.028594,0.0,매우위험,매우안전,위험


In [159]:
youtube_analysis_dict = {
   'mongodb_connection' : mongodb_connection,
   'data_preparation_need' : data_preparation_need,
   'data_preparation_user' : data_preparation_user,
   'data_preparation_contents' : data_preparation_contents,
   'result_topic1' : result_topic1,
   'result_topic2' : result_topic2,
   'result_topic3' : result_topic3,
   'result_topic4' : result_topic4,
}

In [160]:
import joblib
joblib.dump(youtube_analysis_dict, 'youtube_analysis_dict.joblib')

['youtube_analysis_dict.joblib']

In [161]:
# joblib으로 함수 딕셔너리 불러오기
youtube_analysis_dict = joblib.load('youtube_analysis_dict.joblib')