In [1]:
import pymongo
import pandas as pd
import math
from datetime import datetime

import dask.dataframe as dd ## 대규모 데이터 처리
# import vaex ## 대규모 데이터 처리

import os
from dotenv import load_dotenv

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
# .env 파일에서 환경 변수 로드
load_dotenv('C:/py_src/awake/env')

# 환경 변수에서 MongoDB 연결 정보 가져오기
mongo_password = os.getenv('MONGO_PASSWORD')
mongo_user = os.getenv('MONGO_USER')

In [3]:
# MongoDB 연결 URL
url = f"mongodb+srv://{mongo_user}:{mongo_password}@meercat-external.udyfs.mongodb.net/?retryWrites=true&w=majority&appName=meercat-external"
client = pymongo.MongoClient(url, serverSelectionTimeoutMS=100000)

In [4]:
# 연결 확인
from pymongo.errors import ServerSelectionTimeoutError

try:
    # 데이터베이스 목록을 가져오는 쿼리
    databases = client.list_database_names()
    print("Connected successfully. Databases:", databases)
    
except ServerSelectionTimeoutError as err:
    print("Connection failed:", err)

Connected successfully. Databases: ['Test', 'admin', 'config', 'local']


In [5]:
# 데이터베이스 확인
client.list_database_names()

['Test', 'admin', 'config', 'local']

In [6]:
# Test DB 선택
db = client.Test

# Test DB 컬렉션 확인
collections = db.list_collection_names()

In [7]:
len(collections)

21

In [8]:
# 단위 환산
def convert_bytes(num):
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return f"{num:.2f} {x}"
        num /= 1024.0

In [9]:
for collection_name in collections:
    # 컬렉션 통계 정보 가져오기
    stats = db.command("collStats", collection_name)

    # 컬렉션의 크기와 문서 수 출력    
    print(f"Collection '{collection_name}' size: {convert_bytes(stats['size'])}")
    print(f"Collection '{collection_name}' document count: {stats['count']}")
    print(f"Total index size: {convert_bytes(stats['totalIndexSize'])}")
    print('--------------------------------------------------------------------')

Collection 'hashtag_hashtag' size: 13.39 MB
Collection 'hashtag_hashtag' document count: 3052
Total index size: 172.00 KB
--------------------------------------------------------------------
Collection 'youtube_revenue' size: 259.47 MB
Collection 'youtube_revenue' document count: 811499
Total index size: 46.44 MB
--------------------------------------------------------------------
Collection 'youtube_report_v2' size: 4.60 MB
Collection 'youtube_report_v2' document count: 5303
Total index size: 484.00 KB
--------------------------------------------------------------------
Collection 'influencer_datas' size: 7.78 GB
Collection 'influencer_datas' document count: 3340624
Total index size: 246.63 MB
--------------------------------------------------------------------
Collection 'youtube_channel_demographics' size: 59.67 MB
Collection 'youtube_channel_demographics' document count: 297683
Total index size: 4.69 MB
--------------------------------------------------------------------
Collection

In [10]:
# 인스타 관련 데이터 제외 --> # 'youtube_videos', 'youtube_datas', 'youtube_daily_channel_basics', 'youtube_subscriber' 10% 샘플링
collections_need = [
    # 'hashtag_hashtag',
    'youtube_revenue',
    'youtube_report_v2',
    # 'influencer_datas',
    'youtube_channel_demographics',
    # 'user_aggregations',
    # 'campaign_bookmarks',
    'youtube_videos',
    # 'hashtag_hashtaglog',
    'youtube_report',
    'youtube_users',
    # 'alpha_tests',
    # 'influencer_media_datas',
    'youtube_channel_locations',
    'youtube_daily_channel_basics',
    # 'influencer_media_comments',
    # 'user_bookmarks',
    'youtube_subscriber',
    # 'campaigns',
    'youtube_datas',
    # 'instagram_stories'
    ]

In [11]:
# 샘플링 대상 테이블 리스트
tables_to_sample = ['youtube_videos', 'youtube_datas']# , 'youtube_daily_channel_basics', 'youtube_subscriber']
# tables_to_sample = collections_need.copy()

youtube_dict={}
for collection_name in collections_need:
    if collection_name in tables_to_sample:
        # 전체 데이터수 확인
        total_documents  = db[collection_name].count_documents({})

        # 샘플링할 문서 수 계산 (10%)
        page_number = 1
        sample_size = math.ceil(total_documents * 0.05)

        # 샘플링 수행
        sampled_documents = db[collection_name].find().skip((page_number - 1) * sample_size).limit(sample_size)

        # 샘플링된 문서를 리스트로 변환
        documents_list = list(sampled_documents)

        # MongoDB 문서를 DataFrame으로 변환
        youtube_dict[collection_name] = pd.DataFrame(documents_list)
        
        print(collection_name)
        print(youtube_dict[collection_name].columns)
        print(">> Success")
        print("--------------------------------------")
        print("")
        
    else:
        youtube_dict[collection_name] = pd.DataFrame(list(db[collection_name].find()))
        
        print(collection_name)
        print(youtube_dict[collection_name].columns)
        print(">> Success")
        print("--------------------------------------")
        print("")

youtube_revenue
Index(['_id', 'estimated_revenue', 'estimated_ad_revenue',
       'estimated_red_partner_revenue', 'gross_revenue', 'cpm',
       'ad_impressions', 'monetized_playbacks', 'playback_based_cpm',
       'youtube_user_id', 'data_created_at', '__v', 'created_at',
       'updated_at'],
      dtype='object')
>> Success
--------------------------------------

youtube_report_v2
Index(['_id', 'requested', 'youtube_user_id', 'content', 'phone_number',
       'template_code', 'created_at', 'updated_at', '__v'],
      dtype='object')
>> Success
--------------------------------------

youtube_channel_demographics
Index(['_id', 'youtube_user_id', 'end_date', 'created_at', 'updated_at', '__v',
       'demographics'],
      dtype='object')
>> Success
--------------------------------------

youtube_videos
Index(['_id', 'videos', 'youtube_user_id', 'end_date', 'created_at',
       'updated_at', '__v'],
      dtype='object')
>> Success
--------------------------------------

youtube_report

In [12]:
# 키값 형식 맞추기
for key, df in youtube_dict.items():
    # 각 딕셔너리의 value가 DataFrame인지 확인
    if isinstance(df, pd.DataFrame):
        # '_id' 컬럼이 있는지 확인하고 문자열로 변환
        if '_id' in df.columns:
            df['_id'] = df['_id'].astype(str)
        
        # 'youtube_user_id' 컬럼이 있는지 확인하고 문자열로 변환
        if 'youtube_user_id' in df.columns:
            df['youtube_user_id'] = df['youtube_user_id'].astype(str)

        # 변경된 DataFrame을 다시 딕셔너리에 저장
        youtube_dict[key] = df

In [13]:
youtube_dict.keys()

dict_keys(['youtube_revenue', 'youtube_report_v2', 'youtube_channel_demographics', 'youtube_videos', 'youtube_report', 'youtube_users', 'youtube_channel_locations', 'youtube_daily_channel_basics', 'youtube_subscriber', 'youtube_datas'])

In [14]:
youtube_user_id_outer = list(set(list(youtube_dict['youtube_revenue']['youtube_user_id']) +
                                 list(youtube_dict['youtube_report_v2']['youtube_user_id']) +
                                 list(youtube_dict['youtube_channel_demographics']['youtube_user_id']) +
                                 list(youtube_dict['youtube_report']['youtube_user_id']) +
                                 # list(youtube_dict['youtube_users']['youtube_user_id']) +
                                 list(youtube_dict['youtube_channel_locations']['youtube_user_id']) +
                                 list(youtube_dict['youtube_daily_channel_basics']['youtube_user_id']) +
                                 list(youtube_dict['youtube_subscriber']['youtube_user_id']) +
                                 list(youtube_dict['youtube_revenue']['youtube_user_id'])))
len(youtube_user_id_outer)

# youtube_dict['youtube_videos']['youtube_user_id']
# youtube_dict['youtube_datas']['youtube_user_id']

## youtube_user_id 모두 포함 996개

996

In [15]:
youtube_user_id_inner = list(set(youtube_dict['youtube_revenue']['youtube_user_id']) &
                             set(youtube_dict['youtube_report_v2']['youtube_user_id']) &
                             set(youtube_dict['youtube_channel_demographics']['youtube_user_id']) &
                             set(youtube_dict['youtube_report']['youtube_user_id']) &
                            #  set(youtube_dict['youtube_users']['youtube_user_id']) &
                             set(youtube_dict['youtube_channel_locations']['youtube_user_id']) &
                             set(youtube_dict['youtube_daily_channel_basics']['youtube_user_id']) &
                             set(youtube_dict['youtube_subscriber']['youtube_user_id']) &
                             set(youtube_dict['youtube_revenue']['youtube_user_id']))

len(youtube_user_id_inner)
# youtube_dict['youtube_videos']['youtube_user_id']
# youtube_dict['youtube_datas']['youtube_user_id']

## youtube_user_id  공통 포함 30개

30

### youtube_revenue

In [136]:
youtube_revenue = youtube_dict['youtube_revenue']

In [137]:
len(youtube_revenue['youtube_user_id'].unique())
## 유튜버 계정 : 265개

265

In [138]:
# 총 예상 수익
num_col = youtube_revenue.select_dtypes(['int64','float64']).columns[:-1]
youtube_revenue[num_col].sum()
## 광고 수익 가장 높음
## 프리미엄 구독자 수익 가장 낮음

estimated_revenue                1.168341e+10
estimated_ad_revenue             8.573537e+09
estimated_red_partner_revenue    1.408400e+09
gross_revenue                    1.358574e+10
cpm                              3.294414e+09
ad_impressions                   1.686921e+09
monetized_playbacks              2.539314e+09
playback_based_cpm               2.182599e+09
dtype: float64

In [139]:
# 계정별 일일 수익
youtube_revenue = youtube_revenue.groupby(['youtube_user_id','data_created_at'])[num_col].sum().reset_index()

In [140]:
youtube_revenue

Unnamed: 0,youtube_user_id,data_created_at,estimated_revenue,estimated_ad_revenue,estimated_red_partner_revenue,gross_revenue,cpm,ad_impressions,monetized_playbacks,playback_based_cpm
0,627cb611aa6f212355e0b617,2013-07-31,0.000,0.0,0.000,0.0,0,0.0,0,0.0
1,627cb611aa6f212355e0b617,2013-08-01,0.000,0.0,0.000,0.0,0,0.0,0,0.0
2,627cb611aa6f212355e0b617,2013-08-02,0.000,0.0,0.000,0.0,0,0.0,0,0.0
3,627cb611aa6f212355e0b617,2013-08-03,0.000,0.0,0.000,0.0,0,0.0,0,0.0
4,627cb611aa6f212355e0b617,2013-08-04,0.000,0.0,0.000,0.0,0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...
507258,66230ee6d8da110bb0744b2d,2024-04-29,32797.134,0.0,2522.759,0.0,0,0.0,0,0.0
507259,66230ee6d8da110bb0744b2d,2024-04-30,40680.753,0.0,2770.255,0.0,0,0.0,0,0.0
507260,66230ee6d8da110bb0744b2d,2024-05-01,35864.687,0.0,2579.477,0.0,0,0.0,0,0.0
507261,66230ee6d8da110bb0744b2d,2024-05-02,38089.134,0.0,2414.442,0.0,0,0.0,0,0.0


### youtube_report_v2

In [141]:
youtube_report_v2 = youtube_dict['youtube_report_v2']

In [142]:
len(youtube_report_v2['youtube_user_id'].unique())
## 유튜버 계정 : 276개

276

In [143]:
# 필요 컬럼 추출
youtube_report_v2 = youtube_report_v2[['youtube_user_id','content','phone_number','requested']].drop_duplicates().sort_values('youtube_user_id').reset_index(drop=True)

In [144]:
# 날짜형식 변경
youtube_report_v2['date'] = youtube_report_v2['content'].str.split('\n').str[2].str.split(':').str[1].str.strip().str.split(' ').str[:2].apply(''.join)
youtube_report_v2['date'] = youtube_report_v2['date'].apply(lambda x : datetime.strptime(x, '%m월%d일').replace(year=2024))

In [145]:
# 필요 정보 추출
youtube_report_v2['subscriber_count'] = youtube_report_v2['content'].str.split('\n').str[5]
youtube_report_v2['views'] = youtube_report_v2['content'].str.split('\n').str[7]
youtube_report_v2['watched_sum'] = youtube_report_v2['content'].str.split('\n').str[9]
youtube_report_v2['watched_avg'] = youtube_report_v2['content'].str.split('\n').str[11]

youtube_report_v2 = youtube_report_v2.drop('content',axis=1)

In [146]:
# 최종 데이터셋 생성
youtube_report_v2 = youtube_report_v2[['youtube_user_id', 'date', 'phone_number', 'subscriber_count', 'views', 'watched_sum', 'watched_avg', 'requested']]
youtube_report_v2 = youtube_report_v2.sort_values(['youtube_user_id','date']).drop_duplicates().reset_index(drop=True)

In [147]:
youtube_report_v2

Unnamed: 0,youtube_user_id,date,phone_number,subscriber_count,views,watched_sum,watched_avg,requested
0,627cb611aa6f212355e0b617,2024-02-22,01039069284,👥구독자 : 121834명,👀조회수 : 8549회,🎥시청 : 879시 56분,💻평균 시청 지속 시간 : 6분 10초,True
1,627cb611aa6f212355e0b617,2024-02-23,01039069284,👥구독자 : 121833명,👀조회수 : 7074회,🎥시청 : 716시 57분,💻평균 시청 지속 시간 : 6분 4초,True
2,627cb611aa6f212355e0b617,2024-02-24,01039069284,👥구독자 : 121893명,👀조회수 : 12437회,🎥시청 : 1235시 46분,💻평균 시청 지속 시간 : 5분 57초,False
3,627cb611aa6f212355e0b617,2024-02-25,01039069284,👥구독자 : 121917명,👀조회수 : 9762회,🎥시청 : 894시 26분,💻평균 시청 지속 시간 : 5분 29초,False
4,627cb611aa6f212355e0b617,2024-02-26,01039069284,👥구독자 : 121937명,👀조회수 : 8773회,🎥시청 : 760시 4분,💻평균 시청 지속 시간 : 5분 11초,False
...,...,...,...,...,...,...,...,...
5102,641ea50289085e280d0e6bf2,2024-03-24,01042556134,👥구독자 : 11명,👀조회수 : 0회,🎥시청 : 0시 0분,💻평균 시청 지속 시간 : 0분 0초,False
5103,641ea50289085e280d0e6bf2,2024-03-25,01042556134,👥구독자 : 11명,👀조회수 : 0회,🎥시청 : 0시 0분,💻평균 시청 지속 시간 : 0분 0초,False
5104,641ea50289085e280d0e6bf2,2024-03-26,01042556134,👥구독자 : 11명,👀조회수 : 0회,🎥시청 : 0시 0분,💻평균 시청 지속 시간 : 0분 0초,False
5105,641ea50289085e280d0e6bf2,2024-03-27,01042556134,👥구독자 : 11명,👀조회수 : 0회,🎥시청 : 0시 0분,💻평균 시청 지속 시간 : 0분 0초,False


### youtube_channel_demographics
- 유튜버계정 일일 연령대, 성별 분포
- demographics 컬럼 분포 합 100%

In [148]:
youtube_channel_demographics = youtube_dict['youtube_channel_demographics']

In [149]:
len(youtube_channel_demographics['youtube_user_id'].unique())
## 유튜버 계정 : 905개

905

In [150]:
# 필요 컬럼 추출
youtube_channel_demographics = youtube_channel_demographics[~youtube_channel_demographics['demographics'].isnull()].sort_values(['youtube_user_id','end_date']).reset_index(drop=True)
youtube_channel_demographics = youtube_channel_demographics[['youtube_user_id', 'end_date', 'demographics']]

In [151]:
# 최종 데이터셋 생성
youtube_channel_demographics = pd.concat([youtube_channel_demographics, pd.json_normalize(youtube_channel_demographics['demographics'])],axis=1)
youtube_channel_demographics = youtube_channel_demographics.drop('demographics',axis=1)
youtube_channel_demographics = youtube_channel_demographics[youtube_channel_demographics.columns[:16]]
youtube_channel_demographics = youtube_channel_demographics.sort_values(['youtube_user_id', 'end_date']).drop_duplicates().reset_index(drop=True)

In [152]:
youtube_channel_demographics

Unnamed: 0,youtube_user_id,end_date,age13-17.female,age13-17.male,age18-24.female,age18-24.male,age25-34.female,age25-34.male,age35-44.female,age35-44.male,age45-54.female,age45-54.male,age55-64.female,age55-64.male,age65-.female,age65-.male
0,627cb611aa6f212355e0b617,2023-03-26,3.6,2.8,23.4,13.7,14.9,13.8,7.7,10.0,4.4,3.7,0.6,0.4,0.6,0.4
1,627cb611aa6f212355e0b617,2023-03-27,4.9,3.7,21.3,15.9,14.5,11.7,10.0,6.5,4.2,7.3,,,,
2,627cb611aa6f212355e0b617,2023-03-28,5.0,3.1,19.8,14.3,17.3,12.1,8.5,8.2,4.1,5.3,0.5,0.7,0.4,0.7
3,627cb611aa6f212355e0b617,2023-03-29,5.2,3.1,18.5,14.8,17.5,12.0,8.8,8.9,4.2,5.4,0.7,1.0,,
4,627cb611aa6f212355e0b617,2023-03-30,3.3,2.4,21.7,13.9,14.8,12.5,9.9,8.8,4.0,6.1,0.7,0.9,0.5,0.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96594,66230ee6d8da110bb0744b2d,2024-04-29,2.2,5.9,5.2,13.8,4.5,13.6,9.9,14.5,7.1,12.4,2.6,4.1,1.9,2.2
96595,66230ee6d8da110bb0744b2d,2024-04-30,2.5,5.4,5.3,11.7,4.7,11.8,11.3,15.6,7.7,13.2,2.9,4.0,1.7,2.1
96596,66230ee6d8da110bb0744b2d,2024-05-01,1.8,6.0,4.9,12.4,4.5,12.7,10.0,15.0,7.5,13.6,3.2,4.2,1.9,2.0
96597,66230ee6d8da110bb0744b2d,2024-05-02,2.1,6.4,4.9,12.6,4.2,11.1,10.8,14.6,8.0,13.4,3.3,4.2,2.1,2.3


### youtube_report

In [153]:
youtube_report = youtube_dict['youtube_report']

In [154]:
len(youtube_report['youtube_user_id'].unique())
## 유튜버 계정 : 194개

194

In [155]:
youtube_report = youtube_report[['youtube_user_id','contents','phone_number','request']].drop_duplicates().sort_values('youtube_user_id').reset_index(drop=True)

In [156]:
# 날짜형식 변경
youtube_report['date'] = youtube_report['contents'].str.split('\n').str[2].str.split(':').str[1].str.strip().str.split(' ').str[:2].apply(''.join)
youtube_report['date'] = youtube_report['date'].apply(lambda x : datetime.strptime(x, '%m월%d일').replace(year=2024))

In [157]:
# 필요 정보 추출
youtube_report['subscriber_count'] = youtube_report['contents'].str.split('\n').str[5]
youtube_report['views'] = youtube_report['contents'].str.split('\n').str[7]
youtube_report['watched_sum'] = youtube_report['contents'].str.split('\n').str[9]
youtube_report['watched_avg'] = youtube_report['contents'].str.split('\n').str[11]

youtube_report = youtube_report.drop('contents',axis=1)

In [158]:
# 최종 데이터셋 생성
youtube_report = youtube_report[['youtube_user_id', 'date', 'phone_number', 'subscriber_count', 'views', 'watched_sum', 'watched_avg', 'request']]
youtube_report = youtube_report.sort_values(['youtube_user_id','date']).drop_duplicates().reset_index(drop=True)

In [159]:
youtube_report

Unnamed: 0,youtube_user_id,date,phone_number,subscriber_count,views,watched_sum,watched_avg,request
0,627b58c880a4763fdf8e13de,2024-05-11,01066057996,👥구독자 : 0명,👀조회수 : 0회,🎥시청 : 0시간 0분,💻평균 시청 지속 시간 : 0분 0초,True
1,627b58c880a4763fdf8e13de,2024-05-12,01066057996,👥구독자 : 0명,👀조회수 : 0회,🎥시청 : 0시간 0분,💻평균 시청 지속 시간 : 0분 0초,True
2,627b58c880a4763fdf8e13de,2024-05-13,01066057996,👥구독자 : 0명,👀조회수 : 0회,🎥시청 : 0시간 0분,💻평균 시청 지속 시간 : 0분 0초,True
3,627b58c880a4763fdf8e13de,2024-05-14,01066057996,👥구독자 : 0명,👀조회수 : 0회,🎥시청 : 0시간 0분,💻평균 시청 지속 시간 : 0분 0초,True
4,627b58c880a4763fdf8e13de,2024-05-15,01066057996,👥구독자 : 0명,👀조회수 : 0회,🎥시청 : 0시간 0분,💻평균 시청 지속 시간 : 0분 0초,True
...,...,...,...,...,...,...,...,...
3541,62a490289d41c93ff90d242b,2024-06-11,01051284390,👥구독자 : 5명,👀조회수 : 15회,🎥시청 : 0시간 3분,💻평균 시청 지속 시간 : 0분 14초,False
3542,62a5283e9d41c93ff90d28a2,2024-06-10,01073022797,👥구독자 : 57명,👀조회수 : 683회,🎥시청 : 9시간 8분,💻평균 시청 지속 시간 : 0분 48초,True
3543,62a5283e9d41c93ff90d28a2,2024-06-11,01073022797,👥구독자 : 57명,👀조회수 : 324회,🎥시청 : 4시간 15분,💻평균 시청 지속 시간 : 0분 47초,False
3544,62a5c2409d41c93ff90e5dfa,2024-06-11,01051035276,👥구독자 : 263명,👀조회수 : 4회,🎥시청 : 0시간 19분,💻평균 시청 지속 시간 : 4분 46초,False


### youtube_users

In [160]:
youtube_users = youtube_dict['youtube_users']


In [161]:
len(youtube_users['channel_id'].unique())
## 유튜버 계정 : 883개

883

In [162]:
# 필요컬럼추출
youtube_users = youtube_users[['channel_id', 'channel_title', 'phone_num', 'report_user_id', 'statistics','published_at']] 
## published_at : 유튜브 가입일
## 'statistics' 컬럼의 'subscriberCount' 정보와 'subscriber_count' 컬럼 정보가 다름 --> 구독자 수
## 'channel_id' 컬럼, 'contentDetails' 컬럼의 'uploads' 같은 정보

In [163]:
# cast 하여 최종 데이터셋 생성
youtube_users = pd.concat([youtube_users, pd.json_normalize(youtube_users['statistics'])],axis=1)
youtube_users = youtube_users.drop(['statistics','hiddenSubscriberCount'],axis=1)
youtube_users = youtube_users.dropna(how = 'all')

# null 값 0으로 대체
youtube_users[['viewCount', 'subscriberCount', 'videoCount']] = youtube_users[['viewCount', 'subscriberCount', 'videoCount']].fillna(0)

youtube_users = youtube_users.sort_values('channel_id').drop_duplicates().reset_index(drop=True)

In [164]:
youtube_users

Unnamed: 0,channel_id,channel_title,phone_num,report_user_id,published_at,viewCount,subscriberCount,videoCount
0,UC-4K3WFY7e_TzaqjXgwxRoA,대신 밍튜브,,,2013-08-25 04:05:50.000,0,496,0
1,UC-5O4BK-89lZh2r015JBYOg,민경,01095770671,,2018-07-21 16:07:06.000,0,0,0
2,UC-5ra7EcbVUPeNdbaccWJ_A,컴퓨터읽어주는남자 컴읽남,01041548930,,2019-04-11 12:25:55.000,0,0,0
3,UC-8BokR4IeKjUgysUTDutnw,최 이 월,,6416a206c4d04f017d23e6d7,2023-03-12 14:58:40.264,0,0,0
4,UC-Hk-8YkjlFeEmtdESSbFbA,VoidNeverstop,01026248499,,2014-01-19 06:21:57.000,196505,1160,225
...,...,...,...,...,...,...,...,...
929,UCzIP5zrZaiqlpWcSw5V8-eg,엔트리뷰 [누구나 재미있는 테크리뷰],,63f7726d55baf50e2df73caf,2020-03-21 13:05:43.289,1767452,5090,152
930,UCzNsDl-geB-n6MFFkqKDftA,맛있는부산 쥰맛지도,01043209436,,2011-11-22 07:34:54.000,651042,579,94
931,UCzYpYFxdQ4XWUfWxE53DrPw,서인아,,63c159ec4238543bcaf01c56,2013-02-15 12:33:32.000,0,0,0
932,UCz_5BgKiCiQIYl3G0Zvw0IQ,GroomDaddy 구름대디,,,2023-05-15 01:24:40.571,1060,6,10


### youtube_channel_locations
- 채널 구독자 위치

In [165]:
youtube_channel_locations = youtube_dict['youtube_channel_locations']

In [166]:
len(youtube_channel_locations['youtube_user_id'].unique())
## 유튜버 계정 : 906개

906

In [167]:
# 필요컬럼추출
youtube_channel_locations = youtube_channel_locations[youtube_channel_locations['locations'].apply(lambda x: len(x) > 0)]
youtube_channel_locations = youtube_channel_locations[['youtube_user_id','end_date','locations']]

In [168]:
# melt, cast 하여 최종데이터셋 생성
youtube_channel_locations = youtube_channel_locations.explode(['locations']).reset_index(drop=True)

# cast 하여 최종 데이터셋 생성
youtube_channel_locations = pd.concat([youtube_channel_locations,pd.json_normalize(youtube_channel_locations['locations'])], axis=1)
youtube_channel_locations = youtube_channel_locations.drop(['locations','subscribersGained','subscribersLost'],axis=1)
youtube_channel_locations = youtube_channel_locations.drop_duplicates().sort_values(['youtube_user_id','end_date']).reset_index(drop=True)

In [169]:
youtube_channel_locations

Unnamed: 0,youtube_user_id,end_date,country,views,estimatedMinutesWatched,averageViewDuration,averageViewPercentage
0,627cb611aa6f212355e0b617,2023-03-26,AZ,16,37,139,20.32
1,627cb611aa6f212355e0b617,2023-03-26,BD,28,74,159,24.79
2,627cb611aa6f212355e0b617,2023-03-26,BR,42,129,185,28.78
3,627cb611aa6f212355e0b617,2023-03-26,DE,16,53,199,30.98
4,627cb611aa6f212355e0b617,2023-03-26,DZ,26,69,160,24.90
...,...,...,...,...,...,...,...
1146945,66230ee6d8da110bb0744b2d,2024-05-03,TH,134,93,41,90.49
1146946,66230ee6d8da110bb0744b2d,2024-05-03,TR,14,8,37,76.74
1146947,66230ee6d8da110bb0744b2d,2024-05-03,TW,51,33,39,86.75
1146948,66230ee6d8da110bb0744b2d,2024-05-03,US,1454,1168,48,105.43


### youtube_daily_channel_basics

In [170]:
youtube_daily_channel_basics = youtube_dict['youtube_daily_channel_basics']

In [171]:
len(youtube_daily_channel_basics['youtube_user_id'].unique())
## 유튜버 계정 : 906개

906

In [172]:
# 필요컬럼추출
youtube_daily_channel_basics = youtube_daily_channel_basics[youtube_daily_channel_basics['daily_basics'].apply(lambda x: len(x) > 0)]
youtube_daily_channel_basics = youtube_daily_channel_basics[['youtube_user_id','daily_basics']] ## 'end_date' --> 'day' 컬럼 활용
youtube_daily_channel_basics = youtube_daily_channel_basics.explode(['daily_basics']).reset_index(drop=True) ## melt

In [173]:
# daily_basics 컬럼 형식 맞추기 --> 딕셔너리 값들 cast 형태로 맞추기
youtube_daily_channel_basics_cast = []
for item in youtube_daily_channel_basics['daily_basics']:
    if isinstance(item, list):
        youtube_daily_channel_basics_cast.extend(item)
    else:
        youtube_daily_channel_basics_cast.append(item)

youtube_daily_channel_basics_cast = pd.json_normalize(youtube_daily_channel_basics_cast)

In [174]:
# melt, cast 하여 최종 데이터셋 생성
youtube_daily_channel_basics = pd.concat([youtube_daily_channel_basics, youtube_daily_channel_basics_cast],axis=1)
youtube_daily_channel_basics = youtube_daily_channel_basics.drop('daily_basics',axis=1)
youtube_daily_channel_basics = youtube_daily_channel_basics.fillna(0) ## null값 0으로 대체
youtube_daily_channel_basics = youtube_daily_channel_basics[youtube_daily_channel_basics[youtube_daily_channel_basics.columns[3:]].sum(axis=1)!=0] ## 모두 0인 행 제거
youtube_daily_channel_basics = youtube_daily_channel_basics.sort_values(['youtube_user_id','day']).reset_index(drop=True)
del youtube_daily_channel_basics_cast

In [175]:
youtube_daily_channel_basics

Unnamed: 0,youtube_user_id,day,annotationClickThroughRate,annotationCloseRate,averageViewDuration,comments,dislikes,estimatedMinutesWatched,likes,shares,...,views,redViews,estimatedRevenue,estimatedAdRevenue,estimatedRedPartnerRevenue,grossRevenue,cpm,monetizedPlaybacks,adImpressions,playbackBasedCpm
0,627cb611aa6f212355e0b617,2014-10-18,0.0,0.0,78,0,0,137,0,0,...,106,0,0.000,0.0,0.000,0.0,0.0,0.0,0.0,0.0
1,627cb611aa6f212355e0b617,2014-10-19,0.0,0.0,97,0,0,39,0,0,...,24,0,0.000,0.0,0.000,0.0,0.0,0.0,0.0,0.0
2,627cb611aa6f212355e0b617,2014-10-20,0.0,0.0,58,0,0,20,0,0,...,21,0,0.000,0.0,0.000,0.0,0.0,0.0,0.0,0.0
3,627cb611aa6f212355e0b617,2014-10-21,0.0,0.0,88,0,0,26,1,0,...,18,0,0.000,0.0,0.000,0.0,0.0,0.0,0.0,0.0
4,627cb611aa6f212355e0b617,2014-10-22,0.0,0.0,89,0,0,23,0,0,...,16,0,0.000,0.0,0.000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
926929,66230ee6d8da110bb0744b2d,2024-04-30,0.0,0.0,42,44,42,120534,1400,235,...,171412,27211,29.418,0.0,2.003,0.0,0.0,0.0,0.0,0.0
926930,66230ee6d8da110bb0744b2d,2024-05-01,0.0,0.0,42,46,41,106071,1690,202,...,150360,25502,25.879,0.0,1.861,0.0,0.0,0.0,0.0,0.0
926931,66230ee6d8da110bb0744b2d,2024-05-02,0.0,0.0,42,28,45,100593,1889,176,...,143511,21693,27.725,0.0,1.757,0.0,0.0,0.0,0.0,0.0
926932,66230ee6d8da110bb0744b2d,2024-05-03,0.0,0.0,40,52,186,158599,5361,299,...,232700,35609,40.867,0.0,2.837,0.0,0.0,0.0,0.0,0.0


### youtube_subscriber

In [176]:
youtube_subscriber = youtube_dict['youtube_subscriber']

In [177]:
len(youtube_subscriber['youtube_user_id'].unique())
## 유튜버 계정 : 994개

994

In [178]:
# 필요컬럼추출
youtube_subscriber = youtube_subscriber[['youtube_user_id','data_created_at','subscribers_gained','subscribers_lost','subscribers_count']]
youtube_subscriber = youtube_subscriber.drop_duplicates()

youtube_subscriber = youtube_subscriber[youtube_subscriber[youtube_subscriber.columns[2:]].sum(axis=1)!=0] ## 모두 0인 행 제거
youtube_subscriber = youtube_subscriber.sort_values(['youtube_user_id','data_created_at']).reset_index(drop=True)

In [179]:
youtube_subscriber

Unnamed: 0,youtube_user_id,data_created_at,subscribers_gained,subscribers_lost,subscribers_count
0,627b58c880a4763fdf8e13de,2022-06-21,1,0,1
1,627b58c880a4763fdf8e13de,2022-06-22,0,0,1
2,627b58c880a4763fdf8e13de,2022-06-23,0,0,1
3,627b58c880a4763fdf8e13de,2022-06-24,0,0,1
4,627b58c880a4763fdf8e13de,2022-06-25,0,0,1
...,...,...,...,...,...
1311795,66230ee6d8da110bb0744b2d,2024-04-29,135,99,290092
1311796,66230ee6d8da110bb0744b2d,2024-04-30,159,76,290175
1311797,66230ee6d8da110bb0744b2d,2024-05-01,133,63,290245
1311798,66230ee6d8da110bb0744b2d,2024-05-02,149,82,290312


### youtube_videos
- 계정별 일일 콘텐츠 정보

In [None]:
# youtube_videos = youtube_dict['youtube_videos']

# # 필요컬럼추출
# youtube_videos = youtube_videos[youtube_videos['videos'].apply(lambda x: len(x) > 0)].sort_values(['youtube_user_id','end_date']).reset_index(drop=True)
# youtube_videos = youtube_videos[['youtube_user_id','end_date','videos']]


# 위의 과정 mongodb에서 바로 추출
collection = db['youtube_videos']  # 컬렉션 선택

# 파이프라인 정의
pipeline = [
    {
        "$match": {
            "youtube_user_id": {
                "$in": youtube_user_id_inner
            }
        }
    },
    {
        "$match": {
            "videos": {"$ne": []}
        }
    },
    {
        "$sort": {
            "youtube_user_id": 1,
            "end_date": 1
        }
    },
    {
        "$project": {
            "youtube_user_id": 1,
            "end_date": 1,
            "videos": 1
        }
    }
]

# 파이프라인 실행
result = list(collection.aggregate(pipeline, allowDiskUse=True))

# 결과를 Pandas 데이터프레임으로 변환
youtube_videos = pd.DataFrame(result)

In [333]:
len(youtube_videos['youtube_user_id'].unique())

28

In [None]:
# melt, cast하여 최종데이터셋 생성
# melt
youtube_videos = youtube_videos.explode(['videos']).reset_index(drop=True)

#cast
youtube_videos_cast = pd.json_normalize(youtube_videos['videos'])
youtube_videos = pd.concat([youtube_videos, youtube_videos_cast],axis=1)

youtube_videos = youtube_videos.drop('videos',axis=1)
youtube_videos = youtube_videos.fillna(0) ## null값 0으로 대체
youtube_videos = youtube_videos.sort_values(['youtube_user_id', 'end_date']).drop_duplicates().reset_index(drop=True)

del youtube_videos_cast

In [None]:
youtube_videos.head()

Unnamed: 0,youtube_user_id,end_date,video,views,redViews,comments,likes,dislikes,videosAddedToPlaylists,videosRemovedFromPlaylists,...,subscribersGained,subscribersLost,estimatedRevenue,estimatedAdRevenue,grossRevenue,estimatedRedPartnerRevenue,monetizedPlaybacks,playbackBasedCpm,adImpressions,cpm
0,627cb611aa6f212355e0b617,2023-03-27,LbTPH_DkrVs,1209,39,0,11,0,3,2,...,1,0,0.692,0.662,1.204,0.03,592.0,2.034,709.0,1.698
1,627cb611aa6f212355e0b617,2023-03-27,41In6nquflQ,221,59,0,-1,0,1,0,...,2,0,0.629,0.564,1.025,0.065,201.0,5.1,337.0,3.042
2,627cb611aa6f212355e0b617,2023-03-27,Q9jpOSyETg4,156,53,0,0,0,0,0,...,0,0,0.561,0.47,0.855,0.09,133.0,6.429,214.0,3.995
3,627cb611aa6f212355e0b617,2023-03-27,JYNGDZROUYE,130,41,0,3,0,0,0,...,0,0,0.384,0.324,0.588,0.06,118.0,4.983,164.0,3.585
4,627cb611aa6f212355e0b617,2023-03-27,4wUr2sLSfEA,128,29,0,0,0,0,0,...,1,0,0.147,0.123,0.224,0.024,123.0,1.821,140.0,1.6


### youtube_datas

In [339]:
# youtube_datas = youtube_dict['youtube_datas']

# # 필요컬럼추출
# need_col = ['youtube_user_id', 'published_at', 'data_created_at', 'channel_id', 'channel_title', 'traffic_source_type', 'yt_search_keyword', 'subscribed_status','subscriber_count',
#             'video_count','view_count','comment_count','like_count','dislike_count','estimated_minutes_watched','average_view_duration','status_code','red_view_count']
# youtube_datas = youtube_datas[need_col]


# 위의 과정 mongodb에서 바로 추출
collection = db['youtube_datas']  # 컬렉션 선택

# 파이프라인 정의
pipeline = [
    {
        "$match": {
            "youtube_user_id": {
                "$in": youtube_user_id_outer
            }
        }
    },
    {
        "$sort": {
            "youtube_user_id": 1,
            "data_created_at": 1
        }
    },
    {
        "$project": {
            'youtube_user_id' : 1, 
            'data_created_at' : 1, 
            'published_at' : 1, 
            'channel_id' : 1, 
            'channel_title' : 1, 
            'traffic_source_type' : 1, 
            'yt_search_keyword' : 1, 
            'subscribed_status' : 1,
            'subscriber_count' : 1,
            'video_count' : 1,
            'view_count' : 1,
            'comment_count' : 1,
            'like_count' : 1,
            'dislike_count' : 1,
            'estimated_minutes_watched' : 1,
            'average_view_duration' : 1,
            'status_code' : 1,
            'red_view_count' : 1
        }
    }
]

# 파이프라인 실행
result = list(collection.aggregate(pipeline, allowDiskUse=True))

# 결과를 Pandas 데이터프레임으로 변환
youtube_datas = pd.DataFrame(result)

In [340]:
len(youtube_datas['youtube_user_id'].unique())

937

In [319]:
# cast 하여 데이터셋 생성
youtube_datas = pd.concat([youtube_datas, pd.json_normalize(youtube_datas['traffic_source_type']), pd.json_normalize(youtube_datas['subscribed_status'])],axis=1)
youtube_datas = youtube_datas.drop(['traffic_source_type','subscribed_status'],axis=1)

youtube_datas[youtube_datas.columns[6:]] = youtube_datas[youtube_datas.columns[6:]].fillna(0) # null값 0으로 대체
youtube_datas = youtube_datas[youtube_datas[youtube_datas.columns[6:]].sum(axis=1)!=0] ## 모두 0인 행 제거

youtube_datas = youtube_datas.sort_values(['youtube_user_id','data_created_at']).reset_index(drop=True)

In [322]:
youtube_datas

Unnamed: 0,youtube_user_id,published_at,data_created_at,channel_id,channel_title,yt_search_keyword,subscriber_count,video_count,view_count,comment_count,...,EXT_URL,ANNOTATION,NOTIFICATION,PRODUCT_PAGE,SHORTS,HASHTAGS,SOUND_PAGE,ADVERTISING,UNSUBSCRIBED,SUBSCRIBED
0,627b58c880a4763fdf8e13de,2018-10-31 07:39:22,2022-05-10,UCVon3pofMxnCdelBNr6PQ-g,갱배고파,,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,627b58c880a4763fdf8e13de,2018-10-31 07:39:22,2022-05-11,UCVon3pofMxnCdelBNr6PQ-g,갱배고파,,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,627b58c880a4763fdf8e13de,2018-10-31 07:39:22,2022-05-12,UCVon3pofMxnCdelBNr6PQ-g,갱배고파,,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,627b58c880a4763fdf8e13de,2018-10-31 07:39:22,2022-05-13,UCVon3pofMxnCdelBNr6PQ-g,갱배고파,,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,627b58c880a4763fdf8e13de,2018-10-31 07:39:22,2022-05-14,UCVon3pofMxnCdelBNr6PQ-g,갱배고파,,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16988,62f11e332be6ae3ff35b2183,NaT,2022-08-09,,,"{'오트리 메달리스트': 16, '오트리': 12, '오트리 사이즈': 5, '몽골...",0.0,0.0,279,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,265.0,14.0
16989,62f11e332be6ae3ff35b2183,NaT,2022-08-10,,,"{'오트리 메달리스트': 22, '오트리': 13, '오트리 사이즈': 5, '오트...",0.0,0.0,286,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,269.0,17.0
16990,62f3b3552be6ae3ff35ed82c,NaT,2022-08-08,,,"{'아이패드': 1, '프로크리에이트': 1}",0.0,0.0,17,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,6.0
16991,62f3b3552be6ae3ff35ed82c,NaT,2022-08-09,,,"{'아이패드 드로잉': 2, '고양이 일러스트': 1}",0.0,0.0,25,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,4.0


In [183]:
# 최종데이터셋 기준 계정수 확인
youtube_user_id_outer = list(set(list(youtube_revenue['youtube_user_id']) +
                                 list(youtube_report_v2['youtube_user_id']) +
                                 list(youtube_channel_demographics['youtube_user_id']) +
                                 list(youtube_report['youtube_user_id']) +
                                 # list(youtube_users['youtube_user_id']) +
                                 list(youtube_channel_locations['youtube_user_id']) +
                                 list(youtube_daily_channel_basics['youtube_user_id']) +
                                 list(youtube_subscriber['youtube_user_id']) +
                                 list(youtube_revenue['youtube_user_id'])))
len(youtube_user_id_outer)
## youtube_user_id 모두 포함 941개

youtube_user_id_inner = list(set(youtube_revenue['youtube_user_id']) &
                             set(youtube_report_v2['youtube_user_id']) &
                             set(youtube_channel_demographics['youtube_user_id']) &
                             set(youtube_report['youtube_user_id']) &
                            #  set(youtube_users['youtube_user_id']) &
                             set(youtube_channel_locations['youtube_user_id']) &
                             set(youtube_daily_channel_basics['youtube_user_id']) &
                             set(youtube_subscriber['youtube_user_id']) &
                             set(youtube_revenue['youtube_user_id']))

len(youtube_user_id_inner)
## youtube_user_id  공통 포함 30개

28

In [None]:
dfs_to_merge = []
# youtube_dict의 각 데이터프레임을 리스트에 추가
for df in youtube_dict.values():
    if isinstance(df, pd.DataFrame):  # DataFrame인지 확인
        dfs_to_merge.append(df)

In [None]:
# 예제: 여러 개의 데이터프레임을 병합
# DataFrame들을 Dask DataFrame으로 변환
dask_df_list = [dd.from_pandas(df, npartitions=10) for df in dfs_to_merge]

# 첫 번째 DataFrame을 기준으로 병합을 시작
merged_df = dfs_to_merge[0]

# 나머지 DataFrame들과 순차적으로 병합
for df in dfs_to_merge[1:]:
    merged_df = dd.merge(merged_df, df, on='youtube_user_id', how='outer')

# 최종 병합된 DataFrame을 계산하여 Pandas DataFrame으로 변환
merged_df = merged_df.compute()

# 결과 출력
print(merged_df)


In [None]:
import pandas as pd
from functools import reduce

dfs_to_merge = []
# youtube_dict의 각 데이터프레임을 리스트에 추가
for df in youtube_dict.values():
    if isinstance(df, pd.DataFrame):  # DataFrame인지 확인
        dfs_to_merge.append(df)

# 데이터프레임 병합
merged_df = reduce(lambda left, right: pd.merge(left, right, on='youtube_user_id', how='inner'), dfs_to_merge)

# 결과 출력
print(merged_df)