In [1]:
import pymongo
import pandas as pd

import os
from dotenv import load_dotenv

In [2]:
# .env 파일에서 환경 변수 로드
load_dotenv('C:/py_src/awake/env')

# 환경 변수에서 MongoDB 연결 정보 가져오기
mongo_password = os.getenv('MONGO_PASSWORD')
mongo_user = os.getenv('MONGO_USER')

In [3]:
# MongoDB 연결 URL
url = f"mongodb+srv://{mongo_user}:{mongo_password}@meercat-external.udyfs.mongodb.net/?retryWrites=true&w=majority&appName=meercat-external"
client = pymongo.MongoClient(url, serverSelectionTimeoutMS=100000)

In [4]:
# 연결 확인
from pymongo.errors import ServerSelectionTimeoutError

try:
    # 데이터베이스 목록을 가져오는 쿼리
    databases = client.list_database_names()
    print("Connected successfully. Databases:", databases)
    
except ServerSelectionTimeoutError as err:
    print("Connection failed:", err)

Connected successfully. Databases: ['Test', 'admin', 'config', 'local']


In [5]:
# 데이터베이스 확인
client.list_database_names()

['Test', 'admin', 'config', 'local']

In [6]:
# Test DB 선택
db = client.Test

# Test DB 컬렉션 확인
collections = db.list_collection_names()

In [21]:
# 인스타 관련 데이터 제외 --> 'youtube_videos' 따로 확인
collections_need = [
    'hashtag_hashtag',
    'youtube_revenue',
    'youtube_report_v2',
    # 'influencer_datas',
    'youtube_channel_demographics',
    'user_aggregations',
    'campaign_bookmarks',
    # 'youtube_videos',
    'hashtag_hashtaglog',
    'youtube_report',
    'youtube_users',
    'alpha_tests',
    # 'influencer_media_datas',
    'youtube_channel_locations',
    'youtube_daily_channel_basics',
    # 'influencer_media_comments',
    'user_bookmarks',
    'youtube_subscriber',
    'campaigns',
    'youtube_datas',
    # 'instagram_stories'
    ]

In [7]:
# 단위 환산
def convert_bytes(num):
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return f"{num:.2f} {x}"
        num /= 1024.0

In [8]:
for collection_name in collections:
    # 컬렉션 통계 정보 가져오기
    stats = db.command("collStats", collection_name)

    # 컬렉션의 크기와 문서 수 출력    
    print(f"Collection '{collection_name}' size: {convert_bytes(stats['size'])}")
    print(f"Collection '{collection_name}' document count: {stats['count']}")
    print(f"Total index size: {convert_bytes(stats['totalIndexSize'])}")
    print('--------------------------------------------------------------------')

Collection 'hashtag_hashtag' size: 13.39 MB
Collection 'hashtag_hashtag' document count: 3052
Total index size: 172.00 KB
--------------------------------------------------------------------
Collection 'youtube_revenue' size: 259.47 MB
Collection 'youtube_revenue' document count: 811499
Total index size: 46.44 MB
--------------------------------------------------------------------
Collection 'youtube_report_v2' size: 4.60 MB
Collection 'youtube_report_v2' document count: 5303
Total index size: 484.00 KB
--------------------------------------------------------------------
Collection 'influencer_datas' size: 7.78 GB
Collection 'influencer_datas' document count: 3340624
Total index size: 246.63 MB
--------------------------------------------------------------------
Collection 'youtube_channel_demographics' size: 59.67 MB
Collection 'youtube_channel_demographics' document count: 297683
Total index size: 4.69 MB
--------------------------------------------------------------------
Collection

In [None]:
# for document in db['youtube_videos'].find().batch_size(1000):
#     # 각 문서에 대한 처리
#     print(document)

In [42]:
aa = pd.DataFrame(list(db['youtube_datas'].find().skip(0).limit(100)))

In [50]:
aa[aa.columns[:20]]

Unnamed: 0,_id,subscriber_count,video_count,view_count,comment_count,like_count,dislike_count,estimated_minutes_watched,average_view_duration,youtube_user_id,kakao_account_id,channel_id,channel_title,channel_description,published_at,thumbnail_url,traffic_source_type,yt_search_keyword,subscribed_status,created_at
0,627dd732c0dfe30fc1abe9e1,5390,8,62,0,0,0,61,59,627c8fdbce0b951b01262140,60937cd098670f5d30a17022,UCqvom1WpXg0ueWiv2qDJflA,이재이Lee Jay,안녕하세요! 이재이입니다.\n앞으로 제 유튜브 채널을 통한 그동안 보여드리지 못했던...,2020-05-03 13:30:03.443,https://yt3.ggpht.com/ytc/AKedOLTFfk60oJI23PC0...,"{'YT_SEARCH': 44, 'YT_CHANNEL': 10, 'SUBSCRIBE...","{'비키니 하울': 7, '한달에 10kg 빼기': 6, '1달만에 10kg빼기':...","{'UNSUBSCRIBED': 59, 'SUBSCRIBED': 2}",2022-05-13 12:57:34.573
1,627dd732c0dfe30fc1abe9e3,108000,30,4354,0,41,-1,24735,340,627cb611aa6f212355e0b617,627cb5f6aa6f212355e0b5b4,UCxuEudcvmg4zMQhW7isWi-w,성팩 SPAAK,좌충우돌 성팩이의 유튜브! 항상 열심히하겠습니다 구독해주세요 :) \n비즈니스 문의...,2013-07-31 07:43:00.000,https://yt3.ggpht.com/L_EHboXUjidjrXFGg2XAmEE9...,"{'RELATED_VIDEO': 2197, 'SUBSCRIBER': 945, 'YT...","{'성팩': 70, '수빙수': 45, '원칩': 30, '원칩챌린지 먹방': 23...","{'UNSUBSCRIBED': 18476, 'SUBSCRIBED': 6259}",2022-05-13 12:57:34.576
2,627dd736c0dfe30fc1abe9e5,0,0,0,0,0,0,0,0,627cd09faa6f212355e14268,615904dfbc9b4840b3f35221,UCe144H51UZ7XYEmAV204QdA,박효진,,2019-11-13 14:22:44.649,https://yt3.ggpht.com/ytc/AKedOLRO4Dc2UbMRylPo...,,,,2022-05-13 12:57:34.579
3,627dd736c0dfe30fc1abe9e7,0,0,0,0,0,0,0,0,627b58c880a4763fdf8e13de,625686fee63da410fc853f66,UCVon3pofMxnCdelBNr6PQ-g,갱배고파,,2018-10-31 07:39:22.000,https://yt3.ggpht.com/ytc/AKedOLRS60VXBs6aZfDl...,,,,2022-05-13 12:57:34.568
4,627f0d23aa39226247c5f263,116,1,4,0,0,0,3,57,627deb73aa39226247c461a1,629d2745eaf5732d6decbdc3,UC7lPLG5gKr5PSMqAkSfhJWA,김민준,"안녕하세요, 김민준입니다.\n크리에이터테크 스타트업 '어웨이크코퍼레이션' 을 운영하...",2018-11-06 06:20:14.000,https://yt3.ggpht.com/m1j0Pk02oz9Qa0IcGkp2-frf...,"{'NO_LINK_OTHER': 2, 'YT_SEARCH': 2}",,{'UNSUBSCRIBED': 3},2022-05-14 11:00:00.212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,628847a5fb15712a8cbb0c3d,28,1,1,0,0,0,0,46,628722e8fb15712a8cb932a4,628722dbfb15712a8cb93293,UCteqWzsri8RJ4QZW49jnpeQ,Joshua,,2013-10-17 09:47:29.000,https://yt3.ggpht.com/ytc/AKedOLQCx5U4SQIs4piN...,{'YT_SEARCH': 1},,{'UNSUBSCRIBED': 0},2022-05-21 11:00:00.772
96,628847a5fb15712a8cbb0c3f,7,4,0,0,0,0,0,0,62879fa5fb15712a8cbb0958,626cbe4a3f804b323362dfee,UCkIreAMtmXeEu6P3JKVX5AA,퍼플러썸 Purpleossom,,2022-04-15 12:31:35.097,https://yt3.ggpht.com/u6U_MssqhiQCdad9EzxlM6DM...,,,,2022-05-21 11:00:00.880
97,628847a5fb15712a8cbb0c41,193,17,1,0,0,0,0,49,62810d9eaa39226247c994f2,62810cc2aa39226247c994e4,UCFE1HeTKYLhUuErpx72eiSA,유걸YU-Girl TV,그냥 취미로 찍는 일상 유튜브 입니다\n영상 재밌게 봐주세요,2020-07-09 07:21:35.657,https://yt3.ggpht.com/u0jTnAEmggwPXV67DycBsRLW...,{'EXT_URL': 1},,{'UNSUBSCRIBED': 0},2022-05-21 11:00:00.745
98,628847a5fb15712a8cbb0c43,16,14,0,0,0,0,0,0,6287efb5fb15712a8cbb0af1,620bb39df6653147defaed3d,UCeychZVkPmUvAW--avJZ_Ug,아이니,,2021-10-10 08:07:31.139,https://yt3.ggpht.com/ksK07UumZfHn3O5HzCta3Hkg...,,,,2022-05-21 11:00:00.883
