In [1]:
import pymongo
import pandas as pd
import math

import dask.dataframe as dd ## 대규모 데이터 처리
# import vaex ## 대규모 데이터 처리

import os
from dotenv import load_dotenv

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
# .env 파일에서 환경 변수 로드
load_dotenv('C:/py_src/awake/env')

# 환경 변수에서 MongoDB 연결 정보 가져오기
mongo_password = os.getenv('MONGO_PASSWORD')
mongo_user = os.getenv('MONGO_USER')

In [3]:
# MongoDB 연결 URL
url = f"mongodb+srv://{mongo_user}:{mongo_password}@meercat-external.udyfs.mongodb.net/?retryWrites=true&w=majority&appName=meercat-external"
client = pymongo.MongoClient(url, serverSelectionTimeoutMS=100000)

In [4]:
# 연결 확인
from pymongo.errors import ServerSelectionTimeoutError

try:
    # 데이터베이스 목록을 가져오는 쿼리
    databases = client.list_database_names()
    print("Connected successfully. Databases:", databases)
    
except ServerSelectionTimeoutError as err:
    print("Connection failed:", err)

Connected successfully. Databases: ['Test', 'admin', 'config', 'local']


In [5]:
# 데이터베이스 확인
client.list_database_names()

['Test', 'admin', 'config', 'local']

In [6]:
# Test DB 선택
db = client.Test

# Test DB 컬렉션 확인
collections = db.list_collection_names()

In [7]:
len(collections)

21

In [8]:
# 단위 환산
def convert_bytes(num):
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return f"{num:.2f} {x}"
        num /= 1024.0

In [9]:
for collection_name in collections:
    # 컬렉션 통계 정보 가져오기
    stats = db.command("collStats", collection_name)

    # 컬렉션의 크기와 문서 수 출력    
    print(f"Collection '{collection_name}' size: {convert_bytes(stats['size'])}")
    print(f"Collection '{collection_name}' document count: {stats['count']}")
    print(f"Total index size: {convert_bytes(stats['totalIndexSize'])}")
    print('--------------------------------------------------------------------')

Collection 'hashtag_hashtag' size: 13.39 MB
Collection 'hashtag_hashtag' document count: 3052
Total index size: 172.00 KB
--------------------------------------------------------------------
Collection 'youtube_revenue' size: 259.47 MB
Collection 'youtube_revenue' document count: 811499
Total index size: 46.44 MB
--------------------------------------------------------------------
Collection 'youtube_report_v2' size: 4.60 MB
Collection 'youtube_report_v2' document count: 5303
Total index size: 484.00 KB
--------------------------------------------------------------------
Collection 'influencer_datas' size: 7.78 GB
Collection 'influencer_datas' document count: 3340624
Total index size: 246.63 MB
--------------------------------------------------------------------
Collection 'youtube_channel_demographics' size: 59.67 MB
Collection 'youtube_channel_demographics' document count: 297683
Total index size: 4.69 MB
--------------------------------------------------------------------
Collection

In [10]:
# 인스타 관련 데이터 제외 --> # 'youtube_videos', 'youtube_datas', 'youtube_daily_channel_basics', 'youtube_subscriber' 10% 샘플링
collections_need = [
    # 'hashtag_hashtag',
    'youtube_revenue',
    'youtube_report_v2',
    # 'influencer_datas',
    'youtube_channel_demographics',
    # 'user_aggregations',
    # 'campaign_bookmarks',
    'youtube_videos',
    # 'hashtag_hashtaglog',
    'youtube_report',
    'youtube_users',
    # 'alpha_tests',
    # 'influencer_media_datas',
    'youtube_channel_locations',
    'youtube_daily_channel_basics',
    # 'influencer_media_comments',
    # 'user_bookmarks',
    'youtube_subscriber',
    # 'campaigns',
    'youtube_datas',
    # 'instagram_stories'
    ]

In [11]:
# 샘플링 대상 테이블 리스트
tables_to_sample = ['youtube_videos', 'youtube_datas']# , 'youtube_daily_channel_basics', 'youtube_subscriber']
# tables_to_sample = collections_need.copy()

youtube_dict={}
for collection_name in collections_need:
    if collection_name in tables_to_sample:
        # 전체 데이터수 확인
        total_documents  = db[collection_name].count_documents({})

        # 샘플링할 문서 수 계산 (10%)
        page_number = 1
        sample_size = math.ceil(total_documents * 0.05)

        # 샘플링 수행
        sampled_documents = db[collection_name].find().skip((page_number - 1) * sample_size).limit(sample_size)

        # 샘플링된 문서를 리스트로 변환
        documents_list = list(sampled_documents)

        # MongoDB 문서를 DataFrame으로 변환
        youtube_dict[collection_name] = pd.DataFrame(documents_list)
        
        print(collection_name)
        print(youtube_dict[collection_name].columns)
        print(">> Success")
        print("--------------------------------------")
        print("")
        
    else:
        youtube_dict[collection_name] = pd.DataFrame(list(db[collection_name].find()))
        
        print(collection_name)
        print(youtube_dict[collection_name].columns)
        print(">> Success")
        print("--------------------------------------")
        print("")

youtube_revenue
Index(['_id', 'estimated_revenue', 'estimated_ad_revenue',
       'estimated_red_partner_revenue', 'gross_revenue', 'cpm',
       'ad_impressions', 'monetized_playbacks', 'playback_based_cpm',
       'youtube_user_id', 'data_created_at', '__v', 'created_at',
       'updated_at'],
      dtype='object')
>> Success
--------------------------------------

youtube_report_v2
Index(['_id', 'requested', 'youtube_user_id', 'content', 'phone_number',
       'template_code', 'created_at', 'updated_at', '__v'],
      dtype='object')
>> Success
--------------------------------------

youtube_channel_demographics
Index(['_id', 'youtube_user_id', 'end_date', 'created_at', 'updated_at', '__v',
       'demographics'],
      dtype='object')
>> Success
--------------------------------------

youtube_videos
Index(['_id', 'videos', 'youtube_user_id', 'end_date', 'created_at',
       'updated_at', '__v'],
      dtype='object')
>> Success
--------------------------------------

youtube_report

In [12]:
# 키값 형식 맞추기
for key, df in youtube_dict.items():
    # 각 딕셔너리의 value가 DataFrame인지 확인
    if isinstance(df, pd.DataFrame):
        # '_id' 컬럼이 있는지 확인하고 문자열로 변환
        if '_id' in df.columns:
            df['_id'] = df['_id'].astype(str)
        
        # 'youtube_user_id' 컬럼이 있는지 확인하고 문자열로 변환
        if 'youtube_user_id' in df.columns:
            df['youtube_user_id'] = df['youtube_user_id'].astype(str)

        # 변경된 DataFrame을 다시 딕셔너리에 저장
        youtube_dict[key] = df

In [13]:
youtube_dict.keys()

dict_keys(['youtube_revenue', 'youtube_report_v2', 'youtube_channel_demographics', 'youtube_videos', 'youtube_report', 'youtube_users', 'youtube_channel_locations', 'youtube_daily_channel_basics', 'youtube_subscriber', 'youtube_datas'])

In [36]:
youtube_user_id_outer = list(set(list(youtube_dict['youtube_revenue']['youtube_user_id']) +
                                 list(youtube_dict['youtube_report_v2']['youtube_user_id']) +
                                 list(youtube_dict['youtube_channel_demographics']['youtube_user_id']) +
                                 list(youtube_dict['youtube_report']['youtube_user_id']) +
                                 # list(youtube_dict['youtube_users']['youtube_user_id']) +
                                 list(youtube_dict['youtube_channel_locations']['youtube_user_id']) +
                                 list(youtube_dict['youtube_daily_channel_basics']['youtube_user_id']) +
                                 list(youtube_dict['youtube_subscriber']['youtube_user_id']) +
                                 list(youtube_dict['youtube_revenue']['youtube_user_id'])))
len(youtube_user_id_outer)

# youtube_dict['youtube_videos']['youtube_user_id']
# youtube_dict['youtube_datas']['youtube_user_id']

## youtube_user_id 모두 포함 996개

996

In [37]:
youtube_user_id_inner = list(set(youtube_dict['youtube_revenue']['youtube_user_id']) &
                             set(youtube_dict['youtube_report_v2']['youtube_user_id']) &
                             set(youtube_dict['youtube_channel_demographics']['youtube_user_id']) &
                             set(youtube_dict['youtube_report']['youtube_user_id']) &
                            #  set(youtube_dict['youtube_users']['youtube_user_id']) &
                             set(youtube_dict['youtube_channel_locations']['youtube_user_id']) &
                             set(youtube_dict['youtube_daily_channel_basics']['youtube_user_id']) &
                             set(youtube_dict['youtube_subscriber']['youtube_user_id']) &
                             set(youtube_dict['youtube_revenue']['youtube_user_id']))

len(youtube_user_id_inner)
# youtube_dict['youtube_videos']['youtube_user_id']
# youtube_dict['youtube_datas']['youtube_user_id']

## youtube_user_id  공통 포함 30개

30

### youtube_revenue

In [86]:
youtube_revenue = youtube_dict['youtube_revenue']

In [87]:
len(youtube_revenue['youtube_user_id'].unique())
## 유튜버 계정 : 265개

265

In [88]:
# 총 예상 수익
num_col = youtube_revenue.select_dtypes(['int64','float64']).columns[:-1]
youtube_revenue[num_col].sum()
## 광고 수익 가장 높음
## 프리미엄 구독자 수익 가장 낮음

estimated_revenue                1.168341e+10
estimated_ad_revenue             8.573537e+09
estimated_red_partner_revenue    1.408400e+09
gross_revenue                    1.358574e+10
cpm                              3.294414e+09
ad_impressions                   1.686921e+09
monetized_playbacks              2.539314e+09
playback_based_cpm               2.182599e+09
dtype: float64

In [89]:
# 계정별 일일 수익
youtube_revenue = youtube_revenue.groupby(['youtube_user_id','data_created_at'])[num_col].sum().reset_index()

In [None]:
# 계정별 수익
youtube_revenue.groupby(['youtube_user_id'])[num_col].sum().reset_index()

Unnamed: 0,youtube_user_id,estimated_revenue,estimated_ad_revenue,estimated_red_partner_revenue,gross_revenue,cpm,ad_impressions,monetized_playbacks,playback_based_cpm
0,627cb611aa6f212355e0b617,3.674704e+07,3.305527e+07,3690439.615,6.007924e+07,16621484,4940600.580,11924080,6652335.668
1,627f59ccaa39226247c60b01,5.746649e+05,2.320120e+05,17105.274,4.351192e+05,122757,6131410.654,99429,7865634.616
2,6287228afb15712a8cb931d7,1.207998e+06,1.146186e+06,61811.485,2.082840e+06,496226,4226569.212,408292,5192609.145
3,6287229efb15712a8cb93225,8.068942e+06,4.234351e+05,1067163.601,7.652388e+05,192950,2655257.311,144510,3557279.325
4,628722c8fb15712a8cb9326e,3.585489e+05,3.206090e+05,27309.213,5.816992e+05,160338,1949029.113,135069,2314707.493
...,...,...,...,...,...,...,...,...,...
260,65cc401305bf1c0baa425146,3.017992e+07,1.954593e+05,869431.936,3.546106e+05,126403,350341.067,113708,392553.468
261,65e7b773d8da110bb072e2b5,2.462857e+06,1.606118e+06,197159.453,2.919909e+06,362053,5143871.476,280330,6589129.744
262,65f7b17ed8da110bb0733b7b,9.671508e+03,7.694567e+03,1559.687,1.398844e+04,3647,45409.424,3026,54595.828
263,65fecf7ed8da110bb0736199,2.378037e+07,1.802437e+07,4066074.918,3.276256e+07,6714688,6730409.894,4892883,9210131.308


### youtube_report_v2

In [90]:
youtube_report_v2 = youtube_dict['youtube_report_v2']

In [91]:
len(youtube_report_v2['youtube_user_id'].unique())
## 유튜버 계정 : 276개

276

In [92]:
youtube_report_v2 = youtube_report_v2[['youtube_user_id','content','phone_number','requested']].drop_duplicates().sort_values('youtube_user_id').reset_index(drop=True)

### youtube_channel_demographics
- 유튜버계정 일일 연령대, 성별 분포
- demographics 컬럼 분포 합 100%

In [94]:
youtube_channel_demographics = youtube_dict['youtube_channel_demographics']

In [95]:
len(youtube_channel_demographics['youtube_user_id'].unique())
## 유튜버 계정 : 905개

905

In [96]:
youtube_channel_demographics = youtube_channel_demographics[~youtube_channel_demographics['demographics'].isnull()].sort_values(['youtube_user_id','end_date']).reset_index(drop=True)
youtube_channel_demographics = youtube_channel_demographics[['youtube_user_id', 'end_date', 'demographics']]

### youtube_videos
- 계정별 일일 콘텐츠 정보

In [97]:
youtube_videos = youtube_dict['youtube_videos']

In [98]:
len(youtube_videos['youtube_user_id'].unique())
## 유튜버 계정 : 603개(샘플링 데이터라서 정확x)

603

In [99]:
youtube_videos = youtube_videos[youtube_videos['videos'].apply(lambda x: len(x) > 0)].sort_values(['youtube_user_id','end_date']).reset_index(drop=True)
youtube_videos = youtube_videos[['youtube_user_id','end_date','videos']]
youtube_videos = youtube_videos.explode(['videos']).reset_index(drop=True)

In [100]:
youtube_videos_cast = pd.json_normalize(youtube_videos['videos'])
youtube_videos = pd.concat([youtube_videos, youtube_videos_cast],axis=1)
del youtube_videos_cast

### youtube_report

In [102]:
youtube_report = youtube_dict['youtube_report']

In [103]:
len(youtube_report['youtube_user_id'].unique())
## 유튜버 계정 : 194개

194

In [104]:
youtube_report = youtube_report[['youtube_user_id','contents','phone_number','request']].drop_duplicates().sort_values('youtube_user_id').reset_index(drop=True)

### youtube_users

In [63]:
youtube_users = youtube_dict['youtube_users']

In [64]:
def count_empty_lists_and_nans(df):
    # NaN 값 개수
    nan_counts = df.isna().sum()

    # 비어있는 리스트 개수
    empty_list_counts = df.applymap(lambda x: isinstance(x, list) and len(x) == 0).sum()

    return pd.DataFrame({'NaN Count': nan_counts, 'Empty List Count': empty_list_counts})

# 결과를 확인하고 출력
result = count_empty_lists_and_nans(youtube_users)

  empty_list_counts = df.applymap(lambda x: isinstance(x, list) and len(x) == 0).sum()


In [65]:
result.apply(sum,axis=1) / 938

_id                     0.000000
country                 0.049041
phone_num               0.605544
kakao_nick              0.605544
kakao_account_id        0.223881
user_kind               0.605544
created_at              0.000000
__v                     0.000000
channel_title           0.005330
channel_id              0.005330
thumbnail_url           0.005330
published_at            0.004264
subscriber_count        0.000000
is_rev_saved            0.000000
is_subs_saved           0.000000
updated_at              0.000000
brandingSettings        0.105544
contentDetails          0.105544
contentOwnerDetails     0.105544
etag                    0.105544
id                      0.105544
kind                    0.105544
snippet                 0.105544
statistics              0.105544
status                  0.105544
topicDetails            0.284648
connected               0.068230
refresh_error           0.961620
localizations           0.877399
ads_array               0.786780
age       

In [None]:
youtube_users = youtube_users[['channel_title','channel_id','phone_num','country','age','gender','subscriber_count','statistics','status']]
youtube_users = youtube_users.drop_duplicates(['channel_title', 'channel_id', 'phone_num', 'country', 'age', 'gender','subscriber_count'])
## null값 비율 90% 이상 삭제 - 'refresh_error', 'localizations', 'account_type', 'is_accept_suggestion', 'is_add_info', 'user_id'
## 필요한 컬럼 추출

In [68]:
youtube_users['statistics'][0]

{'viewCount': '14328509',
 'subscriberCount': '118000',
 'hiddenSubscriberCount': False,
 'videoCount': '36'}

In [None]:
youtube_users[youtube_users['channel_title']=='목소리 연기자 유지컬']['statistics'].iloc[0] ## statistics --> cast 형태로

{'viewCount': '795441139',
 'subscriberCount': '291000',
 'hiddenSubscriberCount': False,
 'videoCount': '278'}

### youtube_channel_locations
- 채널 구독자 위치

In [105]:
youtube_channel_locations = youtube_dict['youtube_channel_locations']

In [106]:
len(youtube_channel_locations['youtube_user_id'].unique())
## 유튜버 계정 : 647개

906

In [107]:
youtube_channel_locations = youtube_channel_locations[youtube_channel_locations['locations'].apply(lambda x: len(x) > 0)].sort_values(['youtube_user_id','end_date']).reset_index(drop=True)
youtube_channel_locations = youtube_channel_locations[['youtube_user_id','end_date','locations']]

### youtube_daily_channel_basics

In [110]:
youtube_daily_channel_basics = youtube_dict['youtube_daily_channel_basics']

In [111]:
youtube_daily_channel_basics[youtube_daily_channel_basics['daily_basics'].apply(lambda x: len(x) > 0)].sort_values(['youtube_user_id','end_date']).reset_index(drop=True)
youtube_daily_channel_basics = youtube_daily_channel_basics[['youtube_user_id','daily_basics']] ## 'end_date' --> 'day' 컬럼 활용
youtube_daily_channel_basics = youtube_daily_channel_basics.explode(['daily_basics']).reset_index(drop=True)

In [112]:
# daily_basics 컬럼 형식 맞추기 --> 딕셔너리 값들 cast 형태로 맞추기
youtube_daily_channel_basics_cast = []
for item in youtube_daily_channel_basics['daily_basics']:
    if isinstance(item, list):
        youtube_daily_channel_basics_cast.extend(item)
    else:
        youtube_daily_channel_basics_cast.append(item)

youtube_daily_channel_basics_cast = pd.json_normalize(youtube_daily_channel_basics_cast)

In [114]:
# 데이터셋 cast 형식
youtube_daily_channel_basics = pd.concat([youtube_daily_channel_basics,youtube_daily_channel_basics_cast],axis=1).sort_values(['youtube_user_id','day']).reset_index(drop=True)
del youtube_daily_channel_basics_cast

In [82]:
pipeline = [
    {
        "$match": {
            "daily_basics": {"$ne": []}
        }
    },
    {
        "$sort": {
            "youtube_user_id": 1,
            "end_date": 1
        }
    },
    {
        "$project": {
            "youtube_user_id": 1,
            "daily_basics": 1
        }
    },
    {
        "$unwind": "$daily_basics"
    },
    {
        "$addFields": {
            "day": "$daily_basics.day", 
            # "annotationClickThroughRate" : "$daily_basics.annotationClickThroughRate",
            # "annotationCloseRate" : "$daily_basics.annotationCloseRate", 
            # "averageViewDuration" : "$daily_basics.averageViewDuration", 
            # "comments" : "$daily_basics.comments", 
            # "dislikes" : "$daily_basics.dislikes",
            # "estimatedMinutesWatched" : "$daily_basics.estimatedMinutesWatched", 
            # "likes" : "$daily_basics.likes", 
            # "shares" : "$daily_basics.shares", 
            # "subscribersGained" : "$daily_basics.subscribersGained",
            # "subscribersLost" : "$daily_basics.subscribersLost", 
            # "views" : "$daily_basics.views", 
            # "redViews" : "$daily_basics.redViews", 
            # "estimatedRevenue" : "$daily_basics.estimatedRevenue",
            # "estimatedAdRevenue" : "$daily_basics.estimatedAdRevenue", 
            # "estimatedRedPartnerRevenue" : "$daily_basics.estimatedRedPartnerRevenue", 
            # "grossRevenue" : "$daily_basics.grossRevenue",
            # "cpm" : "$daily_basics.cpm", 
            # "monetizedPlaybacks" : "$daily_basics.monetizedPlaybacks", 
            # "adImpressions" : "$daily_basics.adImpressions", 
            # "playbackBasedCpm" : "$daily_basics.playbackBasedCpm"
            # 필요한 다른 필드도 추가할 수 있음
        }
    },
    {
        "$sort": {
            "youtube_user_id": 1,
            "day": 1
        }
    }
]


In [83]:
collection = db['youtube_daily_channel_basics']  # 컬렉션 선택

# 파이프라인 실행
result = list(collection.aggregate(pipeline, allowDiskUse=True)) 

# 결과를 Pandas 데이터프레임으로 변환
import pandas as pd
df = pd.DataFrame(result)

print(df.head())  # 전처리된 데이터 확인

                        _id  \
0  6425568b99ec57e61cbbf0e4   
1  6425568b99ec57e61cbbf0e4   
2  6425568b99ec57e61cbbf0e4   
3  6425568b99ec57e61cbbf0e4   
4  6425568b99ec57e61cbbf0e4   

                                        daily_basics  \
0  {'day': '2013-07-31', 'annotationClickThroughR...   
1  {'day': '2013-08-01', 'annotationClickThroughR...   
2  {'day': '2013-08-02', 'annotationClickThroughR...   
3  {'day': '2013-08-03', 'annotationClickThroughR...   
4  {'day': '2013-08-04', 'annotationClickThroughR...   

            youtube_user_id         day  
0  627cb611aa6f212355e0b617  2013-07-31  
1  627cb611aa6f212355e0b617  2013-08-01  
2  627cb611aa6f212355e0b617  2013-08-02  
3  627cb611aa6f212355e0b617  2013-08-03  
4  627cb611aa6f212355e0b617  2013-08-04  


### youtube_subscriber

In [117]:
youtube_dict['youtube_subscriber']

Unnamed: 0,_id,subscribers_gained,subscribers_lost,subscribers_count,youtube_user_id,data_created_at,__v,created_at,updated_at
0,62cfbae7b4dda42f509e4545,32,2,30,62806935aa39226247c7c59a,2021-07-13,0,2022-07-14 06:42:47.296,2022-07-14 06:42:47.296
1,62cfbae7b4dda42f509e4548,2,1,43,62806935aa39226247c7c59a,2021-07-16,0,2022-07-14 06:42:47.297,2022-07-14 06:42:47.297
2,62cfbae7b4dda42f509e4546,8,0,38,62806935aa39226247c7c59a,2021-07-14,0,2022-07-14 06:42:47.297,2022-07-14 06:42:47.297
3,62cfbae7b4dda42f509e4549,2,0,45,62806935aa39226247c7c59a,2021-07-17,0,2022-07-14 06:42:47.297,2022-07-14 06:42:47.297
4,62cfbae7b4dda42f509e454d,9,0,114,62806935aa39226247c7c59a,2021-07-21,0,2022-07-14 06:42:47.297,2022-07-14 06:42:47.297
...,...,...,...,...,...,...,...,...,...
151769,62cfbaf0b4dda42f50a07856,0,0,9,62879fa5fb15712a8cbb0958,2022-05-25,0,2022-07-14 06:42:56.920,2022-07-14 06:42:56.920
151770,62cfbaf0b4dda42f50a07858,0,0,9,62879fa5fb15712a8cbb0958,2022-05-27,0,2022-07-14 06:42:56.920,2022-07-14 06:42:56.920
151771,62cfbaf0b4dda42f50a0785a,0,0,9,62879fa5fb15712a8cbb0958,2022-05-29,0,2022-07-14 06:42:56.920,2022-07-14 06:42:56.920
151772,62cfbaf0b4dda42f50a07861,0,0,10,62879fa5fb15712a8cbb0958,2022-06-05,0,2022-07-14 06:42:56.920,2022-07-14 06:42:56.920


### youtube_datas

In [29]:
youtube_dict['youtube_datas']['youtube_user_id'].unique()

array(['627c8fdbce0b951b01262140', '627cb611aa6f212355e0b617',
       '627cd09faa6f212355e14268', '627b58c880a4763fdf8e13de',
       '627deb73aa39226247c461a1', '627dfaacaa39226247c46296',
       '627fcf4eaa39226247c7c2a2', '627f59ccaa39226247c60b01',
       '62800378aa39226247c7c35c', '62810d9eaa39226247c994f2',
       '62806935aa39226247c7c59a', '6282f5f0aa39226247cb6bd3',
       '6281e84caa39226247c999de', '62830f86b06c53763e02ba20',
       '628319ebb06c53763e02bab3', '62830ae3b06c53763e02b9db',
       '62848afa4e1c157f51ab0827', '62850de04e1c157f51ac7550',
       '628722e8fb15712a8cb932a4', '62879fa5fb15712a8cbb0958',
       '6287efb5fb15712a8cbb0af1', '628728defb15712a8cb93569',
       '62872293fb15712a8cb9320e', '62873929fb15712a8cb93749',
       '62872441fb15712a8cb933e8', '62872523fb15712a8cb93479',
       '6287229ffb15712a8cb93227', '6287288ffb15712a8cb9355d',
       '6287228afb15712a8cb931d7', '628722abfb15712a8cb93238',
       '628724d8fb15712a8cb93456', '628723b7fb15712a8cb

In [None]:
dfs_to_merge = []
# youtube_dict의 각 데이터프레임을 리스트에 추가
for df in youtube_dict.values():
    if isinstance(df, pd.DataFrame):  # DataFrame인지 확인
        dfs_to_merge.append(df)

In [None]:
# 예제: 여러 개의 데이터프레임을 병합
# DataFrame들을 Dask DataFrame으로 변환
dask_df_list = [dd.from_pandas(df, npartitions=10) for df in dfs_to_merge]

# 첫 번째 DataFrame을 기준으로 병합을 시작
merged_df = dfs_to_merge[0]

# 나머지 DataFrame들과 순차적으로 병합
for df in dfs_to_merge[1:]:
    merged_df = dd.merge(merged_df, df, on='youtube_user_id', how='outer')

# 최종 병합된 DataFrame을 계산하여 Pandas DataFrame으로 변환
merged_df = merged_df.compute()

# 결과 출력
print(merged_df)


In [None]:
import pandas as pd
from functools import reduce

dfs_to_merge = []
# youtube_dict의 각 데이터프레임을 리스트에 추가
for df in youtube_dict.values():
    if isinstance(df, pd.DataFrame):  # DataFrame인지 확인
        dfs_to_merge.append(df)

# 데이터프레임 병합
merged_df = reduce(lambda left, right: pd.merge(left, right, on='youtube_user_id', how='inner'), dfs_to_merge)

# 결과 출력
print(merged_df)