In [8]:
import pymongo
import pandas as pd
import math

import dask.dataframe as dd ## 대규모 데이터 처리
# import vaex ## 대규모 데이터 처리

import os
from dotenv import load_dotenv

In [9]:
# .env 파일에서 환경 변수 로드
load_dotenv('C:/py_src/awake/env')

# 환경 변수에서 MongoDB 연결 정보 가져오기
mongo_password = os.getenv('MONGO_PASSWORD')
mongo_user = os.getenv('MONGO_USER')

In [10]:
# MongoDB 연결 URL
url = f"mongodb+srv://{mongo_user}:{mongo_password}@meercat-external.udyfs.mongodb.net/?retryWrites=true&w=majority&appName=meercat-external"
client = pymongo.MongoClient(url, serverSelectionTimeoutMS=100000)

In [11]:
# 연결 확인
from pymongo.errors import ServerSelectionTimeoutError

try:
    # 데이터베이스 목록을 가져오는 쿼리
    databases = client.list_database_names()
    print("Connected successfully. Databases:", databases)
    
except ServerSelectionTimeoutError as err:
    print("Connection failed:", err)

Connected successfully. Databases: ['Test', 'admin', 'config', 'local']


In [12]:
# 데이터베이스 확인
client.list_database_names()

['Test', 'admin', 'config', 'local']

In [13]:
# Test DB 선택
db = client.Test

# Test DB 컬렉션 확인
collections = db.list_collection_names()

In [14]:
len(collections)

21

In [15]:
# 단위 환산
def convert_bytes(num):
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return f"{num:.2f} {x}"
        num /= 1024.0

In [16]:
for collection_name in collections:
    # 컬렉션 통계 정보 가져오기
    stats = db.command("collStats", collection_name)

    # 컬렉션의 크기와 문서 수 출력    
    print(f"Collection '{collection_name}' size: {convert_bytes(stats['size'])}")
    print(f"Collection '{collection_name}' document count: {stats['count']}")
    print(f"Total index size: {convert_bytes(stats['totalIndexSize'])}")
    print('--------------------------------------------------------------------')

Collection 'hashtag_hashtag' size: 13.39 MB
Collection 'hashtag_hashtag' document count: 3052
Total index size: 172.00 KB
--------------------------------------------------------------------
Collection 'youtube_revenue' size: 259.47 MB
Collection 'youtube_revenue' document count: 811499
Total index size: 46.44 MB
--------------------------------------------------------------------
Collection 'youtube_report_v2' size: 4.60 MB
Collection 'youtube_report_v2' document count: 5303
Total index size: 484.00 KB
--------------------------------------------------------------------
Collection 'influencer_datas' size: 7.78 GB
Collection 'influencer_datas' document count: 3340624
Total index size: 246.63 MB
--------------------------------------------------------------------
Collection 'youtube_channel_demographics' size: 59.67 MB
Collection 'youtube_channel_demographics' document count: 297683
Total index size: 4.69 MB
--------------------------------------------------------------------
Collection

In [17]:
# 인스타 관련 데이터 제외 --> # 'youtube_videos', 'youtube_datas', 'youtube_daily_channel_basics', 'youtube_subscriber' 10% 샘플링
collections_need = [
    # 'hashtag_hashtag',
    'youtube_revenue',
    'youtube_report_v2',
    # 'influencer_datas',
    'youtube_channel_demographics',
    # 'user_aggregations',
    # 'campaign_bookmarks',
    'youtube_videos',
    # 'hashtag_hashtaglog',
    'youtube_report',
    'youtube_users',
    # 'alpha_tests',
    # 'influencer_media_datas',
    'youtube_channel_locations',
    'youtube_daily_channel_basics',
    # 'influencer_media_comments',
    # 'user_bookmarks',
    'youtube_subscriber',
    # 'campaigns',
    'youtube_datas',
    # 'instagram_stories'
    ]

In [18]:
# 샘플링 대상 테이블 리스트
tables_to_sample = ['youtube_videos', 'youtube_datas', 'youtube_daily_channel_basics', 'youtube_subscriber']
# tables_to_sample = collections_need.copy()

youtube_dict={}
for collection_name in collections_need:
    if collection_name in tables_to_sample:
        # 전체 데이터수 확인
        total_documents  = db[collection_name].count_documents({})

        # 샘플링할 문서 수 계산 (10%)
        page_number = 1
        sample_size = math.ceil(total_documents * 0.05)

        # 샘플링 수행
        sampled_documents = db[collection_name].find().skip((page_number - 1) * sample_size).limit(sample_size)

        # 샘플링된 문서를 리스트로 변환
        documents_list = list(sampled_documents)

        # MongoDB 문서를 DataFrame으로 변환
        youtube_dict[collection_name] = pd.DataFrame(documents_list)
        
        print(collection_name)
        print(youtube_dict[collection_name].columns)
        print(">> Success")
        print("--------------------------------------")
        print("")
        
    else:
        youtube_dict[collection_name] = pd.DataFrame(list(db[collection_name].find()))
        
        print(collection_name)
        print(youtube_dict[collection_name].columns)
        print(">> Success")
        print("--------------------------------------")
        print("")

youtube_revenue
Index(['_id', 'estimated_revenue', 'estimated_ad_revenue',
       'estimated_red_partner_revenue', 'gross_revenue', 'cpm',
       'ad_impressions', 'monetized_playbacks', 'playback_based_cpm',
       'youtube_user_id', 'data_created_at', '__v', 'created_at',
       'updated_at'],
      dtype='object')
>> Success
--------------------------------------

youtube_report_v2
Index(['_id', 'requested', 'youtube_user_id', 'content', 'phone_number',
       'template_code', 'created_at', 'updated_at', '__v'],
      dtype='object')
>> Success
--------------------------------------

youtube_channel_demographics
Index(['_id', 'youtube_user_id', 'end_date', 'created_at', 'updated_at', '__v',
       'demographics'],
      dtype='object')
>> Success
--------------------------------------

youtube_videos
Index(['_id', 'videos', 'youtube_user_id', 'end_date', 'created_at',
       'updated_at', '__v'],
      dtype='object')
>> Success
--------------------------------------

youtube_report

In [19]:
# 키값 형식 맞추기
for key, df in youtube_dict.items():
    # 각 딕셔너리의 value가 DataFrame인지 확인
    if isinstance(df, pd.DataFrame):
        # '_id' 컬럼이 있는지 확인하고 문자열로 변환
        if '_id' in df.columns:
            df['_id'] = df['_id'].astype(str)
        
        # 'youtube_user_id' 컬럼이 있는지 확인하고 문자열로 변환
        if 'youtube_user_id' in df.columns:
            df['youtube_user_id'] = df['youtube_user_id'].astype(str)

        # 변경된 DataFrame을 다시 딕셔너리에 저장
        youtube_dict[key] = df

In [20]:
youtube_dict.keys()

dict_keys(['youtube_revenue', 'youtube_report_v2', 'youtube_channel_demographics', 'youtube_videos', 'youtube_report', 'youtube_users', 'youtube_channel_locations', 'youtube_daily_channel_basics', 'youtube_subscriber', 'youtube_datas'])

### youtube_revenue

In [247]:
youtube_revenue = youtube_dict['youtube_revenue']

In [248]:
len(youtube_revenue['youtube_user_id'].unique())
## 유튜버 계정 : 265개

265

In [249]:
# 총 예상 수익
num_col = youtube_revenue.select_dtypes(['int64','float64']).columns[:-1]
youtube_revenue[num_col].sum()
## 광고 수익 가장 높음
## 프리미엄 구독자 수익 가장 낮음

estimated_revenue                1.168341e+10
estimated_ad_revenue             8.573537e+09
estimated_red_partner_revenue    1.408400e+09
gross_revenue                    1.358574e+10
cpm                              3.294414e+09
ad_impressions                   1.686921e+09
monetized_playbacks              2.539314e+09
playback_based_cpm               2.182599e+09
dtype: float64

In [250]:
# 계정별 일일 수익
youtube_revenue = youtube_revenue.groupby(['youtube_user_id','data_created_at'])[num_col].sum().reset_index()
youtube_revenue

Unnamed: 0,youtube_user_id,data_created_at,estimated_revenue,estimated_ad_revenue,estimated_red_partner_revenue,gross_revenue,cpm,ad_impressions,monetized_playbacks,playback_based_cpm
0,627cb611aa6f212355e0b617,2013-07-31,0.000,0.0,0.000,0.0,0,0.0,0,0.0
1,627cb611aa6f212355e0b617,2013-08-01,0.000,0.0,0.000,0.0,0,0.0,0,0.0
2,627cb611aa6f212355e0b617,2013-08-02,0.000,0.0,0.000,0.0,0,0.0,0,0.0
3,627cb611aa6f212355e0b617,2013-08-03,0.000,0.0,0.000,0.0,0,0.0,0,0.0
4,627cb611aa6f212355e0b617,2013-08-04,0.000,0.0,0.000,0.0,0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...
507258,66230ee6d8da110bb0744b2d,2024-04-29,32797.134,0.0,2522.759,0.0,0,0.0,0,0.0
507259,66230ee6d8da110bb0744b2d,2024-04-30,40680.753,0.0,2770.255,0.0,0,0.0,0,0.0
507260,66230ee6d8da110bb0744b2d,2024-05-01,35864.687,0.0,2579.477,0.0,0,0.0,0,0.0
507261,66230ee6d8da110bb0744b2d,2024-05-02,38089.134,0.0,2414.442,0.0,0,0.0,0,0.0


In [251]:
# 계정별 수익
youtube_revenue.groupby(['youtube_user_id'])[num_col].sum().reset_index()

Unnamed: 0,youtube_user_id,estimated_revenue,estimated_ad_revenue,estimated_red_partner_revenue,gross_revenue,cpm,ad_impressions,monetized_playbacks,playback_based_cpm
0,627cb611aa6f212355e0b617,3.674704e+07,3.305527e+07,3690439.615,6.007924e+07,16621484,4940600.580,11924080,6652335.668
1,627f59ccaa39226247c60b01,5.746649e+05,2.320120e+05,17105.274,4.351192e+05,122757,6131410.654,99429,7865634.616
2,6287228afb15712a8cb931d7,1.207998e+06,1.146186e+06,61811.485,2.082840e+06,496226,4226569.212,408292,5192609.145
3,6287229efb15712a8cb93225,8.068942e+06,4.234351e+05,1067163.601,7.652388e+05,192950,2655257.311,144510,3557279.325
4,628722c8fb15712a8cb9326e,3.585489e+05,3.206090e+05,27309.213,5.816992e+05,160338,1949029.113,135069,2314707.493
...,...,...,...,...,...,...,...,...,...
260,65cc401305bf1c0baa425146,3.017992e+07,1.954593e+05,869431.936,3.546106e+05,126403,350341.067,113708,392553.468
261,65e7b773d8da110bb072e2b5,2.462857e+06,1.606118e+06,197159.453,2.919909e+06,362053,5143871.476,280330,6589129.744
262,65f7b17ed8da110bb0733b7b,9.671508e+03,7.694567e+03,1559.687,1.398844e+04,3647,45409.424,3026,54595.828
263,65fecf7ed8da110bb0736199,2.378037e+07,1.802437e+07,4066074.918,3.276256e+07,6714688,6730409.894,4892883,9210131.308


### youtube_report_v2

In [103]:
youtube_report_v2 = youtube_dict['youtube_report_v2']

In [105]:
len(youtube_report_v2['youtube_user_id'].unique())
## 유튜버 계정 : 276개

276

In [125]:
youtube_report_v2 = youtube_report_v2[['youtube_user_id','content','phone_number','requested']].drop_duplicates().sort_values('youtube_user_id').reset_index(drop=True)

### youtube_channel_demographics
- 유튜버계정 일일 연령대, 성별 분포
- demographics 컬럼 분포 합 100%

In [254]:
youtube_channel_demographics = youtube_dict['youtube_channel_demographics']

In [255]:
len(youtube_channel_demographics['youtube_user_id'].unique())
## 유튜버 계정 : 905개

905

In [256]:
youtube_channel_demographics = youtube_channel_demographics[~youtube_channel_demographics['demographics'].isnull()].sort_values(['youtube_user_id','end_date']).reset_index(drop=True)
youtube_channel_demographics = youtube_channel_demographics[['youtube_user_id', 'end_date', 'demographics']]
youtube_channel_demographics

Unnamed: 0,youtube_user_id,end_date,demographics
0,627cb611aa6f212355e0b617,2023-03-26,"{'age13-17': {'female': 3.6, 'male': 2.8}, 'ag..."
1,627cb611aa6f212355e0b617,2023-03-27,"{'age13-17': {'female': 4.9, 'male': 3.7}, 'ag..."
2,627cb611aa6f212355e0b617,2023-03-28,"{'age13-17': {'female': 5, 'male': 3.1}, 'age1..."
3,627cb611aa6f212355e0b617,2023-03-29,"{'age13-17': {'female': 5.2, 'male': 3.1}, 'ag..."
4,627cb611aa6f212355e0b617,2023-03-30,"{'age13-17': {'female': 3.3, 'male': 2.4}, 'ag..."
...,...,...,...
96594,66230ee6d8da110bb0744b2d,2024-04-29,"{'age13-17': {'female': 2.2, 'male': 5.9}, 'ag..."
96595,66230ee6d8da110bb0744b2d,2024-04-30,"{'age13-17': {'female': 2.5, 'male': 5.4, 'gen..."
96596,66230ee6d8da110bb0744b2d,2024-05-01,"{'age13-17': {'female': 1.8, 'male': 6}, 'age1..."
96597,66230ee6d8da110bb0744b2d,2024-05-02,"{'age13-17': {'female': 2.1, 'male': 6.4}, 'ag..."


### youtube_videos
- 계정별 일일 콘텐츠 정보

In [261]:
youtube_videos = youtube_dict['youtube_videos']

In [262]:
len(youtube_videos['youtube_user_id'].unique())
## 유튜버 계정 : 603개(샘플링 데이터라서 정확x)

603

In [263]:
youtube_videos = youtube_videos[youtube_videos['videos'].apply(lambda x: len(x) > 0)].sort_values(['youtube_user_id','end_date']).reset_index(drop=True)
youtube_videos = youtube_videos[['youtube_user_id','end_date','videos']]
youtube_videos

Unnamed: 0,youtube_user_id,end_date,videos
0,627cb611aa6f212355e0b617,2023-03-27,"[{'video': 'LbTPH_DkrVs', 'views': 1209, 'redV..."
1,627cb611aa6f212355e0b617,2023-03-28,"[{'video': 'LbTPH_DkrVs', 'views': 1505, 'redV..."
2,627cb611aa6f212355e0b617,2023-03-29,"[{'video': 'LbTPH_DkrVs', 'views': 1691, 'redV..."
3,627cb611aa6f212355e0b617,2023-03-30,"[{'video': 'LbTPH_DkrVs', 'views': 2093, 'redV..."
4,627cb611aa6f212355e0b617,2023-03-31,"[{'video': 'LbTPH_DkrVs', 'views': 2701, 'redV..."
...,...,...,...
11275,6439618b659261656b3f015f,2023-04-13,"[{'video': 'WppwFFL6o1o', 'views': 6, 'redView..."
11276,64399d0b659261656b3f0681,2023-04-12,"[{'video': 'ClbWBYyWOWc', 'views': 127136, 're..."
11277,64399d0b659261656b3f0681,2023-04-13,"[{'video': 'SIzhrm7oyYk', 'views': 47171, 'red..."
11278,643a42af659261656b3f0e8b,2023-04-13,"[{'video': 'wXuD04dzw5U', 'views': 286, 'redVi..."


### youtube_report

In [265]:
youtube_report = youtube_dict['youtube_report']

In [268]:
len(youtube_report['youtube_user_id'].unique())
## 유튜버 계정 : 194개

194

In [277]:
youtube_report = youtube_report[['youtube_user_id','contents','phone_number','request']].drop_duplicates().sort_values('youtube_user_id').reset_index(drop=True)
youtube_report

Unnamed: 0,youtube_user_id,contents,phone_number,request
0,627b58c880a4763fdf8e13de,📈 미어캣 유튜브 분석 리포트\n유튜브 계정 : 갱배고파\n데이터 분석 시점 : 6...,01059137067,True
1,627b58c880a4763fdf8e13de,📈 미어캣 유튜브 분석 리포트\n유튜브 계정 : 갱배고파\n데이터 분석 시점 : 5...,01066057996,True
2,627b58c880a4763fdf8e13de,📈 미어캣 유튜브 분석 리포트\n유튜브 계정 : 갱배고파\n데이터 분석 시점 : 5...,01059137067,True
3,627b58c880a4763fdf8e13de,📈 미어캣 유튜브 분석 리포트\n유튜브 계정 : 갱배고파\n데이터 분석 시점 : 6...,01059137067,True
4,627b58c880a4763fdf8e13de,📈 미어캣 유튜브 분석 리포트\n유튜브 계정 : 갱배고파\n데이터 분석 시점 : 6...,01059137067,True
...,...,...,...,...
3541,62a490289d41c93ff90d242b,📈 미어캣 유튜브 분석 리포트\n유튜브 계정 : MOTOSTATION_PAJU\n데...,01051284390,False
3542,62a5283e9d41c93ff90d28a2,📈 미어캣 유튜브 분석 리포트\n유튜브 계정 : 술먹꼬\n데이터 분석 시점 : 6월...,01073022797,False
3543,62a5283e9d41c93ff90d28a2,📈 미어캣 유튜브 분석 리포트\n유튜브 계정 : 술먹꼬\n데이터 분석 시점 : 6월...,01073022797,True
3544,62a5c2409d41c93ff90e5dfa,📈 미어캣 유튜브 분석 리포트\n유튜브 계정 : DO지은\n데이터 분석 시점 : 6...,01051035276,False


### youtube_users

In [340]:
youtube_users = youtube_dict['youtube_users']

In [341]:
def count_empty_lists_and_nans(df):
    # NaN 값 개수
    nan_counts = df.isna().sum()

    # 비어있는 리스트 개수
    empty_list_counts = df.applymap(lambda x: isinstance(x, list) and len(x) == 0).sum()

    return pd.DataFrame({'NaN Count': nan_counts, 'Empty List Count': empty_list_counts})

# 결과를 확인하고 출력
result = count_empty_lists_and_nans(youtube_users)

  empty_list_counts = df.applymap(lambda x: isinstance(x, list) and len(x) == 0).sum()


In [342]:
result.apply(sum,axis=1) / 938

_id                     0.000000
country                 0.049041
phone_num               0.605544
kakao_nick              0.605544
kakao_account_id        0.223881
user_kind               0.605544
created_at              0.000000
__v                     0.000000
channel_title           0.005330
channel_id              0.005330
thumbnail_url           0.005330
published_at            0.004264
subscriber_count        0.000000
is_rev_saved            0.000000
is_subs_saved           0.000000
updated_at              0.000000
brandingSettings        0.105544
contentDetails          0.105544
contentOwnerDetails     0.105544
etag                    0.105544
id                      0.105544
kind                    0.105544
snippet                 0.105544
statistics              0.105544
status                  0.105544
topicDetails            0.284648
connected               0.068230
refresh_error           0.961620
localizations           0.877399
ads_array               0.786780
age       

In [356]:
youtube_users = youtube_users[['channel_title','channel_id','phone_num','country','age','gender','subscriber_count','statistics','status']]
youtube_users = youtube_users.drop_duplicates(['channel_title', 'channel_id', 'phone_num', 'country', 'age', 'gender','subscriber_count'])
## null값 비율 90% 이상 삭제 - 'refresh_error', 'localizations', 'account_type', 'is_accept_suggestion', 'is_add_info', 'user_id'
## 필요한 컬럼 추출

In [361]:
youtube_users[youtube_users['channel_title']=='목소리 연기자 유지컬']['statistics'].iloc[0] ## statistics --> cast 형태로

{'viewCount': '795441139',
 'subscriberCount': '291000',
 'hiddenSubscriberCount': False,
 'videoCount': '278'}

### youtube_channel_locations

In [21]:
youtube_channel_locations = youtube_dict['youtube_channel_locations']

In [105]:
len(youtube_channel_locations['youtube_user_id'].unique())
## 유튜버 계정 : 647개

647

In [22]:
youtube_channel_locations = youtube_channel_locations[youtube_channel_locations['locations'].apply(lambda x: len(x) > 0)].sort_values(['youtube_user_id','end_date']).reset_index(drop=True)
youtube_channel_locations = youtube_channel_locations[['youtube_user_id','end_date','locations']]
youtube_channel_locations

Unnamed: 0,youtube_user_id,end_date,locations
0,627cb611aa6f212355e0b617,2023-03-26,"[{'country': 'AZ', 'views': 16, 'estimatedMinu..."
1,627cb611aa6f212355e0b617,2023-03-27,"[{'country': 'BD', 'views': 28, 'estimatedMinu..."
2,627cb611aa6f212355e0b617,2023-03-28,"[{'country': 'AZ', 'views': 19, 'estimatedMinu..."
3,627cb611aa6f212355e0b617,2023-03-29,"[{'country': 'AZ', 'views': 14, 'estimatedMinu..."
4,627cb611aa6f212355e0b617,2023-03-30,"[{'country': 'AZ', 'views': 21, 'estimatedMinu..."
...,...,...,...
139856,66230ee6d8da110bb0744b2d,2024-04-29,"[{'country': 'AE', 'views': 24, 'estimatedMinu..."
139857,66230ee6d8da110bb0744b2d,2024-04-30,"[{'country': 'AE', 'views': 17, 'estimatedMinu..."
139858,66230ee6d8da110bb0744b2d,2024-05-01,"[{'country': 'AE', 'views': 11, 'estimatedMinu..."
139859,66230ee6d8da110bb0744b2d,2024-05-02,"[{'country': 'AE', 'views': 12, 'estimatedMinu..."


### youtube_daily_channel_basics

In [34]:
youtube_daily_channel_basics = youtube_dict['youtube_daily_channel_basics']

In [37]:
youtube_daily_channel_basics[youtube_daily_channel_basics['daily_basics'].apply(lambda x: len(x) > 0)].sort_values(['youtube_user_id','end_date']).reset_index(drop=True)
youtube_daily_channel_basics = youtube_daily_channel_basics[['youtube_user_id','end_date','daily_basics']]
youtube_daily_channel_basics

Unnamed: 0,youtube_user_id,end_date,daily_basics
0,62806935aa39226247c7c59a,2024-05-04,"[{'day': '2021-07-13', 'annotationClickThrough..."
1,62872293fb15712a8cb9320e,2024-05-04,"[{'day': '2022-03-08', 'annotationClickThrough..."
2,627cd09faa6f212355e14268,2024-05-04,"[{'day': '2019-11-13', 'annotationClickThrough..."
3,627fcf4eaa39226247c7c2a2,2024-05-04,"[{'day': '2019-02-22', 'annotationClickThrough..."
4,6287228afb15712a8cb931d7,2024-05-04,"[{'day': '2019-11-08', 'annotationClickThrough..."
5,628722b0fb15712a8cb93243,2024-05-04,"[{'day': '2021-01-28', 'annotationClickThrough..."
6,62872343fb15712a8cb93309,2024-05-04,"[{'day': '2022-01-22', 'annotationClickThrough..."
7,62872370fb15712a8cb93337,2024-05-04,"[{'day': '2022-03-14', 'annotationClickThrough..."
8,62810d9eaa39226247c994f2,2024-05-04,"[{'day': '2020-07-09', 'annotationClickThrough..."
9,628722c8fb15712a8cb9326e,2024-05-04,"[{'day': '2020-12-08', 'annotationClickThrough..."


In [115]:
youtube_daily_channel_basics.explode(['daily_basics']).reset_index(drop=True)
## 리스트안에 딕셔너리 구조가 아님 --> 리스트안에 리스트가 있고, 하나의 리스트당 하나의 딕셔너리

Unnamed: 0,youtube_user_id,end_date,daily_basics
0,62806935aa39226247c7c59a,2024-05-04,"{'day': '2021-07-13', 'annotationClickThroughR..."
1,62806935aa39226247c7c59a,2024-05-04,"{'day': '2021-07-14', 'annotationClickThroughR..."
2,62806935aa39226247c7c59a,2024-05-04,"{'day': '2021-07-15', 'annotationClickThroughR..."
3,62806935aa39226247c7c59a,2024-05-04,"{'day': '2021-07-16', 'annotationClickThroughR..."
4,62806935aa39226247c7c59a,2024-05-04,"{'day': '2021-07-17', 'annotationClickThroughR..."
...,...,...,...
126353,6287240afb15712a8cb933c1,2024-05-04,"[{'day': '2024-04-30', 'annotationClickThrough..."
126354,6287240afb15712a8cb933c1,2024-05-04,"[{'day': '2024-05-01', 'annotationClickThrough..."
126355,6287240afb15712a8cb933c1,2024-05-04,"[{'day': '2024-05-02', 'annotationClickThrough..."
126356,6287240afb15712a8cb933c1,2024-05-04,"[{'day': '2024-05-03', 'annotationClickThrough..."


In [43]:
pd.DataFrame(youtube_daily_channel_basics['daily_basics'][0])

AttributeError: 'list' object has no attribute 'keys'

In [39]:
youtube_daily_channel_basics['daily_basics'][0]

[{'day': '2021-07-13',
  'annotationClickThroughRate': 0,
  'annotationCloseRate': 0,
  'averageViewDuration': 121,
  'comments': 2,
  'dislikes': 0,
  'estimatedMinutesWatched': 554,
  'likes': 11,
  'shares': 0,
  'subscribersGained': 32,
  'subscribersLost': 2,
  'views': 273,
  'redViews': 30},
 {'day': '2021-07-14',
  'annotationClickThroughRate': 0,
  'annotationCloseRate': 0,
  'averageViewDuration': 125,
  'comments': 0,
  'dislikes': 0,
  'estimatedMinutesWatched': 156,
  'likes': 2,
  'shares': 0,
  'subscribersGained': 8,
  'subscribersLost': 0,
  'views': 75,
  'redViews': 9},
 {'day': '2021-07-15',
  'annotationClickThroughRate': 0,
  'annotationCloseRate': 0,
  'averageViewDuration': 114,
  'comments': 1,
  'dislikes': 0,
  'estimatedMinutesWatched': 90,
  'likes': 0,
  'shares': 0,
  'subscribersGained': 4,
  'subscribersLost': 0,
  'views': 47,
  'redViews': 4},
 {'day': '2021-07-16',
  'annotationClickThroughRate': 0,
  'annotationCloseRate': 0,
  'averageViewDuration'

In [54]:
dfs_to_merge = []
# youtube_dict의 각 데이터프레임을 리스트에 추가
for df in youtube_dict.values():
    if isinstance(df, pd.DataFrame):  # DataFrame인지 확인
        dfs_to_merge.append(df)

In [None]:
# 예제: 여러 개의 데이터프레임을 병합
# DataFrame들을 Dask DataFrame으로 변환
dask_df_list = [dd.from_pandas(df, npartitions=10) for df in dfs_to_merge]

# 첫 번째 DataFrame을 기준으로 병합을 시작
merged_df = dfs_to_merge[0]

# 나머지 DataFrame들과 순차적으로 병합
for df in dfs_to_merge[1:]:
    merged_df = dd.merge(merged_df, df, on='youtube_user_id', how='outer')

# 최종 병합된 DataFrame을 계산하여 Pandas DataFrame으로 변환
merged_df = merged_df.compute()

# 결과 출력
print(merged_df)


In [None]:
import pandas as pd
from functools import reduce

dfs_to_merge = []
# youtube_dict의 각 데이터프레임을 리스트에 추가
for df in youtube_dict.values():
    if isinstance(df, pd.DataFrame):  # DataFrame인지 확인
        dfs_to_merge.append(df)

# 데이터프레임 병합
merged_df = reduce(lambda left, right: pd.merge(left, right, on='youtube_user_id', how='inner'), dfs_to_merge)

# 결과 출력
print(merged_df)