In [1]:
import os, sys, subprocess, json
import pandas as pd

from apiclient.discovery import build
from apiclient.errors import HttpError
from oauth2client.tools import argparser

# import timeout_decorator
from time import sleep
RETRY = 2

sys.path.append('/secret')
import devkey
DEVELOPER_KEY = devkey.api1
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"

# connection build
YT = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=DEVELOPER_KEY)

In [2]:
# 参考: https://developers.google.com/youtube/v3/docs/videos?hl=ja#resource
infoTree = {
    'id': None,
    'snippet': ['title','description','liveBroadcastContent','tags','publishedAt','thumbnails'],
    'statistics': ['viewCount','likeCount','favoriteCount','dislikeCount','commentCount'],
    'contentDetails':['caption','definition','dimension','duration','projection'],
    'topicDetails':['TopicIds', 'relevantTopicIds'],
}

commentValues = [
'videoId',
'id',
'totalReplyCount',
'authorChannelId',
'authorDisplayName',
'authorProfileImageUrl',
'likeCount',
'publishedAt',
'textOriginal',
'canRate',
'viewerRating'
]

In [3]:
def get_channel_vlist_quick( yt, cid, max_num ):
    channel_videos_tmp = []
    nextPageToken = ''
    pageNum = 50
    while 1:
        for ridx in range(RETRY):
            try: 
                feed = yt.search().list( channelId=cid, maxResults=pageNum,
                                        order='date', type='video', part='id', pageToken = nextPageToken
                                       ).execute()
                break
            except  Exception as e:
                print( ridx, e, '...' )
                sleep(5)
                continue    
        if ridx == RETRY -1:
            print( 'retry error ...' )
            break
        vids = [ i['id']['videoId'] for i in feed.get('items') ]
        channel_videos_tmp.extend( vids )
        if 'nextPageToken' in feed: nextPageToken = feed[ 'nextPageToken' ]
        else: break
        if len( channel_videos_tmp ) > ( max_num - 60 ): pageNum = ( pageNum / 2 ) + 1
        if len( channel_videos_tmp ) > max_num: break
    return channel_videos_tmp

def get_video_detail( vid ):
    try:
        video_detail = YT.videos().list(part="id,snippet,statistics,contentDetails,topicDetails",id=vid).execute()
        return video_detail.get("items",[])[0]
    except:
        print( 'skipped' )
        return None
        
# @timeout_decorator.timeout(300)
def get_comments_multiplepages( vid, maxNum ):
    comments = []
    nextPageToken = ''
    pageNum = 100
    while 1:
        for ridx in range(RETRY):
            try: 
                commentThreads = YT.commentThreads().list(
                    part="snippet",
                    videoId=vid,
                    maxResults=pageNum,
                    pageToken=nextPageToken,
                    order='relevance',
                    textFormat='plainText'
                ).execute()
                break
            except  Exception as e:
                print( ridx, e, '...' )
                sleep(3)
                continue
        if ridx == RETRY -1:
            print( 'retry error ...' )
            break
        commentsTmp = [ i for i in commentThreads.get('items') ]
        comments.extend( commentsTmp )
        if 'nextPageToken' in commentThreads:
            nextPageToken = commentThreads[ 'nextPageToken' ]
        else:
            break
        if len( comments ) > ( maxNum - 101 ):
            pageNum = ( pageNum / 2 ) + 1
        if len( comments ) > maxNum:
            break
    return comments

def get_comments( vid_tmp ):
    comment_threads = YT.commentThreads().list(
        part="snippet,replies",videoId=vid_tmp,maxResults=100,
        order='relevance',textFormat='plainText' ).execute()
    return comment_threads['items']

def get_rows_comment( comment_info_tmp ):
    rows = []
    for i, com in comment_info_tmp.items():
        # if int(i) % 10000 == 1: print( i )
        try:
            row = [
                com['videoId'],
                com['topLevelComment']['id'],
                com['totalReplyCount'],
                com['topLevelComment']['snippet']['authorChannelId']['value'],
                com['topLevelComment']['snippet']['authorDisplayName'],
                com['topLevelComment']['snippet']['authorProfileImageUrl'],
                com['topLevelComment']['snippet']['likeCount'],
                com['topLevelComment']['snippet']['publishedAt'],
                com['topLevelComment']['snippet']['textOriginal'],
                com['topLevelComment']['snippet']['canRate'],
                com['topLevelComment']['snippet']['viewerRating']
            ]
            rows.append( row )
        except:
            print( com )
            
    return rows

def get_cols( tmp ):
    df_cols_tmp = []
    for parent, child in tmp.items():
        if child == None: df_cols_tmp.append( parent )
        else:
            for c in child: df_cols_tmp.append( c )
    return df_cols_tmp

def get_rows( channel_video_info_tmp, info_tree_tmp ):
    rows = []
    for i, v in channel_video_info_tmp.items():
        # 特定のkeyに値がない場合Noneを入れる
        for parent, child in info_tree_tmp.items():
            if not parent in v:
                v[parent] = {}
                for c in child:
                    v[parent][c] = None
            else:
                if child == None: continue
                for c in child:
                    if not c in v[parent]:
                        v[parent][c] = None
        # 
        row = []
        for parent, child in info_tree_tmp.items():
            if child == None: row.append( v[parent] )
            elif parent == 'snippet':
                for c in child:
                    if c == 'thumbnails':
                        row.append( v[parent][c]['default']['url'] )
                    else:
                        row.append( v[parent][c] )
            else:
                for c in child:
                    row.append( v[parent][c] )
        rows.append( row )
    return rows

In [4]:
df_channels = pd.read_csv('/data/idol_channels.csv')
channel_id_list = df_channels['channel_id'].values

In [5]:
save_dir = '/youtube/idol_channels_20191128'
if not os.path.exists( save_dir ): os.mkdir(save_dir)
if not os.path.exists( f'{save_dir}/videos' ): os.mkdir(f'{save_dir}/videos')
if not os.path.exists( f'{save_dir}/comments' ): os.mkdir(f'{save_dir}/comments')

In [6]:
for cidx, channel_id in enumerate( channel_id_list ):

    print( cidx, channel_id )
    
    channel_dir = f'{save_dir}/videos/{channel_id}'
    if not os.path.exists( channel_dir ):
        os.mkdir(channel_dir)

    channel_videos = get_channel_vlist_quick( YT, channel_id, 500 )
    
    for vidx, vid in enumerate( channel_videos ):
        video_path = f'{channel_dir}/{vid}.json'
        if os.path.exists( video_path ): continue
        vDetail = get_video_detail( vid )
        if vDetail == None: continue
        with open(video_path, 'w') as f:
            json.dump(vDetail, f, indent=4)

0 UCCRb6nYKaT8tzLA8CwDdUtw
1 UCxjXU89x6owat9dA8Z-bzdw
2 UCUzpZpX2wRYOk3J8QTFGxDg
3 UCmr9bYmymcBmQ1p2tLBRvwg
4 UC6YNWTm6zuMFsjqd0PO3G-Q
5 UCnhrIe3jZNmqDEL_zSBXADQ
6 UCoKXb95K5h3sME3c9OCBaeA
7 UCR0V48DJyWbwEAdxLL5FjxA
8 UCG-5D9k_fL4FnMeNuraeAtA
9 UC8RJBf7ftto2R1EKZC3YE0A
10 UCcS2E_TVMSwbN4Q5eH7F_oA
11 UCPQ0GEWwLaam1lTX9P-CgGA
12 UCDwcZ85zjLKD-3-jqlv1wQQ
13 UC6FadPgGviUcq6VQ0CEJqdQ
14 UCWBTV02MVmyLqNPWRGDrl6A
15 UCLQlkHgD2ltaptlgs20eZ6Q
16 UCRkCifCbAw0qwu7XzOa9d7w
17 UCQ6Br7m6vP61FZvjv4lwR5w
18 UCIfuY0NRq1szr_6tzFy23NQ
19 UCw6Y7niyneaJezhnkBSOqkQ
20 UCQSWhzO_0ij0GrN7VaZbXcA
21 UCv7VutirxDn3RWIJXI68n_A
22 UCQuNrURlzPsbJZD17icx7Gw
23 UCShhCnBdAL5nYpKrbL9JOTA
24 UCt3f_Tu1lNua1xLQZu2Td-w
25 UC5GqigDlifXby0ANB8dgT1g
26 UCXTsCXNGHmePgo3a47hnsAA
27 UCa8GISK9_hsZ8aEJEL1u1Sg
28 UCEAn5QItECbYbLW6Mvq3bPg
29 UCwFZSxCiCiucEhxtlV4L5Lg
30 UCJxYlnZBZc7Ol4FZB9kwl4g
31 UCiaowdmU8TziJd9RspD60yQ
32 UCkafLPXB6MgLmTbq8UHE8Cg
33 UCBmvHfXdGCvi_b6lFeU-E1Q
34 UCkEiBC95Yzi7x1pIcdVNiDw
35 UCwJDv8UoHrxTCc_10hnEGgw
36

In [8]:
for cidx, channel_id in enumerate( channel_id_list ):

    print( cidx, channel_id )
    channel_dir = f'{save_dir}/comments/{channel_id}'
    if not os.path.exists( channel_dir ): os.mkdir(channel_dir)

    tmp = os.listdir( f'{save_dir}/videos/{channel_id}' )
    vid_list = [ line.split('.')[0] for line in tmp ] 
    for vidx, vid in enumerate( vid_list ):
        
        comment_path = f'{channel_dir}/{vid}.csv'
        if os.path.exists( comment_path  ): continue

        for ridx in range(RETRY):
            try:
                comments = get_comments_multiplepages( vid, 1000 )
                break
            except:
                print( 'timeout ... get_comments_multiplepages()')
                continue
        
        commentInfo = { cidx:com['snippet'] for cidx, com in enumerate( comments ) }
        rows = get_rows_comment( commentInfo )
        df = pd.DataFrame( rows, columns=commentValues )
        df.to_csv( comment_path , index=False  )
        print( 'ok', len( comments ) )

0 UCCRb6nYKaT8tzLA8CwDdUtw
ok 863
ok 1004
ok 1004
ok 1050
ok 1049
ok 1050
ok 885
ok 917
ok 304
{'videoId': 'X3H-4crGD6k', 'topLevelComment': {'kind': 'youtube#comment', 'etag': '"j6xRRd8dTPVVptg711_CSPADRfg/42xdiDVbSdHWLkJ_2jSXkDZKW-E"', 'id': 'Ugy2RLlHbOLd6B1Kp6F4AaABAg', 'snippet': {'authorDisplayName': '', 'authorProfileImageUrl': '//s.ytimg.com/yts/img/avatar_32-vflI3ugzv.png', 'authorChannelUrl': '', 'videoId': 'X3H-4crGD6k', 'textDisplay': 'This is 5jackson twice ver', 'textOriginal': 'This is 5jackson twice ver', 'canRate': True, 'viewerRating': 'none', 'likeCount': 0, 'publishedAt': '2019-11-19T11:14:33.000Z', 'updatedAt': '2019-11-19T11:14:33.000Z'}}, 'canReply': True, 'totalReplyCount': 0, 'isPublic': True}
ok 1004
0 <HttpError 400 when requesting https://www.googleapis.com/youtube/v3/commentThreads returned "The API server failed to successfully process the request. While this can be a transient error, it usually indicates that the requests input is invalid. Check the struct