# Youtube Api V3 usage for channel mining

# Install 
### pip install --upgrade google-api-python-client
### pip install --upgrade google-auth google-auth-oauthlib google-auth-httplib2
### pip install --upgrade oauth2client

In [1]:
import os, sys
import pandas as pd
import datetime
from pprint import pprint
from tqdm import tqdm

import requests
from bs4 import BeautifulSoup

from apiclient.discovery import build
from apiclient.errors import HttpError
from oauth2client.tools import argparser

import devkey
DEVELOPER_KEY = devkey.api1
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"

# 参考: https://developers.google.com/youtube/v3/docs/videos?hl=ja#resource
info_tree = {
    'id': None,
    'snippet': ['title','description','liveBroadcastContent','tags','publishedAt','thumbnails'],
    'statistics': ['viewCount','likeCount','favoriteCount','dislikeCount','commentCount'],
    'contentDetails':['caption','definition','dimension','duration','projection'],
    'topicDetails':['TopicIds', 'relevantTopicIds'],
}

channel_info_tree = {
    'id': None,
    'snippet': ['title','description','publishedAt','thumbnails'],
    'statistics': ['viewCount','subscriberCount','hiddenSubscriberCount','videoCount','commentCount'],
}

In [2]:
def get_channel_vlist( yt, cid, max_num ):
    channel_videos_tmp = []
    feed = yt.search().list( channelId=cid, maxResults=1, order='date', type='video', part='id' ).execute()
    nextPageToken = feed[ 'nextPageToken' ]
    video_idx = 1
    pbar = tqdm(total=max_num)
    while 1:
        if video_idx > max_num:
            break
        video_idx += 1
        pbar.update(1)
        if not 'nextPageToken' in feed: break 
        nextPageToken = feed[ 'nextPageToken' ]
        vid = feed.get('items')[0]['id']['videoId']
        channel_videos_tmp.append( vid )
        feed = yt.search().list( channelId=cid, maxResults=1, order='date', type='video', part='id', pageToken = nextPageToken ).execute()
    pbar.close()
    return channel_videos_tmp

def get_channel_vlist_quick( yt, cid, max_num ):
    channel_videos_tmp = []
    nextPageToken = ''
    page_num = 50
    # pbar = tqdm(total=max_num)
    while 1:
        feed = yt.search().list( channelId=cid, maxResults=page_num,
                                order='date', type='video', part='id', pageToken = nextPageToken
                               ).execute()
        if 'nextPageToken' in feed: nextPageToken = feed[ 'nextPageToken' ]
        else: break
        vids = [ i['id']['videoId'] for i in feed.get('items') ]
        channel_videos_tmp.extend( vids )
        # pbar.update( len(vids) )
        if len( channel_videos_tmp ) > ( max_num - 60 ): page_num = 1
        if len( channel_videos_tmp ) > max_num: break
    # pbar.close()
    return channel_videos_tmp

def get_video_detail( vid ):
    video_detail = YT.videos().list(part="id,snippet,statistics,contentDetails,topicDetails",id=vid).execute()
    return video_detail.get("items",[])[0]

def get_cols( tmp ):
    df_cols_tmp = []
    for parent, child in tmp.items():
        if child == None: df_cols_tmp.append( parent )
        else:
            for c in child: df_cols_tmp.append( c )
    return df_cols_tmp

def get_rows( channel_video_info_tmp, info_tree_tmp ):
    rows = []
    for i, v in channel_video_info_tmp.items():
        # 特定のkeyに値がない場合Noneを入れる
        for parent, child in info_tree_tmp.items():
            if not parent in v:
                v[parent] = {}
                for c in child:
                    v[parent][c] = None
            else:
                if child == None: continue
                for c in child:
                    if not c in v[parent]:
                        v[parent][c] = None
        # 
        row = []
        for parent, child in info_tree_tmp.items():
            if child == None: row.append( v[parent] )
            elif parent == 'snippet':
                for c in child:
                    if c == 'thumbnails':
                        row.append( v[parent][c]['default']['url'] )
                    else:
                        row.append( v[parent][c] )
            else:
                for c in child:
                    row.append( v[parent][c] )
        rows.append( row )
    return rows

def get_rows_channel( channel_info_tmp, channel_info_tree_tmp ):
    rows = []
    for i, channel in channel_info_tmp.items():
        # 特定のkeyに値がない場合Noneを入れる
        for parent, child in channel_info_tree_tmp.items():
            if not parent in channel:
                channel[parent] = {}
                for c in child:
                    channel[parent][c] = None
            else:
                if child == None: continue
                for c in child:
                    if not c in channel[parent]:
                        channel[parent][c] = None

        row = []
        for parent, child in channel_info_tree_tmp.items():
            if child == None: row.append( channel[parent] )
            elif parent == 'snippet':
                for c in child:
                    if c == 'thumbnails':
                        row.append( channel[parent][c]['default']['url'] )
                    else:
                        row.append( channel[parent][c] )
            else:
                for c in child:
                    row.append( channel[parent][c] )
        rows.append( row )
    return rows

In [3]:
# UUUM公式サイトから所属YouTuberのリストを取得
target_url = 'https://www.uuum.jp/creator/'
r = requests.get(target_url)
soup = BeautifulSoup(r.text, 'lxml')
channel_dic = {}
for i in soup.select('a[href^="https://www.youtube.com/"]'):
    url = i.get('href')
    if url[-1] == '/': url = url[:-1]
    url = url.split('/')
    channel_dic[url[4]] = url[3]

In [9]:
channel_dic

{'0214mex': 'user',
 '70cleam': 'user',
 'ABTVnetwork': 'user',
 'ASFFFFD1': 'user',
 'AmiBeautyTV': 'user',
 'Blackbear1207': 'user',
 'FroggyChiu': 'user',
 'HiROKiMovies': 'user',
 'HikakinTV': 'user',
 'IORIKUNTV': 'user',
 'KanonRintarou': 'c',
 'LEON1CHANNEL1': 'user',
 'MASAIandHamzael': 'user',
 'MEGUMIbernadette': 'user',
 'Maguro29Jp': 'c',
 'MasuoTV': 'user',
 'MizukittyV': 'user',
 'PDSKabushikiGaisha': 'user',
 'SRGAME1000': 'user',
 'SeikinTV': 'user',
 'SekineRisa': 'user',
 'ShihoRimi': 'user',
 'TAKASHIsTV': 'user',
 'TUTTITV1': 'user',
 'TheMorusaism': 'user',
 'TokaiOnAir': 'user',
 'UC-8H678xX1SNBOM10_ReY6Q': 'channel',
 'UC-C5UUZqky2qJmG4EzRnI0g': 'channel',
 'UC0JR9ZTJIvB9VX8xz7LyujA': 'channel',
 'UC0_nOHqfcBd8Jzx9YGXJ6AQ': 'channel',
 'UC0cpUGlqr4YlEmRG7543dEg': 'channel',
 'UC1MwWJ2mFWp51xy9yzyhTlg': 'channel',
 'UC2RdeFmVA1PrDqmFqJMG7hA': 'channel',
 'UC3-I23qIQk-2rwi7cnntIJA': 'channel',
 'UC3LAEwEhZ0Yn3l4-WatZo2Q': 'channel',
 'UC4-TMrb7Mm4KnYx1VsUgcJA': 'ch

In [4]:
# connection build
YT = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=DEVELOPER_KEY)

In [5]:
channel_info_dic = {}
for uid, flag in channel_dic.items():
    if flag == 'user': channel_feed = YT.channels().list(part="id,snippet,statistics",forUsername=uid).execute()
    elif flag == 'c': continue # ???
    else: channel_feed = YT.channels().list(part="id,snippet,statistics",id=uid).execute()
    id = channel_feed.get("items",[])[0]['id']
    channel_info_dic[id] = channel_feed.get("items",[])[0]

In [8]:
# YouTuber
df_cols = get_cols( channel_info_tree )
rows = get_rows_channel( channel_info_dic, channel_info_tree )
df = pd.DataFrame( rows, columns=df_cols )
# id ... d.replace('-','__') # id '-'を'__'に置換

df.to_csv( 'output/UUUM.csv' )

In [10]:
save_dir = 'output/channel_csv_0702'
roop = 0
for uid, flag in channel_dic.items():
    roop += 1
    print( roop, uid, flag )
    if flag == 'user':
        channel_feed = YT.channels().list(part="id",forUsername=uid).execute()
        CHANNEL_ID = channel_feed.get("items",[])[0]['id']
    elif flag == 'c':
        print( 'skip...' )
        continue # ???
    else:
        CHANNEL_ID = uid
    if os.path.exists( '%s/%s.csv' % ( save_dir, CHANNEL_ID ) ):
        continue
    print("\t get_video_idx ...")
    channel_videos = get_channel_vlist_quick( YT, CHANNEL_ID, 500 )
    print("\t get_video_details ...");
    vlist_details = [ get_video_detail( i ) for i in channel_videos ]    
    channel_video_info = {}
    for i, v_detail in enumerate(vlist_details): channel_video_info[i] = v_detail
    df_cols = get_cols( info_tree )
    rows = get_rows( channel_video_info, info_tree )
    df = pd.DataFrame( rows, columns=df_cols )
    df.to_csv( '%s/%s.csv' % ( save_dir, CHANNEL_ID ) )

1 HikakinTV user
2 0214mex user
3 SeikinTV user
4 eguri89 user
5 PDSKabushikiGaisha user
6 MasuoTV user
7 kinoyuu0204 user
8 kazuch0924 user
9 jintomikku user
10 jetdaisuke user
11 MEGUMIbernadette user
12 avntisdouga user
13 MASAIandHamzael user
14 UCpOjLndjOqMoffA-fr8cbKA channel
15 UCosy7jwSdXUbLbJBYgyRcnw channel
16 torokeroTV user
17 UCCJel9mmTsxDU9RiCwdiLiA channel
18 hittyaso user
19 kandorishinobu user
20 yoorai0121 user
21 satotintv user
22 nm0525skywalker user
23 TAKASHIsTV user
24 himawari7859 user
25 UC1MwWJ2mFWp51xy9yzyhTlg channel
26 UCM5R7AjTi7irA9fXorxRTRg channel
27 ASFFFFD1 user
28 kirizakieizi user
29 UCoG2iDwh5Vw61kZx0kWL0WA channel
30 UCMsuwHzQPFMDtHaoR7_HDxg channel
31 kashimacitycom user
32 UCqQ6BtItfK2FI3f4wubJmoA channel
33 UC0JR9ZTJIvB9VX8xz7LyujA channel
34 tryTV92 user
35 IORIKUNTV user
36 UCR1_4Tla5MlZlDK8WM5vklA channel
37 UCCO_itOx-wopeBohj5oPkzg channel
38 bunkei1113 user
39 UCh1K2JtxIZLnUaTPlxPPzjw channel
40 UCPzqjzrvmSA9dVydRdAX6Xg channel
41 UC9nM3v1