# Import libraries

In [1]:
import os
import pandas as pd
from dateutil import parser as iso_date_parser

import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors

# Prepare client credentials

In [2]:
scopes = ["https://www.googleapis.com/auth/youtube.readonly"]

# Disable OAuthlib's HTTPS verification when running locally.
# *DO NOT* leave this option enabled in production.
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"

api_service_name = "youtube"
api_version = "v3"
client_secrets_file = "client_secret.json"

# Get credentials and create an API client
flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(
    client_secrets_file, scopes)
credentials = flow.run_console()
youtube = googleapiclient.discovery.build(
    api_service_name, api_version, credentials=credentials)

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=481920947367-s2qhm173nqhqhcic2iuuhdp10ujk5iid.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fyoutube.readonly&state=PEis3NHqyGor7C4VUeDZ1xHyAmzhDb&prompt=consent&access_type=offline


Enter the authorization code:  4/1AX4XfWglCBgqt5bXnVVPgirI54T2yKixk2ZrmxDBtZf6O6r_7jH3Y_bYrbE


# Define utility functions

In [3]:
# Get the "uploads" (main) playlist of the channel
def getChannelPlaylist(channelId):
    request = youtube.channels().list(
        part="contentDetails",
        id=channelId
    )
    resp = request.execute()
    return resp['items'][0]['contentDetails']['relatedPlaylists']['uploads']

In [4]:
# Get playlist item on one page
def queryPlaylistItems(playlistId, pageToken=''):
    request = youtube.playlistItems().list(
        part='snippet,contentDetails',
        pageToken=pageToken,
        playlistId=playlistId,
        maxResults=50
    )
    resp = request.execute()
    
    return {
        'itemIds': [obj['contentDetails']['videoId'] for obj in resp['items']],
        'nextPageToken': resp['nextPageToken'] if 'nextPageToken' in resp else None
    }

In [5]:
# Get all video ids of a playlist
def getAllVidIdsFromList(playlistId):
    allVideoIds = []
    vidCount = 0
    pageCount = 0
    nextPageToken = ''
    
    while True:
        currentPageData = queryPlaylistItems(playlistId, nextPageToken)
        allVideoIds += currentPageData['itemIds']
        nextPageToken = currentPageData['nextPageToken']
        
        vidCount += len(currentPageData['itemIds'])
        pageCount += 1
        print('Scraping page %d. Total vid count: %d' % (pageCount, vidCount))
        if not nextPageToken:
            break 
    
    return allVideoIds

In [6]:
# Get the following information of a list of video ids
# - Video title 'title'
# - Total views 'viewCount'
# - Published date 'publishedAt'
def getVideoData(vidIds):
    joinedVidIds = ','.join(vidIds)
    
    request = youtube.videos().list(
        part="contentDetails,statistics,snippet",
        id=joinedVidIds
    )
    resp = request.execute()
    
    return [{
        'publishedAt': item['snippet']['publishedAt'],
        'title': item['snippet']['title'],
        'viewCount': item['statistics']['viewCount']
    } for item in resp['items']]

In [7]:
def splitBins(start, end, interval):
    bins = []
    while start + interval < end:
        bins.append((start, start + interval))
        start += interval
    bins.append((start, end))
    return bins

In [8]:
def fetchAllVideoData(vidIds):
    bins = splitBins(0, len(vidIds) - 1, 50)
    allData = []
    
    for start, end in bins:
        print('Processing videos from %d to %d' % (start, end))
        binVidIds = vidIds[start:end]
        allData += getVideoData(binVidIds)
    return allData

## Data scraping
### Love Live

In [9]:
loveLivePlaylists = ['PLmgGL3shzkGM9akfMoobnVhE3XCP42lHb',
                     'PLmgGL3shzkGM96YqVhygd3Skb42CTa-hy']
loveLivePlaylists.append(getChannelPlaylist('UCTkyJbRhal4voLZxmdRSssQ'))
loveLivePlaylists.append(getChannelPlaylist('UCWTbUllFchDX1IxfJF-N0UA'))

In [10]:
loveLiveTotalVidData = []
for playlistId in loveLivePlaylists:
    allVidIds = getAllVidIdsFromList(playlistId)
    allVidData = fetchAllVideoData(allVidIds)
    loveLiveTotalVidData += allVidData

Scraping page 1. Total vid count: 50
Scraping page 2. Total vid count: 100
Scraping page 3. Total vid count: 116
Processing videos from 0 to 50
Processing videos from 50 to 100
Processing videos from 100 to 115
Scraping page 1. Total vid count: 50
Scraping page 2. Total vid count: 100
Scraping page 3. Total vid count: 150
Scraping page 4. Total vid count: 200
Scraping page 5. Total vid count: 215
Processing videos from 0 to 50
Processing videos from 50 to 100
Processing videos from 100 to 150
Processing videos from 150 to 200
Processing videos from 200 to 214
Scraping page 1. Total vid count: 50
Scraping page 2. Total vid count: 100
Scraping page 3. Total vid count: 150
Scraping page 4. Total vid count: 200
Scraping page 5. Total vid count: 250
Scraping page 6. Total vid count: 300
Scraping page 7. Total vid count: 350
Scraping page 8. Total vid count: 400
Scraping page 9. Total vid count: 450
Scraping page 10. Total vid count: 500
Scraping page 11. Total vid count: 550
Scraping page 1

In [11]:
loveLiveDf = pd.DataFrame(loveLiveTotalVidData)

In [12]:
loveLiveDf['viewCount'] = loveLiveDf['viewCount'].astype(int)

In [13]:
totalViews = loveLiveDf['viewCount'].sum()
numVideo = len(loveLiveDf)
totalViews / numVideo

257886.54412786657

## Data scraping
### Bandori

In [14]:
bandoriPlaylists = []
bandoriPlaylists.append(getChannelPlaylist('UCN-bFIdJM0gQlgX7h6LKcZA'))
bandoriPlaylists.append(getChannelPlaylist('UCPityslSknKsWUq9iy8p9fw'))

In [15]:
bandoriTotalVidData = []
for playlistId in bandoriPlaylists:
    allVidIds = getAllVidIdsFromList(playlistId)
    allVidData = fetchAllVideoData(allVidIds)
    bandoriTotalVidData += allVidData

Scraping page 1. Total vid count: 50
Scraping page 2. Total vid count: 100
Scraping page 3. Total vid count: 150
Scraping page 4. Total vid count: 200
Scraping page 5. Total vid count: 250
Scraping page 6. Total vid count: 300
Scraping page 7. Total vid count: 350
Scraping page 8. Total vid count: 400
Scraping page 9. Total vid count: 450
Scraping page 10. Total vid count: 500
Scraping page 11. Total vid count: 550
Scraping page 12. Total vid count: 600
Scraping page 13. Total vid count: 650
Scraping page 14. Total vid count: 700
Scraping page 15. Total vid count: 750
Scraping page 16. Total vid count: 800
Scraping page 17. Total vid count: 850
Scraping page 18. Total vid count: 900
Scraping page 19. Total vid count: 950
Scraping page 20. Total vid count: 1000
Scraping page 21. Total vid count: 1050
Scraping page 22. Total vid count: 1100
Scraping page 23. Total vid count: 1150
Scraping page 24. Total vid count: 1200
Scraping page 25. Total vid count: 1250
Scraping page 26. Total vid c

In [16]:
bandoriDf = pd.DataFrame(bandoriTotalVidData)

In [17]:
bandoriDf['viewCount'] = bandoriDf['viewCount'].astype(int)

In [18]:
totalViews = bandoriDf['viewCount'].sum()
numVideo = len(bandoriDf)
totalViews / numVideo

176246.70393258426

In [19]:
loveLiveDf.to_csv('lovelive.csv', index=False)

In [20]:
bandoriDf.to_csv('bandori.csv', index=False)