In [62]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound, VideoUnavailable
import os
import googleapiclient.discovery
import googleapiclient.errors
from google.oauth2 import service_account
import json
import datetime
import ipywidgets as widgets
import pandas as pd

In [164]:
# globals
filename = 'speeches.lg.json'

SCOPES = ["https://www.googleapis.com/auth/youtube.readonly"]
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"

api_service_name = "youtube"
api_version = "v3"
service_account_file = "service_account.json"

# format date
def f_date(in_date): 
    return in_date.isoformat()#+'+04:00' #+4:00 is needed to set to EST

# run a youtube search over a date range
def run_search(query, start_date, end_date, page=None):
    request = youtube.search().list(
        part="snippet",
        maxResults=50,
        pageToken=page,
        publishedAfter=f_date(start_date),
        publishedBefore=f_date(end_date),
        q=query,
        topicId="/m/05qt0",
        type="video",
        videoCaption="videoCaptionUnspecified",
        videoDuration="long"
    )
    return request.execute()

# match the json structure from "create_records.py"
def build_video_details(videoId, candidate):
    captions = None
    
    request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=videoId
        )
    response = request.execute()
    
    try:
        captions = YouTubeTranscriptApi.get_transcript(videoId)
    except TranscriptsDisabled:
        print("Transcripts disabled for video: %s" % videoId)
    except NoTranscriptFound:
        print("No transcript found for video: %s" % videoId)
    except VideoUnavailable:
        print("Video no longer available: %s" % videoId)
    
    response['items'][0]['candidate'] = candidate
    response['items'][0]['captions'] = captions
    
    return response


In [71]:
# Get credentials and create an API client
credentials = service_account.Credentials.from_service_account_file(
        service_account_file, scopes=SCOPES)
youtube = googleapiclient.discovery.build(
        api_service_name, api_version, credentials=credentials)

In [100]:
candidates = [
    {"person": "biden",
     "q": '"biden" speech|remarks|stream -trump'},
    {"person": "trump",
     "q": '"trump" speech|remarks|stream -biden -melania'},
    {"person": "harris",
     "q": '"kamala" "harris" speech|remarks|stream'},
    {"person": "pence",
     "q": 'mike "pence" speech|remarks|stream'}
]

In [76]:
# setup before loops - set this date to wherever we left off last time
start_date = datetime.datetime(2020, 7, 29, tzinfo=datetime.timezone.utc)
ci = 3 # candidate index

# create blank dataframe to start
df = pd.DataFrame(columns=['id','person','title','date'])

In [150]:
# re-run from here

if ci==3:
    # reset candidate loop, go back 1 week
    ci=0
    # week loop
    end_date = start_date
    start_date = end_date - datetime.timedelta(days=7)
else:
    ci = ci + 1
    
# candidate loop
person = candidates[ci]['person']
query = candidates[ci]['q']

print("Candidate: " + person)
print("Week starting: {}".format(start_date))

# run a search on this candidate for this week
response = run_search(query, start_date, end_date)
titles = [[i['id']['videoId'],i['snippet']['title']] for i in response['items']]
sm = widgets.SelectMultiple(
    options=[t[0]+' '+t[1] for t in titles],
    #value=,
    #rows=10,
    description='Speeches',
    disabled=False,
    layout=widgets.Layout(width='90%', height='800px')
)
print("Select the speeches to keep, then run the next block")
display(sm)

Candidate: pence
Week starting: 2020-07-29 00:00:00+00:00
Select the speeches to keep, then run the next block


SelectMultiple(description='Speeches', layout=Layout(height='800px', width='90%'), options=('GXp7SZQSHuc Vice …

In [151]:
ids = [t[0] for t in titles if t[0]+' '+t[1] in (sm.value)]
titles = [t[1] for t in titles if t[0] in ids]
dates = [i['snippet']['publishedAt'][:10] for i in response['items'] if i['id']['videoId'] in ids]
persons = [person for i in ids]
df = df.append(pd.DataFrame({"id":ids, "person":persons, "title":titles, "date":dates}))
print("Added {} items".format(len(ids)))
# end loop - go back and run the previous block

Added 8 items


In [152]:
df

Unnamed: 0,id,person,title,date
0,bduvZIp6UCo,biden,Joe Biden remarks on the latest jobs report,2020-09-04
1,cuuJ9czNT0g,biden,LIVE: Presidential candidate Joe Biden deliver...,2020-09-04
2,DubLPc_l0Mc,biden,Democratic presidential nominee Joe Biden deli...,2020-09-02
3,uBigQgyIPkE,biden,Joe Biden&#39;s full campaign speech HD 8/31/2020,2020-09-03
4,3BImHVEOY6g,biden,Joe Biden Delivers Remarks Democratic presiden...,2020-09-02
...,...,...,...,...
3,00CXl6jqZC4,pence,Vice President Pence Delivers Remarks on the T...,2020-07-31
4,k8kYwhNn3L8,pence,WATCH- Vice President Pence at Pennsylvania -C...,2020-07-30
5,VyZqo-PZsAo,pence,WATCH: Vice President Pence at Pennsylvania &q...,2020-07-30
6,rArvMOM80HE,pence,Vice President Mike Pence speaks at a Cops for...,2020-07-30


In [153]:
#Read in existing JSON, or create if JSON does not exist
if os.path.isfile(filename):
    speeches=json.load(open(filename))
else:
    speeches=[]

In [163]:
for row in df.iterrows():
    r = row[1] # actual row
    print('Video {}'.format(r['id']))
    speeches.append(build_video_details(r['id'], r['person']))
    

Video bduvZIp6UCo
Video cuuJ9czNT0g
Video DubLPc_l0Mc
Video uBigQgyIPkE
Video 3BImHVEOY6g
Video FKsNEsEPxTI
Video TCwxRU1ypSE
Video -kA8QjQItGU
Video YNMQzLnkJ94
Video HGwE2LNWj4o
Video c0lt0Cf2kQc
Video zddirtktOgA
Transcripts disabled for video: zddirtktOgA
Video Jtm2HOUljtg
Video 8e4XufaznSo
Video YsrBhCXhlBY
Video xfLUAJ2DiM0
Video eAM948gYPe8
Transcripts disabled for video: eAM948gYPe8
Video 4rSKWIyOYas
Transcripts disabled for video: 4rSKWIyOYas
Video eBVEf0ly6G8
Video A5oJSJjFk2A
Video vXB_30_3Je8
Video LcpPA2C4mew
Video nUGjYs3YTes
Transcripts disabled for video: nUGjYs3YTes
Video 2zCEP4tkDWQ
Video V9HgTko9y3U
Video oIBl19NxLtU
Video 1-7mom0W3gY
Video TKXdcfqkXDc
Video Gg0Uo6EKflk
Video a2ET_oiFS5s
Video Kirmh-y-fVU
Transcripts disabled for video: Kirmh-y-fVU
Video QwBkleZI8TQ
Video naDuBCBaaz0
Video QVst_8O8YEw
Video -VVIkXMR2I4
Video Rp2reLIym9I
Transcripts disabled for video: Rp2reLIym9I
Video E9XJB5AWXLs
Video f2CQn4BGD0U
Video 4JcGu49XONM
Video oCZ-Z5F53mw
Video sF4IJJYB0s

KeyError: 'playerCaptionsTracklistRenderer'

In [166]:
len(speeches)

108

In [165]:
# write json
with open(filename, 'w') as outfile:
     json.dump(speeches, outfile, sort_keys = True, indent = 4,
               ensure_ascii = False)