In [1]:
import pandas as pd
import requests
import json
import pymongo
from youtube_transcript_api import YouTubeTranscriptApi

In [2]:
# connect to mongodb
myclient = pymongo.MongoClient("mongodb://localhost/")
db = myclient["speeches"]

In [3]:
# define youtube playlists to pull video transcripts from
# playlist_id, candidate, source_description
sources = [['PLB92o2PvjqnfXeskcxX3GR6alCzdVFHjQ','Biden','Joe Biden Official YouTube Channel. Playlist: Livestreams, Speeches, and Debates'],
           ['PLKOAoICmbyV2XOjXa9u00njJ6fTLpOK5x','Trump','Donald J Trump Official YouTube Channel. Playlist: Trump Rallies']]

## Build library of videos

In [4]:
# pull the video IDs, titles, etc from the playlist
def playlist_index(source):
    # api key AIzaSyC7mEGnVGAFmoy23HdHJo877vZQ7DsYLeg
    pl_df = pd.DataFrame()
    next_page_str = ""
    while True: # loop through pages of 25 results
        r = requests.get("https://www.googleapis.com/youtube/v3/playlistItems?" + 
                         "part=id%2CcontentDetails%2Csnippet&" +
                         "maxResults=25&" + # default of 5
                         "playlistId=" + source[0] + "&" + next_page_str +
                         "key=AIzaSyC7mEGnVGAFmoy23HdHJo877vZQ7DsYLeg", 
                         headers = {'Accept': 'application/json'})
        if r.status_code != 200:
            print("Error for playlist " + source[0])
            print(r.text)
            break
        pl_json = json.loads(r.text)
        df = pd.DataFrame(
            [i['snippet']['resourceId']['videoId'],
             i['snippet']['publishedAt'],
             i['snippet']['title']] for i in pl_json['items']
        )
        df.columns = ['videoId','publishedAt','title']
        df['source'] = "youtube"
        df['candidate'] = source[1]
        df['source_desc'] = source[2]
        pl_df = pl_df.append(df)
        if "nextPageToken" not in pl_json:
            break
        next_page_str = "pageToken="+ pl_json['nextPageToken'] + "&" 
    # clean up playlist
    pl_df = pl_df
    return pl_df

In [5]:
# set up a library of documents

try:
#    library = pd.read_json("library.json", orient='split')
    cursor = db["library"].find()
    library = pd.DataFrame(list(cursor)).drop(columns="_id")
except: # can't load from mongodb, create new collection
    print("Couldn't load library, creating new one")
    library = pd.DataFrame()
for s in sources:
    library = library.append(playlist_index(s))

In [6]:
# remove all the duplicates added
library = library.sort_values(["publishedAt"], ascending=False)\
        .drop_duplicates(["videoId"],keep="first")

In [7]:
library.head()

Unnamed: 0,videoId,publishedAt,title,source,candidate,source_desc
0,ZcR_kuCJvD8,2020-07-06T01:36:43Z,Vice President Biden delivers remarks on Trump...,youtube,Biden,Joe Biden Official YouTube Channel. Playlist: ...
1,m_lYduRUumA,2020-06-29T19:45:09Z,Vice President Biden's Remarks at Global Pride...,youtube,Biden,Joe Biden Official YouTube Channel. Playlist: ...
2,yy6jBRVDcpI,2020-06-21T01:53:05Z,Vice President Biden Delivers Remarks On Havin...,youtube,Biden,Joe Biden Official YouTube Channel. Playlist: ...
3,9CBReIcgQYw,2020-06-19T20:42:10Z,"LIVE: President Trump in Tulsa, Oklahoma",youtube,Trump,Donald J Trump Official YouTube Channel. Playl...
4,-VB24DMVUwY,2020-06-05T03:13:42Z,Join Vice President Biden for A Town Hall with...,youtube,Biden,Joe Biden Official YouTube Channel. Playlist: ...


In [7]:
# is videoId ok as a unique key? or should we generate a unique speech id?
# what if we have different videos for the same speech from different sources?
# or what if we want to compare alternate transcripts for the same video?

In [10]:
# checkpoint - save a working copy to file
#library.to_parquet("library.parquet")
#library.to_json("library.json", orient='split')


In [11]:
# write new entries to mongodb

lc = db["library"]

cursor = db["library"].find()
old_lib = pd.DataFrame(list(cursor)).drop(columns="_id") 

library=library.reset_index().set_index(['videoId']) 
new_lib = library[~library.index.isin(old_lib.videoId)] # new entries only; might be none
print("Writing {} new records to collection".format(new_lib.shape[0]))

records = json.loads(new_lib.to_json(orient="records"))

Writing 0 new records to collection


In [32]:
# test - query the library collection to make sure it looks good
#cursor = db["library"].find()
#pd.DataFrame(list(cursor)).drop(columns="_id") 

Unnamed: 0,videoId,publishedAt,title,source,candidate,source_desc
0,ZcR_kuCJvD8,2020-07-06T01:36:43Z,Vice President Biden delivers remarks on Trump...,youtube,Biden,Joe Biden Official YouTube Channel. Playlist: ...
1,m_lYduRUumA,2020-06-29T19:45:09Z,Vice President Biden's Remarks at Global Pride...,youtube,Biden,Joe Biden Official YouTube Channel. Playlist: ...
2,yy6jBRVDcpI,2020-06-21T01:53:05Z,Vice President Biden Delivers Remarks On Havin...,youtube,Biden,Joe Biden Official YouTube Channel. Playlist: ...
3,9CBReIcgQYw,2020-06-19T20:42:10Z,"LIVE: President Trump in Tulsa, Oklahoma",youtube,Trump,Donald J Trump Official YouTube Channel. Playl...
4,-VB24DMVUwY,2020-06-05T03:13:42Z,Join Vice President Biden for A Town Hall with...,youtube,Biden,Joe Biden Official YouTube Channel. Playlist: ...
...,...,...,...,...,...,...
99,QeYyLho2bnQ,2019-04-08T14:11:19Z,"LIVE: President Trump in Macon, GA",youtube,Trump,Donald J Trump Official YouTube Channel. Playl...
100,B1iI8AMLGiI,2019-04-08T14:11:19Z,"LIVE: President Trump in Fort Myers, FL",youtube,Trump,Donald J Trump Official YouTube Channel. Playl...
101,zWICNmMVhLs,2019-04-08T14:11:19Z,"LIVE: President Trump in Lebanon, OH",youtube,Trump,Donald J Trump Official YouTube Channel. Playl...
102,yGfh4eLgKYw,2019-04-08T14:11:19Z,"LIVE: President Trump in Council Bluffs, IA",youtube,Trump,Donald J Trump Official YouTube Channel. Playl...


## Build document df of raw transcripts

In [65]:
# load existing doc from a file if it exists
try:
    doc = pd.read_json("doc.json", orient="split").set_index("index")
except: # can't load, create blank
    doc = pd.DataFrame()

In [17]:
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound

n = 0
for v in library['videoId']:
    if 'videoId' in doc.columns and doc[doc['videoId'] == v]['videoId'].count() > 0:
        n = n + 1
        continue # already exists, don't overwrite it. skip to next loop
    try:
        vdf = pd.DataFrame(YouTubeTranscriptApi.get_transcript(v))
        vdf['videoId'] = v 
        doc = doc.append(vdf)
    except TranscriptsDisabled:
        print("Transcripts disabled for video: %s" % v)
    except NoTranscriptFound:
        print("No transcript found for video: %s" % v)
    n = n + 1
    if n % 10 == 0:
        print(n) # print progress numbers

10
20
30
40
Transcripts disabled for video: MJLq0Nw9YgM
50
No transcript found for video: hDhploWe9Ec
60
70
Transcripts disabled for video: wHdN6KFxmgs
Transcripts disabled for video: imluGUa6YEo
Transcripts disabled for video: YdeaPEP5K9k
80
Transcripts disabled for video: vEwJwLUZjyU
Transcripts disabled for video: Z3KW869TSL0
Transcripts disabled for video: WrRZYoPWWFU
Transcripts disabled for video: QfV9mw42dvs
Transcripts disabled for video: vWCt3fcQqgM
90
Transcripts disabled for video: 2IlvPU2-BIM
Transcripts disabled for video: G_KNy7bdc80
Transcripts disabled for video: QeYyLho2bnQ
Transcripts disabled for video: zWICNmMVhLs
Transcripts disabled for video: naiRdSkaVpk
Transcripts disabled for video: cln_VNkl688
Transcripts disabled for video: 9PT6KJxYdnQ
100
Transcripts disabled for video: U6OPcaMMQyQ


In [18]:
doc

Unnamed: 0,text,start,duration,videoId
0,">>Good afternoon,\neveryone.",97.596,2.369,yy6jBRVDcpI
1,I apologize for\nthe slight delay.,99.999,1.501,yy6jBRVDcpI
2,We wanted to make\nsure everyone,101.533,1.335,yy6jBRVDcpI
3,was able to get here.,102.902,2.202,yy6jBRVDcpI
4,Let me think the\nlocal officials,105.137,1.068,yy6jBRVDcpI
...,...,...,...,...
280,we got 1.6 billion dollars we have,1153.830,6.330,Bw0WRGkxi7I
281,another 1.6 million we have a third 1.6,1156.590,5.760,Bw0WRGkxi7I
282,billion coming but we want to build it,1160.160,5.130,Bw0WRGkxi7I
283,quickly at one time we don't want to do,1162.350,5.330,Bw0WRGkxi7I


In [31]:
# checkpoint - save a copy
#doc.to_parquet("doc.parquet")
doc.to_json("doc.json", orient="split")

In [67]:
# load it in another notebook with:
#doc = pd.read_json("doc.json", orient="split").set_index("index")

In [68]:
library

Unnamed: 0,videoId,publishedAt,title,source,candidate,source_desc
0,yy6jBRVDcpI,2020-06-21T01:53:05Z,Vice President Biden Delivers Remarks On Havin...,youtube,Biden,Joe Biden Official YouTube Channel. Playlist: ...
0,9CBReIcgQYw,2020-06-19T20:42:10Z,"LIVE: President Trump in Tulsa, Oklahoma",youtube,Trump,Donald J Trump Official YouTube Channel. Playl...
1,-VB24DMVUwY,2020-06-05T03:13:42Z,Join Vice President Biden for A Town Hall with...,youtube,Biden,Joe Biden Official YouTube Channel. Playlist: ...
2,VYqEo1ehYII,2020-06-02T22:28:33Z,Joe Biden Addresses the Unfolding Situation in...,youtube,Biden,Joe Biden Official YouTube Channel. Playlist: ...
3,b2aWDG87nJo,2020-06-02T22:23:53Z,Joe Biden Addresses the Nation On the Civil Un...,youtube,Biden,Joe Biden Official YouTube Channel. Playlist: ...
...,...,...,...,...,...,...
15,B1iI8AMLGiI,2019-04-08T14:11:19Z,"LIVE: President Trump in Fort Myers, FL",youtube,Trump,Donald J Trump Official YouTube Channel. Playl...
14,zWICNmMVhLs,2019-04-08T14:11:19Z,"LIVE: President Trump in Lebanon, OH",youtube,Trump,Donald J Trump Official YouTube Channel. Playl...
13,naiRdSkaVpk,2019-04-08T14:11:19Z,"LIVE: President Trump in Richmond, KY",youtube,Trump,Donald J Trump Official YouTube Channel. Playl...
19,LrCHIJr4pmE,2019-04-08T14:11:19Z,"LIVE: President Trump in Mosinee, WI",youtube,Trump,Donald J Trump Official YouTube Channel. Playl...
