In [7]:
import pandas as pd
import requests
import json
import pymongo
from youtube_transcript_api import YouTubeTranscriptApi

In [8]:
# connect to mongodb
myclient = pymongo.MongoClient("mongodb://localhost/")
db = myclient["speeches"]

In [9]:
# define youtube playlists to pull video transcripts from
# playlist_id, candidate, source_description
sources = [['PLB92o2PvjqnfXeskcxX3GR6alCzdVFHjQ','Biden','Joe Biden Official YouTube Channel. Playlist: Livestreams, Speeches, and Debates'],
           ['PLKOAoICmbyV2XOjXa9u00njJ6fTLpOK5x','Trump','Donald J Trump Official YouTube Channel. Playlist: Trump Rallies']]

## Build library of videos

In [10]:
# pull the video IDs, titles, etc from the playlist
def playlist_index(source):
    # api key AIzaSyC7mEGnVGAFmoy23HdHJo877vZQ7DsYLeg
    pl_df = pd.DataFrame()
    next_page_str = ""
    while True: # loop through pages of 25 results
        r = requests.get("https://www.googleapis.com/youtube/v3/playlistItems?" + 
                         "part=id%2CcontentDetails%2Csnippet&" +
                         "maxResults=25&" + # default of 5
                         "playlistId=" + source[0] + "&" + next_page_str +
                         "key=AIzaSyC7mEGnVGAFmoy23HdHJo877vZQ7DsYLeg", 
                         headers = {'Accept': 'application/json'})
        if r.status_code != 200:
            print("Error for playlist " + source[0])
            print(r.text)
            break
        pl_json = json.loads(r.text)
        df = pd.DataFrame(
            [i['snippet']['resourceId']['videoId'],
             i['snippet']['publishedAt'],
             i['snippet']['title']] for i in pl_json['items']
        )
        df.columns = ['videoId','publishedAt','title']
        df['source'] = "youtube"
        df['candidate'] = source[1]
        df['source_desc'] = source[2]
        pl_df = pl_df.append(df)
        if "nextPageToken" not in pl_json:
            break
        next_page_str = "pageToken="+ pl_json['nextPageToken'] + "&" 
    # clean up playlist
    pl_df = pl_df
    return pl_df

In [13]:
# set up a library of documents

try:
    cursor = db["library"].find()
    library = pd.DataFrame(list(cursor)).drop(columns="_id")
except: # can't load from mongodb, create new collection
    print("Couldn't load library, creating new one")
    library = pd.DataFrame()
    db.create_collection('library')
for s in sources:
    library = library.append(playlist_index(s))

Couldn't load library, creating new one


CollectionInvalid: collection library already exists

In [27]:
# remove all the duplicates added
library = library.sort_values(["publishedAt"], ascending=False)\
        .drop_duplicates(["videoId"],keep="first")

In [28]:
library.sort_values('publishedAt',ascending=False).head()

Unnamed: 0,videoId,publishedAt,title,source,candidate,source_desc
0,T5pTdbSMcrI,2020-07-14T18:26:29Z,Joe Biden Delivers Remarks On His “Build Back ...,youtube,Biden,Joe Biden Official YouTube Channel. Playlist: ...
1,ZcR_kuCJvD8,2020-07-06T01:36:43Z,Vice President Biden delivers remarks on Trump...,youtube,Biden,Joe Biden Official YouTube Channel. Playlist: ...
2,m_lYduRUumA,2020-06-29T19:45:09Z,Vice President Biden's Remarks at Global Pride...,youtube,Biden,Joe Biden Official YouTube Channel. Playlist: ...
3,yy6jBRVDcpI,2020-06-21T01:53:05Z,Vice President Biden Delivers Remarks On Havin...,youtube,Biden,Joe Biden Official YouTube Channel. Playlist: ...
0,9CBReIcgQYw,2020-06-19T20:42:10Z,"LIVE: President Trump in Tulsa, Oklahoma",youtube,Trump,Donald J Trump Official YouTube Channel. Playl...


In [None]:
# is videoId ok as a unique key? or should we generate a unique speech id?
# what if we have different videos for the same speech from different sources?
# or what if we want to compare alternate transcripts for the same video?


In [None]:
# write new entries to mongodb

#lc = db["library"]
cursor = db["library"].find()
curlist = list(cursor)

if len(curlist) == 0:
    old_lib = pd.DataFrame(columns=library.columns)
else:
    old_lib = pd.DataFrame(curlist).drop(columns="_id") # existing library entries

library=library.set_index(['videoId']) 
new_lib = library[~library.index.isin(old_lib.videoId)].reset_index() # new entries only; might be none

print("Writing {} new records to collection".format(new_lib.shape[0]))
records = json.loads(new_lib.to_json(orient="records")) # convert new entries to JSON
if len(records) > 0:
    db["library"].insert_many(records) # write to db

library = library.reset_index()

In [None]:
# test - query the library collection to make sure it looks good
#cursor = db["library"].find()
#pd.DataFrame(list(cursor)).drop(columns="_id")

In [None]:
#db.library.drop()

## Build document df of raw transcripts

In [29]:
# load existing doc from a file if it exists
    
try:
    cursor = db["transcripts"].find()
    curlist = list(cursor)
    if len(curlist) == 0:
        transcripts = pd.DataFrame(columns=["text","start","duration","videoId"])
    else:
        transcripts = pd.DataFrame(curlist).drop(columns="_id") # existing raw entries
    
except: # can't load from mongodb, create new collection
    print("Couldn't load library, creating new one")
    transcripts = pd.DataFrame(columns=["text","start","duration","videoId"])
    #db.create_collection('transcripts')


In [30]:
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound

n = 0
for v in library['videoId']:
    if 'videoId' in transcripts.columns and transcripts[transcripts['videoId'] == v]['videoId'].count() > 0:
        n = n + 1
        continue # already exists, don't overwrite it. skip to next loop
    try:
        t = YouTubeTranscriptApi.get_transcript(v)
        rec = {"videoId": v, "transcript": t}
        db["transcripts"].insert_one(rec)
    except TranscriptsDisabled:
        print("Transcripts disabled for video: %s" % v)
    except NoTranscriptFound:
        print("No transcript found for video: %s" % v)
    n = n + 1
    if n % 10 == 0:
        print(n) # print progress numbers

10
20
30
Transcripts disabled for video: MJLq0Nw9YgM
No transcript found for video: hDhploWe9Ec
60
Transcripts disabled for video: wHdN6KFxmgs
Transcripts disabled for video: imluGUa6YEo
Transcripts disabled for video: YdeaPEP5K9k
Transcripts disabled for video: vEwJwLUZjyU
Transcripts disabled for video: WrRZYoPWWFU
Transcripts disabled for video: Z3KW869TSL0
Transcripts disabled for video: QfV9mw42dvs
Transcripts disabled for video: vWCt3fcQqgM
Transcripts disabled for video: 2IlvPU2-BIM
Transcripts disabled for video: G_KNy7bdc80
Transcripts disabled for video: QeYyLho2bnQ
Transcripts disabled for video: zWICNmMVhLs
Transcripts disabled for video: naiRdSkaVpk
100
Transcripts disabled for video: cln_VNkl688
Transcripts disabled for video: 9PT6KJxYdnQ
Transcripts disabled for video: U6OPcaMMQyQ


In [None]:
#transcripts
#db.transcripts.drop()

In [32]:
#v = "T5pTdbSMcrI"
#t = YouTubeTranscriptApi.get_transcript(v)
#rec = {"videoId": v, "transcript": t}
#db["transcripts"].insert_one(rec)
cursor = db["transcripts"].find()
token = pd.DataFrame(list(cursor)).drop(columns="_id").set_index('videoId')


In [26]:
library = playlist_index(sources[0])
library = library.append(playlist_index(sources[1]))


In [34]:
df = library.join(token, on="videoId")

In [38]:
df[~df['transcript'].isnull().values]['videoId'].tolist()

['T5pTdbSMcrI',
 'ZcR_kuCJvD8',
 'm_lYduRUumA',
 'yy6jBRVDcpI',
 '9CBReIcgQYw',
 '-VB24DMVUwY',
 'VYqEo1ehYII',
 'b2aWDG87nJo',
 '0VpB4tllFkk',
 'nEE8XmPqkH4',
 'uJr0dEjSkHE',
 'Z0Asi6UAUG8',
 '7S4hdImZXdw',
 'C5583GLr__Y',
 'Bc3wdOPWPLs',
 'sn1hWHUwvn4',
 'E1XUeJA0-f0',
 '1lbMK2CG63k',
 'EkHIsiqkL8I',
 '0oX8uWNAgKs',
 '9y5bdQ3r-Qc',
 'GXuEUeCH8hg',
 'aVzLyQ1_Sm4',
 'iIB4zpW6-Q0',
 'GdpV6_UVK5k',
 '4PEkEc1ZFBs',
 'lnG13ft9BeI',
 '3TFqeMSsTx0',
 'JfBVHeZkVFs',
 'QxhsV_6dWVs',
 '-8KiohxTJ0Y',
 'l1-6cwetV5k',
 'LXqI5QFNiP8',
 'NdWlxDtt8Qk',
 'AzrkZp0RxI8',
 'rUbJ4jhFF7g',
 'no0MnomPzLw',
 '0pJnrHKoBHY',
 '89ewdfbqGxo',
 'AEJ99-7Rm_k',
 '9e8mjUJcSb8',
 'Fgq_mOdAUq0',
 'v-es4fBfCCY',
 'WXtnH_r1HC8',
 'R33k5M4a8aU',
 'PA0duhpkkT0',
 '2LC-Phx4_nU',
 'ygLy6HcExTg',
 'mMkBS2WzfHY',
 'jrDlx7v5WcQ',
 'DvWrekxwo2o',
 'cH1LXd0dZ8g',
 'FiPVOx-cAfQ',
 'scZgh2GDM6U',
 'sL1UF0KDwsE',
 'xoBgzVWub2A',
 'WP0QAeuSFsI',
 'j0nUc34rCzY',
 'YYQ5FAp6INU',
 'lanHUiGPWfo',
 'T-oF_qIDxUw',
 'dwRA5qmm-e4',
 'KgLM08