# Download Data
Saves non-video data

In [70]:
%load_ext autoreload
%autoreload 2

In [7]:
import os

import re
import google_auth_oauthlib.flow
from googleapiclient.discovery import build
import googleapiclient.errors
from pprint import pprint
import json
from youtube_transcript_api import YouTubeTranscriptApi
from tqdm import tqdm
import youtube_dl
import pickle
from glob import glob
import pandas as pd
import numpy as np
from lib.Episode import Episode, EpisodeFactory
import time
import shutil

In [8]:
ID = "lex"
BASE = f"../data/{ID}"
VIDEOS_LOCATION = f"/Volumes/{ID}/videos/"
WEBSITE = f"../../{ID}-vis/public/"

UPLOADS = BASE + "/uploads.json"
STATS = BASE + "/stats.json"
COMMENTS = BASE + "/comments.json"
CAPTIONS = BASE + "/captions.json"
CAPTIONS_FAILED = BASE + "/captions_failure.json"

In [9]:
# Disable OAuthlib's HTTPS verification when running locally.
# *DO NOT* leave this option enabled in production.
scopes = ["https://www.googleapis.com/auth/youtube.readonly"]
API_KEY = "AIzaSyApvC_9XhjS4Hun7JqqAsUuAL3eXhY8rmM"

os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"

api_service_name = "youtube"
api_version = "v3"

# https://developers.google.com/resources/api-libraries/documentation/youtube/v3/python/latest/
youtube = build(api_service_name, api_version, developerKey=API_KEY)

In [11]:
youtube.channels().list(
    # part="snippet,contentDetails,statistics,uploads",
    part="contentDetails",
    forUsername="lexfridman"
).execute()

{'kind': 'youtube#channelListResponse',
 'etag': 'aY9fEZBveDyoom-c6zy897RtVYA',
 'pageInfo': {'totalResults': 1, 'resultsPerPage': 5},
 'items': [{'kind': 'youtube#channel',
   'etag': '00S2G4SJGIaEORsQ1zHp-OEyjB4',
   'id': 'UCSHZKyawb77ixDdsGog4iWA',
   'contentDetails': {'relatedPlaylists': {'likes': '',
     'favorites': '',
     'uploads': 'UUSHZKyawb77ixDdsGog4iWA'}}}]}

In [12]:
UPLOADS_ID = "UUzQUP1qoWDoEbmsQxvdjxgQ" # JRE
UPLOADS_ID = "UUSHZKyawb77ixDdsGog4iWA" # LEX

In [14]:
### Get uploads
def get_video_ids(upload_id):
    page_token = None
    items = []
    total_results = 0

    while True:
        print(f"{len(items)}/{total_results} -> {page_token}", end="\r")

        request = youtube.playlistItems().list(
            part="id,contentDetails",
            playlistId=upload_id,
            pageToken=page_token,
            maxResults=50,
        )
        response = request.execute()
        items.extend([item["contentDetails"]["videoId"] for item in response["items"]])

        total_results = response["pageInfo"]["totalResults"]

        if "nextPageToken" in response:
            page_token = response["nextPageToken"]
        else:
            break

    return items

try:
    video_ids = get_video_ids(UPLOADS_ID)
except Exception as e:
    print(e)

500/509 -> CPQDEAA

In [15]:
len(video_ids)

509

In [16]:
def get_uploads():
    uploads = []
    
    for id in tqdm(video_ids): 
        response = youtube.videos().list(
            id=id,
            part="contentDetails,id,liveStreamingDetails,recordingDetails,snippet,statistics,topicDetails,status",
        ).execute()
        
        # Sanity check to ensure all responses have length 1
        if len(response['items']) != 1:
            print("There should not be more than one item for", id)
        
        uploads.append(response['items'][0])
        
    return uploads

uploads = get_uploads()
with open(UPLOADS, "w") as f:
    f.write(json.dumps(uploads))

100%|██████████| 509/509 [01:14<00:00,  6.84it/s]


In [17]:
with open(UPLOADS, "r") as f:
    uploads = json.load(f)
# Contents of uploads
len(uploads), uploads[15]

(509,
 {'kind': 'youtube#video',
  'etag': '_iMQxh8EnTTt_V86KIUqiYp3erk',
  'id': '_L3gNaAVjQ4',
  'snippet': {'publishedAt': '2020-10-22T01:09:25Z',
   'channelId': 'UCSHZKyawb77ixDdsGog4iWA',
   'title': 'George Hotz: Hacking the Simulation & Learning to Drive with Neural Nets | Lex Fridman Podcast #132',
   'description': "George Hotz (geohot) is a programmer, hacker, and the founder of Comma.ai. Please support this podcast by checking out our sponsors:\n- Four Sigmatic: https://foursigmatic.com/lex and use code LexPod to get up to 40% & free shipping\n- Decoding Digital: https://appdirect.com/decoding-digital\n- ExpressVPN: https://expressvpn.com/lexpod and use code LexPod to get 3 months free\n\nEPISODE LINKS:\nComma.ai's Twitter: https://twitter.com/comma_ai\nComma.ai's Website: https://comma.ai/\nGeorge's Instagram: https://www.instagram.com/georgehotz\nGeorge's Twitch: https://www.twitch.tv/georgehotz\nGeorge's Twitter: https://twitter.com/realgeorgehotz\nComma.ai YouTube (unof

In [20]:
def get_captions():
    failed_to_get_captions = []
    captions = {}
    with open(CAPTIONS, "r") as f:
        captions = json.loads(f.read())
    with open(UPLOADS, "r") as f:
        uploads = json.loads(f.read())
    
    downloaded = 0
    for upload in tqdm(uploads):
        id = upload["id"]

        if id in captions:
            continue

        try:
            caption = YouTubeTranscriptApi.get_transcript(id)
            captions[id] = caption
            downloaded += 1
        except:
            failed_to_get_captions.append(id)

        # save progress
        if downloaded % 10 == 0:
            with open(CAPTIONS, "w") as f:
                f.write(json.dumps(captions))

    print("Downloaded", downloaded, "items")
    return captions, failed_to_get_captions

captions, failed_to_get_captions = get_captions()

with open(CAPTIONS, "w") as f:
    f.write(json.dumps(captions))

with open(CAPTIONS_FAILED, "w") as f:
    f.write(json.dumps(failed_to_get_captions))
    
print(len(captions.items()), len(failed_to_get_captions))

100%|██████████| 509/509 [06:42<00:00,  1.27it/s]


Downloaded 435 items
435 74


In [21]:
print(len(captions.items()), len(failed_to_get_captions))

print("Missing the following items")
[[(u['snippet']['title'], u['contentDetails']['duration']) for u in uploads if u['id'] == id][0] for id in failed_to_get_captions]

435 74
Missing the following items


[('Manolis Kellis: Meaning of Life, the Universe, and Everything | Lex Fridman Podcast #142',
  'PT2H10M57S'),
 ("10 things I'm grateful for this Thanksgiving", 'PT7M37S'),
 ('Erik Brynjolfsson: Economics of AI, Social Networks, and Technology | Lex Fridman Podcast #141',
  'PT1H39M50S'),
 ('Lisa Feldman Barrett: Love, Evolution, and the Human Brain | Lex Fridman Podcast #140',
  'PT2H15M38S'),
 ('Andrew Huberman: Neuroscience of Optimal Performance | Lex Fridman Podcast #139',
  'PT2H32M15S'),
 ('Yaron Brook: Ayn Rand and the Philosophy of Objectivism | Lex Fridman Podcast #138',
  'PT2H49M46S'),
 ('Alex Filippenko: Supernovae, Dark Energy, Aliens & the Expanding Universe | Lex Fridman Podcast #137',
  'PT2H35M48S'),
 ('Dan Carlin: Hardcore History | Lex Fridman Podcast #136', 'PT3H21M26S'),
 ('Charles Isbell: Computing, Interactive AI, and Race in America | Lex Fridman Podcast #135',
  'PT2H23M51S'),
 ('Eric Weinstein: On the Nature of Good and Evil, Genius and Madness | Lex Fridman 

# Create Episodes cache
Create episode object (lib/Episode) out of each upload

In [22]:
CACHE_ALL = f"./{ID}-episodes.pickle"
CACHE_SMALL = f"./{ID}-episodes-small.pickle"
CACHE = CACHE_ALL

print(f"Generating new pickle {CACHE}...")

factory = EpisodeFactory(BASE)
episodes = factory.create_episodes(skip_comments=True)

with open(CACHE, "wb") as f:
    if CACHE == CACHE_SMALL:
        pickle.dump(episodes[:100], f)
    else:
        pickle.dump(episodes, f)

print(f"Number of loaded episodes: {len(episodes)}")

Generating new pickle ./lex-episodes.pickle...
Number of loaded episodes: 509


In [24]:
# Save CSV version of this to website directory
ep_df = pd.DataFrame(
    [
        (e.video_id, e.title, e.number, e.published_at, e.guests, e.is_main_episode, e.likes, e.dislikes, e.views, e.comment_count) 
         for e in episodes
    ],
    columns=["id", "title", "number", "published", "guests", "main", "likes", "dislikes","views","commentCount"]
)

# ep_df.to_csv(WEBSITE + "episodes.csv")
ep_df.head()

Unnamed: 0,id,title,number,published,guests,main,likes,dislikes,views,commentCount
0,W7wJDJ56c88,DeepMind solves protein folding | AlphaFold 2,,2020-12-02T22:39:08Z,,False,1579,7,12640,140
1,bgNzUxyS-kQ,"Manolis Kellis: Meaning of Life, the Universe,...",142.0,2020-11-30T19:52:50Z,,False,2302,97,59002,564
2,ipQBP1wRFNM,10 things I'm grateful for this Thanksgiving,,2020-11-27T03:03:49Z,,False,2837,44,34296,410
3,NOReE-3EBhI,"Erik Brynjolfsson: Economics of AI, Social Net...",141.0,2020-11-25T18:00:29Z,,False,2663,95,84286,381
4,Lks97-GLElk,Comma.ai Drive and Tour with George Hotz and L...,,2020-11-23T19:32:08Z,,False,5208,70,91814,583


# Download Videos

In [43]:
CACHE = f"./{ID}-episodes.pickle"

with open(CACHE, "rb") as f:
    episodes = pickle.load(f)

main_eps = [e for e in episodes if e.is_main_episode]
print("Total main episodes", len(main_eps))

Total main episodes 1470


In [75]:
video_files = list(glob(VIDEOS_LOCATION + "*.mp4"))
video_files_ids = [v[-15:-4] for v in video_files]
downloaded_main = [e for e in main_eps if e.video_id in video_files_ids]

# sort to give priority to newer episodes first
missing_videos = sorted(
    list(set(main_eps) - set(downloaded_main)), 
    key=lambda ep: ep.number, 
    reverse=True,
)

print("Total missing main videos", len(missing_videos))

ydl = youtube_dl.YoutubeDL(
    {
        # MP4 at 360p
        "format": "18",
        # "cookiefile": "./youtube-dl-cookies.txt"
    }
)

downloaded = 0
with ydl:
    for ep in missing_videos:
        video = f"http://www.youtube.com/watch?v={ep.video_id}"

        try:
            result = ydl.download([video])
            print(result)
            downloaded += 1
            
            # Couldn't get option "outtml" to work, manually move instead
            try:
                file = f"{ep.title}-{ep.video_id}.mp4"
                shutil.move(f"./{file}", VIDEOS_LOCATION + file)
            except Exception as e:
                print("Could not move", ep, e)
        
        except Exception as e:
            print("could not download", ep.title)
        
        time.sleep(15)

print("Downloaded", downloaded, "videos")

Total missing main videos 11
[youtube] wC5TVZ3p_H4: Downloading webpage
[download] Destination: Joe Rogan Experience #1548 - Roy Jones Jr.-wC5TVZ3p_H4.mp4
[download] 100% of 306.32MiB in 00:2467MiB/s ETA 00:005
0
[youtube] ckjwkCbGIu8: Downloading webpage
[download] Destination: Joe Rogan Experience #1547 - Colin Quinn-ckjwkCbGIu8.mp4
[download] 100% of 367.64MiB in 00:2737MiB/s ETA 00:002
0
[youtube] A9PfeA9qFp8: Downloading webpage
[download] Destination: Joe Rogan Experience #1546 - Evan Hafer & Mat Best-A9PfeA9qFp8.mp4
[download] 100% of 431.54MiB in 00:2803MiB/s ETA 00:002
0
[youtube] j-bSjzIPRro: Downloading webpage
[download] Destination: Joe Rogan Experience #1545 - W. Keith Campbell-j-bSjzIPRro.mp4
[download] 100% of 430.29MiB in 00:2899MiB/s ETA 00:001
0
[youtube] ikJq6wcgrXI: Downloading webpage
[download] Destination: Joe Rogan Experience #1544 - Tim Dillon-ikJq6wcgrXI.mp4
[download] 100% of 400.05MiB in 00:2442MiB/s ETA 00:001
0
[youtube] gzAQ7SklDxo: Downloading webpage
[

ERROR: This video is not available.


could not download Joe Rogan Experience #674 - Brian Redban
[youtube] 0swiKKUHIiU: Downloading webpage
[youtube] 0swiKKUHIiU: Downloading embed webpage
[youtube] 0swiKKUHIiU: Refetching age-gated info webpage


ERROR: This video contains content from NFL, who has blocked it on copyright grounds.


could not download Joe Rogan Experience #572 - Dom Irrera
Downloaded 9 videos


## Unused but may be helpful to someone

In [None]:
def get_comments(videoId, max=1500):
    page_token = None
    items = []
    # print(videoId)

    while True and len(items) < max:
        # print(f"\t{len(items)}/{max}")
        request = youtube.commentThreads().list(
            part="id,snippet",
            videoId=videoId,
            pageToken=page_token,
            maxResults=500,
            order="relevance",
        )
        response = request.execute()
        items.extend(response["items"])

        if "nextPageToken" in response:
            page_token = response["nextPageToken"]
        else:
            break

    return items

def get_comments():
#     Get comments and statistics for each video
    with open(UPLOADS, "r") as f:
        uploads = json.loads(f.read())

        # likeCount, textDisplay, textOriginal, publishedAt
        # authorDisplayName, authorChannelId
        i = 0
        comment_threads = {}
        with open(COMMENTS, "r") as f:
            comment_threads = json.load(f)

        for upload in tqdm.tqdm(uploads):
            videoId = upload["snippet"]["resourceId"]["videoId"]

            if videoId not in comment_threads:
                comments = get_comments(videoId)
                comment_threads[videoId] = comments
                print(videoId, len(comments))
            else:
                print("Already saved thread for", videoId)

            i += 1
            # save progress
            if i % 25 == 0:
                with open(COMMENTS, "w") as f:
                    f.write(json.dumps(comment_threads))

        with open(COMMENTS, "w") as f:
            f.write(json.dumps(comment_threads))






In [None]:
# I use video data instead
def download_audio():
    audio_file = lambda id: f"data/jre/audio/{id}"
    with open(UPLOADS, "r") as f:
        uploads = json.loads(f.read())

        for upload in tqdm(uploads):
            id = upload["contentDetails"]["videoId"]
            yt = None
            if os.path.exists(audio_file(id)):
                print("File exists", id, "skipping...")
                continue

            try:
                yt = YouTube(f"https://www.youtube.com/watch?v={id}")
            except Exception as e:
                print("Could not load api", id, str(e))
                continue

            audio_streams = yt.streams.filter(only_audio=True)

            if len(audio_streams) == 0:
                print("No audio streams available for", id)
                continue

            mp4_streams = [s for s in audio_streams if s.mime_type == "audio/mp4"]
            stream = audio_streams[0] if len(mp4_streams) == 0 else mp4_streams[0]

            print(f"Downloading [{id}] {stream.mime_type} {stream.abr}")
            stream.download(audio_file(id))
            print("\t... Done")

In [None]:
# Unused utils
def get_timestamps(c):
    timestamp = r"(\d?\d?:)?\d?\d?:\d{2}"

    if "topLevelComment" in c["snippet"]:
        snippet = c["snippet"]["topLevelComment"]["snippet"]

        if re.search(timestamp, snippet["textOriginal"]) is None:
            return None
        return snippet["likeCount"], snippet["textOriginal"]
    elif re.search(timestamp, c["snippet"]["textOriginal"]) is not None:
        return c["snippet"]["likeCount"], c["snippet"]["textOriginal"]