## Design:

Find videos with random numbers until list is of size 200.

In [14]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv

In [15]:
# Params:

location = '' # 'RU' or '' for default
startDate = 2009
endDate = 2011
videoDuration = 'short'
vidNumber = 70

## Pre Reqs

In [16]:
load_dotenv()
SECRET_KEY = os.getenv('YOUTUBE_API_KEY')

df = pd.DataFrame(columns=[
    'title',
    'link',
    'id',
    'description',
    'author',
    'datePublished'
])

## Helpers

In [17]:
import random
def rand_nums() -> str:
    val = ""
    for x in range (4):
        y = random.randint(0,9)
        val += str(y)
    print(val)
    return val

In [18]:
def convert_year_to_rfc3339(year):
    timestamp = pd.Timestamp(year=year, month=pd.Timestamp.today().month, day=pd.Timestamp.today().day-1, hour=0, minute=0, second=0, tz='UTC')
    rfc3339_str = timestamp.strftime('%Y-%m-%dT%H:%M:%SZ')
    return rfc3339_str

In [19]:
def start_day(year):
    timestamp = pd.Timestamp(year=year, month=pd.Timestamp.today().month, day=pd.Timestamp.today().day-1, hour=0, minute=0, second=0, tz='UTC')
    rfc3339_str = timestamp.strftime('%Y-%m-%dT%H:%M:%SZ')
    return rfc3339_str

def end_day(year):
    timestamp = pd.Timestamp(year=year, month=pd.Timestamp.today().month, day=pd.Timestamp.today().day-1, hour=23, minute=59, second=0, tz='UTC')
    rfc3339_str = timestamp.strftime('%Y-%m-%dT%H:%M:%SZ')
    return rfc3339_str

start_day(2011)
end_day(2011)

'2011-12-27T23:59:00Z'

# TODO: Videos from today n years ago
# TODO: Viewer count for video

# maybe location or other interesting things

Front end notes:
if viewed all go to old ones


## Core

In [20]:

from apiclient.discovery import build

youtube = build('youtube','v3',developerKey = SECRET_KEY)

while (df.shape[0] < vidNumber): #might need adjustments

    num = rand_nums()

    request = youtube.search().list(
        q=f'IMG {num}',
        part='snippet',
        type='video', 
        maxResults='3',
        publishedAfter=f'{convert_year_to_rfc3339(2009)}',
        publishedBefore=f'{convert_year_to_rfc3339(2011)}',
        videoDuration=f'{videoDuration}',
        videoEmbeddable='true'
        )

    res = request.execute()

    for item in res['items']:

        title = item['snippet']['title']
        video_link = 'https://www.youtube.com/watch?v=' + item['id']['videoId']
        video_id = item['id']['videoId']
        description = item['snippet']['description']
        author = item['snippet']['channelTitle']
        date_published = item['snippet']['publishedAt']

        new_row_df = pd.DataFrame([{
            'title': title,
            'link': video_link,
            'id': video_id,
            'description': description,
            'author': author,
            'datePublished': date_published
        }])

        df = pd.concat([df, new_row_df], ignore_index=True)

8239
5609
4664
3944
4249
4483
1417
1041
8549
0633
5755
7087
5831
6951
1230
0797
1454
6240
5698
0110
8704
1570
2844
2373


In [21]:
df.head(60)

Unnamed: 0,title,link,id,description,author,datePublished
0,IMG_8239.AVI,https://www.youtube.com/watch?v=xTeq1yqqIRc,xTeq1yqqIRc,民國100年11/5日至高雄市永安區於永新漁港舉辦第一屆海洋音樂季；所掌鏡錄影潘朵拉樂團所帶...,王者逆天,2011-11-11T22:46:00Z
1,MVI 8239,https://www.youtube.com/watch?v=yFTttx6Qqow,yFTttx6Qqow,,vansang Tran,2011-06-04T05:19:04Z
2,IMG 0505,https://www.youtube.com/watch?v=UQqgwMsDCj4,UQqgwMsDCj4,,vansang Tran,2011-06-02T08:38:00Z
3,IMG 5609,https://www.youtube.com/watch?v=kJnA-3c_eBQ,kJnA-3c_eBQ,,philthuha,2011-08-20T18:23:02Z
4,IMG 5609,https://www.youtube.com/watch?v=KikbaV3o8uk,KikbaV3o8uk,,Dandara Voltarello,2011-12-23T23:38:05Z
5,MVI 5609,https://www.youtube.com/watch?v=66LRCEAIOJI,66LRCEAIOJI,,Julia Goldbach Carney,2011-05-19T20:46:05Z
6,IMG 4664,https://www.youtube.com/watch?v=ku3kFTgGBKo,ku3kFTgGBKo,,winnielove4u1986,2011-11-29T15:16:00Z
7,IMG 4664,https://www.youtube.com/watch?v=mNHp3FG7MYo,mNHp3FG7MYo,,Marìa Victoria Silva,2011-03-06T18:43:48Z
8,IMG_4664.mov,https://www.youtube.com/watch?v=Zn13Kev6RVs,Zn13Kev6RVs,,Marie Guilloux,2010-06-23T17:53:05Z
9,IMG 3944,https://www.youtube.com/watch?v=azLpm7-Ug4o,azLpm7-Ug4o,,ladybug3713,2011-04-21T04:46:32Z


## Postprocess

Remove all videos where title is not in the IMG_XXXX format

In [22]:
regex_pattern = r"IMG[_ ]\d{4}(?:\.MOV|\.mp4)?" 

filtered_df = df[df['title'].str.match(regex_pattern, na=False)]
filtered_df = filtered_df.reset_index(drop=True)

filtered_df.head(60)

Unnamed: 0,title,link,id,description,author,datePublished
0,IMG_8239.AVI,https://www.youtube.com/watch?v=xTeq1yqqIRc,xTeq1yqqIRc,民國100年11/5日至高雄市永安區於永新漁港舉辦第一屆海洋音樂季；所掌鏡錄影潘朵拉樂團所帶...,王者逆天,2011-11-11T22:46:00Z
1,IMG 0505,https://www.youtube.com/watch?v=UQqgwMsDCj4,UQqgwMsDCj4,,vansang Tran,2011-06-02T08:38:00Z
2,IMG 5609,https://www.youtube.com/watch?v=kJnA-3c_eBQ,kJnA-3c_eBQ,,philthuha,2011-08-20T18:23:02Z
3,IMG 5609,https://www.youtube.com/watch?v=KikbaV3o8uk,KikbaV3o8uk,,Dandara Voltarello,2011-12-23T23:38:05Z
4,IMG 4664,https://www.youtube.com/watch?v=ku3kFTgGBKo,ku3kFTgGBKo,,winnielove4u1986,2011-11-29T15:16:00Z
5,IMG 4664,https://www.youtube.com/watch?v=mNHp3FG7MYo,mNHp3FG7MYo,,Marìa Victoria Silva,2011-03-06T18:43:48Z
6,IMG_4664.mov,https://www.youtube.com/watch?v=Zn13Kev6RVs,Zn13Kev6RVs,,Marie Guilloux,2010-06-23T17:53:05Z
7,IMG 3944,https://www.youtube.com/watch?v=azLpm7-Ug4o,azLpm7-Ug4o,,ladybug3713,2011-04-21T04:46:32Z
8,IMG 3944,https://www.youtube.com/watch?v=06bkgoYlr6o,06bkgoYlr6o,,SoradaSurapaeng,2011-08-20T15:48:10Z
9,IMG 3944,https://www.youtube.com/watch?v=wdpSb1AGEQU,wdpSb1AGEQU,,netskartucho,2011-05-31T04:57:24Z


## Daily videos

In [23]:
daily_path = "../DB/videos.json"

filtered_df.to_json(daily_path, orient='records', lines=False, indent=4)

## Archive for research purposes

In [24]:
# might error if this has not been created previously
archive_path = "../DB/old_videos.json"

archive_df = pd.read_json(archive_path, orient='records')

updated_df = pd.concat([archive_df, filtered_df], ignore_index=True)

updated_df.to_json(archive_path, orient='records', lines=False, indent=4)