## Design:

Find videos with random numbers until list is of size 200.

In [1]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv

In [2]:
# Params:

location = '' # 'RU' or '' for default
startDate = 2009
endDate = 2011
videoDuration = 'short'
vidNumber = 20

## Pre Reqs

In [3]:
load_dotenv()
SECRET_KEY = os.getenv('YOUTUBE_API_KEY')

df = pd.DataFrame(columns=[
    'title',
    'link',
    'description',
    'author',
    'date published'
])

## Helpers

In [4]:
import random
def rand_nums() -> str:
    val = ""
    for x in range (4):
        y = random.randint(0,9)
        val += str(y)
    print(val)
    return val

In [5]:
def convert_year_to_rfc3339(year):
    timestamp = pd.Timestamp(year=year, month=1, day=1, hour=0, minute=0, second=0, tz='UTC')
    rfc3339_str = timestamp.strftime('%Y-%m-%dT%H:%M:%SZ')
    return rfc3339_str

## Core

In [6]:

from apiclient.discovery import build

youtube = build('youtube','v3',developerKey = SECRET_KEY)

while (df.shape[0] < 20): #might need adjustments

    num = rand_nums()

    request = youtube.search().list(
        q=f'IMG {num}',
        part='snippet',
        type='video', 
        maxResults='3',
        publishedAfter=f'{convert_year_to_rfc3339(startDate)}',
        publishedBefore=f'{convert_year_to_rfc3339(endDate)}',
        videoDuration=f'{videoDuration}',
        )

    res = request.execute()

    for item in res['items']:

        title = item['snippet']['title']
        video_link = 'https://www.youtube.com/watch?v=' + item['id']['videoId']
        description = item['snippet']['description']
        author = item['snippet']['channelTitle']
        date_published = item['snippet']['publishedAt']

        new_row_df = pd.DataFrame([{
            'title': title,
            'link': video_link,
            'description': description,
            'author': author,
            'date published': date_published
        }])

        df = pd.concat([df, new_row_df], ignore_index=True)

1525
9067
3304
6518
5403
0955
8179


In [7]:
df.head(60)

Unnamed: 0,title,link,description,author,date published
0,IMG 1525,https://www.youtube.com/watch?v=RAbJm8yp4AU,,poiqsdiospqifdo,2010-12-21T15:14:50Z
1,IMG 1525,https://www.youtube.com/watch?v=4brgBUQ2rFA,,梅晟義,2010-10-23T07:12:45Z
2,IMG 1525,https://www.youtube.com/watch?v=NuDo0JpXoec,видео снято iphone4.,oostin333,2010-10-30T19:10:24Z
3,00008.MTS,https://www.youtube.com/watch?v=vTwVwn6PVcI,,jayhajj,2010-06-01T13:07:08Z
4,"Stirling engine Reichelt , model LH",https://www.youtube.com/watch?v=50jCFTYy0g4,http://dampmaskiner.webs.com/ R. Reichelt Stir...,75echo,2010-09-21T07:26:14Z
5,audi a4 sline,https://www.youtube.com/watch?v=aCK65Zt0GzY,parotech audi a4 s4 rs4.,MsRs44,2010-12-28T20:26:23Z
6,IMG 3304,https://www.youtube.com/watch?v=N32c2ojtprA,,BabyShrimpfu,2009-08-11T07:00:01Z
7,MVI 3304,https://www.youtube.com/watch?v=fRcHs7EvZUw,,Tevis Diaz,2010-12-19T17:43:50Z
8,IMG 1984,https://www.youtube.com/watch?v=4BMxh2MuJtA,,Thao DOan,2010-11-20T16:27:00Z
9,MVI_6518.MOV,https://www.youtube.com/watch?v=qklRInWjUY8,"happy birthday, papi!",chiarabird,2010-01-14T20:58:41Z


## Postprocess

Remove all videos where title is not in the IMG_XXXX format

In [8]:
regex_pattern = r"IMG[_ ]\d{4}(?:\.MOV|\.mp4)?" 

filtered_df = df[df['title'].str.match(regex_pattern, na=False)]
filtered_df = filtered_df.reset_index(drop=True)

filtered_df.head(60)

Unnamed: 0,title,link,description,author,date published
0,IMG 1525,https://www.youtube.com/watch?v=RAbJm8yp4AU,,poiqsdiospqifdo,2010-12-21T15:14:50Z
1,IMG 1525,https://www.youtube.com/watch?v=4brgBUQ2rFA,,梅晟義,2010-10-23T07:12:45Z
2,IMG 1525,https://www.youtube.com/watch?v=NuDo0JpXoec,видео снято iphone4.,oostin333,2010-10-30T19:10:24Z
3,IMG 3304,https://www.youtube.com/watch?v=N32c2ojtprA,,BabyShrimpfu,2009-08-11T07:00:01Z
4,IMG 1984,https://www.youtube.com/watch?v=4BMxh2MuJtA,,Thao DOan,2010-11-20T16:27:00Z
5,IMG 0955,https://www.youtube.com/watch?v=VoPH9BBRvSE,,sorani1980,2010-08-21T20:58:00Z
6,IMG 0955,https://www.youtube.com/watch?v=Ek1mrB9WAnI,,BrokeSideProductions,2010-09-02T02:19:22Z
7,IMG 0955,https://www.youtube.com/watch?v=S_47YN7mKJA,,Terry Smith,2010-12-22T06:02:04Z


## Daily videos

In [9]:
daily_path = "../DB/videos.json"

filtered_df.to_json(daily_path, orient='records', lines=False, indent=4)

## Archive for research purposes

In [None]:
# might error if this has not been created previously
archive_path = "../DB/old_videos.json"

archive_df = pd.read_json(archive_path, orient='records')

updated_df = pd.concat([archive_df, filtered_df], ignore_index=True)

updated_df.to_json(archive_path, orient='records', lines=False, indent=4)