## Design:

Find videos with random numbers until list is of size 200.

In [54]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv

In [55]:
# Params:

location = '' # 'RU' or '' for default
startDate = 2009
endDate = 2011
videoDuration = 'short'
vidNumber = 30

## Pre Reqs

In [56]:
load_dotenv()
SECRET_KEY = os.getenv('YOUTUBE_API_KEY')

df = pd.DataFrame(columns=[
    'title',
    'link',
    'id',
    'description',
    'author',
    'datePublished',
    'viewCount'
])

## Helpers

In [57]:
import random
def rand_nums() -> str:
    val = ""
    for x in range (4):
        y = random.randint(0,9)
        val += str(y)
    print(val)
    return val

In [58]:
def convert_year_to_rfc3339(year):
    timestamp = pd.Timestamp(year=year, month=pd.Timestamp.today().month, day=pd.Timestamp.today().day, hour=0, minute=0, second=0, tz='UTC')
    rfc3339_str = timestamp.strftime('%Y-%m-%dT%H:%M:%SZ')
    return rfc3339_str


In [59]:
# def start_day(year):
#     timestamp = pd.Timestamp(year=year, month=pd.Timestamp.today().month, day=pd.Timestamp.today().day-1, hour=0, minute=0, second=0, tz='UTC')
#     rfc3339_str = timestamp.strftime('%Y-%m-%dT%H:%M:%SZ')
#     return rfc3339_str

# def end_day(year):
#     timestamp = pd.Timestamp(year=year, month=pd.Timestamp.today().month, day=pd.Timestamp.today().day-1, hour=23, minute=59, second=0, tz='UTC')
#     rfc3339_str = timestamp.strftime('%Y-%m-%dT%H:%M:%SZ')
#     return rfc3339_str

# start_day(2011)
# end_day(2011)

# TODO: Videos from today n years ago
# TODO: Viewer count for video

# maybe location or other interesting things

Front end notes:
if viewed all go to old ones


## Core

In [60]:

from apiclient.discovery import build

youtube = build('youtube','v3',developerKey = SECRET_KEY)

while (df.shape[0] < vidNumber): #might need adjustments

    num = rand_nums()

    request = youtube.search().list(
        q=f'IMG {num}',
        part='snippet',
        type='video', 
        maxResults='3',
        publishedAfter=f'{convert_year_to_rfc3339(2009)}',
        publishedBefore=f'{convert_year_to_rfc3339(2011)}',
        videoDuration=f'{videoDuration}',
        videoEmbeddable='true'
        )

    res = request.execute()

    for item in res['items']:

        title = item['snippet']['title']
        video_link = 'https://www.youtube.com/watch?v=' + item['id']['videoId']
        video_id = item['id']['videoId']
        description = item['snippet']['description']
        author = item['snippet']['channelTitle']
        date_published = item['snippet']['publishedAt']

        new_row_df = pd.DataFrame([{
            'title': title,
            'link': video_link,
            'id': video_id,
            'description': description,
            'author': author,
            'datePublished': date_published,
            'viewCount' : 0
        }])

        df = pd.concat([df, new_row_df], ignore_index=True)

1882
3305
7335
4321
9620
9421
4408
8142
3048
8666


In [61]:
df.head(60)

Unnamed: 0,title,link,id,description,author,datePublished,viewCount
0,IMG 1882,https://www.youtube.com/watch?v=xZHD_lCZs20,xZHD_lCZs20,Duncan the cat.,g550guy,2009-10-15T15:36:19Z,0
1,IMG 1882 mpeg1video,https://www.youtube.com/watch?v=K2_BP44v2Lc,K2_BP44v2Lc,Visit http://www.rubber-power.com for more inf...,Darcy Whyte,2009-07-20T15:50:35Z,0
2,IMG 1882,https://www.youtube.com/watch?v=wZHjQF-LDdo,wZHjQF-LDdo,,rlsilverstone,2010-11-07T10:29:37Z,0
3,IMG 3305,https://www.youtube.com/watch?v=LLGGNt4V1Qk,LLGGNt4V1Qk,,BabyShrimpfu,2009-08-11T07:08:10Z,0
4,MVI_3305.AVI,https://www.youtube.com/watch?v=9WYmRoJjazU,9WYmRoJjazU,,Christine Paveling,2009-11-23T02:41:13Z,0
5,IMG 3288,https://www.youtube.com/watch?v=NjQv_sMdb-U,NjQv_sMdb-U,,BabyShrimpfu,2009-08-11T06:40:06Z,0
6,●●引っ張らないの！！！ｗ,https://www.youtube.com/watch?v=tMX9ajGGvw4,tMX9ajGGvw4,女の子の影響で発情しそうになってしまったチャーリー。 なぜか、じぶんの○○引っ張ってます＾＾；,名無し,2009-09-29T07:31:19Z,0
7,jona met ballonnen,https://www.youtube.com/watch?v=M3FaTxtjsic,M3FaTxtjsic,,Michel Hendriks,2010-01-19T19:35:48Z,0
8,7759 Wheel Chair Ramp,https://www.youtube.com/watch?v=iJgyvdU08WI,iJgyvdU08WI,7759 loading up wheel chair ramp.,Downsview34,2009-10-07T03:00:54Z,0
9,IMG 4321,https://www.youtube.com/watch?v=BWRD2k2la94,BWRD2k2la94,,KF825,2009-08-18T15:15:38Z,0


## Postprocess

Remove all videos where title is not in the IMG_XXXX format

In [62]:
regex_pattern = r"IMG[_ ]\d{4}(?:\.MOV|\.mp4)?" 

filtered_df = df[df['title'].str.match(regex_pattern, na=False)]
filtered_df = filtered_df.reset_index(drop=True)

filtered_df.head(60)

Unnamed: 0,title,link,id,description,author,datePublished,viewCount
0,IMG 1882,https://www.youtube.com/watch?v=xZHD_lCZs20,xZHD_lCZs20,Duncan the cat.,g550guy,2009-10-15T15:36:19Z,0
1,IMG 1882 mpeg1video,https://www.youtube.com/watch?v=K2_BP44v2Lc,K2_BP44v2Lc,Visit http://www.rubber-power.com for more inf...,Darcy Whyte,2009-07-20T15:50:35Z,0
2,IMG 1882,https://www.youtube.com/watch?v=wZHjQF-LDdo,wZHjQF-LDdo,,rlsilverstone,2010-11-07T10:29:37Z,0
3,IMG 3305,https://www.youtube.com/watch?v=LLGGNt4V1Qk,LLGGNt4V1Qk,,BabyShrimpfu,2009-08-11T07:08:10Z,0
4,IMG 3288,https://www.youtube.com/watch?v=NjQv_sMdb-U,NjQv_sMdb-U,,BabyShrimpfu,2009-08-11T06:40:06Z,0
5,IMG 4321,https://www.youtube.com/watch?v=BWRD2k2la94,BWRD2k2la94,,KF825,2009-08-18T15:15:38Z,0
6,IMG 0148,https://www.youtube.com/watch?v=MYB9gXy0XEw,MYB9gXy0XEw,,threelittebirds4321,2010-08-26T05:27:47Z,0
7,IMG 0215,https://www.youtube.com/watch?v=u4J81uIZ7mM,u4J81uIZ7mM,,Ally Trent,2010-12-13T03:52:03Z,0
8,IMG 4233,https://www.youtube.com/watch?v=43I5n3L6PUM,43I5n3L6PUM,,hannahbefogg,2010-08-16T13:14:02Z,0
9,IMG 4408,https://www.youtube.com/watch?v=Jy9T8_BuUIE,Jy9T8_BuUIE,ابو صالح وابو حنفى.,mostafaart25,2009-06-25T02:08:47Z,0


In [63]:
# Fetch view count
from apiclient.discovery import build

for idx, row in filtered_df.iterrows():

    print(row['id'])

    youtube = build("youtube", "v3", developerKey=SECRET_KEY)
    
    # Query the videos endpoint with the specified part and video ID
    response = youtube.videos().list(
        part="statistics",
        id=row['id']
    ).execute()
    
    # Extract items from the response
    items = response.get("items", [])
    if not items:
        raise ValueError(f"No items found for videoId: {row['id']}")
    
    # Extract the 'statistics' object, then the 'viewCount'
    stats = items[0].get("statistics", {})
    view_count_str = stats.get("viewCount", 0)

    print(view_count_str)

    filtered_df.at[idx, 'viewCount'] = view_count_str


xZHD_lCZs20
32
K2_BP44v2Lc
653
wZHjQF-LDdo
32
LLGGNt4V1Qk
15
NjQv_sMdb-U
25
BWRD2k2la94
31
MYB9gXy0XEw
18
u4J81uIZ7mM
24
43I5n3L6PUM
98
Jy9T8_BuUIE
24
xwqRttqQIBo
25
_OFqpT06C90
36
U1M6Y42LYlY
11
7_EVFqhFTCo
27
POPW4azfJkc
7
cVZjI62SIhY
463
u7t0xa4gHuU
0
-KawCMlxJ3c
107


In [64]:
filtered_df.head(60)

Unnamed: 0,title,link,id,description,author,datePublished,viewCount
0,IMG 1882,https://www.youtube.com/watch?v=xZHD_lCZs20,xZHD_lCZs20,Duncan the cat.,g550guy,2009-10-15T15:36:19Z,32
1,IMG 1882 mpeg1video,https://www.youtube.com/watch?v=K2_BP44v2Lc,K2_BP44v2Lc,Visit http://www.rubber-power.com for more inf...,Darcy Whyte,2009-07-20T15:50:35Z,653
2,IMG 1882,https://www.youtube.com/watch?v=wZHjQF-LDdo,wZHjQF-LDdo,,rlsilverstone,2010-11-07T10:29:37Z,32
3,IMG 3305,https://www.youtube.com/watch?v=LLGGNt4V1Qk,LLGGNt4V1Qk,,BabyShrimpfu,2009-08-11T07:08:10Z,15
4,IMG 3288,https://www.youtube.com/watch?v=NjQv_sMdb-U,NjQv_sMdb-U,,BabyShrimpfu,2009-08-11T06:40:06Z,25
5,IMG 4321,https://www.youtube.com/watch?v=BWRD2k2la94,BWRD2k2la94,,KF825,2009-08-18T15:15:38Z,31
6,IMG 0148,https://www.youtube.com/watch?v=MYB9gXy0XEw,MYB9gXy0XEw,,threelittebirds4321,2010-08-26T05:27:47Z,18
7,IMG 0215,https://www.youtube.com/watch?v=u4J81uIZ7mM,u4J81uIZ7mM,,Ally Trent,2010-12-13T03:52:03Z,24
8,IMG 4233,https://www.youtube.com/watch?v=43I5n3L6PUM,43I5n3L6PUM,,hannahbefogg,2010-08-16T13:14:02Z,98
9,IMG 4408,https://www.youtube.com/watch?v=Jy9T8_BuUIE,Jy9T8_BuUIE,ابو صالح وابو حنفى.,mostafaart25,2009-06-25T02:08:47Z,24


## Daily videos

In [65]:
daily_path = "../DB/videos.json"

filtered_df.to_json(daily_path, orient='records', lines=False, indent=4)

## Archive for research purposes

In [66]:
# might error if this has not been created previously
archive_path = "../DB/old_videos.json"

archive_df = pd.read_json(archive_path, orient='records')

updated_df = pd.concat([archive_df, filtered_df], ignore_index=True)

updated_df.to_json(archive_path, orient='records', lines=False, indent=4)