# Data Collection

In [4]:
import asyncio
import json
import random
import re
import time
from apism import YouTubeAPI

from keys import key
from functions import *

## Search and Video Data

In [49]:
# Set query and parameters
params = {
    'search': {
        'part': 'snippet',
        'type': 'video',
        'maxResults': 50,
        'relevanceLanguage': 'en'
    },
    'videos': {
        'part': 'id,statistics,topicDetails'
    }
}
bucket_name = 'youtube-data-api'

In [None]:
async def collect_data(query):
    yt = YouTubeAPI(api_key=key, params=params)
    await yt.search(query)
    await yt.videos()
    
    return yt

In [10]:
scishow_videos = await collect_data('Scishow')

In [14]:
jh_videos = await collect_data('johnny harris')

In [16]:
pbs_videos = await collect_data('pbs space time')

In [17]:
_3b1b_videos = await collect_data('3blue1brown')

In [18]:
Damilee = await collect_data('Damilee')

In [19]:
Fireship = await collect_data('Fireship')

In [20]:
print(f"------ ADS ------")
print(f"SciShow : {len(scishow_videos.results['videos'])}")
print(f"Johnny Harris : {len(jh_videos.results['videos'])}")
print(f"PBS Space Time : {len(pbs_videos.results['videos'])}")
print(f"------ NO ADS ------")
print(f"3blue1brown : {len(_3b1b_videos.results['videos'])}")
print(f"Dami Lee : {len(Damilee.results['videos'])}")
print(f"Fireship : {len(Fireship.results['videos'])}")

------ ADS ------
SciShow : 541
Johnny Harris : 594
PBS Space Time : 590
------ NO ADS ------
3blue1brown : 560
Dami Lee : 496
Fireship : 572


In [35]:
# Clean and get proper channel names only
videoIDs = {}
for yt_api, name in zip(
    [scishow_videos, jh_videos, pbs_videos, _3b1b_videos, Damilee, Fireship],
    ['SciShow', 'Johnny Harris', 'PBS Space Time', '3Blue1Brown', 'DamiLee', 'Fireship']
    ):
    videoIDs[name] = [i['id']['videoId'] for i in yt_api.results['search'] if i['snippet']['channelTitle']==name]

In [38]:
for i in ['SciShow', 'Johnny Harris', 'PBS Space Time', '3Blue1Brown', 'DamiLee', 'Fireship']:
    print(f"{i}: {len(videoIDs[i])}")

SciShow: 363
Johnny Harris: 178
PBS Space Time: 355
3Blue1Brown: 139
DamiLee: 189
Fireship: 317


In [50]:
# Upload to GCS
for yt_api, name in zip(
    [scishow_videos, jh_videos, pbs_videos, _3b1b_videos, Damilee, Fireship],
    ['SciShow', 'Johnny Harris', 'PBS Space Time', '3Blue1Brown', 'DamiLee', 'Fireship']
    ):
    
    data = {
        'search': [i for i in yt_api.results['search'] if i['id']['videoId'] in videoIDs[name]],
        'videos': [i for i in yt_api.results['videos'] if i['id'] in videoIDs[name]]
    }
    
    # Writing to a JSON file
    with open(f'{name}.json', 'w') as json_file:
        json.dump(data, json_file, indent=4)
    
    upload_and_delete_local_file(bucket_name, f'{name}.json', f'youtube-ads/{name}.json')

In [63]:
# Sample 50 videos from each channel
sampled_IDs = {}
for yt_api, name in zip(
    [scishow_videos, jh_videos, pbs_videos, _3b1b_videos, Damilee, Fireship],
    ['SciShow', 'Johnny Harris', 'PBS Space Time', '3Blue1Brown', 'DamiLee', 'Fireship']
    ):
    
    sampled_IDs[name] = random.sample([i['id'] for i in yt_api.results['videos'] if i['id'] in videoIDs[name]], 50)

with open(f'sampledIDs.json', 'w') as json_file:
    json.dump(sampled_IDs, json_file, indent=4)
    
upload_and_delete_local_file(bucket_name, f'sampledIDs.json', f'youtube-ads/sampledIDs.json')

## Collect Transcripts

In [72]:
transcripts = {}
for yt_api, name in zip(
    [scishow_videos, jh_videos, pbs_videos, _3b1b_videos, Damilee, Fireship],
    ['SciShow', 'Johnny Harris', 'PBS Space Time', '3Blue1Brown', 'DamiLee', 'Fireship']
    ):
    
    transcripts[name] = await transcript(sampled_IDs[name])

In [74]:
# Upload transcripts to GCS
with open(f'transcripts.json', 'w') as json_file:
    json.dump(transcripts, json_file, indent=4)

upload_and_delete_local_file(bucket_name, f'transcripts.json', f'youtube-ads/transcripts.json')

In [93]:
for name in ['SciShow', 'Johnny Harris', 'PBS Space Time', '3Blue1Brown', 'DamiLee', 'Fireship']:
    print(f"{name}: Generated = {len([i['generated'] for i in transcripts[name] if i and i.get('generated')])} Manual = {len([i['manual'] for i in transcripts[name] if i and i.get('manual')])}")

SciShow: Generated = 47 Manual = 28
Johnny Harris: Generated = 48 Manual = 44
PBS Space Time: Generated = 44 Manual = 48
3Blue1Brown: Generated = 9 Manual = 49
DamiLee: Generated = 48 Manual = 9
Fireship: Generated = 47 Manual = 0
