# Getting training data

This notebook contains code to download metadata and transcripts for videos contained in training data (`Data/normalized_data/train.csv`) and to process them into a pandas DataFrame. The code here is based on the code provided by [Papadamou et al.](https://github.com/kostantinos-papadamou/pseudoscience-paper/tree/main/youtubehelpers)

## Download videos' metadata and transcripts

In [None]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from socket import error as SocketError
import time
import glob
import subprocess
import pandas as pd
import os
import json

In [None]:
YOUTUBE_API_SERVICE_NAME = 'youtube'
YOUTUBE_API_VERSION = 'v3'
YOUTUBE_API_KEY = 'YOUR_YOUTUBE_DATA_API_KEY'

In [None]:
youtube_api = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=YOUTUBE_API_KEY)

In [None]:
VIDEO_BASE_DIR = '../Data/raw_data/videos_data'

In [None]:
def download_video_metadata(video_id):
    while True:
        try:
            # Send request to get video's information
            response = youtube_api.videos().list(
                part='id,snippet,contentDetails,statistics',
                id=video_id
            ).execute()

            # Get Video Details
            try:
                return response['items'][0]
            except:
                return None

        except (HttpError, SocketError) as error:
            print(f'--- HTTP Error occurred while retrieving information for VideoID: {video_id}. [ERROR]: {error}')
            time.sleep(30)


def is_video_transcript_downloaded(video_id):
    video_transcript = glob.glob(f'{VIDEO_BASE_DIR}/{video_id}/{video_id}_transcript.*')
    if len(video_transcript) > 0:
        return True
    return False


def download_video_transcript(video_id):
    video_url = f'https://www.youtube.com/watch?v={video_id}'
    path = f'{VIDEO_BASE_DIR}/{video_id}/{video_id}_transcript'

    try:
        # download_video_transcript.sh needs to have execute permissions
        output = subprocess.check_output(f'bash download_video_transcript.sh {video_url} {path}', shell=True)
        if "HTTP_ERROR" in str(output):
            print(output)
            return
    except subprocess.CalledProcessError as e:
        pass
    return


def download_video(video_id):
    print('Downloading metadata...')
    video_metadata = download_video_metadata(video_id)
    
    if video_metadata is None:
        print(f'ERROR: Video Metadata not available for video: {video_id}')
        return None

    if not is_video_transcript_downloaded(video_id):
        print('Downloading transcript...')
        download_video_transcript(video_id)

    return video_metadata

In [None]:
videos = pd.read_csv('../Data/normalized_data/train.csv')

In [None]:
youtube_ids = list(videos['youtube_id'])
len(youtube_ids)

In [None]:
if not os.path.exists(VIDEO_BASE_DIR):
    os.mkdir(VIDEO_BASE_DIR)

In [None]:
for video_id in youtube_ids[:5]:
    path = f'{VIDEO_BASE_DIR}/{video_id}/{video_id}.json'
    if not os.path.exists(path):
        print('Downloading video', video_id)
        video_details = download_video(video_id)
        with open(path, 'w') as f:
            json.dump(video_details, f)
        print('Done')
        time.sleep(10)

## Process downloaded data

In [None]:
from datetime import datetime
import numpy as np
import csv

In [None]:
def process_video_metadata(path):
    with open(path, 'r') as f:
        video = json.load(f)

        return {
            'youtube_id': video['id'],
            'published_at': video['snippet']['publishedAt'],
            'updated_at': datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ'),
            'view_count': video['statistics']['viewCount'] if 'viewCount' in video['statistics'] else np.nan,
            'like_count': video['statistics']['likeCount'] if 'likeCount' in video['statistics'] else np.nan,
            'dislike_count': video['statistics']['dislikeCount'] if 'dislikeCount' in video['statistics'] else np.nan,
            'favourite_count': video['statistics']['favoriteCount'] if 'favoriteCount' in video['statistics'] else np.nan,
            'comment_count': video['statistics']['commentCount'] if 'commentCount' in video['statistics'] else np.nan,
            'duration': video['contentDetails']['duration']
        }

def get_video_transcript(path):
    transcript_path = glob.glob(path)
    if len(transcript_path) == 0:
        return ''
    
    transcript_path = transcript_path[0]
    with open(transcript_path, 'r') as f:
        return f.read()

In [None]:
processed_videos = []

for video_id in youtube_ids:
    path = f'{VIDEO_BASE_DIR}/{video_id}/{video_id}.json'
    transcript_path = f'{VIDEO_BASE_DIR}/{video_id}/{video_id}_transcript.*'
    if os.path.exists(path):
        processed_video = process_video_metadata(path)
        processed_video['transcript'] = get_video_transcript(transcript_path)
        processed_video['annotation'] = videos[videos['youtube_id'] == video_id]['annotation'].values[0]
        
        processed_videos.append(processed_video)

In [None]:
processed_videos_df = pd.DataFrame(processed_videos)

In [None]:
processed_videos_df.info()

In [None]:
processed_videos_df.head()

In [None]:
processed_videos_df.to_csv('../Data/normalized_data/train_processed.csv', index=False, quoting=csv.QUOTE_ALL)