# Data Collection

In [1]:
# Allows us to connect to the YouTube API service
from googleapiclient.discovery import build
import math
import pandas as pd

In [2]:
API_KEY = ""

# Read in the API key
with open("../YouTube API Key.txt") as f:
    for line in f:
        API_KEY = line

# Create the service
youtube_service = build('youtube', 'v3', developerKey=API_KEY)

In [24]:
def search_videos(search_terms, pages=5, results_per_page = 50):
    """
    Search videos on YouTube given a list of various keywords or key phrases.
    Return a list of YouTube video IDs.
    
    search_terms: a list of search terms. each search term will be a new search
    pages: the number of pages we want to retrieve per search (default 5)
    results_per_page: the number of results per page we want to retrieve per search (max is 50)
    returns a list of video ids
    
    """
    
    # initialize empty list to store video ids
    list_of_ids = []
    
    # ID for the video category of music is 10
    MUSIC_CATEGORY_ID = 10

    for search_term in search_terms:
        
        # get the first page of videos for the search term
        request = youtube_service.search().list(
                    part='snippet',
                    maxResults=results_per_page,
                    q=search_term,
                    type='video',
                    videoCategoryId = MUSIC_CATEGORY_ID,
                    )
        response = request.execute()

        # iterate through the response and save just the video id's
        for item in response['items']:
            list_of_ids.append(item['id']['videoId'])
                
        # we already searched once, so get the remaining X-1 pages (where X is number of pages)
        for page in range(1, pages):
            last_request = request
            last_response = response

            request = youtube_service.search().list_next(
                                    previous_request = last_request,
                                    previous_response = last_response)
            response = request.execute()
            
            # iterate through the response and save just the video id's
            for item in response['items']:
                list_of_ids.append(item['id']['videoId'])

    # done!    
    return list_of_ids

In [34]:
def get_video_data(video_ids):
    """
    Returns a pandas dataframe (for now) of video data.
    
    video_ids: the list of video ids
    """
    
    # initialize empty dataframe to build off of
    df = pd.DataFrame(columns=['video_id','title','localized_title','description','localized_description',
                           'tags','channel_title','duration','view_count','like_count','dislike_count',
                           'comment_count','topic_categories'])
    
    for i in range(0, math.ceil(len(ids)/50)):
        
        # 50 is the max items we can get per request
        start = i * 50
        end = (i * 50) + 50
        
        request = youtube_service.videos().list(
            part='snippet, contentDetails, statistics, topicDetails',
            id=ids[start:end])

        response = request.execute()
        
        for j in range(0,len(response['items'])):
        
            # get all the different features from the api response
            video_id = response['items'][j]['id']
            title = response['items'][j]['snippet']['title']
            localized_title = response['items'][j]['snippet']['localized']['title']
            description = response['items'][j]['snippet']['description']
            localized_description = response['items'][j]['snippet']['localized']['description']

            try:
                tags = response['items'][j]['snippet']['tags']
            except:
                tags = 'none'

            channel_title = response['items'][j]['snippet']['channelTitle']
            duration = response['items'][j]['contentDetails']['duration']
            view_count = response['items'][j]['statistics']['viewCount']
            
            try:
                like_count = response['items'][j]['statistics']['likeCount']
            except:
                like_count = -1
                
            try:
                dislike_count = response['items'][j]['statistics']['dislikeCount']
            except:
                dislike_count = -1
                
            try:
                comment_count = response['items'][j]['statistics']['commentCount']
            except:
                comment_count = -1

            try:
                topic_categories = response['items'][j]['topicDetails']
            except:
                topic_categories = 'none'

            df = df.append({'video_id':video_id,'title':title,'localized_title':localized_title,
                   'description':description,'localized_description':localized_description,
                   'tags':tags,'channel_title':channel_title,'duration':duration,
                   'view_count':view_count,'like_count':like_count,'dislike_count':dislike_count,
                   'comment_count':comment_count, 'topic_categories':topic_categories}, ignore_index=True) 
    
    return df

<hr>

In [35]:
#keywords = ['xkito', 'electric swing','reol','animal crossing']
keywords = ['academia','piano','zelda','genshin impact', 'kpop']
#keywords = ['study','hype','zelda']

In [36]:
ids = search_videos(keywords, pages=2)

In [37]:
df = get_video_data(ids)

In [38]:
df

Unnamed: 0,video_id,title,localized_title,description,localized_description,tags,channel_title,duration,view_count,like_count,dislike_count,comment_count,topic_categories
0,XYynzekp3nE,a light academia classical study playlist 🕯🥐☕,a light academia classical study playlist 🕯🥐☕,🍑Like and Subscribe if u enjoyed the video🍯✨ \...,🍑Like and Subscribe if u enjoyed the video🍯✨ \...,none,crachead,PT1H17M16S,927581,51457,166,1312,{'topicCategories': ['https://en.wikipedia.org...
1,n-by0HlBCoY,CHOOSE YOUR ACADEMIA // find your aesthetic 20...,CHOOSE YOUR ACADEMIA // find your aesthetic 20...,hey everyone! thank you so much for watching a...,hey everyone! thank you so much for watching a...,none,sara baji,PT11M13S,482707,36975,198,1916,{'topicCategories': ['https://en.wikipedia.org...
2,nVlurNHtFhw,A romantic academia x light academia classical...,A romantic academia x light academia classical...,🍑Like and Subscribe if u enjoyed the video🍯✨ \...,🍑Like and Subscribe if u enjoyed the video🍯✨ \...,none,crachead,PT1H14M49S,147931,11579,20,187,{'topicCategories': ['https://en.wikipedia.org...
3,Fo1n217fnn0,Melhor Sequencia de Músicas para Malhar 2021 ...,Melhor Sequencia de Músicas para Malhar 2021 ...,Melhor Sequencia de Músicas para Malhar 2021 ...,Melhor Sequencia de Músicas para Malhar 2021 ...,"[musicas de academia, musicas de academia 2021...",Fire Music - Música Academia,PT1H22M40S,805310,10549,569,208,{'topicCategories': ['https://en.wikipedia.org...
4,5g59341uZ_A,𝐃𝐚𝐫𝐤 𝐚𝐜𝐚𝐝𝐞𝐦𝐢𝐚 𝐦𝐮𝐬𝐢𝐜 - 𝐂𝐥𝐚𝐬𝐬𝐢𝐜𝐚𝐥 𝐞𝐝𝐢𝐭𝐢𝐨𝐧,𝐃𝐚𝐫𝐤 𝐚𝐜𝐚𝐝𝐞𝐦𝐢𝐚 𝐦𝐮𝐬𝐢𝐜 - 𝐂𝐥𝐚𝐬𝐬𝐢𝐜𝐚𝐥 𝐞𝐝𝐢𝐭𝐢𝐨𝐧,songs: \n0:00 Vivaldi - Storm\n2:32 Toshifumi ...,songs: \n0:00 Vivaldi - Storm\n2:32 Toshifumi ...,none,Pleasant atmosphere,PT54M1S,2372145,138676,377,3048,{'topicCategories': ['https://en.wikipedia.org...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,4xk8qGDLSNs,"POP vs KPOP 대결 (SING OFF vs. 에릭남) | BTS, 레드벨벳,...","POP vs KPOP 대결 (SING OFF vs. 에릭남) | BTS, 레드벨벳,...",17 songs on one beat (pop vs kpop) sing off\n\...,17 songs on one beat (pop vs kpop) sing off\n\...,"[nida, eric nam, pop vs kpop, mashup, sing off...",NIDA,PT4M,3533925,321167,779,6507,{'topicCategories': ['https://en.wikipedia.org...
496,zPZAKRKNrjw,GUESS THE KPOP SONGS BY THE EMOJI || KPOP GAME,GUESS THE KPOP SONGS BY THE EMOJI || KPOP GAME,Hey thanks for playing! Like and Subscribe for...,Hey thanks for playing! Like and Subscribe for...,none,Kpop DR,PT11M2S,141530,3602,113,197,{'topicCategories': ['https://en.wikipedia.org...
497,uJm6yatUqe4,[KPOP IN PUBLIC NYC] TWICE - CRY FOR ME Dance ...,[KPOP IN PUBLIC NYC] TWICE - CRY FOR ME Dance ...,It's been such a long time since we covered TW...,It's been such a long time since we covered TW...,"[twice, cry for me, cry for me dance cover, tw...",Harmonyc Movement,PT3M46S,484,116,1,13,{'topicCategories': ['https://en.wikipedia.org...
498,7wcE5xVUPxw,NCT U 'FADED IN MY LAST SONG' kpop Reaction Hi...,NCT U 'FADED IN MY LAST SONG' kpop Reaction Hi...,https://www.higherfaculty.com/\n\nJOIN OUR PAT...,https://www.higherfaculty.com/\n\nJOIN OUR PAT...,"[nct, nctu, reaction, kpop reaction, kpop]",Higher Faculty,PT12M49S,14964,1795,6,262,{'topicCategories': ['https://en.wikipedia.org...


In [17]:
df.to_csv('teehee.csv')