## Keyword search for videos and getting their metadata




In [None]:
!pip install --upgrade google-api-python-client
!pip install --upgrade google-auth-oauthlib google-auth-httplib2
!pip install isodate

import pandas as pd
from apiclient.discovery import build
from apiclient.errors import HttpError

In [None]:
key="YouTubeAPI"

In [None]:
DEVELOPER_KEY = key
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
    developerKey=DEVELOPER_KEY)#this is where we build our engine

In [None]:
def youtube_search(q, max_results=50, order="relevance", type="video", language=None, token=None, location=None, location_radius=None, total=200):
    """Performs a YouTube keyword search with the specified parameters."""
    search_list = []
    iterations = -(-total // max_results)  # Calculate the number of iterations (ceil division)

    for _ in range(iterations):
        search_response = youtube.search().list(
            q=q,
            type=type,
            pageToken=token,
            order=order,
            part="id,snippet",
            maxResults=max_results,
            relevanceLanguage=language,
            location=location,
            locationRadius=location_radius
        ).execute()
        search_list.extend(search_response.get("items", []))
        print(search_response.get("pageInfo", {}))

        # Update the token for the next page or break if no token is available
        token = search_response.get("nextPageToken")
        if not token:
            break

    return search_list

def search_to_df(test_search):
    """Transforms a result from a keywords search into a dataframe."""
    search_list = [
        (
            post["id"]["videoId"],
            post["snippet"]["channelTitle"],
            post["snippet"]["title"],
            post["snippet"]["description"]
        )
        for post in test_search #Remove nested loop to iterate directly through search results
        if isinstance(post, dict) and post.get("id") and isinstance(post.get("id"), dict) and post["id"].get("videoId") #Check if post is a dict, if "id" key exists and is a dict, and if "videoId" key exists within "id"

    ]
    return pd.DataFrame(search_list, columns=["Id", "Channel", "Title", "Description"])

def metaDataExtractor(video_ids):
    """Takes a list of video IDs as input and returns their metadata."""
    import isodate  # For parsing duration

    video_data = []
    for num, video_id in enumerate(video_ids, start=1):
        res = youtube.videos().list(id=video_id, part="snippet,statistics,content_details").execute()
        video_data.append(res)
        print(f"\rDownloading metadata for video {num} of {len(video_ids)}. Please wait...", end="")

    metadata_list = []
    keys = {
        "Id": ("items", 0, "id"),
        "Channel": ("items", 0, "snippet", "channelTitle"),
        "Date": ("items", 0, "snippet", "publishedAt"),
        "Time": ("items", 0, "snippet", "publishedAt"),
        "Title": ("items", 0, "snippet", "title"),
        "Description": ("items", 0, "snippet", "description"),
        "Duration": ("items", 0, "contentDetails", "duration"),
        "Tags": ("items", 0, "snippet", "tags"),
        "Views": ("items", 0, "statistics", "viewCount"),
        "Likes": ("items", 0, "statistics", "likeCount"),
        "Favourite": ("items", 0, "statistics", "favoriteCount"),
        "Comments": ("items", 0, "statistics", "commentCount"),
    }

    for item in video_data:
        tempdict = {}
        for key, path in keys.items():
            try:
                value = item
                for p in path:
                    value = value[p] if isinstance(p, (int, str)) else value
                if key == "Duration":
                    value = isodate.parse_duration(value).seconds
                tempdict[key] = value
            except (KeyError, IndexError, TypeError):
                tempdict[key] = ""
        # Separate Date and Time from publishedAt
        if tempdict.get("Date"):
            tempdict["Date"], tempdict["Time"] = tempdict["Date"][:10], tempdict["Time"][11:19]
        metadata_list.append(tempdict)

    return metadata_list



def MetaDownloadDF(video_list):
    """A function that calls other functions above, and returns a Pandas DataFrame
    with metadata for a channel (given as input)"""
    metadict=metaDataExtractor(video_list)
    metadf=pd.DataFrame(metadict)
    return metadf

# Performing a search

In [None]:
search_terms="بيت المولد"#exchange for your own

In [None]:
search=youtube_search(search_terms, total=10000)
searchdf=search_to_df(search)
searchdf


### Save search results to CSV
The code below saves a file that will appear under the folder to the left of here <----- (klick the folder symbol under the key symbol, if not visible.)

In [None]:
searchdf.to_csv("/content/drive/MyDrive/Forskning/Relikprojekt/Youtube/Resultat/"+search_terms+".csv")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

+### Perhaps you want all metadata for all videos in the search results (publication date, channel, views, likes etc etc)
Then we first extract all the video ids

In [None]:
video_list=searchdf["Id"].tolist()

Then we use the MetaDownloadtoDF function from above

In [None]:
metadf=MetaDownloadDF(video_list)
metadf

And then we can save the result as a CSV-file.

In [None]:
metadf.to_csv("searchmetadf.csv")