# Youtube Data Pipeline

In [1]:
import os

import pandas as pd
from html import unescape
from datetime import datetime
import isodate
import googleapiclient.discovery

from IPython.display import JSON

In [2]:
api_key = os.environ["YOUTUBE_API_KEY"]

## Channel Information

### Simple Code

In [3]:
api_service_name = "youtube"
api_version = "v3"

youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=api_key
)

In [4]:
request = youtube.channels().list(
    part="snippet",#,contentDetails,statistics",
    forUsername="harsh1kumar"
    # id="UCueeXkuJezkCqu0YryvJnnQ,UCs8a-hjf6X4pa-O0orSoC8w"
)
response = request.execute()

In [5]:
JSON(response)

<IPython.core.display.JSON object>

### Function for channel information

In [6]:
def get_channel_info(youtube, channel_ids):
    
    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id=",".join(channel_ids)
    )
    response = request.execute()
    
    all_data = []
    for items in response["items"]:
        data = {
            "channel_name": items["snippet"]["title"],
            "view_count": items["statistics"]["viewCount"],
            "subscriber_count": items["statistics"]["subscriberCount"],
            "video_count": items["statistics"]["videoCount"],
            "channel_id": items["id"],
            "playlist_id": items["contentDetails"]["relatedPlaylists"]["uploads"],
        }

        all_data.append(data)

    return pd.DataFrame(all_data)

In [7]:
channel_ids = ["UCueeXkuJezkCqu0YryvJnnQ",   #@harsh1kumar
               "UCs8a-hjf6X4pa-O0orSoC8w",   #@amitvarma
               "UCJQJAI7IjbLcpsjWdSzYz0Q",   #@Thuvu5
              ]
channel_info = get_channel_info(youtube, channel_ids)
channel_info

Unnamed: 0,channel_name,view_count,subscriber_count,video_count,channel_id,playlist_id
0,Harsh Kumar,76490,706,14,UCueeXkuJezkCqu0YryvJnnQ,UUueeXkuJezkCqu0YryvJnnQ
1,Thu Vu data analytics,5520237,169000,76,UCJQJAI7IjbLcpsjWdSzYz0Q,UUJQJAI7IjbLcpsjWdSzYz0Q
2,amitvarma,146605,5770,16,UCs8a-hjf6X4pa-O0orSoC8w,UUs8a-hjf6X4pa-O0orSoC8w


## Playlist Information

### Simple Code

In [8]:
request = youtube.playlistItems().list(
    part="snippet,contentDetails",
    playlistId="UUJQJAI7IjbLcpsjWdSzYz0Q",
    maxResults=50
)
response = request.execute()

In [9]:
JSON(response)

<IPython.core.display.JSON object>

### Function for Playlist Information

In [10]:
def get_playlist_info(youtube, playlist_ids):
    
    all_data = []
    
    for pid in playlist_ids:
        next_page_token = ""

        while next_page_token is not None:
            request = youtube.playlistItems().list(
                part="snippet,contentDetails",
                maxResults=50,
                playlistId=pid,
                pageToken = next_page_token
            )
            response = request.execute()

            for items in response["items"]:
                data = {
                    "title": items["snippet"]["title"],
                    "published_at": items["snippet"]["publishedAt"],
                    "channel_name": items["snippet"]["videoOwnerChannelTitle"],
                    "channel_id": items["snippet"]["channelId"],
                    "video_id": items["snippet"]["resourceId"]["videoId"]
                }
                all_data.append(data)
            
            next_page_token = response.get('nextPageToken')

    return pd.DataFrame(all_data)

In [11]:
playlist_info = get_playlist_info(youtube, channel_info["playlist_id"].to_list())
playlist_info.head()

Unnamed: 0,title,published_at,channel_name,channel_id,video_id
0,MLFlow Tutorial | Hands-on | ML Tracking and S...,2023-04-17T10:30:03Z,Harsh Kumar,UCueeXkuJezkCqu0YryvJnnQ,7Mv91hcxCCI
1,isort for sorting Python imports #shorts #pyth...,2023-03-26T13:41:44Z,Harsh Kumar,UCueeXkuJezkCqu0YryvJnnQ,rs8_I0sYGhw
2,flake8 for linting in Python #shorts #python #...,2023-03-19T18:34:57Z,Harsh Kumar,UCueeXkuJezkCqu0YryvJnnQ,bURvz4g-XIg
3,pytest Tutorial: How to write tests in Python ...,2023-02-06T14:06:40Z,Harsh Kumar,UCueeXkuJezkCqu0YryvJnnQ,bhjaQssIXiw
4,Simple Neural Network using Tensorflow and Ker...,2021-10-18T06:32:21Z,Harsh Kumar,UCueeXkuJezkCqu0YryvJnnQ,DqlPAWkkQC8


In [12]:
playlist_info.published_at = pd.to_datetime(playlist_info.published_at, format='%Y-%m-%dT%H:%M:%SZ')

In [13]:
playlist_info.shape

(106, 5)

In [14]:
playlist_info.groupby("channel_name", as_index=False).size()

Unnamed: 0,channel_name,size
0,Harsh Kumar,14
1,Thu Vu data analytics,76
2,amitvarma,16


## Get video stats

### Simple Code

In [15]:
request = youtube.videos().list(
    part="contentDetails,snippet,statistics",
    id="7Mv91hcxCCI"
)
response = request.execute()

In [16]:
JSON(response)

<IPython.core.display.JSON object>

### Function for Video Information

In [17]:
def get_video_details(youtube, video_ids):

    all_data = []
    for i in range(0, len(video_ids), 10):
        # Make request for 10 videos at a time
        vid = video_ids[i: i+10]
    
        request = youtube.videos().list(
            part="contentDetails,snippet,statistics",
            id=",".join(vid)
        )
        response = request.execute()
        
        
        for items in response["items"]:
            data = {
                "video_id": items["id"],
                "title": items["snippet"]["title"],
                "published_at": items["snippet"]["publishedAt"],
                "duration": items["contentDetails"]["duration"],
                "view_count": items["statistics"]["viewCount"],
                "like_count": items["statistics"]["likeCount"],
                "comment_count": items["statistics"]["commentCount"],
            }
    
            all_data.append(data)

    return pd.DataFrame(all_data)

In [18]:
video_details = get_video_details(youtube, playlist_info["video_id"].to_list())
video_details

Unnamed: 0,video_id,title,published_at,duration,view_count,like_count,comment_count
0,7Mv91hcxCCI,MLFlow Tutorial | Hands-on | ML Tracking and S...,2023-04-17T10:30:03Z,PT13M48S,1016,36,2
1,rs8_I0sYGhw,isort for sorting Python imports #shorts #pyth...,2023-03-26T13:41:44Z,PT56S,195,6,0
2,bURvz4g-XIg,flake8 for linting in Python #shorts #python #...,2023-03-19T18:34:57Z,PT53S,118,7,1
3,bhjaQssIXiw,pytest Tutorial: How to write tests in Python ...,2023-02-06T14:06:40Z,PT11M28S,93,6,0
4,DqlPAWkkQC8,Simple Neural Network using Tensorflow and Ker...,2021-10-18T06:32:21Z,PT13M55S,229,15,3
...,...,...,...,...,...,...,...
101,43jaxQBbjvM,"My Hero, OPPENHEIMER | Episode 5 | Everything ...",2023-07-28T06:42:19Z,PT53M49S,8842,391,41
102,Dc3NnU4tKVM,The WRONG Way to Fight Extremists | Episode 4 ...,2023-07-21T06:09:35Z,PT56M17S,9320,299,25
103,OuJc5vi93xk,Are You Just One Version of Yourself? | Episo...,2023-07-14T13:25:47Z,PT46M56S,7338,251,35
104,Ohkw2zoHZ5w,Is the Singularity Near? | Episode 2 | Everyth...,2023-07-08T03:19:18Z,PT1H16S,9841,283,46


### Detail of latest video for each channel

In [19]:
playlist_info["recency_rank"] = playlist_info.groupby("channel_id")["published_at"].rank(method="first", ascending=False)
playlist_info.loc[playlist_info["recency_rank"]==1]

Unnamed: 0,title,published_at,channel_name,channel_id,video_id,recency_rank
0,MLFlow Tutorial | Hands-on | ML Tracking and S...,2023-04-17 10:30:03,Harsh Kumar,UCueeXkuJezkCqu0YryvJnnQ,7Mv91hcxCCI,1.0
14,How to AI PROOF Your Career,2023-10-16 22:03:49,Thu Vu data analytics,UCJQJAI7IjbLcpsjWdSzYz0Q,lp6SfYCRGLM,1.0
90,The China Model is Broken | Episode 16 | Every...,2023-10-13 04:54:24,amitvarma,UCs8a-hjf6X4pa-O0orSoC8w,eHX9sgBt1nE,1.0


In [20]:
latest_video_list = playlist_info.loc[playlist_info["recency_rank"]==1, "video_id"].to_list()

In [21]:
latest_video_details = get_video_details(youtube, latest_video_list)
latest_video_details

Unnamed: 0,video_id,title,published_at,duration,view_count,like_count,comment_count
0,7Mv91hcxCCI,MLFlow Tutorial | Hands-on | ML Tracking and S...,2023-04-17T10:30:03Z,PT13M48S,1016,36,2
1,lp6SfYCRGLM,How to AI PROOF Your Career,2023-10-16T22:03:49Z,PT11M37S,1025,94,11
2,eHX9sgBt1nE,The China Model is Broken | Episode 16 | Every...,2023-10-13T04:54:24Z,PT1H28M51S,6391,232,47


## Get Video Comments

### Simple Code

In [22]:
request = youtube.commentThreads().list(
    part="snippet,replies",
    maxResults=25,
    videoId="eHX9sgBt1nE"
)
response = request.execute()

In [23]:
JSON(response)

<IPython.core.display.JSON object>

### Function for Comment Information

In [24]:
def get_video_comments(youtube, video_ids):
    
    all_data = []
    for vid in video_ids:
        request = youtube.commentThreads().list(
            part="snippet,replies",
            maxResults=100,
            videoId=vid,
        )
        response = request.execute()

        for items in response["items"]:
            data = {

                "comment_id": items["id"],
                "video_id": items["snippet"]["videoId"],
                "channel_id": items["snippet"]["channelId"],
                "published_at": items["snippet"]["topLevelComment"]["snippet"]["publishedAt"],
                "text_display": items["snippet"]["topLevelComment"]["snippet"]["textDisplay"],
                "author_name": items["snippet"]["topLevelComment"]["snippet"]["authorDisplayName"],
                "like_count": items["snippet"]["topLevelComment"]["snippet"]["likeCount"],
            }

            all_data.append(data)

    return pd.DataFrame(all_data)

In [25]:
comment_details = get_video_comments(youtube, latest_video_list)
comment_details.head()

Unnamed: 0,comment_id,video_id,channel_id,published_at,text_display,author_name,like_count
0,UgyIqOxKydbUPhSovAh4AaABAg,7Mv91hcxCCI,UCueeXkuJezkCqu0YryvJnnQ,2023-06-22T02:59:00Z,Great demo and walkthrough!,Narasimha Murthy,0
1,UgxWwT57bocTPxMWN6l4AaABAg,7Mv91hcxCCI,UCueeXkuJezkCqu0YryvJnnQ,2023-04-17T12:08:51Z,Thanks,Sery christian renaud,1
2,UgzIJ22znlie0ljeRsp4AaABAg,lp6SfYCRGLM,UCJQJAI7IjbLcpsjWdSzYz0Q,2023-10-17T07:11:22Z,There is NO framework for securing your job fr...,vaibhav jadhav,0
3,UgwLPaL2maAunywGQmN4AaABAg,lp6SfYCRGLM,UCJQJAI7IjbLcpsjWdSzYz0Q,2023-10-17T06:19:04Z,Hi Thu Vu! Thank you for sharing this great vi...,Isa Lutfi,0
4,UgwZSDNbHlYe1s1YvLl4AaABAg,lp6SfYCRGLM,UCJQJAI7IjbLcpsjWdSzYz0Q,2023-10-17T06:09:24Z,My first thought: not an other AI video.. - bu...,BG,0


In [26]:
comment_details.shape

(41, 7)

In [27]:
comment_details.groupby("video_id", as_index=False).size()

Unnamed: 0,video_id,size
0,7Mv91hcxCCI,2
1,eHX9sgBt1nE,28
2,lp6SfYCRGLM,11


## Data Post Processing

Remove HTML character reference from string

In [28]:
comment_details["text_display"] = comment_details["text_display"].apply(unescape)

Get proper duration

In [29]:
latest_video_details['duration_sec'] = latest_video_details['duration'].apply(lambda x: isodate.parse_duration(x))
latest_video_details['duration_sec'] = latest_video_details['duration_sec'].dt.total_seconds()

latest_video_details.drop('duration', axis=1, inplace=True)

Fix datatypes

In [30]:
channel_info.view_count = channel_info.view_count.astype(int)
channel_info.subscriber_count = channel_info.subscriber_count.astype(int)
channel_info.video_count = channel_info.video_count.astype(int)

In [31]:
video_details.view_count = video_details.view_count.astype(int)
video_details.like_count = video_details.like_count.astype(int)
video_details.comment_count = video_details.comment_count.astype(int)

video_details.published_at = pd.to_datetime(video_details.published_at, format='%Y-%m-%dT%H:%M:%SZ')

In [32]:
latest_video_details.view_count = latest_video_details.view_count.astype(int)
latest_video_details.like_count = latest_video_details.like_count.astype(int)
latest_video_details.comment_count = latest_video_details.comment_count.astype(int)

latest_video_details.published_at = pd.to_datetime(latest_video_details.published_at, format='%Y-%m-%dT%H:%M:%SZ')

In [33]:
comment_details.like_count = comment_details.like_count.astype(int)

comment_details.published_at = pd.to_datetime(comment_details.published_at, format='%Y-%m-%dT%H:%M:%SZ')

## Text Analysis of Comments

In [34]:
from transformers import pipeline

### Classify based on sentiments

In [35]:
sentiment_classifier = pipeline(
    task="text-classification",
    model="nickwong64/bert-base-uncased-poems-sentiment",
    model_kwargs={"cache_dir": "../model_cache"}
)

Restrict words to first 400 words to make sure model inference doesn't fail due to large input

In [52]:
comments_list = [" ".join(l.split()[:400]) for l in comment_details.text_display.to_list()]

In [53]:
sentiments = pd.DataFrame(sentiment_classifier(comments_list))

In [54]:
sentiments.rename({"label":"sentiment", "score":"sentiment_score"}, axis=1, inplace=True)

In [55]:
comment_details = comment_details.join(sentiments)

### Classify based on sentiments (different multilingual model)

In [56]:
multi_lingual_sentiment_classifier = pipeline(
    task="text-classification",
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student",
    model_kwargs={"cache_dir": "../model_cache"}
)

In [57]:
sentiments = pd.DataFrame(multi_lingual_sentiment_classifier(comments_list))

In [58]:
sentiments.rename({"label":"sentiment", "score":"sentiment_score"}, axis=1, inplace=True)

In [59]:
sentiments.sentiment.drop_duplicates()

0    positive
2    negative
6     neutral
Name: sentiment, dtype: object

In [60]:
# comment_details[["text_display"]].join(sentiment)

### Classify as questions vs statements

In [61]:
question_statement_classifier = pipeline(
    task="text-classification",
    model="shahrukhx01/question-vs-statement-classifier",
    model_kwargs={"cache_dir": "../model_cache"}
)

In [62]:
question_labels = pd.DataFrame(question_statement_classifier(comments_list))

In [63]:
question_labels.rename({"label":"question_category", "score":"question_score"}, axis=1, inplace=True)

question_labels.loc[question_labels["question_category"]=="LABEL_0","question_category"] = "statement"
question_labels.loc[question_labels["question_category"]=="LABEL_1","question_category"] = "question"

In [64]:
comment_details = comment_details.join(question_labels)

## Push data to BQ

Before pushing, it is useful to add load_timestamp to the table

In [48]:
gcp_project_id = "wide-hexagon-397214"

Push channel details

In [49]:
channel_info["load_timestamp"] = datetime.now()

channel_info.to_gbq(destination_table='youtube_data.channel_info',
                     project_id=gcp_project_id,
                     if_exists='append')

100%|███████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1015.32it/s]


Push all video details

In [50]:
video_details["load_timestamp"] = datetime.now()

video_details.to_gbq(destination_table='youtube_data.video_details',
                     project_id=gcp_project_id,
                     if_exists='replace')

100%|███████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 5309.25it/s]


Push latest video details

In [51]:
latest_video_details["load_timestamp"] = datetime.now()

latest_video_details.to_gbq(destination_table='youtube_data.latest_video_details',
                            project_id=gcp_project_id,
                            if_exists='replace')

100%|███████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 3521.67it/s]


Push comment details

In [52]:
comment_details["load_timestamp"] = datetime.now()

comment_details.to_gbq(destination_table='youtube_data.comment_details',
                       project_id=gcp_project_id,
                       if_exists='replace')

100%|███████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1618.17it/s]
