In [1]:
import pandas as pd
import os
from googleapiclient.discovery import build
import isodate
from sqlalchemy import create_engine,text

### YouTube API credentials

In [2]:
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")

In [3]:
api_service_name = "youtube"
api_version = "v3"

# Get credentials and create an API client
youtube = build(
    api_service_name, api_version, developerKey=YOUTUBE_API_KEY)

### Import channel IDs from CSV

In [4]:
# File path for CSV import
csv_file_path = 'channels/channel_ids.csv'

# Read channel IDs from CSV file into a list
import_channel_df = pd.read_csv(csv_file_path, header=None, names=["channel_id"])

Connect to db

In [5]:
db_string = 'sqlite:///db/youtube.db'
# Create a engine
engine = create_engine(db_string)
# Create connection
conn = engine.connect()

Check if channel data already exists for each channel in db

In [6]:
# Test database with simple query
query = text('SELECT channel_id FROM channel')
db_channels_df = pd.read_sql_query(query, conn)

In [7]:
# Get only channels that don't exist already in the db
new_channels_df = import_channel_df[~import_channel_df['channel_id'].isin(db_channels_df['channel_id'])]

# Convert new channels to a list to pass to the API
channel_ids = new_channels_df['channel_id'].tolist()

new_channels_df.head()

Unnamed: 0,channel_id
0,UCtYLUTtgS3k1Fg4y5tAhLbw
1,UC4JX40jDee_tINbkjycV4Sg
2,UC8butISFwT-Wl7EV0hUK0BQ
3,UCFbNIlppjAuEX4znoulh0Cw
4,UC6AVa0vSrCpuskzGDDKz_EQ


### Function to return channel details

In [8]:
def get_channel_stats(youtube, channel_ids):
    all_data = []

    try:
        # Split channel_ids list into chunks of up to 50 ids each
        id_chunks = [channel_ids[i:i+50] for i in range(0, len(channel_ids), 50)]

        for id_chunk in id_chunks:
            request = youtube.channels().list(
                part='snippet,contentDetails,statistics',
                id=','.join(id_chunk)
            )
            response = request.execute()

            for item in response.get('items', []):
                data = {
                    'channel_name': item['snippet']['title'],
                    'channel_id': item['id'],
                    'description': item['snippet']['description'],
                    'subscriber_count': item['statistics']['subscriberCount'],
                    'view_count': item['statistics']['viewCount'],
                    'video_count': item['statistics']['videoCount'],
                    'playlist_id': item['contentDetails']['relatedPlaylists']['uploads'],
                    'start_date': item['snippet']['publishedAt'],
                    'country': item['snippet'].get('country', None),
                }
                all_data.append(data)
    except Exception as e:
        print(f"Error occurred: {e}")

    return pd.DataFrame(all_data)

### Get channel data if there are new channels to process

In [9]:
# Check if there are new channels to process
if len(new_channels_df) > 0:
    # Get channel details
    channels_df = get_channel_stats(youtube, channel_ids)

### Covert start date to dt format

In [10]:
channels_df['start_date'] = pd.to_datetime(channels_df['start_date']).dt.date

In [11]:
channels_df.head()

Unnamed: 0,channel_name,channel_id,description,subscriber_count,view_count,video_count,playlist_id,start_date,country
0,Dataquest,UC_lePY0Lm0E2-_IkYUWpI5A,Learn AI and data skills 10x faster with Dataq...,49400,1684659,49,UU_lePY0Lm0E2-_IkYUWpI5A,2019-01-30,US
1,Luke Barousse,UCLLw7jmFsvfIVaUFsLs8mlQ,"What's up, Data Nerds! I'm Luke, a data analys...",412000,20450551,153,UULLw7jmFsvfIVaUFsLs8mlQ,2020-08-03,US
2,Learn with Lukas,UCw_LFe2pS8x3NyipGNJgeEA,"Hey there, I'm Lukas! 👋\n\nOn this channel I'l...",33000,1843768,72,UUw_LFe2pS8x3NyipGNJgeEA,2020-11-26,US
3,Ayush Singh,UCkRFwipiIqBTakN-mkZ-GcQ,,70200,2075280,24,UUkRFwipiIqBTakN-mkZ-GcQ,2022-06-25,IN
4,Mo Chen,UCDybamfye5An6p-j1t2YMsg,"👋 Hey there, my name is Mo Chen and I work as ...",82100,2658864,88,UUDybamfye5An6p-j1t2YMsg,2022-12-25,GB


In [12]:
# Check for duplicates
channels_df[channels_df['channel_id'].duplicated()]

Unnamed: 0,channel_name,channel_id,description,subscriber_count,view_count,video_count,playlist_id,start_date,country


Push channel data to db

In [13]:
# Push df to database
channels_df.to_sql("channel", engine, if_exists='append', index=False)

100

In [14]:
# Test database with simple query
query = text('SELECT * FROM channel')
test_df = pd.read_sql_query(query, conn)
test_df.head()

Unnamed: 0,channel_id,channel_name,description,subscriber_count,view_count,video_count,playlist_id,start_date,country
0,UC_lePY0Lm0E2-_IkYUWpI5A,Dataquest,Learn AI and data skills 10x faster with Dataq...,49400,1684659,49,UU_lePY0Lm0E2-_IkYUWpI5A,2019-01-30,US
1,UCLLw7jmFsvfIVaUFsLs8mlQ,Luke Barousse,"What's up, Data Nerds! I'm Luke, a data analys...",412000,20450551,153,UULLw7jmFsvfIVaUFsLs8mlQ,2020-08-03,US
2,UCw_LFe2pS8x3NyipGNJgeEA,Learn with Lukas,"Hey there, I'm Lukas! 👋\n\nOn this channel I'l...",33000,1843768,72,UUw_LFe2pS8x3NyipGNJgeEA,2020-11-26,US
3,UCkRFwipiIqBTakN-mkZ-GcQ,Ayush Singh,,70200,2075280,24,UUkRFwipiIqBTakN-mkZ-GcQ,2022-06-25,IN
4,UCDybamfye5An6p-j1t2YMsg,Mo Chen,"👋 Hey there, my name is Mo Chen and I work as ...",82100,2658864,88,UUDybamfye5An6p-j1t2YMsg,2022-12-25,GB


### Get all video ids from a channel

In [15]:
def get_video_ids(youtube, playlist_id):

    video_ids = []

    try:
        # Initial request for the first page of videos
        request = youtube.playlistItems().list(
            part='contentDetails',
            playlistId=playlist_id,
            maxResults=50
        )
        response = request.execute()

        # Extract video IDs from the first page of response
        for item in response.get('items', []):
            video_ids.append(item['contentDetails']['videoId'])

        # Fetch additional pages of videos, if available
        while 'nextPageToken' in response:
            request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId=playlist_id,
                maxResults=50,
                pageToken=response['nextPageToken']
            )
            response = request.execute()

            # Extract video IDs from the current page of response
            for item in response.get('items', []):
                video_ids.append(item['contentDetails']['videoId'])
    except Exception as e:
        print(f"Error occurred: {e}")

    return video_ids

### Get video details for video IDs

In [16]:
def get_video_details(youtube, video_ids):

    all_video_info = []

    try:
        # Split video_ids list into chunks of up to 50 IDs each
        id_chunks = [video_ids[i:i+50] for i in range(0, len(video_ids), 50)]

        for id_chunk in id_chunks:
            request = youtube.videos().list(
                part="snippet,contentDetails,statistics",
                id=','.join(id_chunk)
            )
            response = request.execute()

            for video in response.get('items', []):
                video_info = {
                    'channel_id': video['snippet']['channelId'],
                    'video_id': video['id'],
                    'video_title': video['snippet']['title'],
                    'description': video['snippet']['description'],
                    'tags': video['snippet'].get('tags', []),
                    'published': video['snippet']['publishedAt'],
                    'view_count': video['statistics'].get('viewCount', None),
                    'like_count': video['statistics'].get('likeCount', None),
                    'favourite_count': video['statistics'].get('favoriteCount', None),
                    'comment_count': video['statistics'].get('commentCount', None),
                    'duration': video['contentDetails'].get('duration', None),
                    'definition': video['contentDetails'].get('definition', None),
                    'caption': video['contentDetails'].get('caption', None),
                    'category_id': video['snippet'].get('categoryId', None),
                }
                all_video_info.append(video_info)
    except Exception as e:
        print(f"Error occurred: {e}")

    return pd.DataFrame(all_video_info)


### Get video data for each channel

In [27]:
videos_df = pd.DataFrame()

for channel in channels_df['channel_name']:
    # Get playlist id for channel
    playlist_id = channels_df.loc[channels_df['channel_name'] == channel, 'playlist_id'].iloc[0]
    # Get all video ids for channel
    video_ids = get_video_ids(youtube, playlist_id)
    num_of_videos = len(video_ids)
    print(f"{num_of_videos} videos found for channel ID: {channel}")

    # Get video data for each video
    video_data = get_video_details(youtube, video_ids)

    # Concat to main df
    videos_df = pd.concat([videos_df, video_data], ignore_index=True)

### Convert dates to dt

In [28]:
videos_df['published'] = pd.to_datetime(videos_df['published'])

### Covert duration to seconds using ISODATE

In [29]:
def duration_to_seconds(duration_str):
    duration = isodate.parse_duration(duration_str)
    total_seconds = duration.total_seconds()
    return int(total_seconds)

In [30]:
videos_df['duration'] = videos_df['duration'].apply(duration_to_seconds)

### Convert tag list to string

In [31]:
videos_df['tags'] = videos_df['tags'].apply(', '.join)

In [32]:
videos_df.tail()

Unnamed: 0,channel_id,video_id,video_title,description,tags,published,view_count,like_count,favourite_count,comment_count,duration,definition,caption,category_id
44780,UCqBFsuAz41sqWcFjZkqmJqQ,WBiMeRD5yXk,Choropleth Map (Submit Button) - Python Dash P...,Learn how to make an interactive Choropleth ma...,"data analysis, data visualization, python, dat...",2020-02-09 08:59:29+00:00,16205,280,0,55,1194,hd,False,27
44781,UCqBFsuAz41sqWcFjZkqmJqQ,DCHkv3x3Vs8,Scatter Plot (RangeSlider) - Python Dash Plotly,"Using RangeSlider and a Scatter Plot, this tut...","Dash, Plotly, Scatter Plot, data analysis, dat...",2020-01-29 06:31:13+00:00,9142,134,0,22,1072,hd,False,27
44782,UCqBFsuAz41sqWcFjZkqmJqQ,7R7VMSLwooo,Scatter Plot Mapbox (Checkbox) - Dash Python,Learn to create an interactive Scatter MapBox ...,"Dash, Plotly, Bar graph, Bar Chart, data analy...",2020-01-21 14:43:10+00:00,18183,288,0,60,2012,hd,False,27
44783,UCqBFsuAz41sqWcFjZkqmJqQ,FuJOsZgo4nU,Bar Graph (RadioItems) - Python Dash Plotly,"Using Radioitems and bar graphs, this tutorial...","Dash tutorial series, Interacting With Plotly ...",2020-01-18 22:22:47+00:00,12175,173,0,33,1739,hd,True,27
44784,UCqBFsuAz41sqWcFjZkqmJqQ,iV51JqP6y_Q,Pie Chart (Dropdowns) - Python Dash Plotly,"Using Dropdown and a pie chart, this tutorial ...",GETTING STARTED WITH SCATTERPLOT AND PIE CHART...,2020-01-12 14:26:08+00:00,32700,535,0,110,1063,hd,False,27


### Export video df to database

In [33]:
# Push df to database
videos_df.to_sql(name="video", con=engine, if_exists='append', index=False)

44785

In [34]:
# Test database with simple query
query = text("SELECT * FROM video")
test_df = pd.read_sql_query(query, conn)
test_df.head()

Unnamed: 0,channel_id,video_id,video_title,description,tags,published,view_count,like_count,favourite_count,comment_count,duration,definition,caption,category_id
0,UC_lePY0Lm0E2-_IkYUWpI5A,EDt811lX6B4,Why you Should Keep Working on Harder (and Har...,Become a Python Pro! Take on Bigger Challenges...,,2023-10-02 14:00:12.000000,414.0,21.0,0,0.0,58,hd,False,27
1,UC_lePY0Lm0E2-_IkYUWpI5A,blXQZFGzAdA,How to Create Your Own Python Projects?,After you’ve worked through a few structured p...,"learn python, learn to code, how to learn pyth...",2023-09-28 14:00:22.000000,676.0,26.0,0,0.0,32,hd,False,27
2,UC_lePY0Lm0E2-_IkYUWpI5A,WM47-5hvtmA,Why Start with Guided Projects? Learn Python t...,"When it comes to guided projects, there is no ...","learn python, learn to code, how to learn pyth...",2023-09-25 14:00:23.000000,301.0,14.0,0,0.0,44,hd,False,27
3,UC_lePY0Lm0E2-_IkYUWpI5A,VycKHaHXlEc,"Learn the Basic Syntax, Quickly!",We know you can't wait to start your own proje...,"learn python, learn to code, how to learn pyth...",2023-09-22 14:00:23.000000,404.0,13.0,0,0.0,27,hd,False,27
4,UC_lePY0Lm0E2-_IkYUWpI5A,-9-U6BwLU7M,Why do You Want to Learn Python?,Ever wondered why you should learn Python? Ste...,"how to learn programming language, learn progr...",2023-09-18 14:00:00.000000,586.0,28.0,0,1.0,60,hd,False,27


### Get comment for each video

In [30]:
def get_comments_by_video_ids(youtube, video_ids):

    all_comments = []
    
    for id in video_ids[:5]: # limit to 5 videos for testing
        try:   
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=id,
                maxResults=100,
                order="relevance",
            )
            response = request.execute()

            for comment in response.get('items', []):
                comment_data = {
                    'video_id': comment['snippet']['videoId'],
                    'comment_id': comment['snippet']['topLevelComment']['id'],
                    'body': comment['snippet']['topLevelComment']['snippet']['textOriginal'],
                    'comment_likes': comment['snippet']['topLevelComment']['snippet']['likeCount'],
                    'comment_replies': comment['snippet']['totalReplyCount']
                }

                all_comments.append(comment_data)
            
        except: 
            # Handle if there are no comments for video
            print(f'Failed to get comments for video id: {id}')

    #return response    
    return pd.DataFrame(all_comments)

In [33]:
# Convert video_ids column to a list to pass to the API
video_ids = videos_df['video_id'].tolist()

comments_df = get_comments_by_video_ids(youtube, video_ids)

In [34]:
comments_df.head()

Unnamed: 0,video_id,comment_id,body,comment_likes,comment_replies
0,H6kKmMB-LdQ,UgxRLIp2ZTtu2fQp9FR4AaABAg,Create your free ClickUp account here: https:/...,0,1
1,H6kKmMB-LdQ,UgwIg-QP4EidK4B0Rx94AaABAg,What about an updated desk set up tour?,0,0
2,H6kKmMB-LdQ,Ugzfwg6Q_1m5KtAiPw54AaABAg,This is amazing!,0,0
3,H6kKmMB-LdQ,Ugx_csBjo-76WzOjBr94AaABAg,"Goede video Dave, thanks!",0,0
4,H6kKmMB-LdQ,Ugx3gUipt86ZAOJDEjl4AaABAg,Lets go!,0,0


In [35]:
# Push df to database
comments_df.to_sql(name="comment", con=engine, if_exists='append', index=False)

56

In [221]:
# Close the connection
conn.close()
engine.dispose()