In [8]:
import pandas as pd
import os
from googleapiclient.discovery import build
import isodate
from sqlalchemy import create_engine,text

### YouTube API credentials

In [9]:
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")

In [10]:
api_service_name = "youtube"
api_version = "v3"

# Get credentials and create an API client
youtube = build(
    api_service_name, api_version, developerKey=YOUTUBE_API_KEY)

### Import channel IDs from CSV

In [11]:
# File path for CSV import
csv_file_path = 'channels/channel_ids.csv'

# Read channel IDs from CSV file into a list
import_channel_df = pd.read_csv(csv_file_path, header=None, names=["channel_id"])

Connect to db

In [50]:
db_string = 'sqlite:///db/youtube.db'
# Create a engine
engine = create_engine(db_string)
# Create connection
conn = engine.connect()

Check if channel data already exists for each channel in db

In [13]:
# Test database with simple query
query = text('SELECT channel_id FROM channel')
db_channels_df = pd.read_sql_query(query, conn)

In [14]:
# Get only channels that don't exist already in the db
new_channels_df = import_channel_df[~import_channel_df['channel_id'].isin(db_channels_df['channel_id'])]

# Convert new channels to a list to pass to the API
channel_ids = new_channels_df['channel_id'].tolist()

new_channels_df.head()

Unnamed: 0,channel_id
0,UCtYLUTtgS3k1Fg4y5tAhLbw
1,UC4JX40jDee_tINbkjycV4Sg
2,UC8butISFwT-Wl7EV0hUK0BQ
3,UCFbNIlppjAuEX4znoulh0Cw
4,UC6AVa0vSrCpuskzGDDKz_EQ


### Function to return channel details

In [15]:
def get_channel_stats(youtube, channel_ids):
    all_data = []

    try:
        # Split channel_ids list into chunks of up to 50 ids each
        id_chunks = [channel_ids[i:i+50] for i in range(0, len(channel_ids), 50)]

        for id_chunk in id_chunks:
            request = youtube.channels().list(
                part='snippet,contentDetails,statistics',
                id=','.join(id_chunk)
            )
            response = request.execute()

            for item in response.get('items', []):
                data = {
                    'channel_name': item['snippet']['title'],
                    'channel_id': item['id'],
                    'description': item['snippet']['description'],
                    'subscriber_count': item['statistics']['subscriberCount'],
                    'view_count': item['statistics']['viewCount'],
                    'video_count': item['statistics']['videoCount'],
                    'playlist_id': item['contentDetails']['relatedPlaylists']['uploads'],
                    'start_date': item['snippet']['publishedAt'],
                    'country': item['snippet'].get('country', None),
                }
                all_data.append(data)
    except Exception as e:
        print(f"Error occurred: {e}")

    return pd.DataFrame(all_data)

### Get channel data if there are new channels to process

In [16]:
# Check if there are new channels to process
if len(new_channels_df) > 0:
    # Get channel details
    channels_df = get_channel_stats(youtube, channel_ids)

### Covert start date to dt format

In [17]:
channels_df['start_date'] = pd.to_datetime(channels_df['start_date']).dt.date

In [19]:
channels_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   channel_name      100 non-null    object
 1   channel_id        100 non-null    object
 2   description       100 non-null    object
 3   subscriber_count  100 non-null    object
 4   view_count        100 non-null    object
 5   video_count       100 non-null    object
 6   playlist_id       100 non-null    object
 7   start_date        100 non-null    object
 8   country           87 non-null     object
dtypes: object(9)
memory usage: 7.2+ KB


In [20]:
# Check for duplicates
channels_df[channels_df['channel_id'].duplicated()]

Unnamed: 0,channel_name,channel_id,description,subscriber_count,view_count,video_count,playlist_id,start_date,country


Push channel data to db

In [21]:
# Push df to database
channels_df.to_sql("channel", engine, if_exists='append', index=False)

100

In [22]:
# Test database with simple query
query = text('SELECT * FROM channel')
test_df = pd.read_sql_query(query, conn)
test_df.head()

Unnamed: 0,channel_id,channel_name,description,subscriber_count,view_count,video_count,playlist_id,start_date,country
0,UC8butISFwT-Wl7EV0hUK0BQ,freeCodeCamp.org,Learn to code for free.,9040000,661570512,1585,UU8butISFwT-Wl7EV0hUK0BQ,2014-12-16,US
1,UCEBpSZhI1X8WaP-kY_2LLcg,365 Data Science,At 365 Data Science we make #DataScience acces...,307000,13782817,224,UUEBpSZhI1X8WaP-kY_2LLcg,2017-08-07,BG
2,UCHXa4OpASJEwrHrLeIzw7Yg,Nicholas Renotte,"Sup!\n\nWelcome to the channel. So, if you're ...",235000,15263747,304,UUHXa4OpASJEwrHrLeIzw7Yg,2019-01-26,AU
3,UCDybamfye5An6p-j1t2YMsg,Mo Chen,"👋 Hey there, my name is Mo Chen and I work as ...",82100,2658864,88,UUDybamfye5An6p-j1t2YMsg,2022-12-25,GB
4,UCkRFwipiIqBTakN-mkZ-GcQ,Ayush Singh,,70200,2075280,24,UUkRFwipiIqBTakN-mkZ-GcQ,2022-06-25,IN


### Get all video ids from a channel

In [23]:
def get_video_ids(youtube, playlist_id):

    video_ids = []

    try:
        # Initial request for the first page of videos
        request = youtube.playlistItems().list(
            part='contentDetails',
            playlistId=playlist_id,
            maxResults=50
        )
        response = request.execute()

        # Extract video IDs from the first page of response
        for item in response.get('items', []):
            video_ids.append(item['contentDetails']['videoId'])

        # Fetch additional pages of videos, if available
        while 'nextPageToken' in response:
            request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId=playlist_id,
                maxResults=50,
                pageToken=response['nextPageToken']
            )
            response = request.execute()

            # Extract video IDs from the current page of response
            for item in response.get('items', []):
                video_ids.append(item['contentDetails']['videoId'])
    except Exception as e:
        print(f"Error occurred: {e}")

    return video_ids

### Get video details for video IDs

In [24]:
def get_video_details(youtube, video_ids):

    all_video_info = []

    try:
        # Split video_ids list into chunks of up to 50 IDs each
        id_chunks = [video_ids[i:i+50] for i in range(0, len(video_ids), 50)]

        for id_chunk in id_chunks:
            request = youtube.videos().list(
                part="snippet,contentDetails,statistics",
                id=','.join(id_chunk)
            )
            response = request.execute()

            for video in response.get('items', []):
                video_info = {
                    'channel_id': video['snippet']['channelId'],
                    'video_id': video['id'],
                    'video_title': video['snippet']['title'],
                    'description': video['snippet']['description'],
                    'tags': video['snippet'].get('tags', []),
                    'published': video['snippet']['publishedAt'],
                    'view_count': video['statistics'].get('viewCount', None),
                    'like_count': video['statistics'].get('likeCount', None),
                    'favourite_count': video['statistics'].get('favoriteCount', None),
                    'comment_count': video['statistics'].get('commentCount', None),
                    'duration': video['contentDetails'].get('duration', None),
                    'definition': video['contentDetails'].get('definition', None),
                    'caption': video['contentDetails'].get('caption', None),
                    'category_id': video['snippet'].get('categoryId', None),
                }
                all_video_info.append(video_info)
    except Exception as e:
        print(f"Error occurred: {e}")

    return pd.DataFrame(all_video_info)


### Get video data for each channel

In [25]:
videos_df = pd.DataFrame()

for channel in channels_df['channel_name']:
    # Get playlist id for channel
    playlist_id = channels_df.loc[channels_df['channel_name'] == channel, 'playlist_id'].iloc[0]
    # Get all video ids for channel
    video_ids = get_video_ids(youtube, playlist_id)
    num_of_videos = len(video_ids)
    print(f"{num_of_videos} videos found for channel: {channel}")

    # Get video data for each video
    video_data = get_video_details(youtube, video_ids)

    # Concat to main df
    videos_df = pd.concat([videos_df, video_data], ignore_index=True)

1586 videos found for channel ID: freeCodeCamp.org
224 videos found for channel ID: 365 Data Science
305 videos found for channel ID: Nicholas Renotte
88 videos found for channel ID: Mo Chen
24 videos found for channel ID: Ayush Singh
9 videos found for channel ID: Yosh
145 videos found for channel ID: Rob Mulla
183 videos found for channel ID: Programming with Mosh
198 videos found for channel ID: UVA School of Data Science
126 videos found for channel ID: Sundas Khalid
233 videos found for channel ID: Corey Schafer
161 videos found for channel ID: YUNIKARN
282 videos found for channel ID: Alex The Analyst
83 videos found for channel ID: zedstatistics
803 videos found for channel ID: codebasics
134 videos found for channel ID: Internet Made Coder
38 videos found for channel ID: Egor Howell
673 videos found for channel ID: Web Dev Simplified
202 videos found for channel ID: Machine Learning Street Talk
7172 videos found for channel ID: Simplilearn
98 videos found for channel ID: Joma T

### Convert dates to dt

In [41]:
videos_df['published'] = pd.to_datetime(videos_df['published'])

### Covert duration to seconds using ISODATE

In [42]:
def duration_to_seconds(duration_str):
    duration = isodate.parse_duration(duration_str)
    total_seconds = duration.total_seconds()
    return int(total_seconds)

In [None]:
videos_df['duration'] = videos_df['duration'].apply(duration_to_seconds)

### Convert tag list to string

In [45]:
videos_df['tags'] = videos_df['tags'].apply(', '.join)

In [46]:
videos_df.tail()

Unnamed: 0,channel_id,video_id,video_title,description,tags,published,view_count,like_count,favourite_count,comment_count,duration,definition,caption,category_id
44787,UCwBhBDsqiQflTMLy2epbQVw,RO9rfa8-vwo,Life Engine Update (now with graphs! 📈),Create your own life in the Life Engine: https...,,2021-02-19 22:00:16+00:00,16563,588,0,25,475,hd,False,28
44788,UCwBhBDsqiQflTMLy2epbQVw,HpgXTphPCP0,Bugs are Features in Evolution [The Life Engine],Play the Life Engine: https://thelifeengine.ne...,,2021-02-05 21:55:05+00:00,54992,1268,0,60,982,hd,False,28
44789,UCwBhBDsqiQflTMLy2epbQVw,uGkkm023BSs,Building a Zoo with Evolution [The Life Engine],Here I demonstrate a very different path of ev...,,2021-01-29 21:40:38+00:00,83000,2303,0,74,912,hd,False,28
44790,UCwBhBDsqiQflTMLy2epbQVw,WJyHaPFwFSQ,Evolution of Eyes and Brains [The Life Engine],Play the Life Engine here: https://thelifeengi...,,2020-08-28 15:04:14+00:00,103449,3886,0,141,810,hd,False,28
44791,UCwBhBDsqiQflTMLy2epbQVw,4XEklaH9k6k,Evolution Simulator [The Life Engine],"In this video I introduce the Life Engine, an ...","E, v, o, l, u, t, i, o, n, , s, i, m, u, l, a...",2020-08-07 22:21:22+00:00,194728,6481,0,299,3048,hd,False,28


### Export video df to database

In [None]:
# Push df to database
videos_df.to_sql(name="video", con=engine, if_exists='append', index=False)

In [53]:
# Test database with simple query
query = text("SELECT * FROM video")
test_df = pd.read_sql_query(query, conn)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44792 entries, 0 to 44791
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   channel_id       44792 non-null  object 
 1   video_id         44792 non-null  object 
 2   video_title      44792 non-null  object 
 3   description      44792 non-null  object 
 4   tags             44792 non-null  object 
 5   published        44792 non-null  object 
 6   view_count       44788 non-null  float64
 7   like_count       44624 non-null  float64
 8   favourite_count  44792 non-null  int64  
 9   comment_count    44700 non-null  float64
 10  duration         44792 non-null  int64  
 11  definition       44792 non-null  object 
 12  caption          44792 non-null  object 
 13  category_id      44792 non-null  int64  
dtypes: float64(3), int64(3), object(8)
memory usage: 4.8+ MB


### Get comment for each video

In [None]:
def get_comments_by_video_ids(youtube, video_ids):

    all_comments = []
    
    for id in video_ids[:5]: # limit to 5 videos for testing
        try:   
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=id,
                maxResults=100,
                order="relevance",
            )
            response = request.execute()

            for comment in response.get('items', []):
                comment_data = {
                    'video_id': comment['snippet']['videoId'],
                    'comment_id': comment['snippet']['topLevelComment']['id'],
                    'body': comment['snippet']['topLevelComment']['snippet']['textOriginal'],
                    'comment_likes': comment['snippet']['topLevelComment']['snippet']['likeCount'],
                    'comment_replies': comment['snippet']['totalReplyCount']
                }

                all_comments.append(comment_data)
            
        except: 
            # Handle if there are no comments for video
            print(f'Failed to get comments for video id: {id}')

    #return response    
    return pd.DataFrame(all_comments)

In [None]:
# Convert video_ids column to a list to pass to the API
video_ids = videos_df['video_id'].tolist()

comments_df = get_comments_by_video_ids(youtube, video_ids)

In [None]:
comments_df.head()

Unnamed: 0,video_id,comment_id,body,comment_likes,comment_replies
0,YdWkUdMxMvM,UgyJJ1jV6CQzwJpKJcB4AaABAg,"I've been trying to change to a Coding career,...",6,3
1,YdWkUdMxMvM,UgyTupQVEMxwqAN7mSh4AaABAg,This code camp toughens the competition. Stop ...,6,1
2,YdWkUdMxMvM,UgwGXdzPdCPVpCe-vFx4AaABAg,Great video 👍,0,0
3,YdWkUdMxMvM,UgxnPUv4Fr-7PyLUWwp4AaABAg,on point,0,0
4,YdWkUdMxMvM,Ugy9oMMo28xQwVFQHwd4AaABAg,Thanks,0,0


In [None]:
# Push df to database
comments_df.to_sql(name="comment", con=engine, if_exists='append', index=False)

217

In [None]:
# Close the connection
conn.close()
engine.dispose()