## Webscrapping with Youtube API 

- This project interacts with the YouTube Data API to retrieve and analyze various statistics about a YouTube channel and its videos. The code is designed to fetch channel statistics, retrieve video IDs from a playlist, and gather detailed statistics for each video. 

- This information can be used for various analytical purposes, such as understanding channel performance, video popularity, and audience engagement.

In [67]:
# importing the required library

from googleapiclient.discovery import build 
import pandas as pd
import seaborn as sns 
from IPython.display import HTML
import warnings
warnings.simplefilter("ignore")

In [68]:
# API key for authenticating requests to the YouTube Data API
api_key  = "#Your API key"

# The ID of the YouTube channel to retrieve data from 
channel_id = "#Your Channel ID"

# Creating the YouTube API service object
youtube = build ("youtube", "v3", developerKey = api_key)

## Function to get Channel Statistics 

In [69]:
def get_channel_stats (youtube, channel_id):
    request = youtube.channels().list(
    part="snippet,contentDetails,statistics",
    id=channel_id
)

    response = request.execute()
    
    data = dict(Channel_name = response["items"][0]["snippet"]["title"],
                Subscribers = response["items"][0]["statistics"]["subscriberCount"],
                Views = response["items"][0]["statistics"]["viewCount"],
                Total_videos = response["items"][0]["statistics"]["videoCount"],
                Playlist_id = response ["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"])
    

    return data 

In [None]:
channel_data =  get_channel_stats (youtube, channel_id)
channel_data

## Function to fetch Video IDs

In [None]:
playlist_id = channel_data["Playlist_id"]

playlist_id

In [72]:
def get_video_ids (youtube, playlist_id):
    request = youtube.playlistItems().list(
    part = "contentDetails", 
    playlistId = playlist_id,
    maxResults = 50)
    
    response = request.execute()
    
    video_ids = []
    
    # by default maxRange = 50
    for i in range(len(response["items"])):
        video_ids.append(response["items"][i]["contentDetails"]["videoId"])
        
    next_page_token = response.get("nextPageToken")
    more_pages = True 
    
    while more_pages:
        if next_page_token is None: 
            more_pages = False
            
        else:
            request = youtube.playlistItems().list(
            part = "contentDetails", 
            playlistId = playlist_id,
            maxResults = 50,
            pageToken = next_page_token)
    
            response = request.execute()
        
            for i in range(len(response["items"])):
                video_ids.append(response["items"][i]["contentDetails"]["videoId"])
                
                next_page_token = response.get("nextPageToken")
    
    return video_ids

In [None]:
video_ids = get_video_ids(youtube, playlist_id)

video_ids

## Function to get Video Details 

In [74]:
def get_video_details(youtube, video_ids):
    
    all_video_stats = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
                    part="snippet,contentDetails,statistics" ,
                    id=",".join(video_ids[i:i+50]))

        response = request.execute()
        for video in response["items"]:
            video_stats = dict(Title = video["snippet"]["title"],
                              Published_date = video["snippet"]["publishedAt"],
                              Duration = video["contentDetails"]["duration"],
                              Views = video["statistics"]["viewCount"],
                              Comments = video["statistics"]["commentCount"],
                              Likes = video["statistics"]["likeCount"])
            
            all_video_stats.append(video_stats)
            
    return all_video_stats
        
    
    

In [75]:
video_details = get_video_details (youtube, video_ids)

In [76]:
video_data = pd.DataFrame (video_details)

## Dataset

In [None]:
video_data

# Data Cleaning 

In [None]:
video_data.head()

In [79]:
video_data.shape

(154, 6)

In [80]:
video_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Title           154 non-null    object
 1   Published_date  154 non-null    object
 2   Duration        154 non-null    object
 3   Views           154 non-null    object
 4   Comments        154 non-null    object
 5   Likes           154 non-null    object
dtypes: object(6)
memory usage: 7.3+ KB


In [81]:
# conver the dtype to numeric

video_data["Views"] = pd.to_numeric(video_data["Views"])
video_data["Comments"] = pd.to_numeric(video_data["Comments"])
video_data["Likes"] = pd.to_numeric(video_data["Likes"])

In [82]:
# Split the 'FullName' column into 'FirstName' and 'LastName'
video_data[["Date", "Time"]] = video_data["Published_date"].str.split("T", expand=True)


In [83]:
video_data["Upload_hour"] = pd.to_datetime(video_data["Published_date"]).dt.hour


In [None]:
video_data [["Time", "Time__"]] = video_data["Time"].str.split("Z", expand = True)

video_data

In [85]:
# Function to extract minutes using split method
def extract_minutes_split(duration):
    if 'M' in duration:
        minutes_part = duration.split('M')[0]  # Get the part before 'M'
        if 'H' in minutes_part:
            minutes = minutes_part.split('H')[1]  # Get the part after 'H'
        else:
            minutes = minutes_part.split('T')[1]  # Get the part after 'T'
        return int(minutes)
    return 0

# Apply the function to the 'duration' column
video_data["Duration"] = video_data["Duration"].apply(extract_minutes_split)


In [86]:
video_data.drop(["Published_date", "Time__"], axis = 1, inplace = True)



In [88]:
video_data["Date"] = pd.to_datetime (video_data["Date"])

video_data["Day_of_week"] = video_data["Date"].dt.day_name ()



In [89]:

video_data['Time'] = pd.to_datetime(video_data['Time'])

video_data["Time"] = video_data["Time"].dt.time

In [90]:
video_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Title        154 non-null    object        
 1   Duration     154 non-null    int64         
 2   Views        154 non-null    int64         
 3   Comments     154 non-null    int64         
 4   Likes        154 non-null    int64         
 5   Date         154 non-null    datetime64[ns]
 6   Time         154 non-null    object        
 7   Upload_hour  154 non-null    int32         
 8   Day_of_week  154 non-null    object        
dtypes: datetime64[ns](1), int32(1), int64(4), object(3)
memory usage: 10.4+ KB


In [None]:
video_data

In [93]:
print(type(video_data['Time'][0]))

<class 'datetime.time'>


In [94]:
file_path = "video_data.csv"

video_data.to_csv(file_path, index=False)

In [95]:
# Create a download link
def create_download_link(filename):
    return HTML(f'<a href="{filename}" download>Download CSV file</a>')

# Display the download link
create_download_link(file_path)