In [30]:
import pandas as pd
from googleapiclient.discovery import build
from IPython.display import JSON

In [31]:
api_key = 'AIzaSyCFA8g0X3IQMpb6nGA8SdnNNlqmfbUgf1I'

In [32]:
channel_ids = open('channel_ids.txt', 'r').read().splitlines()

In [33]:
len(channel_ids)

32

In [34]:
# Create a YouTube API client
youtube = build('youtube', 'v3', developerKey=api_key)

In [35]:
# Retrieve the metadata for the specified channels
channel_response = youtube.channels().list(
    part='statistics,snippet',
    id=','.join(channel_ids),
).execute()

In [36]:
# Pretty print the data
import uuid
from IPython.display import display_javascript, display_html, display
import json

class RenderJSON(object):
    def __init__(self, json_data):
        if isinstance(json_data, dict):
            self.json_str = json.dumps(json_data)
        else:
            self.json_str = json_data
        self.uuid = str(uuid.uuid4())

    def _ipython_display_(self):
        display_html('<div id="{}" style="height: 600px; width:100%;"></div>'.format(self.uuid), raw=True)
        display_javascript("""
        require(["https://rawgit.com/caldwell/renderjson/master/renderjson.js"], function() {
        document.getElementById('%s').appendChild(renderjson(%s))
        });
        """ % (self.uuid, self.json_str), raw=True)


In [37]:
# Create a pandas dataframe to store the channel information
df = pd.DataFrame(columns=['channel_id', 'channel_title', 'view_count', 'subscriber_count', 'country'])

In [38]:
def get_channel_data(channel_ids, page_token=None):
    """
    Recursively retrieves data for the specified channels from the YouTube Data API.
    """
    # Retrieve the metadata for the specified channels
    channel_response = youtube.channels().list(
        part='statistics,snippet',
        id=','.join(channel_ids),
        pageToken=page_token
    ).execute()

    # Iterate over the channels in the response and add their information to the dataframe
    for channel in channel_response['items']:
        channel_id = channel['id']
        channel_title = channel['snippet']['title']
        channel_view_count = channel['statistics']['viewCount']
        channel_subscriber_count = channel['statistics']['subscriberCount']
        channel_video_count = channel['statistics']['videoCount']
        channel_country = channel['snippet'].get('country', '')

        df.loc[len(df)] = [channel_id, channel_title, channel_view_count, channel_subscriber_count, channel_country]

    # Check if there are more pages
    if 'nextPageToken' in channel_response:
        next_page_token = channel_response['nextPageToken']
        get_channel_data(channel_ids, next_page_token)

    return df

In [39]:
df = get_channel_data(channel_ids)

In [40]:
import plotly.express as px

In [41]:
# Convert the view_count and subscriber_count columns to integers
df['view_count'] = df['view_count'].astype(int)
df['subscriber_count'] = df['subscriber_count'].astype(int)

# Convert the channel_id column to a string
df['channel_id'] = df['channel_id'].astype(str)

In [42]:
# Create a bar chart of the total view count for each channel
fig = px.bar(df, x='channel_title', y='view_count', title='Total View Count by Channel')
fig.show()

In [43]:
# Create a bar chart of the total subsribers count for each channel
fig = px.bar(df, x='channel_title', y='subscriber_count', title='Total Subscriber Count by Channel')
fig.show()

In [44]:
# Visualize the data sorted

# Sort the data frame by the view count column
df = df.sort_values('view_count')

# Create a bar chart of the total view count for each channel
fig = px.bar(df, x='channel_title', y='view_count', title='Total View Count by Channel')

# Update the x-axis label and tickangle
fig.update_xaxes(title='Channel', tickangle=-45)


# Display the chart
fig.show()

In [None]:
df = df.sort_values('subscriber_count')

# Create a bar chart of the total view count for each channel
fig = px.bar(df, x='channel_title', y='subscriber_count', title='Total Subscriber Count by Channel')

# Display the chart
fig.show()

In [None]:


# Lists to store the data
channel_stats_data = []
video_stats_data = []
video_metadata_data = []
video_category_data = []
audience_data = []

# Loop through the artist IDs and retrieve data for each artist
for artist_id in channel_ids:
    # Retrieve channel data using the artist ID
    channel_data = youtube.channels().list(
        part='snippet, statistics',
        id=artist_id
    ).execute()
    
    # Append the channel statistics data to the list
    channel_stats_data.append(channel_data['items'][0]['statistics'])
    
    # Retrieve video data using the artist ID
    search_response = youtube.search().list(
        q="",
        type="video",
        part="id,snippet",
        channelId=artist_id,
        maxResults=50
    ).execute()
    
    # Loop through the video data and retrieve additional data for each video
    while search_response:
        for item in search_response['items']:
            video_id = item['id']['videoId']

            # Retrieve video statistics using the video ID
            video_data = youtube.videos().list(
                part='statistics, snippet',
                id=video_id
            ).execute()

            # Append the video statistics data to the list
            video_stats_data.append(video_data['items'][0]['statistics'])

            # Append the video metadata data to the list
            video_metadata_data.append(video_data['items'][0]['snippet'])

            # Append the video category data to the list
            video_category_data.append(video_data['items'][0]['snippet']['categoryId'])

        # Check if there are more results
        if 'nextPageToken' in search_response:
            next_page_token = search_response['nextPageToken']
            search_response = youtube.search().list(
                q="",
                type="video",
                part="id,snippet",
                channelId=artist_id,
                maxResults=50,
                pageToken=next_page_token
            ).execute()
        else:
            break

# Convert the data to pandas dataframes
channel_stats_df = pd.DataFrame(channel_stats_data)
video_stats_df = pd.DataFrame(video_stats_data)
video_metadata_df = pd.DataFrame(video_metadata_data)
video_category_df = pd.DataFrame(video_category_data)

# Merge the video dataframes
video_df = pd.concat([video_stats_df, video_metadata_df, video_category_df], axis=1)

# Calculate engagement metrics
video_df['comments_to_views_ratio'] = video_df['commentCount'] / video_df['viewCount']
video_df['views_per_video'] = channel_stats_df['viewCount'] / channel_stats_df['videoCount']

# Extract audience data

   



In [None]:
df_copy = video_df.copy()
df_copy.columns

In [None]:
df_copy[df_copy['likeCount'].isna() == True].ip

In [None]:
# Drop columns that won't be used in analysis
df_copy.drop(['categoryId', 'defaultLanguage', 'favoriteCount', 'localized'], axis=1, inplace=True)


In [None]:
# Convert view counts, like counts, and comment counts to integers
df_copy['viewCount'] = video_df['viewCount'].astype(int)
df_copy['likeCount'] = video_df['likeCount'].fillna(0).astype(int)
df_copy['commentCount'] = video_df['commentCount'].fillna(0).astype(int)

In [None]:
# Convert dates to datetime objects
df_copy['publishedAt'] = pd.to_datetime(video_df['publishedAt'], format='%Y-%m-%dT%H:%M:%SZ')


In [None]:
# Rename columns for clarity
df_copy.rename(columns={'publishedAt': 'videoPublishedAt', 'viewCount': 'videoViewCount', 'likeCount': 'videoLikeCount'}, inplace=True)


In [None]:
df_channel = df.copy()
df_channel.columns
df_copy.columns

In [None]:
df_channel.fillna(0, inplace=True)
df_channel.rename(columns={'view_count': 'channelViewCount', 'subscriber_count': 'channelSubscriberCount', 'video_count': 'channelVideoCount', 'channel_id': 'channelId'}, inplace=True)

In [None]:
# Merge the video and channel dataframes on the channel ID
data = pd.merge(df_copy, df_channel, on='channelId')

In [None]:
df_copy.head()

In [None]:
df_channel.head()

In [None]:
# Reset the index
data.reset_index(drop=True, inplace=True)

In [None]:
data.describe()

In [None]:
data.to_csv('data001.csv', index=False)