# Import libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
import warnings
from dotenv import load_dotenv
from googleapiclient.discovery import build

warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 500)

## Get environment variables for api key and channel id(s)
* *Change appropriate code to use channel_ids if using multiple channel ids within the environment variable*

In [2]:
load_dotenv()

api_key = os.getenv('api_key')
channel_id = os.getenv('channel_id')
#channel_ids = os.getenv('channel_ids')

youtube = build('youtube', 'v3', developerKey = api_key)

## Function to retrieve channel statistics

In [3]:
def get_channel_stats(youtube, 
                      channel_id
                      #channel_ids
                     ):
    all_data = []
    request = youtube.channels().list(
                part = 'snippet, contentDetails, statistics',
                #id = ','.join(channel_ids))
                id = channel_id)
    response = request.execute()
    
    for i in range(len(response['items'])):
        data = dict(channel_name = response['items'][i]['snippet']['title'],
                    subscribers = response['items'][i]['statistics']['subscriberCount'],
                    views = response['items'][i]['statistics']['viewCount'],
                    total_videos = response['items'][i]['statistics']['videoCount'],
                    playlist_id = response['items'][i]['contentDetails']['relatedPlaylists']['uploads'])
        all_data.append(data)
    return all_data

In [4]:
channel_stats = get_channel_stats(youtube, channel_id)

## Load channel stats into pandas dataframe
### Data cleaning and processing

In [5]:
channel_data = pd.DataFrame(channel_stats)

In [6]:
channel_data

In [7]:
channel_data.dtypes

In [8]:
channel_data['subscribers'] = pd.to_numeric(channel_data['subscribers'])
channel_data['views'] = pd.to_numeric(channel_data['views'])
channel_data['total_videos'] = pd.to_numeric(channel_data['total_videos'])
channel_data.dtypes

In [9]:
channel_data

In [10]:
playlist_id = channel_data.loc[channel_data['channel_name'] == 'Mathilification', 'playlist_id'].iloc[0]

## Functions to retrieve video IDs and video statistics.

In [11]:
def get_video_ids(youtube, playlist_id):
    request = youtube.playlistItems().list(
                part = 'contentDetails',
                playlistId = playlist_id,
                maxResults = 50)
    response = request.execute()
    
    video_ids = []
    
    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    more_pages = True
    
    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                        part = 'contentDetails',
                        playlistId = playlist_id,
                        maxResults = 50,
                        pageToken = next_page_token)
            response = request.execute()
            
            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])
                
            next_page_token = response.get('nextPageToken')
            
    return video_ids

In [12]:
video_ids = get_video_ids(youtube, playlist_id)

In [13]:
#video_ids

In [14]:
def get_video_details(youtube, video_ids):
    all_video_stats = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
                    part = 'snippet, statistics',
                    id = ','.join(video_ids[i:i+50]))
        response = request.execute()
        
        for video in response['items']:
            #if video['id']['kind'] == "youtube#video":
                video_stats = dict(title = video['snippet']['title'],
                                   upload_date = video['snippet']['publishedAt'],
                                   view_count = video['statistics']['viewCount'],
                                   like_count = video['statistics']['likeCount'],
                                   #dislike_count = video['statistics']['dislikeCount'],
                                   comment_count = video['statistics']['commentCount'],
                                   #favorite_count = video['statistics']['favoriteCount']
                                  )
                all_video_stats.append(video_stats)
    return all_video_stats         

In [15]:
video_details = get_video_details(youtube, video_ids)

## Load video statistics into pandas dataframe
### Data cleaning and processing

In [16]:
video_data = pd.DataFrame(video_details)

In [17]:
video_data

In [18]:
video_data.dtypes

In [19]:
video_data['upload_date'] = pd.to_datetime(video_data['upload_date']).dt.date
video_data['view_count'] = pd.to_numeric(video_data['view_count'])
video_data['like_count'] = pd.to_numeric(video_data['like_count'])
video_data['comment_count'] = pd.to_numeric(video_data['comment_count'])
video_data

In [20]:
top10_by_views = video_data.sort_values(by = 'view_count', ascending = False).head(10)
top10_by_views

## Graphical Analysis

In [21]:
bar1 = sns.barplot(data = top10_by_views, x = 'view_count', y = 'title')
bar1.set_title("Top 10 Videos by View Count", fontsize = 16)
bar1.set_xlabel("View Count")
bar1.set_ylabel("Video Title")
plt.show()

In [22]:
video_data['Month'] = pd.to_datetime(video_data['upload_date']).dt.strftime('%b')
video_data['Day'] = pd.to_datetime(video_data['upload_date']).dt.strftime('%d')
video_data['Year'] = pd.to_datetime(video_data['upload_date']).dt.strftime('%Y')
video_data

In [23]:
videos_per_month = video_data.groupby('Month', as_index = False).size()
videos_per_month

In [24]:
sort_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

In [25]:
videos_per_month.index = pd.CategoricalIndex(videos_per_month['Month'], categories = sort_order, ordered = True)
videos_per_month = videos_per_month.sort_index()

In [26]:
videos_per_month

In [27]:
bar2 = sns.barplot(data = videos_per_month, x = 'Month', y = 'size')
bar2.set_ylabel('Number of Videos')
bar2.set_title('Videos per Month', fontsize = 16)
plt.show()

In [28]:
video_data.to_csv('Mathil Youtube.csv')

In [29]:
top10_by_likes = video_data.sort_values(by = 'like_count', ascending = False).head(10)
top10_by_likes

In [30]:
bar3 = sns.barplot(data = top10_by_likes, x = 'like_count', y = 'title')
bar3.set_title("Top 10 Videos by Likes", fontsize = 16)
bar3.set_xlabel("Like Count")
bar3.set_ylabel("Video Title")
plt.show()

In [31]:
videos_per_year = video_data.groupby('Year', as_index = False).size()
videos_per_year

In [32]:
bar4 = sns.barplot(data = videos_per_year, x = 'Year', y = 'size')
bar4.set_ylabel('Number of Videos')
bar4.set_title('Videos per Year', fontsize = 16)
plt.show()

In [33]:
df = video_data.groupby(['Year', 'Month'], as_index = False).size()
df

In [34]:
scat1 = sns.scatterplot(data = df, x = 'Year', y = 'size', hue_order = sort_order, hue = 'Month')
scat1.legend(loc = 'center right', bbox_to_anchor = (1.2, 0.5))
scat1.set_title("Videos per Year by Month", fontsize = 16)
scat1.set_ylabel("Number of Videos")
scat1.grid(which = 'major', color = 'grey', linewidth = 0.1)
plt.show()