## NLP Project for The Pat McAfee Show Livestream




### Import required libraries and Youtube API setup

In [4]:
import googleapiclient.discovery
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import creds

playlist_id = "PLeBO0zY8b9XkbJgjfxGlCVCGc0ve8wQj2"
youtube = googleapiclient.discovery.build('youtube','v3', developerKey=creds.api_key)

### Data Collection utilizing Youtube Data API

In [5]:
def get_video_list(youtube, playlist_id):
    """
    Inputs:
        youtube - youtube instance to connect to Youtube Data API
        playlist_id - unique id for any given playlist on Youtube
    Outputs:
        video_list - list of all available videoId in given playlist_id
    """
    video_list = []
    request = youtube.playlistItems().list(
        part='contentDetails',
        playlistId=playlist_id,
        maxResults=50
    )
    next_page = True
    while next_page:
        response = request.execute()
        data = response['items']

        for video in data:
            video_id = video['contentDetails']['videoId']

            if video_id not in video_list:
                video_list.append(video_id)

        # more pages?
        if 'nextPageToken' in response.keys():
            request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId=playlist_id,
                pageToken=response['nextPageToken'],
                maxResults=50
            )
        else:
            next_page = False

    return video_list

In [6]:
# grab all videoId of all videos from The Pat McAfee Show 
pms_live_list = get_video_list(youtube,playlist_id)

print(len(pms_live_list))

740


In [8]:
def video_details(youtube, video_list):
    """
    Inputs:
        youtube - youtube instance to connect to Youtube Data API
        video_list - list of all available videoId in given playlist_id
    Outputs:
        video_stats - a list of dictionaries including title, date/time when video was published, view count, and like count of all videos in given video_list
    """
    video_stats = []

    for i in range(0, len(video_list), 50):
        request = youtube.videos().list(
            part = 'snippet,statistics',
            id = video_list[i:i+50] # first time through, will execute for videos at indices 0-49, and keeps looping by 50 (ie second time through 50-99), etc
        )

        data = request.execute()

        for video in data['items']:
            title = video['snippet']['title']
            publish = video['snippet']['publishedAt']
            view_count = video['statistics']['viewCount']
            like_count = video['statistics']['likeCount']

            video_dict = dict(
                title = title,
                publish = publish,
                view_count = view_count,
                like_count = like_count
            )

            video_stats.append(video_dict)
    
    return video_stats 

In [9]:
pms_live_daily_data = video_details(youtube, pms_live_list)

### Data Cleaning Pt.1

In [25]:
pms_df = pd.DataFrame(pms_live_daily_data)

pms_df['view_count'] = pd.to_numeric(pms_df.view_count)
pms_df['like_count'] = pd.to_numeric(pms_df.like_count)
pms_df['publish'] = pd.to_datetime(pms_df.publish)

pms_df.head()


Unnamed: 0,title,publish,view_count,like_count
0,The Pat McAfee Show | Wednesday August 24th 2022,2022-08-24 13:45:33+00:00,42420,2090
1,The Pat McAfee Show | Tuesday August 23rd 2022,2022-08-23 19:26:20+00:00,249983,4972
2,The Pat McAfee Show | Monday August 22nd 2022,2022-08-22 20:04:16+00:00,317445,5730
3,The Pat McAfee Show | Friday August 19th 2022,2022-08-19 19:11:01+00:00,280875,4550
4,The Pat McAfee Show | Thursday August 18th 2022,2022-08-18 19:55:45+00:00,326126,5426


In [26]:
pms_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 740 entries, 0 to 739
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   title       740 non-null    object             
 1   publish     740 non-null    datetime64[ns, UTC]
 2   view_count  740 non-null    int64              
 3   like_count  740 non-null    int64              
dtypes: datetime64[ns, UTC](1), int64(2), object(1)
memory usage: 23.2+ KB


In [27]:
pms_df['day_of_week']=pms_df['publish'].dt.day_of_week
pms_df['day_name'] = pms_df['publish'].dt.day_name()


pms_df.head()

Unnamed: 0,title,publish,view_count,like_count,day_of_week,day_name
0,The Pat McAfee Show | Wednesday August 24th 2022,2022-08-24 13:45:33+00:00,42420,2090,2,Wednesday
1,The Pat McAfee Show | Tuesday August 23rd 2022,2022-08-23 19:26:20+00:00,249983,4972,1,Tuesday
2,The Pat McAfee Show | Monday August 22nd 2022,2022-08-22 20:04:16+00:00,317445,5730,0,Monday
3,The Pat McAfee Show | Friday August 19th 2022,2022-08-19 19:11:01+00:00,280875,4550,4,Friday
4,The Pat McAfee Show | Thursday August 18th 2022,2022-08-18 19:55:45+00:00,326126,5426,3,Thursday


In [28]:
pms_df.to_csv('data/pms_daily_cleaned_df.csv')

### Data Visualization Pt.1 - The Pat McAfee Show (Daily Youtube Livestream and SIRIUS XM Radio Channel 82)