In [1]:
#####
##### Author: Jonathan Chan
##### 
##### ELI5: Access the google API to search pages of results, get stats_df, 
##### Refactor the youtube build variables
#####



In [2]:
from apiclient.discovery import build
from apiclient.errors import HttpError
from oauth2client.tools import argparser
import pandas as pd
# import matplotlib.pyplot as plt
# from youtube_transcript_api import YouTubeTranscriptApi

import json
import credentials

# define the variables used to access the YouTube API
DEVELOPER_KEY = credentials.GOOGLE_API_KEY
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"

#Set the youtube search queries 
query="why I quit my tech job"
max_results=500
order='relevance'

#replace with "data" folder when doing full run - will use up Google API credits
# max_results=1000
# data_folder = "data"

# Define the output folder to save the data
data_folder = "data" 
# out_filepath =f"{data_folder}/01_youtubeStatsTranscripts.json"
#setup category dict
cat_dict = {
    1: "Film & Animation",
    2:	"Autos & Vehicles",
    10:	"Music",
    15:"Pets & Animals",
    17:	"Sports",
    18:	"Short Movies",
    19:	"Travel & Events",
    20:	"Gaming",
    21:	"Videoblogging",
    22:	"People & Blogs",
    23:	"Comedy",
    24:	"Entertainment",
    25:	"News & Politics",
    26:	"Howto & Style",
    27:	"Education",
    28:	"Science & Technology",
    29:	"Nonprofits & Activism",
    30: "Movies",
    31:	"Anime/Animation",
    32:	"Action/Adventure",
    33:	"Classics",
    34:	"Comedy",
    35:	"Documentary",
    36:	"Drama",
    37:	"Family",
    38:	"Foreign",
    39:	"Horror",
    40:	"Sci-Fi/Fantasy",
    41:	"Thriller",
    42:	"Shorts",
    43:	"Shows",
    44: "Trailers"
}



In [3]:
# #### Setup the YouTube API access using the variables listed above
# youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,developerKey=DEVELOPER_KEY)



In [4]:
#### FUNCTIONAL VERSION OF v2

def scrape_youtube_df(input_query="why I quit my job"):
    """ 
    Returns the youtube video query results based on the input_query. 

    assume youtube object was built using the build() function outside of this function
    """

    #### Setup the YouTube API access using the variables listed above
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,developerKey=DEVELOPER_KEY)
    n_results = 0
    n_iterations = max_results // 50 # get whole number of times we need to go through pages of youtube searches -
    print(f"n iterations: {n_iterations}")

    ### Initialize list of pages of results prior to searches
    results_list = []
    next_token = None
    prev_token = None

    #populate results_list with items searched from query - go through each page and update nextPageTokenm prevPageToken to iterate
    for i in range(0, n_iterations):
        search_response = youtube.search().list(
            q=input_query, pageToken=next_token,maxResults=50, #only 50 results per page - verify?
            type="video", order = "relevance", part="id, snippet", location=None,locationRadius=None).execute()
        results_list.append(search_response)
        #Update next_token and prev_token variables for next search iteration
        if 'nextPageToken' in search_response.keys():
            next_token=search_response['nextPageToken']
        else:
            next_token=None
        if 'prevPageToken' in search_response.keys():
            prev_token=search_response['prevPageToken']
        else:
            prev_token=None
    #create list of dicts for youtube urls and basic info
    video_list = []
    for result in results_list:
        curr_page = result['items']
        for curr_vid in curr_page:
            video_title=curr_vid['snippet']['title']
            video_id = curr_vid['id']['videoId']
            channel_id = curr_vid['snippet']['channelId']
            published_at = curr_vid['snippet']['publishedAt']
            video_dict = {
                "video_id": video_id,
                "video_title": video_title,
                "channel_id": channel_id,
                'published_at': published_at
            }
            
            video_list.append(video_dict)
    
    
    results_df = pd.DataFrame.from_dict(video_list)
    print(f"LENGTH OF VIDEO LIST: {len(video_list)}")
    print(f"LENGTH OF VIDEO df: {results_df.shape}")

    return results_df

results_df = scrape_youtube_df("Why I quit my analyst job")
results_df.head()

n iterations: 10
LENGTH OF VIDEO LIST: 500
LENGTH OF VIDEO df: (500, 4)


Unnamed: 0,video_id,video_title,channel_id,published_at
0,bVJfQAe-UP4,Why I Quit my 125k Analytics Job,UC7cs8q-gJRlGwj4A8OmCmXg,2022-12-20T12:00:33Z
1,-ayEjbg0ZEc,I QUIT MY JOB as a Financial Analyst to START ...,UChpxXPOP1xIq65mpNv-DclQ,2021-07-22T14:00:01Z
2,C9h0uhjdsOI,The REAL Reason I Quit My 6-Figure Data Analys...,UC0GmdVKZhMM3Rmielp4oVAA,2022-03-07T17:43:04Z
3,sd5F1uR3tvA,I quit my 6-figure Data Analyst job in 4 month...,UCNXDU-8M1KFAWpFxmev8C-g,2024-03-23T11:22:46Z
4,M8md7_gyBy4,"I QUIT my $170,000 Tech Job After Learning 3 L...",UC0GmdVKZhMM3Rmielp4oVAA,2022-01-17T10:58:08Z


In [6]:
def create_stats_df(input_df):
    """
    Returns a modified version of input_df - added stats information columns.

    Assume there is a column for 'video_ids' (ex: 'bVJfQAe-UP4')
    
    """
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,developerKey=DEVELOPER_KEY)
    i = 0
    stats_list = []
    for curr_id in input_df['video_id']:       
        stats = youtube.videos().list(
            part='statistics, snippet',
            id=curr_id).execute()
        curr_channel_id = stats['items'][0]['snippet']['channelId']
        curr_channel_name = stats['items'][0]['snippet']['channelTitle']
        curr_cat = int(stats['items'][0]['snippet']['categoryId'])
        curr_favs= stats['items'][0]['statistics']['favoriteCount']
        #Check for view count, dislike count, like count, comment count integers 
        try:
            curr_views = stats['items'][0]['statistics']['viewCount']
        except:
            curr_views = "Not available"
        try:
            curr_likes = stats['items'][0]['statistics']['likeCount']
        except:
            curr_likes = "Not available"
        try:
            curr_dislikes = stats['items'][0]['statistics']['dislikeCount']    
        except:
            curr_dislikes = "Not available"
            
        if 'commentCount' in stats['items'][0]['statistics'].keys():
            curr_comments = stats['items'][0]['statistics']['commentCount']
        else:
            curr_comments = 0
    
        
        #Check for tags, description, comments strings - If statements since they won't show up in snippet keys if not available
        if 'tags' in stats['items'][0]['snippet'].keys():
            curr_tags = stats['items'][0]['snippet']['tags']
        else:
            curr_tags = 'No Tags'
        if 'description' in stats['items'][0]['snippet'].keys():
            curr_description = stats['items'][0]['snippet']['description']
        else:
            curr_description = "No Description"
    
        #Write final dictionary
        stats_dict = {
            "video_id": curr_id,
            "published_at": stats['items'][0]['snippet']['publishedAt'],
            "channel_id": curr_channel_id,
            "channel_name": curr_channel_name,
            "Category": cat_dict[curr_cat], #cat is returned as an ind, put into the cat_dict
            "favourites": curr_favs,
            "title": stats['items'][0]['snippet']['title'],
            "views": curr_views,
            "likes": curr_likes,
            'dislikes': curr_dislikes,
            'comment_count': curr_comments,
            "description": curr_description,
            "tags": curr_tags
        }
        stats_list.append(stats_dict)
        #### CHECK PRINTS
        # print(stats_dict.keys())
        # print("---")
        # i += 1
        # if i == 5:
        #     break
    stats_df = pd.DataFrame.from_dict(stats_list)
    return stats_df

stats_df = create_stats_df(results_df)

stats_df.head()

Unnamed: 0,video_id,published_at,channel_id,channel_name,Category,favourites,title,views,likes,dislikes,comment_count,description,tags
0,bVJfQAe-UP4,2022-12-20T12:00:33Z,UC7cs8q-gJRlGwj4A8OmCmXg,Alex The Analyst,Education,0,Why I Quit my 125k Analytics Job,158087,8715,Not available,858,"This is not where I saw my career going, but h...","[Data Analyst, Data Analyst job, Data Analyst ..."
1,-ayEjbg0ZEc,2021-07-22T14:00:01Z,UChpxXPOP1xIq65mpNv-DclQ,NFTs Simplified,People & Blogs,0,I QUIT MY JOB as a Financial Analyst to START ...,3968,211,Not available,61,I quit my job as a financial analyst to start ...,"[should you quit your job, should i quit my jo..."
2,C9h0uhjdsOI,2022-03-07T17:43:04Z,UC0GmdVKZhMM3Rmielp4oVAA,Stefanovic,People & Blogs,0,The REAL Reason I Quit My 6-Figure Data Analys...,70501,3421,Not available,204,Save 25% off Datacamp here:\nhttps://www.datac...,No Tags
3,sd5F1uR3tvA,2024-03-23T11:22:46Z,UCNXDU-8M1KFAWpFxmev8C-g,Jess Ramos,Entertainment,0,I quit my 6-figure Data Analyst job in 4 month...,91,3,Not available,0,,No Tags
4,M8md7_gyBy4,2022-01-17T10:58:08Z,UC0GmdVKZhMM3Rmielp4oVAA,Stefanovic,People & Blogs,0,"I QUIT my $170,000 Tech Job After Learning 3 L...",1011039,41165,Not available,1875,Save 25% off Datacamp here:\nhttps://datacamp....,"[i quit my job, passive income, passive income..."


In [7]:
out_filepath = f"{data_folder}/01_youtube_stats_df.csv"
stats_df.to_csv(out_filepath)

In [None]:
### FUNCTIONALIZED ALL UNDER ALREADY

In [None]:
# # IGNORE - ALREADY ADDED TO FUNCTION
# ### ### VERSION 2 - MAIN SEARCH FOR PAGES OF RESULTS (Google API)
# #### Use youtube.Search().list() to iterate through pages to get a list of responses
# #### Test the iteration through the search results
# #### 50 is the number of search results for each page - get number of iterations to go through


# n_results = 0
# n_iterations = max_results // 50 # get whole number of times we need to go through pages of youtube searches -
# print(f"n iterations: {n_iterations}")

# ### Initialize list of pages of results prior to searches
# results_list = []
# next_token = None
# prev_token = None

# for i in range(0, n_iterations):
#     ### TEST OUTPUT - NEXT 2 LINES
#     # print(f"iteration {i}")
#     # print(f"TOKENS BEFORE: {next_token} and {prev_token}")

#     #Main Google API search - go through current page's results, append to results_list
#     #individual results are stored in results_list[i]['items']
#     search_response = youtube.search().list(
#         q=query, pageToken=next_token,
#         maxResults=50, #only 50 results per page - verify?
#         type="video", order = "relevance", part="id, snippet", location=None,locationRadius=None
#     ).execute()
#     results_list.append(search_response)
    
#     #Update next_token and prev_token variables for next search iteration
#     if 'nextPageToken' in search_response.keys():
#         next_token=search_response['nextPageToken']
#     else:
#         next_token=None
#     if 'prevPageToken' in search_response.keys():
#         prev_token=search_response['prevPageToken']
#     else:
#         prev_token=None
        
#     ### TEST OUTPUT - NEXT 4 LINES
#     # print(f"TOKENS AFTER: {next_token} and {prev_token}")
#     # print(f"Iteration: {i}")
#     # print(f"response length: {len(results_list)}")
#     # print("----")
# results_list[0]['items']

In [None]:
### EXAMPLE OF RESULTS: 

### First page: results_list[0]['items']
# [{'kind': 'youtube#searchResult',
#   'etag': 'ZV8Ot8a1VIuZnYJCLRbqu8hZvOM',
#   'id': {'kind': 'youtube#video', 'videoId': 'NErXzkS_qBc'},
#   'snippet': {'publishedAt': '2024-01-29T16:00:41Z',
#    'channelId': 'UCLCW9rn6lwZOHDnPnrWGTDA',
#    'title': 'Today I Quit My Job... A Rant On Mental Heath In A Toxic Workplace...',
#    'description': 'Today I quit my job. In this raw and honest video, I share the experience that led to this decision, where my manager dismissed the ...',
#    'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/NErXzkS_qBc/default.jpg',
#      'width': 120,
#      'height': 90},
#     'medium': {'url': 'https://i.ytimg.com/vi/NErXzkS_qBc/mqdefault.jpg',
#      'width': 320,
#      'height': 180},
#     'high': {'url': 'https://i.ytimg.com/vi/NErXzkS_qBc/hqdefault.jpg',
#      'width': 480,
#      'height': 360}},
#    'channelTitle': 'Mims Films',
#    'liveBroadcastContent': 'none',
#    'publishTime': '2024-01-29T16:00:41Z'}},
#  {'kind': 'youtube#searchResult',
#   'etag': 'lGeetA0UG8yWph-kCyj3zkxnADY',
#   'id': {'kind': 'youtube#video', 'videoId': 'UxC9AyKQ65w'},
# .............


### Last result of first page: results_list[0]['items'][49]
# {'kind': 'youtube#searchResult',
#  'etag': 'P_4ewemOuR5GKLgPmbwsKH11yQk',
#  'id': {'kind': 'youtube#video', 'videoId': 'N94EpLXFbEM'},
#  'snippet': {'publishedAt': '2024-12-15T16:00:25Z',
#   'channelId': 'UC9lApmbeXQD_Uu_XNN5Rqsw',
#   'title': 'I quit my job at age 55 after learning this...',
#   'description': "At 55, I made the hardest and most freeing decision of my life: I quit my corporate job. In this video, I'll share the eye-opening ...",
#   'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/N94EpLXFbEM/default.jpg',
#     'width': 120,
#     'height': 90},
#    'medium': {'url': 'https://i.ytimg.com/vi/N94EpLXFbEM/mqdefault.jpg',
#     'width': 320,
#     'height': 180},
#    'high': {'url': 'https://i.ytimg.com/vi/N94EpLXFbEM/hqdefault.jpg',
#     'width': 480,
#     'height': 360}},
#   'channelTitle': 'reggi sweat',
#   'liveBroadcastContent': 'none',
#   'publishTime': '2024-12-15T16:00:25Z'}}




In [None]:
### OBSOLETE

#response list is a list of dictionaries - 'items' stores individual video info
#responses_list[0]['items'][0]
# responses_list[0]
# responses_list[0]['items'][0]

In [None]:
### OBSOLETE
# for response in responses_list:
#     print(response.keys())
#     for key in response.keys():

#         print(response['items'])
#         if key =="items":
#             item_list = response['items']
#             for item in item_list:
#                 video_dict = {}
#     #             print(item)
#                 title=item['snippet']['title']

#                 desc = item['snippet']['description']
#                 video_id = item['id']['videoId']
#                 channel_id = items['snippet']['channelId']
#                 print(desc)

#         print("____")

In [None]:
# # IGNORE - ALREADY ADDED TO FUNCTION
# ### VERSION 2 - GET RESULTS INTO DF
# ### Is this necessary? just need the video IDs to create the next df
# ### Or combine with youtube stats df
# ### Create list of video information: video title, video id, channel ID, published time
# ### Output: list of dictionaries containing video title, id, channel id, publish date of each 
# video_list = []

# for result in results_list:
#     curr_page = result['items']
#     for curr_vid in curr_page:
#         video_title=curr_vid['snippet']['title']
#         video_id = curr_vid['id']['videoId']
#         channel_id = curr_vid['snippet']['channelId']
#         published_at = curr_vid['snippet']['publishedAt']
#         video_dict = {
#             "video_id": video_id,
#             "video_title": video_title,
#             "channel_id": channel_id,
#             'published_at': published_at
#         }
        
#         video_list.append(video_dict)


# results_df = pd.DataFrame.from_dict(video_list)
# print(f"LENGTH OF VIDEO LIST: {len(video_list)}")
# print(f"LENGTH OF VIDEO df: {results_df.shape}")
# results_df.head()

In [None]:
# #IGNORE - ADDED TO FUNCTIONS
# ### VERSION 2: WORKING
# ### Creates the stats_df: replaces the video_df by including view count info, tags, etc

# i = 0
# stats_list = []
# for curr_id in results_df['video_id']:
#     # print(curr_id)
    
#     stats = youtube.videos().list(
#         part='statistics, snippet',
#         id=curr_id).execute()
#     # print(stats['items'][0]['snippet'].keys())
#     # print(stats['items'][0]['statistics'].keys())
#     curr_channel_id = stats['items'][0]['snippet']['channelId']
#     curr_channel_name = stats['items'][0]['snippet']['channelTitle']
#     curr_cat = int(stats['items'][0]['snippet']['categoryId'])
#     curr_favs= stats['items'][0]['statistics']['favoriteCount']
#     #Check for view count, dislike count, like count, comment count integers 
#     try:
#         curr_views = stats['items'][0]['statistics']['viewCount']
#     except:
#         curr_views = "Not available"
#     try:
#         curr_likes = stats['items'][0]['statistics']['likeCount']
#     except:
#         curr_likes = "Not available"
#     try:
#         curr_dislikes = stats['items'][0]['statistics']['dislikeCount']    
#     except:
#         curr_dislikes = "Not available"
        
#     if 'commentCount' in stats['items'][0]['statistics'].keys():
#         curr_comments = stats['items'][0]['statistics']['commentCount']
#     else:
#         curr_comments = 0

    
#     #Check for tags, description, comments strings - If statements since they won't show up in snippet keys if not available
#     if 'tags' in stats['items'][0]['snippet'].keys():
#         curr_tags = stats['items'][0]['snippet']['tags']
#     else:
#         curr_tags = 'No Tags'
#     if 'description' in stats['items'][0]['snippet'].keys():
#         curr_description = stats['items'][0]['snippet']['description']
#     else:
#         curr_description = "No Description"

#     #Write final dictionary
#     stats_dict = {
#         "video_id": curr_id,
#         "published_at": stats['items'][0]['snippet']['publishedAt'],
        
#         "channel_id": curr_channel_id,
#         "channel_name": curr_channel_name,
#         "Category": cat_dict[curr_cat], #cat is returned as an ind, put into the cat_dict
#         "favourites": curr_favs,
#         "title": stats['items'][0]['snippet']['title'],
#         "views": curr_views,
#         "likes": curr_likes,
#         'dislikes': curr_dislikes,
#         'comment_count': curr_comments,
#         "description": curr_description,
#         "tags": curr_tags

#         # "tags": stats['items'][0]['snippet']['tags'],
#         # "category_id": stats['items'[0]['snippet']['categoryId'],
#         # "view_count": stats['items'][0]['statistics']['viewCount']
#     }
#     stats_list.append(stats_dict)
#     #### CHECK PRINTS
#     # print(stats_dict.keys())
#     # print("---")
#     # i += 1
#     # if i == 5:
#     #     break
# stats_list
# stats_df = pd.DataFrame.from_dict(stats_list)
# stats_df.tail()


In [None]:
# stats_df.shape

In [None]:
#### OBSOLETE

# #Create individual lists and store items from video_list dictionaries

# title = []
# channelId = []
# channelTitle = []
# categoryId = []
# publishedAt=[]
# videoId = []
# viewCount = []
# likeCount = []
# dislikeCount = []
# commentCount = []
# favoriteCount = []
# category = []
# tags = []
# videos = []
# descriptions=[]
# i = 0
# for video in video_list:
#     if i % 100 == 0:
#         print("VIDEOS PROCESSED: ", i)
#     videoId.append(video['videoId'])
#     title.append(video['videoTitle'])
#     publishedAt.append(video['publishedAt'])
#     stats = youtube.videos().list(
#         part='statistics, snippet',
#         id=video['videoId']).execute()
#     channelId.append(stats['items'][0]['snippet']['channelId']) 
#     channelTitle.append(stats['items'][0]['snippet']['channelTitle']) 
#     categoryId.append(stats['items'][0]['snippet']['categoryId']) 
#     favoriteCount.append(stats['items'][0]['statistics']['favoriteCount'])
#     try:
#         viewCount.append(stats['items'][0]['statistics']['viewCount']) 
#     except:
#         viewCount.append("Not available") 
#     #Not every video has likes/dislikes enabled so they won't appear in JSON response
#     try:
#         likeCount.append(stats['items'][0]['statistics']['likeCount'])
#     except:
#    #Good to be aware of Channels that turn off their Likes
# #         print("Video titled {0}, on Channel {1} Likes Count is not available".format(stats['items'][0]['snippet']['title'],
# #                                                                                      stats['items'][0]['snippet']['channelTitle']))
# #         print(stats['items'][0]['statistics'].keys())
#     #Appends "Not Available" to keep dictionary values aligned
#         likeCount.append("Not available")

#     try:
#         dislikeCount.append(stats['items'][0]['statistics']['dislikeCount'])     
#     except:
#         #Good to be aware of Channels that turn off their Likes
# #         print("Video titled {0}, on Channel {1} Dislikes Count is not available".format(stats['items'][0]['snippet']['title'],
# #                                                                                         stats['items'][0]['snippet']['channelTitle']))
# #         print(stats['items'][0]['statistics'].keys())
#         dislikeCount.append("Not available")

#     if 'commentCount' in stats['items'][0]['statistics'].keys():
#         commentCount.append(stats['items'][0]['statistics']['commentCount'])
#     else:
#         commentCount.append(0)

#     if 'tags' in stats['items'][0]['snippet'].keys():
#         tags.append(stats['items'][0]['snippet']['tags'])
#     else:
#         tags.append("No Tags")
        
#     if 'description' in stats['items'][0]['snippet'].keys():
#         descriptions.append(stats['items'][0]['snippet']['description'])
#     else:
#         #I'm not a fan of empty fields
#         tags.append("No Description")
    
#     #a given video is equivelant to stats['items'][0]
#     i += 1

In [None]:
### OBSOLETE - 

# #create list of ids to iterate through - remove the layer of lists given for individual queries of youtube.search()

# video_list = []
# for response in responses_list:
# #     print(response.keys())
#     videos_list = response['items']
#     for item in videos_list:
        
#         video_title=item['snippet']['title']
# #         desc = item['snippet']['description'] #only gives snippet of desc - add when getting transcripts
#         video_id = item['id']['videoId']
#         channel_id = item['snippet']['channelId']
#         published_at = item['snippet']['publishedAt']
#         video_dict = {
#             "videoId": video_id,
#             "videoTitle": video_title,
#             "channelId": channel_id,
#             'publishedAt': published_at
#         }
        
#         video_list.append(video_dict)
# len(video_list)

        

In [None]:
### OBSOLETE

# #write JSON to store video information for all
# youtube_dict = {'tags':tags,'channelId': channelId,'channelTitle': channelTitle,
#                 'categoryId':categoryId,'publishedAt': publishedAt,'title':title,'videoId':videoId,
#                 'viewCount':viewCount,'likeCount':likeCount,'dislikeCount':dislikeCount,
#                 'commentCount':commentCount,'favoriteCount':favoriteCount,
#                "description": descriptions}

# # for key in youtube_dict.keys():
# #     print(key)
# #     print("LENGTH: ", len(youtube_dict[key]))
# #     print(youtube_dict[key][:5])
    
# #     print("---")
# youtube_dict.items()

In [None]:
# stats_df.head()

In [None]:
# out_filepath = f"{data_folder}/01_youtube_stats_df.csv"
# stats_df.to_csv(out_filepath)

In [None]:
# test_id = "PgfzN1xfgLc"
# YouTubeTranscriptApi.get_transcript(test_id,languages=['en'])

# for i, video_id in enumerate(youtube_dict['videoId']):
#      print(video_id)

In [None]:
# pip freeze | grep youtube_transcript_api

In [None]:
### MOVE TRANSCRIPT SCRAPE TO NEXT ONE?

In [None]:
### TESTING - IGNORE
# test_id = "sNVcTgLAX4g"
# from youtube_transcript_api import YouTubeTranscriptApi

# ytt_api = YouTubeTranscriptApi()
# eng_transcript = ytt_api.get_transcript(test_id, languages = ['en'])
# transcript_sentlist = [str(x['text']).replace("\xa0", "") for x in eng_transcript]
# " ".join(transcript_sentlist)

In [None]:
#### WORKING - MOVED TO 02 ALREADY
##Get transcripts using get_transcript() of YoutubeTranscriptApi for each transcript
### Get as dictionary of dictionary: youtube_id : {}
# transcript_dict = {}
# yt_api = YouTubeTranscriptApi()

# for i, video_id in enumerate(stats_df['video_id']):
#     transcript_sents = None #create new list for individual sentences
#     try:
#         curr_result = yt_api.get_transcript(video_id,languages=['en'])
#         transcript_sents = [str(x['text']).replace("\xa0", "") for x in curr_result]
#         transcript_joined = " ".join(transcript_sentlist)
#         # print(f"FIRST SENTS FOR {video_id} TRANSCRIPT: {transcript_sents[:5]}")
        
        
#     except:
#         print(f"NO TRANSCRIPT FOR {video_id} - SKIPPED")
#     transcript_dict[video_id] = {
#                                     "joined": transcript_joined,
#                                     "sents": transcript_sents
        
#                                 }
    
#     if i % 50 == 0:
#         print(f"PROCESSED {i} TRANSCRIPTS")
#         print(f"LENGTH OF DICTS")

# print(len(transcript_dict))
    
# for curr_key in transcript_dict.keys():
#     try:
#         print(f"youtube_id: {curr_key}, number of sents: {len(transcript_dict[curr_key]['sents'])}")
#     except:
#         print(f"TRANSCRIPT MISSING FOR {curr_key}")  
    

In [None]:
# ## OBSOLETE 
# transcripts_sents = []
# transcripts_strings = []
# transcripts_parsed = []
# for i, video_id in enumerate(youtube_dict['videoId']):
# #     print(video_id)
#     transcript_sentlist = []
#     try:
#         eng_transcript = YouTubeTranscriptApi.get_transcript(video_id,languages=['en'])
# #         print(eng_transcript[0:5])
#         transcript_sentlist = [str(x['text']).replace("\xa0", "") for x in eng_transcript]
#         transcript_joined = " ".join(transcript_sentlist)
#         #transcript_sentlist is broken up by timestamp - use Spacy to break up using dependancy parse
# #         document = nlp(transcript_joined)
# #         transcript_parsed = [x for x in document.sents]
#     except:
# #         print("NONE")
#         transtript_sentlist = None
#         transcript_joined = None
# #         transcript_parsed = None
# #         pass
#     transcripts_sents.append(transcript_sentlist)
#     transcripts_strings.append(transcript_joined)
# #     transcripts_parsed.append(transcript_parsed)
#     if i % 25 ==0:
#         print(f"PROCESSED: {i}")
# #         break
# print(f"COMPLETED")
# youtube_dict['transcripts_raw'] = transcripts_sents
# youtube_dict['transcript_strings'] = transcripts_strings
# # youtube_dict['transcript_parsed'] = transcripts_parsed

[{'text': "and I've tried asking God I said God", 'start': 0.0, 'duration': 4.74}, {'text': 'what exactly is this next season gonna', 'start': 2.159, 'duration': 5.22}, {'text': 'look like can you give me a Five-Year', 'start': 4.74, 'duration': 5.399}, {'text': "Plan of How It's all gonna play out and", 'start': 7.379, 'duration': 4.501}, {'text': "God said girl you know that's not how it", 'start': 10.139, 'duration': 3.721}]
tq4No1cOwZM
[{'text': 'so in 2020 I quite quit my job and it', 'start': 0.0, 'duration': 5.16}, {'text': 'was without a doubt the best decision of', 'start': 2.58, 'duration': 4.38}, {'text': "my life and since then I've created", 'start': 5.16, 'duration': 3.479}, {'text': 'several online businesses and became', 'start': 6.96, 'duration': 3.36}, {'text': 'financially free but before I tell you', 'start': 8.639, 'duration': 3.781}]
nj2c5mKGTtQ
[{'text': 'Translator: Valérie ESPANET\nReviewer: Zsófia Herczeg', 'start': 0.0, 'duration': 7.0}, {'text': 'Imagine for

In [None]:
### OBSOLETE:
### search Youtube, iterate through pages and get list of responses
###OUT: response_list: list of dictionaries
#use youtube.search().list(part='id, snippet')


# location=None
# location_radius=None

# responses_list =[]

# for i in range(0,max_results,50):
# #     print(i)
    
#     if i % 100 == 0:
#         print("VIDEO STATS PROCESSED: ", i)
#     if i ==0:
#         next_token=None
#         prev_token = None
#     search_response = youtube.search().list(
#         q=query,
#         type="video",
#         pageToken=next_token,
#         order = order,
#         part="id,snippet",
#         maxResults=max_results,
#         location=location,
#         locationRadius=location_radius).execute()
    
#     responses_list.append(search_response)
#     #GET NEXT PAGE OF RESULTS
#     if 'nextPageToken' in search_response.keys():
#         next_token=search_response['nextPageToken']
#     else:
#         next_token=None
        
#     if 'prevPageToken' in search_response.keys():
#         prev_token=search_response['prevPageToken']
#     else:
#         prev_token=None

# #     print("----")


# # print(responses_list[-1])
# print(len(responses_list[0]['items']))

In [None]:
# youtube_dict

In [None]:
# new_descs = [str(x) for x in youtube_dict['description']]
# print(new_descs)
# youtube_dict['description'] = new_descs

# trans_parsed = []
# for transcript_parsed in youtube_dict['transcript_parsed']:
#     print(transcript_parsed)
#     if transcript_parsed:
#         new_parsed = [str(x) for x in transcript_parsed]
#     else:
#         new_parsed = None
#     trans_parsed.append(new_parsed)
# youtube_dict['transcript_parsed'] = trans_parsed
# print(len(trans_parsed))

In [None]:
### CHECK VALUES CELL - COMMENT OUT WHEN RUNNING


#     dict_item = youtube_dict[key][0]
#     print(key)
#     print(dict_item)
#     print(type(dict_item))
#     print("----")

In [None]:
# ###ADD IN 02_PROCESS TRANSCRIPTS CODE
# all_transcripts = []

# id_list = youtube_dict['videoId']

# for i, video_id in enumerate(youtube_dict['videoId']):
    
#     try:
#         transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
#     except:
#         transtript_list = None
#         pass
    
#     all_transcripts.append(transcript_list)
#     if not transcript_list:
#         print("MISSING TRANSCRIPT:", youtube_dict["title"][i])
    
#     if i % 100 == 0:
#         print("PROCESSED: ",i)
        
# youtube_dict['transcript_api'] = all_transcripts

In [None]:

# with open(out_filepath, 'w', encoding='utf-8') as f:
#     json.dump(youtube_dict, f, ensure_ascii=False, indent=4)


In [None]:
# type(youtube_dict)

In [None]:
# #REDO WITH ID AND TRANSCRIPT API IN ONE PLACE

# transcript_dict_all = {}
# transcript_dict_man = {}

# for i, video_id in enumerate(youtube_dict['videoId']):
    
#     transcript_api = youtube_dict['transcript_api'][i]
#     try:
#         en_transcript = transcript_api.find_transcript(['en']).fetch()
#         transcript_dict_all[video_id] = en_transcript
#     except:
#         pass
    
# #     #try to find a manual transcript if it exists in transcript_list - not often
# #     try:
# #         en_transcript_manual = transcript_api.find_manually_created_transcript(['en']).fetch()
# #         transcript_dict_man[video_id] = en_transcript_manual
# #     except:
# #         pass
    
#     if i % 200 == 0:
#         print("PROCESSED: ", i)
#         print("---")

In [None]:
len(transcript_dict_all.keys())

In [None]:
# def process_transcripts(raw_dict_in, outfile='processed_transcripts.json'):
    
#     """Writes a json file from the raw transcript dict returned from scrape_transcripts()
    
#     Returns processed_transcripts with video_ids as keys and the following items:
#         list: list of sentence strings from 'text' field of raw_dict
#         str: combined string from 'text' field of raw_dict
    
#     """

#     processed_transcripts = {}
#     for video_id, transcript in raw_dict_in.items():
#         processed_transcripts[video_id] = {
#             "list": [],
#             "str": ""
#         }
#         transcript_list = []
#         transcript_str = ""
#         for item in transcript:
#             text_line = item['text']
#             transcript_list.append(text_line)
#             transcript_str += text_line 
#             transcript_str += " "
#         processed_transcripts[video_id]['list'] = transcript_list
#         processed_transcripts[video_id]['str'] = transcript_str
        
#         return processed_transcripts


# #     with open(outfile, 'w', encoding='utf-8') as f:
# #         json.dump(processed_transcripts, f, ensure_ascii=False, indent=4)

In [None]:
# # process_transcripts(transcript_dict_man, "processed_MAN.json")
# process_transcripts(transcript_dict_all, "processed_ALL.json")

In [None]:
#use youtube.videos().list(part='statistics, snippet')