In [7]:
######################################################################################
#           PYTHON CODE IN THIS FILE READS YOUTUBE LINKS FROM 'youtube_links.csv' FILE,
#           EXTRACTS VIDEO METADATA (INCLUDING AUDIO/TEXT, ADDS PUNCTUATIONS), AND 
#           SAVES IT TO youtube_links_with_metadata.csv                     
######################################################################################

from pytube import YouTube
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
import nltk
import requests

#####################################################################################
#           FUNCTION TO GET PUNCTUATED TEXT
# Stanford-openie can generate RDF triples better if punctuated text is used.
# POST to http://bark.phon.ioc.ee/punctuator to get punctuated text.  
# The above API is based on this paper https://www.researchgate.net/publication/307889284_Bidirectional_Recurrent_Neural_Network_with_Attention_Mechanism_for_Punctuation_Restoration
##################################################################################### 
def get_punctuated_text(text):
    url = 'http://bark.phon.ioc.ee/punctuator'
    myobj = {'text': text}
    response = requests.post(url, data=myobj)   
    return response.text 

#####################################################################################
#           FUNCTION TO GET YOUTUBE VIDEO METADATA AND TRANSCRIPT
# Note: transcript is a dataframe of phrases in video subtitles
#####################################################################################
def get_video_metadata(link):
    yt = YouTube(link)
    transcript = YouTubeTranscriptApi.get_transcript(link.rsplit('/', 1)[-1])
    text_df = pd.DataFrame(transcript)
    text = text_df.text.str.cat(sep=' ')
    punctuated_text = get_punctuated_text(text)
    return link, yt.title, yt.author, yt.publish_date.strftime("%Y-%m-%d"), yt.views, yt.length, yt.rating,  punctuated_text



#Read youtube video links from csv
v_df = pd.read_csv(r'youtube_links.csv')

result = [get_video_metadata(link) for link in v_df['link']]
v_df = pd.DataFrame (result, columns = ['link', 'title','author','publish_date','views','length_seconds', 'rating','transcript'])

#save youtube metadata and transcript to csv
v_df.to_csv(r"youtube_links_with_metadata.csv")


#References:  
# https://pypi.org/project/stanford-openie/
# Tip - runs only with protobuf-3.20.1 
# pip install protobuf==3.20.1
#    Uninstalling protobuf-4.21.7:
#    Successfully uninstalled protobuf-4.21.7
#    Successfully installed protobuf-3.20.1
# Install JDK1.8 from https://docs.aws.amazon.com/corretto/latest/corretto-8-ug/downloads-list.html for stanfordnlp to work.