In [1]:
!pip install --upgrade google-api-python-client
!pip install --upgrade google-cloud-speech google-cloud-bigtable google-cloud-core google-api-core
!pip install moviepy
!pip install pytube
!sudo apt-get install ffmpeg -y

Collecting google-api-python-client
  Downloading google_api_python_client-2.127.0-py2.py3-none-any.whl.metadata (6.7 kB)
Downloading google_api_python_client-2.127.0-py2.py3-none-any.whl (12.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m92.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: google-api-python-client
  Attempting uninstall: google-api-python-client
    Found existing installation: google-api-python-client 2.126.0
    Uninstalling google-api-python-client-2.126.0:
      Successfully uninstalled google-api-python-client-2.126.0
Successfully installed google-api-python-client-2.127.0
Reading package lists... Done
Building dependency tree       
Reading state information... Done
Some packages could not be installed. This may mean that you have
requested an impossible situation or if you are using the unstable
distribution that some required packages have not yet been created
or been moved out of

In [2]:
import os
import subprocess
from pytube import YouTube
from google.cloud import speech_v1 as speech
from google.cloud import storage
import pandas as pd

#define all functions, also make sure that if we encounter a problem we know exactly which step is wrong
#download the audio from a given YouTube URL using pytube
def download_audio(url):
    try:
        yt = YouTube(url)
        audio_stream = yt.streams.filter(only_audio=True).order_by('abr').desc().first()
        filename = audio_stream.download()
        return filename
    except Exception as e:
        print(f"Failed to download audio from {url}. Reason: {e}")
        return None

#adjust audio to same format which can be used for speech-to-text api (adjust sample rate, channels and output path)
def adjust_audio(input_audio_path, output_audio_path, target_sample_rate=16000, channels=1):
    if not output_audio_path.endswith('.wav'):
        output_audio_path = os.path.splitext(output_audio_path)[0] + '.wav'
    
    command = [
        "ffmpeg", #use ffmpeg
        "-i", input_audio_path,          
        "-ar", str(target_sample_rate),   
        "-ac", str(channels),           
        "-y",                            
        output_audio_path                 
    ]
    try:
        subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return output_audio_path
    except subprocess.CalledProcessError as e:
        print(f"Failed to adjust audio properties: {e.stderr.decode()}")


# upload file to google cloud storage
def upload_to_gcs(local_file_path, bucket_name, gcs_file_path):
    try:
        subprocess.run(
            ["gsutil", "cp", local_file_path, f"gs://{bucket_name}/{gcs_file_path}"],
            check=True,
            capture_output=True
        )
    except subprocess.CalledProcessError as e:
        print(f"Failed to upload {local_file_path} to GCS: {e.stderr.decode()}")

#using speech-to-text api to get the transcript results
def transcribe_audio_gcs(gcs_uri):
    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcs_uri)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",#get english transcript
    )
    operation = client.long_running_recognize(config=config, audio=audio)
    response = operation.result(timeout=1000)#set timeout
    #combine text from all the transcripts
    transcripts = ' '.join([result.alternatives[0].transcript for result in response.results])
    return transcripts

# upload text content to a google cloud storage
def upload_text_to_gcs(bucket_name, blob_name, content):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.upload_from_string(content)

In [5]:
#usage for multiple videos

#to process audio using defined functions
def process_audio(url, bucket_name):
    audio_path = download_audio(url) #download audio
    text = ''  #initialise variable
    if audio_path:
        #adjust audio name
        base_name = os.path.basename(audio_path)
        video_id = os.path.splitext(base_name)[0]
        adjusted_audio_path = f'{video_id}_adjusted.wav'
        #adjust audio
        adjust_audio(audio_path, adjusted_audio_path, 16000, 1)
        gcs_audio_filename = f'{video_id}.wav'
        #upload audio to gcs
        upload_to_gcs(adjusted_audio_path, bucket_name, gcs_audio_filename)
        gcs_uri = f'gs://{bucket_name}/{gcs_audio_filename}'
        #get text and upload to gcs
        text = transcribe_audio_gcs(gcs_uri)  
        text_blob_name = gcs_audio_filename.replace('.wav', '.txt')
        upload_text_to_gcs(bucket_name, text_blob_name, text)
    return text  

#to handle each url in the dataframe
def process_audio_for_dataframe(url, bucket_name):
    try:
        return process_audio(url, bucket_name)
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return ""
    
if __name__ == '__main__':
    #set project id and bucket name
    PROJECT_ID = 'st446-project'
    BUCKET_NAME = 'st446-project' 
    # get the csv file containing urls from previous part(URLs Extracting.ipynb)
    df = pd.read_csv('gs://st446-project/urls_combined.csv')
    
    #apply the audio processing function to each url, also save the results in a new column in the csv file
    df['text'] = df['URL'].apply(lambda url: process_audio_for_dataframe(url, BUCKET_NAME))

    #save the final dataframe containing transcripts
    df.to_csv(f'gs://{BUCKET_NAME}/urls_with_transcripts.csv', index=False)