# Generating Subtitles 

The code in this project show steps to generate subtitles for your video file.

Note:
    1. Python libraries mostly work on wav audio formats, so I converted the video to audio (.wav extension) in the code.
    2. Google Cloud APIs work properly on locally stored files only if the audio files are < 1 minute. 
    (Otherwise you need to store the file on Cloud Bucket)
    So, while converting the audios here, I also trimmed the audio to 50 second duration.

In [1]:
import os
import json
import io
from google.cloud import speech_v1
from google.cloud.speech_v1 import enums
from google.cloud.speech_v1 import types
from pydub.utils import mediainfo
import datetime
import subprocess
import srt

In [21]:
#Path to the video you want subtitles for.

path = "C:/DummyPath/"
video =  path + "C4W1L01 Computer Vision.mp4" 
#This should be set to the path of your service account credentials JSON file.
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = 'Cloud_AIML.json'

In [3]:
print(os.environ["GOOGLE_APPLICATION_CREDENTIALS"])

Cloud_AIML.json


### Get the sample rate, bit rate and no. of channels from the video

In [4]:
def video_info(video_filepath):
    """ this function returns number of channels, bit rate, and sample rate of the video"""

    video_data = mediainfo(video_filepath)
    channels = video_data["channels"]
    bit_rate = video_data["bit_rate"]
    sample_rate = video_data["sample_rate"]

    return channels, bit_rate, sample_rate

In [5]:
channels, bit_rate, sample_rate = video_info(video)

In [6]:
sample_rate

'44100'

### Convert video to audio

In [7]:
[input_name, input_type] = os.path.splitext(video)
output_name = input_name.split('/')[-1]

In [16]:
audio_path = '"' + path + "audio/" + output_name + ".wav" + '"'
video_path = '"' + video + '"'
command = f"ffmpeg -i { video_path } -b:a {bit_rate} -ac {channels} -ar {sample_rate} -t 00:00:50.0 -vn { audio_path }"
subprocess.call(command, shell=True)

# Output '0' means success and '1' means failed.

0

In [22]:
#Path to where audio is stored.

audio = path + "audio/" + output_name + ".wav"

### Transcribing the audio file 

In [20]:
def long_running_recognize(local_file_path, sample_rate, channels):
   
    client = speech_v1.SpeechClient()


    encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
    config = {
        "language_code": "en-US",
        "sample_rate_hertz": int(sample_rate),
        "encoding": enums.RecognitionConfig.AudioEncoding.LINEAR16,
        "audio_channel_count": int(channels),
        "enable_word_time_offsets": True,   #provides timing information
        "model": "video",
        "enable_automatic_punctuation":True
    }
    with io.open(local_file_path, "rb") as f:
        content = f.read()
    audio = {"content": content}

    operation = client.long_running_recognize(config, audio)

    print("Waiting for operation to complete...")
    response = operation.result()
    return response

In [11]:
response=long_running_recognize(audio, sample_rate, channels)

Waiting for operation to complete...


### Generating Subtitles

In [17]:
def subtitle_generation(speech_to_text_response, bin_size=4):
    """We define a bin of time period to display the words in sync with audio. 
    Here, bin_size = 3 means each bin is of 3 secs. 
    All the words in the interval of 3 secs in result will be grouped togather."""
    transcriptions = []
    index = 0
 
    for result in response.results:
        try:
            if result.alternatives[0].words[0].start_time.seconds:
                # bin start -> for first word of result
                start_sec = result.alternatives[0].words[0].start_time.seconds 
                start_microsec = result.alternatives[0].words[0].start_time.nanos * 0.001
            else:
                # bin start -> For First word of response
                start_sec = 0
                start_microsec = 0 
            end_sec = start_sec + bin_size # bin end sec
            
            # for last word of result
            last_word_end_sec = result.alternatives[0].words[-1].end_time.seconds
            last_word_end_microsec = result.alternatives[0].words[-1].end_time.nanos * 0.001
            
            # bin transcript
            transcript = result.alternatives[0].words[0].word
            
            index += 1 # subtitle index

            for i in range(len(result.alternatives[0].words) - 1):
                try:
                    word = result.alternatives[0].words[i + 1].word
                    word_start_sec = result.alternatives[0].words[i + 1].start_time.seconds
                    word_start_microsec = result.alternatives[0].words[i + 1].start_time.nanos * 0.001 # 0.001 to convert nana -> micro
                    word_end_sec = result.alternatives[0].words[i + 1].end_time.seconds
                    word_end_microsec = result.alternatives[0].words[i + 1].end_time.nanos * 0.001

                    if word_end_sec < end_sec:
                        transcript = transcript + " " + word
                    else:
                        previous_word_end_sec = result.alternatives[0].words[i].end_time.seconds
                        previous_word_end_microsec = result.alternatives[0].words[i].end_time.nanos * 0.001
                        
                        # append bin transcript
                        transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, start_sec, start_microsec), datetime.timedelta(0, previous_word_end_sec, previous_word_end_microsec), transcript))
                        
                        # reset bin parameters
                        start_sec = word_start_sec
                        start_microsec = word_start_microsec
                        end_sec = start_sec + bin_size
                        transcript = result.alternatives[0].words[i + 1].word
                        
                        index += 1
                except IndexError:
                    pass
            # append transcript of last transcript in bin
            transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, start_sec, start_microsec), datetime.timedelta(0, last_word_end_sec, last_word_end_microsec), transcript))
            index += 1
        except IndexError:
            pass
    
    # turn transcription list into subtitles
    subtitles = srt.compose(transcriptions)
    return subtitles

In [18]:
subtitles= subtitle_generation(response)

### Store subtitles in a new .srt file

In [19]:
with open("subtitles.srt", "w+") as f:
    f.write(subtitles)