In [None]:
import ffmpeg  # Library to work with audio and video files, here used to extract audio
import subprocess  # To make subprocess call using terminal
from ibm_watson import SpeechToTextV1
from ibm_watson.websocket import RecognizeCallback, AudioSource
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

In [None]:
# Extracts audio from video
# ffmpeg -i input_filename -ab bitrate -ar frequency -vn output_filename

# TODO: Change input_filename.mp4 to file to be converted
# If your output is '127' that means command not found and likely ffmpeg has not been correctly installed.
command = 'ffmpeg -i input_filename.mp4 -ab 160k -ar 44100 -vn output_filename.wav'
subprocess.call(command, shell=True)

In [None]:
# TODO: Insert the api_key and url as strings as listed on your IBM Cloud account (Watson Speech to Text service)
api_key = ''
url = ''

In [None]:
# Setups service
authenticator = IAMAuthenticator(api_key)
stt = SpeechToTextV1(authenticator=authenticator)
stt.set_service_url(url)

In [None]:
# Model list: https://cloud.ibm.com/apidocs/speech-to-text#listmodels
# Japanese models: ja-JP_BroadbandModel, ja-JP_NarrowbandModel
# JP_Broadband seems to capture the speech more accurately

# If 'Error: Payload length', please use video-to-text-longfiles.py
with open('output_filename.wav', 'rb') as f:
    res = stt.recognize(audio=f, content_type= 'audio/wav', model='ja-JP_BroadbandModel', continuous=True).get_result()

In [None]:
res  # Shows speech to text results

In [None]:
text = [result['alternatives'][0]['transcript'].rstrip() + '.\n' for result in res['results']]

In [None]:
transcript_list = ''.join(text).split()

for n, i in enumerate(transcript_list):
    if i.endswith(u'.'):
        transcript_list[n] = i[:-1]

for n, i in enumerate(transcript_list):
    # Light text cleaning below
    if i == u'ですね':
        transcript_list[n] = u'ですね。'
    if i == u'はい':
        transcript_list[n] = u'はい、'
    if i.endswith(u'ます'):
        transcript_list[n] = i.replace(u'ます', u'ます。')
    if i.endswith(u'ましょう'):
        transcript_list[n] = i.replace(u'ましょう', u'ましょう。')
    if i == u'ね' and transcript_list[n-1] == u'です':
        transcript_list[n] = u'ね。'
    if i == u'ねえ':
        transcript_list[n] = u'ね。'
    if i.startswith('D_'):  # Dropoffs in speech. Often in places like ええと and まあー
        transcript_list[n] = ''

In [None]:
transcript = ''.join(transcript_list)
transcript = transcript.replace(u'ます。ので', 'ますので、')
transcript

In [None]:
# Saves transcript variable as 'トランスクリプト.txt'
with open('トランスクリプト.txt', 'w') as out:
    out.writelines(transcript)