In [23]:
from __future__ import print_function
import time
import boto3
import json
from sagemaker import get_execution_role

REGION = 'us-west-2'

ROLE = get_execution_role()
display(ROLE)
# need to attach policies

'arn:aws:iam::688520471316:role/service-role/AmazonSageMaker-ExecutionRole-20200522T134110'

In [20]:
## ComprehendFullAccess 
## AmazonSageMakerFullAccess 
## AmazonS3FullAccess 
## AmazonAugmentedAIFullAccess

In [16]:
transcribe = boto3.client('transcribe', REGION)
s3 = boto3.client("s3", REGION)
job_name_1 = "AWS-sage-1"
job_uri_1 = "https://jashuang-sagemaker-5-22.s3-us-west-2.amazonaws.com/transcribe-bucket/Fully-Managed+Notebook+Instances+with+Amazon+SageMaker+-+a+Deep+Dive.mp4"
out_bucket = "jashuang-sagemaker-5-22"

In [None]:
transcribe.start_transcription_job(
    TranscriptionJobName=job_name,
    Media={'MediaFileUri': job_uri},
    MediaFormat='mp4',
    LanguageCode='en-US'
)

In [18]:
def transcribe(job_name, job_uri, out_bucket, format="mp4"):
    """Transcribe a .wav or .mp4 file to text.
    Args:
        job_name (str): the name of the job that you specify;
                        the output json will be job_name.json
        job_uri (str): input path (in s3) to the file being transcribed
        out_bucket (str): s3 bucket name that you want the output json
                          to be placed in
        format (str): mp4 or wav for input file format;
                      defaults to mp4
    """
    
    if format not in ['mp3','mp4','wav','flac']:
        print("Invalid format")
        return

    try:
        transcribe = boto3.client("transcribe")
        print("------" + format)
        transcribe.start_transcription_job(
            TranscriptionJobName=job_name,
            Media={"MediaFileUri": job_uri},
            MediaFormat=format,
            LanguageCode="en-US",
            OutputBucketName=out_bucket,
        )
        
        while True:
            status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
            if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
                break
            print("Not ready yet...")
            time.sleep(5)
        print(status)

    except Exception as e:
        print(e)

In [9]:
def get_transcript_text_and_timestamps(bucket_name, file_name):
    """take json file from s3 bucket and returns a tuple of:
       entire transcript, list object of tuples of timestamp and individual sentences
    
    Args:
        bucket_name (str): name of s3 bucket
        file_name (str): name of file
    Returns:
        (entire_transcript: str,
        [ {timestamp (in seconds) : int, sentence : str} ])
    """
    s3_clientobj = s3.get_object(Bucket=bucket_name, Key=file_name)
    s3_clientdata = s3_clientobj["Body"].read().decode("utf-8")

    original = json.loads(s3_clientdata)
    items = original["results"]["items"]
    entire_transcript = original["results"]["transcripts"]

    sentences_and_times = []
    temp_sentence = ""
    temp_start_time = 0
    newSentence = True
    
    confidences = []

    for item in items:
        # always add the word
        if item["type"] == "punctuation":
            temp_sentence = (
                temp_sentence.strip() + item["alternatives"][0]["content"] + " "
            )
        else:
            temp_sentence = temp_sentence + item["alternatives"][0]["content"] + " "

        # if this is a new sentence, and it starts with a word, save the time
        if newSentence == True:
            if item["type"] == "pronunciation":
                temp_start_time = float(item["start_time"])
            newSentence = False
        # else, keep going until you hit a punctuation
        else:
            if (
                item["type"] == "punctuation"
                and item["alternatives"][0]["content"] != ","
            ):
                sentences_and_times.append(
                    {"time": temp_start_time, "sentence": temp_sentence.strip()}
                )
                # reset the temp sentence
                newSentence = True
                temp_sentence = ""

    return entire_transcript, sentences_and_times

In [19]:
transcribe(job_name_1, job_uri_1, out_bucket)

------mp4
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
{'TranscriptionJob': {'TranscriptionJobName': 'AWS-sage-1', 'TranscriptionJobStatus': 'COMPLETED', 'LanguageCode': 'en-US', 'MediaSampleRateHertz': 44100, 'MediaFormat': 'mp4', 'Media': {'MediaFileUri': 'https://jashuang-sagemaker-5-22.s3-us-west-2.amazonaws.com/transcribe-bucket/Fully-Managed+Notebook+Instances+with+Amazon+SageMaker+-+a+Deep+Dive.mp4'}, 'Transcript': {'TranscriptFileUri': 'https://s3.us-west-2.amazonaws.com/jashuang-sagemaker-5-22/AWS-sage-1.json'}, 'StartTime': datetime.dat

In [24]:
entire_transcript_1, sentences_and_times_1 = get_transcript_text_and_timestamps("jashuang-sagemaker-5-22","AWS-sage-1.json")

In [25]:
entire_transcript_1

[{'transcript': "Hi. My name is Emily Weber. I'm a machine learning specialist at Amazon Web services on today. We're gonna talk about insolence. Age maker Comes on Stage Maker is a fully managed machine learning service that developers and data scientists can use to build, train and deploy machine learning models. Today, we're gonna talk about notebook instances and this is ready. Dive. So with the notebook instances on stage maker, it all starts with a notebook, right? And within the notebook, it starts with your easy to instance. You're easy to instance that's your elastic compute cloud. That's your virtual machine that's going to spin up and let us do all of our processing. This is a managed, easy to instance. That means that even though we're turning it on and off, it's not gonna show up. Under are easy to console, and we're not gonna have ssh access to this machine. It's gonna be fully managed by Amazon. We want to pick the right family. You're easy to instances gonna come in man

In [26]:
type(sentences_and_times_1)

list

In [32]:
for tup in sentences_and_times_1:
    print(str(tup['time']) + " -- " + tup['sentence'])

0.54 -- Hi.
1.36 -- My name is Emily Weber.
2.74 -- I'm a machine learning specialist at Amazon Web services on today.
5.99 -- We're gonna talk about insolence.
7.03 -- Age maker Comes on Stage Maker is a fully managed machine learning service that developers and data scientists can use to build, train and deploy machine learning models.
17.89 -- Today, we're gonna talk about notebook instances and this is ready.
21.2 -- Dive.
23.94 -- So with the notebook instances on stage maker, it all starts with a notebook, right?
28.69 -- And within the notebook, it starts with your easy to instance.
31.82 -- You're easy to instance that's your elastic compute cloud.
34.36 -- That's your virtual machine that's going to spin up and let us do all of our processing.
38.48 -- This is a managed, easy to instance.
41.09 -- That means that even though we're turning it on and off, it's not gonna show up.
44.84 -- Under are easy to console, and we're not gonna have ssh access to this machine.
49.76 -- It'