### Transpription from URI

In [1]:
# Imports the Google Cloud client library
from google.cloud import speech

In [2]:
# Instantiates a client
client = speech.SpeechClient()

In [3]:
# The name of the audio file to transcribe
gcs_uri = "gs://cloud-samples-data/speech/brooklyn_bridge.raw"

In [4]:
audio = speech.RecognitionAudio(uri=gcs_uri)

config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=16000,
    language_code="en-US",
)

# Detects speech in the audio file
response = client.recognize(config=config, audio=audio)

for result in response.results:
    print("Transcript: {}".format(result.alternatives[0].transcript))

Transcript: how old is the Brooklyn Bridge


### Transcribing short audio files

In [39]:
def transcribe_file(speech_file):
    """Transcribe the given audio file."""
    from google.cloud import speech
    import io

    client = speech.SpeechClient()

    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code="en-US",
    )

    response = client.recognize(config=config, audio=audio)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print("Transcript: {}".format(result.alternatives[0].transcript))


In [40]:
transcribe_file('../../records/2023-02-14-00-22-45/audio.wav')

Transcript: I'm honored to be with you today for your commencement from one of the finest universities in the world
Transcript:  truth be told
Transcript:  I never graduated from college and this is the closest I've ever gotten to a college graduation
Transcript:  today I want to tell you three stories for my life that's it no big deal


### Transcribing long files

In [46]:
def transcribe_gcs(gcs_uri):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech

    client = speech.SpeechClient()

    audio = speech.RecognitionAudio(uri=gcs_uri)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code="en-US",
    )

    operation = client.long_running_recognize(config=config, audio=audio)

    print("Waiting for operation to complete...")
    response = operation.result(timeout=90)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print("Transcript: {}".format(result.alternatives[0].transcript))
        print("Confidence: {}".format(result.alternatives[0].confidence))


In [47]:
transcribe_gcs('gs://audio-recorder/audio.wav')

Waiting for operation to complete...
Transcript: this program is brought to you by Stanford University please visit us at stanford.edu
Confidence: 0.9744933843612671
Transcript:  thank you
Confidence: 0.9544791579246521
Transcript:  I'm honored to be with you today for your commencement from one of the finest universities in the world
Confidence: 0.9675921201705933
Transcript:  truth be told
Confidence: 0.9876291155815125
Transcript:  I never graduated from college and this is the closest I've ever gotten to a college graduation
Confidence: 0.9876018762588501
Transcript:  today I want to tell you three stories for my life that's it no big deal just three stories
Confidence: 0.9687941670417786
Transcript:  the first story is about connecting the dots
Confidence: 0.84505295753479
Transcript:  I dropped out of college after the first 6 months but then stayed around as a drop in for another 18 months or so before I really quit
Confidence: 0.9099323153495789
Transcript:  so why did I drop o

In [8]:
from google.cloud import storage

def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"
    # The path to your file to upload
    # source_file_name = "local/path/to/file"
    # The ID of your GCS object
    # destination_blob_name = "storage-object-name"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print(
        f"File {source_file_name} uploaded to {destination_blob_name}."
    )


In [54]:
upload_blob('audio-recorder', '../../records/2023-02-14-01-13-18/audio.wav', 'conversation_sample.wav')

File ../../records/2023-02-14-01-13-18/audio.wav uploaded to conversation_sample.wav.


### Diarization

In [76]:
def diarization_gcs(gcs_uri):
    from google.cloud import speech_v1p1beta1 as speech

    client = speech.SpeechClient()

    audio = speech.RecognitionAudio(uri=gcs_uri)

    diarization_config = speech.SpeakerDiarizationConfig(
        enable_speaker_diarization=True,
        min_speaker_count=2,
        max_speaker_count=2,
    )

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code="en-US",
        diarization_config=diarization_config,
    )
    print("Waiting for operation to complete...")
    operation = client.long_running_recognize(config=config, audio=audio)
    response = operation.result(timeout=900)
    result = response.results[-1]
    words_info = result.alternatives[0].words

    # Printing out the output:
    for word_info in words_info:
        print(
            "word: '{}', speaker_tag: {}".format(word_info.word, word_info.speaker_tag)
        )

In [73]:
diarization_gcs('gs://audio-recorder/conversation_sample.wav')

Waiting for operation to complete...
word: 'hello', speaker_tag: 1
word: 'hi', speaker_tag: 1
word: 'what', speaker_tag: 1
word: 'can', speaker_tag: 1
word: 'I', speaker_tag: 1
word: 'get', speaker_tag: 1
word: 'you', speaker_tag: 1
word: 'coffee', speaker_tag: 1
word: 'milk', speaker_tag: 1
word: 'and', speaker_tag: 1
word: 'sugar', speaker_tag: 1
word: 'no', speaker_tag: 1
word: 'milk', speaker_tag: 1
word: 'would', speaker_tag: 2
word: 'you', speaker_tag: 2
word: 'like', speaker_tag: 2
word: 'any', speaker_tag: 2
word: 'a', speaker_tag: 1
word: 'slice', speaker_tag: 1
word: 'of', speaker_tag: 1
word: 'chocolate', speaker_tag: 1
word: 'cake', speaker_tag: 1
word: 'some', speaker_tag: 1
word: 'pastries', speaker_tag: 1
word: 'no', speaker_tag: 1
word: 'thanks', speaker_tag: 1
word: 'Justin', speaker_tag: 1
word: 'amazing', speaker_tag: 1
word: 'I', speaker_tag: 1
word: 'mean', speaker_tag: 1
word: 'what', speaker_tag: 1
word: 'Egypt', speaker_tag: 1
word: 'credible', speaker_tag: 1
wo

In [74]:
upload_blob('audio-recorder', '../../records/2023-02-13-15-56-37/audio.wav', 'meeting_log_sample.wav')

File ../../records/2023-02-13-15-56-37/audio.wav uploaded to meeting_log_sample.wav.


In [None]:
diarization_gcs('gs://audio-recorder/meeting_log_sample.wav')

Waiting for operation to complete...


## Diarization

In [1]:
import librosa
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [16]:
audio_path = "../../../result/family-conversation/family-conversation.wav"
audio, sr = librosa.load(audio_path, sr=None)
print(audio.shape[0]/sr)

91.74204081632654


In [7]:
mfccs = librosa.feature.mfcc(y=audio, sr=sr)

In [9]:
mfccs.shape

(20, 7903)

In [10]:
scaler = StandardScaler()
mfccs_scaled = scaler.fit_transform(mfccs.T)
kmeans = KMeans(n_clusters=2)  # Adjust based on the expected number of speakers
speaker_labels = kmeans.fit_predict(mfccs_scaled)

for i, label in enumerate(speaker_labels):
    print(f"Time Segment {i}: Speaker {label}")

Time Segment 0: Speaker 1
Time Segment 1: Speaker 1
Time Segment 2: Speaker 1
Time Segment 3: Speaker 1
Time Segment 4: Speaker 1
Time Segment 5: Speaker 1
Time Segment 6: Speaker 1
Time Segment 7: Speaker 1
Time Segment 8: Speaker 1
Time Segment 9: Speaker 1
Time Segment 10: Speaker 1
Time Segment 11: Speaker 1
Time Segment 12: Speaker 1
Time Segment 13: Speaker 1
Time Segment 14: Speaker 1
Time Segment 15: Speaker 1
Time Segment 16: Speaker 1
Time Segment 17: Speaker 1
Time Segment 18: Speaker 1
Time Segment 19: Speaker 1
Time Segment 20: Speaker 1
Time Segment 21: Speaker 1
Time Segment 22: Speaker 1
Time Segment 23: Speaker 1
Time Segment 24: Speaker 1
Time Segment 25: Speaker 1
Time Segment 26: Speaker 1
Time Segment 27: Speaker 1
Time Segment 28: Speaker 1
Time Segment 29: Speaker 1
Time Segment 30: Speaker 1
Time Segment 31: Speaker 1
Time Segment 32: Speaker 1
Time Segment 33: Speaker 1
Time Segment 34: Speaker 1
Time Segment 35: Speaker 1
Time Segment 36: Speaker 1
Time Segmen