# Record audio from your microphone

The part to record from microphone comes from [here](https://colab.research.google.com/gist/ricardodeazambuja/03ac98c31e87caf284f7b06286ebf7fd/microphone-to-numpy-array-from-your-browser-in-colab.ipynb)



In [None]:
!pip install ffmpeg-python

Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Installing collected packages: ffmpeg-python
Successfully installed ffmpeg-python-0.2.0


In [None]:
!pip install --upgrade google-cloud-speech

Collecting google-cloud-speech
  Downloading google_cloud_speech-2.23.0-py2.py3-none-any.whl (274 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/274.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/274.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m274.5/274.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: google-cloud-speech
Successfully installed google-cloud-speech-2.23.0


In [None]:
"""
To write this piece of code I took inspiration/code from a lot of places.
It was late night, so I'm not sure how much I created or just copied o.O
Here are some of the possible references:
https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/
https://stackoverflow.com/a/18650249
https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/
https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/
https://stackoverflow.com/a/49019356
"""
from IPython.display import HTML, Audio
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
from scipy.io.wavfile import read as wav_read
import io
import ffmpeg

AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Press to start recording");

my_btn.appendChild(t);
//my_p.appendChild(my_btn);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);

var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;

var handleSuccess = function(stream) {
  gumStream = stream;
  var options = {
    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k
    mimeType : 'audio/webm;codecs=opus'
    //mimeType : 'audio/webm;codecs=pcm'
  };
  //recorder = new MediaRecorder(stream, options);
  recorder = new MediaRecorder(stream);
  recorder.ondataavailable = function(e) {
    var url = URL.createObjectURL(e.data);
    var preview = document.createElement('audio');
    preview.controls = true;
    preview.src = url;
    document.body.appendChild(preview);

    reader = new FileReader();
    reader.readAsDataURL(e.data);
    reader.onloadend = function() {
      base64data = reader.result;
      //console.log("Inside FileReader:" + base64data);
    }
  };
  recorder.start();
  };

recordButton.innerText = "Recording... press to stop";

navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);


function toggleRecording() {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordButton.innerText = "Saving the recording... pls wait!"
  }
}

// https://stackoverflow.com/a/951057
function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()

sleep(2000).then(() => {
  // wait 2000ms for the data to be available...
  // ideally this should use something like await...
  //console.log("Inside data:" + base64data)
  resolve(base64data.toString())

});

}
});

</script>
"""

def get_audio():
  display(HTML(AUDIO_HTML))
  data = eval_js("data")
  binary = b64decode(data.split(',')[1])

  process = (ffmpeg
    .input('pipe:0')
    .output('pipe:1', format='wav')
    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
  )
  output, err = process.communicate(input=binary)

  riff_chunk_size = len(output) - 8
  # Break up the chunk size into four bytes, held in b.
  q = riff_chunk_size
  b = []
  for i in range(4):
      q, r = divmod(q, 256)
      b.append(r)

  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
  riff = output[:4] + bytes(b) + output[8:]

  sr, audio = wav_read(io.BytesIO(riff))

  return audio, sr

In [None]:
audio, sr = get_audio()

In [None]:
import scipy
scipy.io.wavfile.write('test_2.wav', sr, audio)

In [None]:
from google.cloud import speech_v1p1beta1 as speech

client = speech.SpeechClient.from_service_account_json('stt_key.json')

speech_file = "test_2.wav"

with open(speech_file, "rb") as audio_file:
    content = audio_file.read()
audio = speech.RecognitionAudio(content=content)

diarization_config = speech.SpeakerDiarizationConfig(
    enable_speaker_diarization=True,
    min_speaker_count=2,
    max_speaker_count=3,
)

config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=sr,
    language_code="en-US",
    diarization_config=diarization_config,
)

print("Waiting for operation to complete...")
response = client.recognize(config=config, audio=audio)
wanted_result = response.results[-1]
prev_speaker_tag = wanted_result.alternatives[0].words[0].speaker_tag

s="Speaker {}:".format(prev_speaker_tag)

for i in wanted_result.alternatives[0].words:
    if(i.speaker_tag != prev_speaker_tag):
        # print('\n')
        # print("Speaker {} : ".format(i.speaker_tag))
        print(s+'\n')
        s = "Speaker {}:".format(i.speaker_tag)

    s+= " "+ i.word

    prev_speaker_tag = i.speaker_tag    # SPEKAER

Waiting for operation to complete...
Speaker 2: hello my name is Brandon years



In [None]:
wanted_result

alternatives {
  words {
    start_time {
    }
    end_time {
      seconds: 1
      nanos: 600000000
    }
    word: "hello"
    speaker_tag: 2
  }
  words {
    start_time {
      seconds: 1
      nanos: 600000000
    }
    end_time {
      seconds: 1
      nanos: 900000000
    }
    word: "my"
    speaker_tag: 2
  }
  words {
    start_time {
      seconds: 1
      nanos: 900000000
    }
    end_time {
      seconds: 2
    }
    word: "name"
    speaker_tag: 2
  }
  words {
    start_time {
      seconds: 2
    }
    end_time {
      seconds: 2
      nanos: 300000000
    }
    word: "is"
    speaker_tag: 2
  }
  words {
    start_time {
      seconds: 2
      nanos: 300000000
    }
    end_time {
      seconds: 2
      nanos: 600000000
    }
    word: "Brandon"
    speaker_tag: 2
  }
  words {
    start_time {
      seconds: 2
      nanos: 600000000
    }
    end_time {
      seconds: 6
      nanos: 200000000
    }
    word: "years"
    speaker_tag: 2
  }
  words {
    start_time {

In [None]:
from google.cloud import speech_v1p1beta1 as speech

client = speech.SpeechClient.from_service_account_json('stt_key.json')

speech_file = "test_2.wav"

with open(speech_file, "rb") as audio_file:
    content = audio_file.read()
audio = speech.RecognitionAudio(content=content)

diarization_config = speech.SpeakerDiarizationConfig(
    enable_speaker_diarization=True,
    min_speaker_count=2,
    max_speaker_count=2
)

config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=sr,
    language_code="en-US",
    diarization_config=diarization_config,
    enable_word_time_offsets=True
)

print("Waiting for operation to complete...")
response = client.recognize(config=config, audio=audio)

# The transcript within each result is separate and sequential per result.
# However, the words list within an alternative includes all the words
# from all the results thus far. Thus, to get all the words with speaker
# tags, you only have to take the words list from the last result:
result = response.results[-1]

words_info = result.alternatives[0].words

# Printing out the output:
for word_info in words_info:
    print(f"word: '{word_info.word}', speaker_tag: {word_info.speaker_tag}")

Waiting for operation to complete...
word: 'excuse', speaker_tag: 1
word: 'me', speaker_tag: 1
word: 'is', speaker_tag: 1
word: 'this', speaker_tag: 1
word: 'seat', speaker_tag: 1
word: 'free', speaker_tag: 1
word: 'I', speaker_tag: 1
word: 'didn't', speaker_tag: 1
word: 'think', speaker_tag: 1
word: 'the', speaker_tag: 1
word: 'auditorium', speaker_tag: 1
word: 'would', speaker_tag: 1
word: 'fill', speaker_tag: 1
word: 'up', speaker_tag: 1
word: 'so', speaker_tag: 1
word: 'fast', speaker_tag: 1
word: 'of', speaker_tag: 1
word: 'course', speaker_tag: 1
word: 'by', speaker_tag: 1
word: 'the', speaker_tag: 1
word: 'way', speaker_tag: 1
word: 'nice', speaker_tag: 1
word: 'to', speaker_tag: 1
word: 'meet', speaker_tag: 1
word: 'you', speaker_tag: 1
word: 'nice', speaker_tag: 1
word: 'to', speaker_tag: 1
word: 'meet', speaker_tag: 1
word: 'you', speaker_tag: 1
word: 'Kevin', speaker_tag: 1
word: 'I'm', speaker_tag: 1
word: 'Carmen', speaker_tag: 1
word: 'Sanchez', speaker_tag: 1
word: 'and'

In [None]:
import io
import argparse
import json

import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "stt_key.json"

from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()
speech_file = "test_2 (1).wav"

with open(speech_file, "rb") as audio_file:
    content = audio_file.read()
audio = speech.types.RecognitionAudio(content = content)

config =speech.types.RecognitionConfig(
  encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16,
  sample_rate_hertz = 48000,
  language_code = 'en-US',
  enable_speaker_diarization = True,
  enable_automatic_punctuation = True,
  diarization_speaker_count = 2,
  model = 'phone_call'
  # use_enhanced=True
  )
speech.types.RecognitionConfig()

operation = client.long_running_recognize(config = config, audio=audio)
print("Waiting for operation to complete..")
response = operation.result(timeout=90)

wanted_result = response.results[-1]
# print("WantedResult:" + str(wanted_result))
# print("Type:"+ str(type(wanted_result)))
prev_speaker_tag = wanted_result.alternatives[0].words[0].speaker_tag
# print("PrevSpeakerTag" + str(prev_speaker_tag))
# print("alternatives" + str(wanted_result.alternatives[0]) +str(type(wanted_result.alternatives[0])))
# print("Words" + str(wanted_result.alternatives[0].words[0]) + str(type(wanted_result.alternatives[0].words[0])))

# for result in wanted_result.alternatives[0]:
#     if(result.words[0].speaker_tag != prev_speaker_tag):
#         print('/n')
#         print("Speaker {}:".format(result.words[0].speaker_tag))
#         l.append(result.words[0].word)

#     else:
#         l.append(result.alternatives.words.word)

#     prev_speaker_tag = result.alternatives.words.speaker_tag

s="Speaker {}:".format(prev_speaker_tag)
for i in wanted_result.alternatives[0].words:
    if(i.speaker_tag != prev_speaker_tag):
        print(s+'\n')
        s = "Speaker {}:".format(i.speaker_tag)

    s+= " "+ i.word

    prev_speaker_tag = i.speaker_tag    # SPEKAER
print(s+'\n')

Waiting for operation to complete..
Speaker 2: Excuse me. Is this seat free? I didn't think the auditorium would fill up so fast, of course, my

Speaker 1: name is Kevin more by the way North to make you

Speaker 2: nice to meet you Kevin. I'm Carmen Sanchez.

Speaker 1: Are you the common finches leaving a networking Workshop later on?

Speaker 2: That's right. I've been holding one at this event for the last few years. How do you hear about it?

Speaker 1: A friend of mine recommended it to me. So I saw that for he mentioned something about there being no better place to networks in a networking Workshop.



In [None]:
for i, result in enumerate(response.results):
      alternative = result.alternatives[0]
      print("-" * 20)
      print(f"First alternative of result {i}")
      print(f"Transcript: {alternative.transcript}")
      # print(f"Channel Tag: {alternative.speaker_tag}")

--------------------
First alternative of result 0
Transcript: a sterile mind if I join you. How's your day going crazy busy you know how it is how about you same here I need even the new project by the way did you get my email about the client meeting tomorrow I'll review it tonight and send over my treatment anything else no that's it for now let's eat lunch tomorrow to sing before the meeting sounds good what do you think about new place downtown I see you tomorrow then
--------------------
First alternative of result 1
Transcript: 


In [None]:
result = response.results[0]
alternative = result.alternatives[0]

In [None]:
print(alternative.transcript)

a sterile mind if I join you. How's your day going crazy busy you know how it is how about you same here I need even the new project by the way did you get my email about the client meeting tomorrow I'll review it tonight and send over my treatment anything else no that's it for now let's eat lunch tomorrow to sing before the meeting sounds good what do you think about new place downtown I see you tomorrow then


In [None]:
words_info

[start_time {
  seconds: 1
  nanos: 100000000
}
end_time {
  seconds: 1
  nanos: 500000000
}
word: "a"
speaker_tag: 2
, start_time {
  seconds: 1
  nanos: 500000000
}
end_time {
  seconds: 1
  nanos: 900000000
}
word: "sterile"
speaker_tag: 1
, start_time {
  seconds: 1
  nanos: 900000000
}
end_time {
  seconds: 2
  nanos: 400000000
}
word: "mind"
speaker_tag: 1
, start_time {
  seconds: 2
  nanos: 400000000
}
end_time {
  seconds: 2
  nanos: 500000000
}
word: "if"
speaker_tag: 1
, start_time {
  seconds: 2
  nanos: 500000000
}
end_time {
  seconds: 2
  nanos: 600000000
}
word: "I"
speaker_tag: 1
, start_time {
  seconds: 2
  nanos: 600000000
}
end_time {
  seconds: 3
}
word: "join"
speaker_tag: 1
, start_time {
  seconds: 3
}
end_time {
  seconds: 3
  nanos: 800000000
}
word: "you."
speaker_tag: 1
, start_time {
  seconds: 3
  nanos: 800000000
}
end_time {
  seconds: 5
  nanos: 400000000
}
word: "How\'s"
speaker_tag: 1
, start_time {
  seconds: 5
  nanos: 400000000
}
end_time {
  seco