From 298c1e56e87b092ff58775edd43ae944e1128410 Mon Sep 17 00:00:00 2001 From: charles-zablit Date: Fri, 14 Oct 2022 15:49:50 +0200 Subject: [PATCH 1/4] feat: add Whispering support --- .../WhisperingTranscriptionService.java | 355 ++++++++++++++++++ 1 file changed, 355 insertions(+) create mode 100644 src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java diff --git a/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java b/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java new file mode 100644 index 00000000..001bd19c --- /dev/null +++ b/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java @@ -0,0 +1,355 @@ +/* + * Jigasi, the Jitsi Gateway to SIP. + * + * Copyright @ 2018 - present 8x8, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.jitsi.jigasi.transcription; + +import org.eclipse.jetty.websocket.api.Session; +import org.eclipse.jetty.websocket.api.annotations.*; +import org.eclipse.jetty.websocket.client.WebSocketClient; +import org.jitsi.utils.logging.Logger; +import org.json.JSONObject; +import org.json.simple.JSONArray; + +import javax.media.format.AudioFormat; +import java.io.IOException; +import java.net.URI; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.CountDownLatch; +import java.util.function.Consumer; + + +/** + * Implements a TranscriptionService which uses + * Whispering, a streaming websocket transcription + * service, based on OpenAI's Whisper. + *

+ * See https://github.com/shirayu/whispering for + * more information about the project. + * + * @author Charles Zablit + */ +public class WhisperingTranscriptionService + implements TranscriptionService { + + /** + * The logger for this class. + */ + private final static Logger logger + = Logger.getLogger(WhisperingTranscriptionService.class); + + /** + * The URL of the websocket service speech-to-text service. + */ + public final static String WEBSOCKET_URL + = "org.jitsi.jigasi.transcription.whispering.websocket_url"; + + //public final static String DEFAULT_WEBSOCKET_URL = "ws://192.168.43.152:8000"; + public final static String DEFAULT_WEBSOCKET_URL = "ws://152.228.167.183:8000"; + + private final static String EOF_MESSAGE = "{\"eof\" : 1}"; + + private final String websocketUrl; + + /** + * Create a TranscriptionService which will send audio to Whispering + * to get a transcription. + */ + public WhisperingTranscriptionService() { + websocketUrl = DEFAULT_WEBSOCKET_URL; + } + + /** + * No configuration required yet. + */ + public boolean isConfiguredProperly() { + return true; + } + + /** + * Sends audio as an array of bytes to Whispering. + * + * @param request the TranscriptionRequest which holds the audio to be sent. + * @param resultConsumer a Consumer which will handle the TranscriptionResult. + */ + @Override + public void sendSingleRequest(final TranscriptionRequest request, + final Consumer resultConsumer) { + // Try to create the client, which can throw an IOException + try { + // Set the sampling rate and encoding of the audio + AudioFormat format = request.getFormat(); + if (!format.getEncoding().equals("LINEAR")) { + throw new IllegalArgumentException("Given AudioFormat" + + "has unexpected" + + "encoding"); + } + + WebSocketClient ws = new WebSocketClient(); + WhisperingWebsocketSession socket = new WhisperingWebsocketSession(request); + ws.start(); + ws.connect(socket, new URI(websocketUrl)); + socket.awaitClose(); + resultConsumer.accept( + new TranscriptionResult( + null, + UUID.randomUUID(), + false, + request.getLocale().toLanguageTag(), + 0, + new TranscriptionAlternative(socket.getResult()))); + } catch (Exception e) { + logger.error("Error sending single req", e); + } + } + + @Override + public StreamingRecognitionSession initStreamingSession(Participant participant) + throws UnsupportedOperationException { + try { + WhisperingWebsocketStreamingSession streamingSession = new WhisperingWebsocketStreamingSession( + participant.getDebugName()); + streamingSession.transcriptionTag = participant.getTranslationLanguage(); + if (streamingSession.transcriptionTag == null) { + streamingSession.transcriptionTag = participant.getSourceLanguage(); + } + return streamingSession; + } catch (Exception e) { + throw new UnsupportedOperationException("Failed to create streaming session", e); + } + } + + @Override + public boolean supportsFragmentTranscription() { + return true; + } + + @Override + public boolean supportsStreamRecognition() { + return true; + } + + /** + * A Transcription session for transcribing streams, handles + * the lifecycle of websocket + */ + @WebSocket(maxBinaryMessageSize = 1024 * 1024 * 1024) + public class WhisperingWebsocketStreamingSession + implements StreamingRecognitionSession { + private Session session; + /* The name of the participant */ + private final String debugName; + /* Transcription language requested by the user who requested the transcription */ + private String transcriptionTag = "en-US"; + + /** + * List of TranscriptionListeners which will be notified when a + * result comes in + */ + private final List listeners = new ArrayList<>(); + + WhisperingWebsocketStreamingSession(String debugName) + throws Exception { + logger.info("STARTING WHISPERING WEBSOCKET."); + this.debugName = debugName; + WebSocketClient ws = new WebSocketClient(); + ws.start(); + ws.connect(this, new URI(websocketUrl)); + } + + @OnWebSocketClose + public void onClose(int statusCode, String reason) { + logger.info("CLOSED WHISPERING WEBSOCKET."); + this.session = null; + } + + @OnWebSocketConnect + public void onConnect(Session session) { + logger.info("CONNECTED TO WHISPERING WEBSOCKET."); + this.session = session; + try { + WhisperingContext ctx = new WhisperingContext(0.0); + session.getRemote().sendString(ctx.toJSON().toString()); + } catch (Exception e) { + logger.error("Error while sending context to Whispering server " + debugName, e); + } + } + + @OnWebSocketMessage + public void onMessage(String msg) { + if (logger.isDebugEnabled()) + logger.debug(debugName + "Received response: " + msg); + + JSONObject jsonData = new JSONObject(msg); + logger.info("YOU'VE GOT MAIL " + jsonData.optString("text", "not working")); + for (TranscriptionListener l : listeners) { + l.notify(new TranscriptionResult( + null, + UUID.randomUUID(), + false, + transcriptionTag, + 1.0, + new TranscriptionAlternative(jsonData.optString("text", "not working")))); + } + } + + @OnWebSocketError + public void onError(Throwable cause) { + logger.error("Error while streaming audio data to transcription service", cause); + } + + public void sendRequest(TranscriptionRequest request) { + logger.info("SENDING REQUEST"); + logger.info(request.getFormat().getSampleRate()); + logger.info(request.getDurationInMs()); + try { + //if (sampleRate < 0) + //{ + // sampleRate = request.getFormat().getSampleRate(); + // session.getRemote().sendString("{\"config\" : {\"sample_rate\" : " + sampleRate + " }}"); + //} + ByteBuffer audioBuffer = ByteBuffer.wrap(request.getAudio()); + session.getRemote().sendBytes(audioBuffer); + } catch (Exception e) { + logger.error("Error to send websocket request for participant " + debugName, e); + } + } + + public void addTranscriptionListener(TranscriptionListener listener) { + listeners.add(listener); + } + + public void end() { + try { + session.getRemote().sendString(EOF_MESSAGE); + } catch (Exception e) { + logger.error("Error to finalize websocket connection for participant " + debugName, e); + } + } + + public boolean ended() { + return session == null; + } + } + + /** + * Session to send websocket data and receive results. Non-streaming version + */ + @WebSocket + public class WhisperingWebsocketSession { + /* Signal for the end of operation */ + private final CountDownLatch closeLatch; + + /* Request we need to process */ + private final TranscriptionRequest request; + + /* Collect results*/ + private StringBuilder result; + + WhisperingWebsocketSession(TranscriptionRequest request) { + this.closeLatch = new CountDownLatch(1); + this.request = request; + this.result = new StringBuilder(); + } + + @OnWebSocketClose + public void onClose(int statusCode, String reason) { + this.closeLatch.countDown(); // trigger latch + } + + @OnWebSocketConnect + public void onConnect(Session session) { + try { + AudioFormat format = request.getFormat(); + WhisperingContext ctx = new WhisperingContext(0.0); + session.getRemote().sendString(ctx.toJSON().toString()); + ByteBuffer audioBuffer = ByteBuffer.wrap(request.getAudio()); + session.getRemote().sendBytes(audioBuffer); + session.getRemote().sendString(EOF_MESSAGE); + } catch (IOException e) { + logger.error("Error to transcribe audio", e); + } + } + + @OnWebSocketMessage + public void onMessage(String msg) { + result.append(msg); + result.append('\n'); + } + + @OnWebSocketError + public void onError(Throwable cause) { + logger.error("Websocket connection error", cause); + } + + public String getResult() { + return result.toString(); + } + + void awaitClose() + throws InterruptedException { + closeLatch.await(); + } + } + + /** + * Represent the Whispering Context to instantiate the transcription + * service. + */ + public class WhisperingContext { + /* Starting timestamp of the transcription service. */ + double timestamp; + + WhisperingContext(double timestamp) { + this.timestamp = timestamp; + } + + public JSONObject toJSON() { + JSONObject ctx = new JSONObject(); + ctx.put("timestamp", this.timestamp); + ctx.put("buffer_tokens", new JSONArray()); + ctx.put("buffer_mel", JSONObject.NULL); + ctx.put("vad", true); + JSONArray temperatures = new JSONArray(); + for (int i = 0; i <= 10; i += 2) { + temperatures.add(i / 10); + } + ctx.put("temperatures", temperatures); + ctx.put("allow_padding", true); + ctx.put("patience", JSONObject.NULL); + ctx.put("compression_ratio_threshold", 2.4); + ctx.put("logprob_threshold", -1.0); + ctx.put("no_captions_threshold", 0.6); + ctx.put("best_of", 5); + ctx.put("beam_size", 5); + ctx.put("no_speech_threshold", 0.6); + ctx.put("buffer_threshold", 0.5); + ctx.put("vad_threshold", 0.5); + + JSONObject res = new JSONObject(); + res.put("context", ctx); + + return res; + } + + public String toString() { + return this.toJSON().toString(); + } + } +} From d537a6c020de4ae6dcba55aa07064bb0382d0e5e Mon Sep 17 00:00:00 2001 From: charles-zablit Date: Fri, 14 Oct 2022 16:10:39 +0200 Subject: [PATCH 2/4] chore: edit README --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.md b/README.md index bac2c60e..79aab690 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,21 @@ Then configure the transcription class with the following properly in `~/jigasi/ org.jitsi.jigasi.transcription.customService=org.jitsi.jigasi.transcription.VoskTranscriptionService ``` +Whispering configuration +================== + +To use [Whispering](https://github.com/shirayu/whispering) follow the setup instruction the repo's [README](https://github.com/shirayu/whispering/#example-of-web-socket). + +``` +whispering --language en --model tiny --host 0.0.0.0 --port 8000 +``` + +Then configure the transcription class with the following properly in `~/jigasi/jigasi-home/sip-communicator.properties`: + +``` +org.jitsi.jigasi.transcription.customService=org.jitsi.jigasi.transcription.WhisperingTranscriptionService +``` + Transcription options ===================== From 7e1a4ee52183d2315d0f370eee9feb21c19cadc3 Mon Sep 17 00:00:00 2001 From: charles-zablit Date: Fri, 14 Oct 2022 16:15:52 +0200 Subject: [PATCH 3/4] chore: fix checkstyle violations --- .../WhisperingTranscriptionService.java | 48 +++++++++++++------ 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java b/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java index 001bd19c..1b69905b 100644 --- a/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java +++ b/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java @@ -92,10 +92,12 @@ public boolean isConfiguredProperly() { public void sendSingleRequest(final TranscriptionRequest request, final Consumer resultConsumer) { // Try to create the client, which can throw an IOException - try { + try + { // Set the sampling rate and encoding of the audio AudioFormat format = request.getFormat(); - if (!format.getEncoding().equals("LINEAR")) { + if (!format.getEncoding().equals("LINEAR")) + { throw new IllegalArgumentException("Given AudioFormat" + "has unexpected" + "encoding"); @@ -114,7 +116,9 @@ public void sendSingleRequest(final TranscriptionRequest request, request.getLocale().toLanguageTag(), 0, new TranscriptionAlternative(socket.getResult()))); - } catch (Exception e) { + } + catch (Exception e) + { logger.error("Error sending single req", e); } } @@ -122,15 +126,19 @@ public void sendSingleRequest(final TranscriptionRequest request, @Override public StreamingRecognitionSession initStreamingSession(Participant participant) throws UnsupportedOperationException { - try { + try + { WhisperingWebsocketStreamingSession streamingSession = new WhisperingWebsocketStreamingSession( participant.getDebugName()); streamingSession.transcriptionTag = participant.getTranslationLanguage(); - if (streamingSession.transcriptionTag == null) { + if (streamingSession.transcriptionTag == null) + { streamingSession.transcriptionTag = participant.getSourceLanguage(); } return streamingSession; - } catch (Exception e) { + } + catch (Exception e) + { throw new UnsupportedOperationException("Failed to create streaming session", e); } } @@ -183,10 +191,13 @@ public void onClose(int statusCode, String reason) { public void onConnect(Session session) { logger.info("CONNECTED TO WHISPERING WEBSOCKET."); this.session = session; - try { + try + { WhisperingContext ctx = new WhisperingContext(0.0); session.getRemote().sendString(ctx.toJSON().toString()); - } catch (Exception e) { + } + catch (Exception e) + { logger.error("Error while sending context to Whispering server " + debugName, e); } } @@ -218,7 +229,8 @@ public void sendRequest(TranscriptionRequest request) { logger.info("SENDING REQUEST"); logger.info(request.getFormat().getSampleRate()); logger.info(request.getDurationInMs()); - try { + try + { //if (sampleRate < 0) //{ // sampleRate = request.getFormat().getSampleRate(); @@ -226,7 +238,9 @@ public void sendRequest(TranscriptionRequest request) { //} ByteBuffer audioBuffer = ByteBuffer.wrap(request.getAudio()); session.getRemote().sendBytes(audioBuffer); - } catch (Exception e) { + } + catch (Exception e) + { logger.error("Error to send websocket request for participant " + debugName, e); } } @@ -236,9 +250,12 @@ public void addTranscriptionListener(TranscriptionListener listener) { } public void end() { - try { + try + { session.getRemote().sendString(EOF_MESSAGE); - } catch (Exception e) { + } + catch (Exception e) + { logger.error("Error to finalize websocket connection for participant " + debugName, e); } } @@ -275,14 +292,17 @@ public void onClose(int statusCode, String reason) { @OnWebSocketConnect public void onConnect(Session session) { - try { + try + { AudioFormat format = request.getFormat(); WhisperingContext ctx = new WhisperingContext(0.0); session.getRemote().sendString(ctx.toJSON().toString()); ByteBuffer audioBuffer = ByteBuffer.wrap(request.getAudio()); session.getRemote().sendBytes(audioBuffer); session.getRemote().sendString(EOF_MESSAGE); - } catch (IOException e) { + } + catch (IOException e) + { logger.error("Error to transcribe audio", e); } } From d8f88aeb2d176b91662aadb2f17edd6d5016dba1 Mon Sep 17 00:00:00 2001 From: charles-zablit Date: Mon, 17 Oct 2022 13:49:27 +0200 Subject: [PATCH 4/4] chore: adapt new Whispering context --- .../jigasi/transcription/WhisperingTranscriptionService.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java b/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java index 1b69905b..1d6a580b 100644 --- a/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java +++ b/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java @@ -361,6 +361,7 @@ public JSONObject toJSON() { ctx.put("no_speech_threshold", 0.6); ctx.put("buffer_threshold", 0.5); ctx.put("vad_threshold", 0.5); + ctx.put("data_type", "int64"); JSONObject res = new JSONObject(); res.put("context", ctx);