From 298c1e56e87b092ff58775edd43ae944e1128410 Mon Sep 17 00:00:00 2001
From: charles-zablit <zablitcharles@gmail.com>
Date: Fri, 14 Oct 2022 15:49:50 +0200
Subject: [PATCH 1/4] feat: add Whispering support

---
 .../WhisperingTranscriptionService.java       | 355 ++++++++++++++++++
 1 file changed, 355 insertions(+)
 create mode 100644 src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java
diff --git a/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java b/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java
new file mode 100644
index 00000000..001bd19c
--- /dev/null
+++ b/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java
@@ -0,0 +1,355 @@
+/*
+ * Jigasi, the Jitsi Gateway to SIP.
+ *
+ * Copyright @ 2018 - present 8x8, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.jitsi.jigasi.transcription;
+
+import org.eclipse.jetty.websocket.api.Session;
+import org.eclipse.jetty.websocket.api.annotations.*;
+import org.eclipse.jetty.websocket.client.WebSocketClient;
+import org.jitsi.utils.logging.Logger;
+import org.json.JSONObject;
+import org.json.simple.JSONArray;
+
+import javax.media.format.AudioFormat;
+import java.io.IOException;
+import java.net.URI;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.UUID;
+import java.util.concurrent.CountDownLatch;
+import java.util.function.Consumer;
+
+
+/**
+ * Implements a TranscriptionService which uses
+ * Whispering, a streaming websocket transcription
+ * service, based on OpenAI's Whisper.
+ * <p>
+ * See https://github.com/shirayu/whispering for
+ * more information about the project.
+ *
+ * @author Charles Zablit
+ */
+public class WhisperingTranscriptionService
+        implements TranscriptionService {
+
+    /**
+     * The logger for this class.
+     */
+    private final static Logger logger
+            = Logger.getLogger(WhisperingTranscriptionService.class);
+
+    /**
+     * The URL of the websocket service speech-to-text service.
+     */
+    public final static String WEBSOCKET_URL
+            = "org.jitsi.jigasi.transcription.whispering.websocket_url";
+
+    //public final static String DEFAULT_WEBSOCKET_URL = "ws://192.168.43.152:8000";
+    public final static String DEFAULT_WEBSOCKET_URL = "ws://152.228.167.183:8000";
+
+    private final static String EOF_MESSAGE = "{\"eof\" : 1}";
+
+    private final String websocketUrl;
+
+    /**
+     * Create a TranscriptionService which will send audio to Whispering
+     * to get a transcription.
+     */
+    public WhisperingTranscriptionService() {
+        websocketUrl = DEFAULT_WEBSOCKET_URL;
+    }
+
+    /**
+     * No configuration required yet.
+     */
+    public boolean isConfiguredProperly() {
+        return true;
+    }
+
+    /**
+     * Sends audio as an array of bytes to Whispering.
+     *
+     * @param request        the TranscriptionRequest which holds the audio to be sent.
+     * @param resultConsumer a Consumer which will handle the TranscriptionResult.
+     */
+    @Override
+    public void sendSingleRequest(final TranscriptionRequest request,
+                                  final Consumer<TranscriptionResult> resultConsumer) {
+        // Try to create the client, which can throw an IOException
+        try {
+            // Set the sampling rate and encoding of the audio
+            AudioFormat format = request.getFormat();
+            if (!format.getEncoding().equals("LINEAR")) {
+                throw new IllegalArgumentException("Given AudioFormat" +
+                        "has unexpected" +
+                        "encoding");
+            }
+
+            WebSocketClient ws = new WebSocketClient();
+            WhisperingWebsocketSession socket = new WhisperingWebsocketSession(request);
+            ws.start();
+            ws.connect(socket, new URI(websocketUrl));
+            socket.awaitClose();
+            resultConsumer.accept(
+                    new TranscriptionResult(
+                            null,
+                            UUID.randomUUID(),
+                            false,
+                            request.getLocale().toLanguageTag(),
+                            0,
+                            new TranscriptionAlternative(socket.getResult())));
+        } catch (Exception e) {
+            logger.error("Error sending single req", e);
+        }
+    }
+
+    @Override
+    public StreamingRecognitionSession initStreamingSession(Participant participant)
+            throws UnsupportedOperationException {
+        try {
+            WhisperingWebsocketStreamingSession streamingSession = new WhisperingWebsocketStreamingSession(
+                    participant.getDebugName());
+            streamingSession.transcriptionTag = participant.getTranslationLanguage();
+            if (streamingSession.transcriptionTag == null) {
+                streamingSession.transcriptionTag = participant.getSourceLanguage();
+            }
+            return streamingSession;
+        } catch (Exception e) {
+            throw new UnsupportedOperationException("Failed to create streaming session", e);
+        }
+    }
+
+    @Override
+    public boolean supportsFragmentTranscription() {
+        return true;
+    }
+
+    @Override
+    public boolean supportsStreamRecognition() {
+        return true;
+    }
+
+    /**
+     * A Transcription session for transcribing streams, handles
+     * the lifecycle of websocket
+     */
+    @WebSocket(maxBinaryMessageSize = 1024 * 1024 * 1024)
+    public class WhisperingWebsocketStreamingSession
+            implements StreamingRecognitionSession {
+        private Session session;
+        /* The name of the participant */
+        private final String debugName;
+        /* Transcription language requested by the user who requested the transcription */
+        private String transcriptionTag = "en-US";
+
+        /**
+         * List of TranscriptionListeners which will be notified when a
+         * result comes in
+         */
+        private final List<TranscriptionListener> listeners = new ArrayList<>();
+
+        WhisperingWebsocketStreamingSession(String debugName)
+                throws Exception {
+            logger.info("STARTING WHISPERING WEBSOCKET.");
+            this.debugName = debugName;
+            WebSocketClient ws = new WebSocketClient();
+            ws.start();
+            ws.connect(this, new URI(websocketUrl));
+        }
+
+        @OnWebSocketClose
+        public void onClose(int statusCode, String reason) {
+            logger.info("CLOSED WHISPERING WEBSOCKET.");
+            this.session = null;
+        }
+
+        @OnWebSocketConnect
+        public void onConnect(Session session) {
+            logger.info("CONNECTED TO WHISPERING WEBSOCKET.");
+            this.session = session;
+            try {
+                WhisperingContext ctx = new WhisperingContext(0.0);
+                session.getRemote().sendString(ctx.toJSON().toString());
+            } catch (Exception e) {
+                logger.error("Error while sending context to Whispering server " + debugName, e);
+            }
+        }
+
+        @OnWebSocketMessage
+        public void onMessage(String msg) {
+            if (logger.isDebugEnabled())
+                logger.debug(debugName + "Received response: " + msg);
+
+            JSONObject jsonData = new JSONObject(msg);
+            logger.info("YOU'VE GOT MAIL " + jsonData.optString("text", "not working"));
+            for (TranscriptionListener l : listeners) {
+                l.notify(new TranscriptionResult(
+                        null,
+                        UUID.randomUUID(),
+                        false,
+                        transcriptionTag,
+                        1.0,
+                        new TranscriptionAlternative(jsonData.optString("text", "not working"))));
+            }
+        }
+
+        @OnWebSocketError
+        public void onError(Throwable cause) {
+            logger.error("Error while streaming audio data to transcription service", cause);
+        }
+
+        public void sendRequest(TranscriptionRequest request) {
+            logger.info("SENDING REQUEST");
+            logger.info(request.getFormat().getSampleRate());
+            logger.info(request.getDurationInMs());
+            try {
+                //if (sampleRate < 0)
+                //{
+                //    sampleRate = request.getFormat().getSampleRate();
+                //    session.getRemote().sendString("{\"config\" : {\"sample_rate\" : " + sampleRate + " }}");
+                //}
+                ByteBuffer audioBuffer = ByteBuffer.wrap(request.getAudio());
+                session.getRemote().sendBytes(audioBuffer);
+            } catch (Exception e) {
+                logger.error("Error to send websocket request for participant " + debugName, e);
+            }
+        }
+
+        public void addTranscriptionListener(TranscriptionListener listener) {
+            listeners.add(listener);
+        }
+
+        public void end() {
+            try {
+                session.getRemote().sendString(EOF_MESSAGE);
+            } catch (Exception e) {
+                logger.error("Error to finalize websocket connection for participant " + debugName, e);
+            }
+        }
+
+        public boolean ended() {
+            return session == null;
+        }
+    }
+
+    /**
+     * Session to send websocket data and receive results. Non-streaming version
+     */
+    @WebSocket
+    public class WhisperingWebsocketSession {
+        /* Signal for the end of operation */
+        private final CountDownLatch closeLatch;
+
+        /* Request we need to process */
+        private final TranscriptionRequest request;
+
+        /* Collect results*/
+        private StringBuilder result;
+
+        WhisperingWebsocketSession(TranscriptionRequest request) {
+            this.closeLatch = new CountDownLatch(1);
+            this.request = request;
+            this.result = new StringBuilder();
+        }
+
+        @OnWebSocketClose
+        public void onClose(int statusCode, String reason) {
+            this.closeLatch.countDown(); // trigger latch
+        }
+
+        @OnWebSocketConnect
+        public void onConnect(Session session) {
+            try {
+                AudioFormat format = request.getFormat();
+                WhisperingContext ctx = new WhisperingContext(0.0);
+                session.getRemote().sendString(ctx.toJSON().toString());
+                ByteBuffer audioBuffer = ByteBuffer.wrap(request.getAudio());
+                session.getRemote().sendBytes(audioBuffer);
+                session.getRemote().sendString(EOF_MESSAGE);
+            } catch (IOException e) {
+                logger.error("Error to transcribe audio", e);
+            }
+        }
+
+        @OnWebSocketMessage
+        public void onMessage(String msg) {
+            result.append(msg);
+            result.append('\n');
+        }
+
+        @OnWebSocketError
+        public void onError(Throwable cause) {
+            logger.error("Websocket connection error", cause);
+        }
+
+        public String getResult() {
+            return result.toString();
+        }
+
+        void awaitClose()
+                throws InterruptedException {
+            closeLatch.await();
+        }
+    }
+
+    /**
+     * Represent the Whispering Context to instantiate the transcription
+     * service.
+     */
+    public class WhisperingContext {
+        /* Starting timestamp of the transcription service. */
+        double timestamp;
+
+        WhisperingContext(double timestamp) {
+            this.timestamp = timestamp;
+        }
+
+        public JSONObject toJSON() {
+            JSONObject ctx = new JSONObject();
+            ctx.put("timestamp", this.timestamp);
+            ctx.put("buffer_tokens", new JSONArray());
+            ctx.put("buffer_mel", JSONObject.NULL);
+            ctx.put("vad", true);
+            JSONArray temperatures = new JSONArray();
+            for (int i = 0; i <= 10; i += 2) {
+                temperatures.add(i / 10);
+            }
+            ctx.put("temperatures", temperatures);
+            ctx.put("allow_padding", true);
+            ctx.put("patience", JSONObject.NULL);
+            ctx.put("compression_ratio_threshold", 2.4);
+            ctx.put("logprob_threshold", -1.0);
+            ctx.put("no_captions_threshold", 0.6);
+            ctx.put("best_of", 5);
+            ctx.put("beam_size", 5);
+            ctx.put("no_speech_threshold", 0.6);
+            ctx.put("buffer_threshold", 0.5);
+            ctx.put("vad_threshold", 0.5);
+
+            JSONObject res = new JSONObject();
+            res.put("context", ctx);
+
+            return res;
+        }
+
+        public String toString() {
+            return this.toJSON().toString();
+        }
+    }
+}

From d537a6c020de4ae6dcba55aa07064bb0382d0e5e Mon Sep 17 00:00:00 2001
From: charles-zablit <zablitcharles@gmail.com>
Date: Fri, 14 Oct 2022 16:10:39 +0200
Subject: [PATCH 2/4] chore: edit README

---
 README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/README.md b/README.md
index bac2c60e..79aab690 100644
--- a/README.md
+++ b/README.md
@@ -143,6 +143,21 @@ Then configure the transcription class with the following properly in `~/jigasi/
 org.jitsi.jigasi.transcription.customService=org.jitsi.jigasi.transcription.VoskTranscriptionService
 ```
 
+Whispering configuration
+==================
+
+To use [Whispering](https://github.com/shirayu/whispering) follow the setup instruction the repo's [README](https://github.com/shirayu/whispering/#example-of-web-socket).
+
+```
+whispering --language en --model tiny --host 0.0.0.0 --port 8000
+```
+
+Then configure the transcription class with the following properly in `~/jigasi/jigasi-home/sip-communicator.properties`:
+
+```
+org.jitsi.jigasi.transcription.customService=org.jitsi.jigasi.transcription.WhisperingTranscriptionService
+```
+
 Transcription options
 =====================
 

From 7e1a4ee52183d2315d0f370eee9feb21c19cadc3 Mon Sep 17 00:00:00 2001
From: charles-zablit <zablitcharles@gmail.com>
Date: Fri, 14 Oct 2022 16:15:52 +0200
Subject: [PATCH 3/4] chore: fix checkstyle violations

---
 .../WhisperingTranscriptionService.java       | 48 +++++++++++++------
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java b/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java
index 001bd19c..1b69905b 100644
--- a/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java
+++ b/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java
@@ -92,10 +92,12 @@ public boolean isConfiguredProperly() {
     public void sendSingleRequest(final TranscriptionRequest request,
                                   final Consumer<TranscriptionResult> resultConsumer) {
         // Try to create the client, which can throw an IOException
-        try {
+        try
+        {
             // Set the sampling rate and encoding of the audio
             AudioFormat format = request.getFormat();
-            if (!format.getEncoding().equals("LINEAR")) {
+            if (!format.getEncoding().equals("LINEAR"))
+            {
                 throw new IllegalArgumentException("Given AudioFormat" +
                         "has unexpected" +
                         "encoding");
@@ -114,7 +116,9 @@ public void sendSingleRequest(final TranscriptionRequest request,
                             request.getLocale().toLanguageTag(),
                             0,
                             new TranscriptionAlternative(socket.getResult())));
-        } catch (Exception e) {
+        }
+        catch (Exception e)
+        {
             logger.error("Error sending single req", e);
         }
     }
@@ -122,15 +126,19 @@ public void sendSingleRequest(final TranscriptionRequest request,
     @Override
     public StreamingRecognitionSession initStreamingSession(Participant participant)
             throws UnsupportedOperationException {
-        try {
+        try
+        {
             WhisperingWebsocketStreamingSession streamingSession = new WhisperingWebsocketStreamingSession(
                     participant.getDebugName());
             streamingSession.transcriptionTag = participant.getTranslationLanguage();
-            if (streamingSession.transcriptionTag == null) {
+            if (streamingSession.transcriptionTag == null)
+            {
                 streamingSession.transcriptionTag = participant.getSourceLanguage();
             }
             return streamingSession;
-        } catch (Exception e) {
+        }
+        catch (Exception e)
+        {
             throw new UnsupportedOperationException("Failed to create streaming session", e);
         }
     }
@@ -183,10 +191,13 @@ public void onClose(int statusCode, String reason) {
         public void onConnect(Session session) {
             logger.info("CONNECTED TO WHISPERING WEBSOCKET.");
             this.session = session;
-            try {
+            try
+            {
                 WhisperingContext ctx = new WhisperingContext(0.0);
                 session.getRemote().sendString(ctx.toJSON().toString());
-            } catch (Exception e) {
+            }
+            catch (Exception e)
+            {
                 logger.error("Error while sending context to Whispering server " + debugName, e);
             }
         }
@@ -218,7 +229,8 @@ public void sendRequest(TranscriptionRequest request) {
             logger.info("SENDING REQUEST");
             logger.info(request.getFormat().getSampleRate());
             logger.info(request.getDurationInMs());
-            try {
+            try
+            {
                 //if (sampleRate < 0)
                 //{
                 //    sampleRate = request.getFormat().getSampleRate();
@@ -226,7 +238,9 @@ public void sendRequest(TranscriptionRequest request) {
                 //}
                 ByteBuffer audioBuffer = ByteBuffer.wrap(request.getAudio());
                 session.getRemote().sendBytes(audioBuffer);
-            } catch (Exception e) {
+            }
+            catch (Exception e)
+            {
                 logger.error("Error to send websocket request for participant " + debugName, e);
             }
         }
@@ -236,9 +250,12 @@ public void addTranscriptionListener(TranscriptionListener listener) {
         }
 
         public void end() {
-            try {
+            try
+            {
                 session.getRemote().sendString(EOF_MESSAGE);
-            } catch (Exception e) {
+            }
+            catch (Exception e)
+            {
                 logger.error("Error to finalize websocket connection for participant " + debugName, e);
             }
         }
@@ -275,14 +292,17 @@ public void onClose(int statusCode, String reason) {
 
         @OnWebSocketConnect
         public void onConnect(Session session) {
-            try {
+            try
+            {
                 AudioFormat format = request.getFormat();
                 WhisperingContext ctx = new WhisperingContext(0.0);
                 session.getRemote().sendString(ctx.toJSON().toString());
                 ByteBuffer audioBuffer = ByteBuffer.wrap(request.getAudio());
                 session.getRemote().sendBytes(audioBuffer);
                 session.getRemote().sendString(EOF_MESSAGE);
-            } catch (IOException e) {
+            }
+            catch (IOException e)
+            {
                 logger.error("Error to transcribe audio", e);
             }
         }

From d8f88aeb2d176b91662aadb2f17edd6d5016dba1 Mon Sep 17 00:00:00 2001
From: charles-zablit <zablitcharles@gmail.com>
Date: Mon, 17 Oct 2022 13:49:27 +0200
Subject: [PATCH 4/4] chore: adapt new Whispering context

---
 .../jigasi/transcription/WhisperingTranscriptionService.java     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java b/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java
index 1b69905b..1d6a580b 100644
--- a/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java
+++ b/src/main/java/org/jitsi/jigasi/transcription/WhisperingTranscriptionService.java
@@ -361,6 +361,7 @@ public JSONObject toJSON() {
             ctx.put("no_speech_threshold", 0.6);
             ctx.put("buffer_threshold", 0.5);
             ctx.put("vad_threshold", 0.5);
+            ctx.put("data_type", "int64");
 
             JSONObject res = new JSONObject();
             res.put("context", ctx);