Skip to content

Commit 4cae3e1

Browse files
authored
classify audio attachments (#29197)
* classify audio attachments * compute amps * better audio attachment support * x * x
1 parent 0a255e2 commit 4cae3e1

16 files changed

Lines changed: 483 additions & 125 deletions

File tree

go/bind/keybase.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@ package keybase
55

66
import (
77
"context"
8+
"encoding/binary"
89
"encoding/json"
910
"errors"
1011
"fmt"
12+
"math"
1113
"net"
1214
"os"
1315
"path/filepath"
@@ -102,6 +104,9 @@ type PushNotifier interface {
102104
type NativeVideoHelper interface {
103105
Thumbnail(filename string) []byte
104106
Duration(filename string) int
107+
// AudioAmps returns IEEE 754 float32 samples encoded as little-endian bytes,
108+
// representing RMS amplitude values in [0,1] for waveform visualization.
109+
AudioAmps(filename string) []byte
105110
}
106111

107112
// ShareIntentDonator is implemented by the native iOS layer to donate INSendMessageIntent
@@ -189,6 +194,19 @@ func (v videoHelper) ThumbnailAndDuration(ctx context.Context, filename string)
189194
return v.nvh.Thumbnail(filename), v.nvh.Duration(filename), nil
190195
}
191196

197+
func (v videoHelper) AudioAmps(ctx context.Context, filename string) ([]float64, error) {
198+
data := v.nvh.AudioAmps(filename)
199+
if len(data) == 0 || len(data)%4 != 0 {
200+
return nil, nil
201+
}
202+
amps := make([]float64, len(data)/4)
203+
for i := range amps {
204+
bits := binary.LittleEndian.Uint32(data[i*4:])
205+
amps[i] = float64(math.Float32frombits(bits))
206+
}
207+
return amps, nil
208+
}
209+
192210
type ExternalDNSNSFetcher interface {
193211
GetServers() []byte
194212
}

go/chat/attachments/preprocess.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,8 @@ func PreprocessAsset(ctx context.Context, g *globals.Context, log utils.DebugLab
278278
p.PreviewDim = &Dimension{Width: previewRes.PreviewWidth, Height: previewRes.PreviewHeight}
279279
}
280280
p.BaseDurationMs = previewRes.BaseDurationMs
281+
p.BaseIsAudio = previewRes.BaseIsAudio
282+
p.PreviewAudioAmps = previewRes.AudioAmps
281283
p.PreviewDurationMs = previewRes.PreviewDurationMs
282284
}
283285

go/chat/attachments/preview.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ type PreviewRes struct {
3737
BaseWidth int
3838
BaseHeight int
3939
BaseDurationMs int
40+
BaseIsAudio bool
41+
AudioAmps []float64
4042
PreviewWidth int
4143
PreviewHeight int
4244
PreviewDurationMs int

go/chat/attachments/preview_android.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,13 @@ func previewVideo(ctx context.Context, log utils.DebugLabeler, src io.Reader,
2121
return res, err
2222
}
2323
log.Debug(ctx, "previewVideo: size: %d duration: %d", len(dat), duration)
24+
if len(dat) == 0 && duration > 1 && isAudioExtension(basename) {
25+
amps, ampErr := nvh.AudioAmps(ctx, basename)
26+
if ampErr != nil {
27+
log.Debug(ctx, "previewVideo: AudioAmps failed: %v", ampErr)
28+
}
29+
return previewAudio(duration, amps)
30+
}
2431
if len(dat) == 0 {
2532
log.Debug(ctx, "failed to generate preview from native, using blank image")
2633
return previewVideoBlank(ctx, log, src, basename)

go/chat/attachments/preview_darwin.go

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ package attachments
1212
#include <CoreFoundation/CoreFoundation.h>
1313
#include <Foundation/Foundation.h>
1414
#include <ImageIO/ImageIO.h>
15+
#include <math.h>
1516
#include <UniformTypeIdentifiers/UniformTypeIdentifiers.h>
1617
#if TARGET_OS_IPHONE
1718
#include <MobileCoreServices/MobileCoreServices.h>
@@ -31,6 +32,11 @@ typedef struct {
3132
int imageLength;
3233
} ImageConversionResult;
3334
35+
typedef struct {
36+
float* data;
37+
int length;
38+
} AudioAmpsResult;
39+
3440
VideoPreviewResult MakeVideoThumbnail(const char* inFilename) {
3541
VideoPreviewResult result = {NULL, 0, 0};
3642
NSString* filename = [NSString stringWithUTF8String:inFilename];
@@ -71,6 +77,118 @@ VideoPreviewResult MakeVideoThumbnail(const char* inFilename) {
7177
return result;
7278
}
7379
80+
// GetAudioAmplitudes reads PCM samples from an audio file via AVAssetReader and
81+
// returns numSamples RMS amplitude values in [0,1]. The caller must free result.data.
82+
AudioAmpsResult GetAudioAmplitudes(const char* inFilename, int numSamples) {
83+
AudioAmpsResult result = {NULL, 0};
84+
if (numSamples <= 0) return result;
85+
86+
NSString* filename = [NSString stringWithUTF8String:inFilename];
87+
NSURL* url = [NSURL fileURLWithPath:filename];
88+
AVURLAsset* asset = [AVURLAsset URLAssetWithURL:url options:nil];
89+
90+
NSArray<AVAssetTrack*>* audioTracks = [asset tracksWithMediaType:AVMediaTypeAudio];
91+
if (audioTracks.count == 0) return result;
92+
AVAssetTrack* audioTrack = audioTracks[0];
93+
94+
Float64 durationSeconds = CMTimeGetSeconds(asset.duration);
95+
if (!(durationSeconds > 0)) return result;
96+
97+
Float64 sampleRate = 0;
98+
for (id formatDescription in audioTrack.formatDescriptions) {
99+
CMAudioFormatDescriptionRef desc = (__bridge CMAudioFormatDescriptionRef)formatDescription;
100+
const AudioStreamBasicDescription* asbd =
101+
CMAudioFormatDescriptionGetStreamBasicDescription(desc);
102+
if (asbd && asbd->mSampleRate > 0) {
103+
sampleRate = asbd->mSampleRate;
104+
break;
105+
}
106+
}
107+
if (sampleRate <= 0) {
108+
sampleRate = 44100;
109+
}
110+
long long totalSamples = llround(durationSeconds * sampleRate);
111+
if (totalSamples < numSamples) {
112+
totalSamples = numSamples;
113+
}
114+
115+
NSError* error = nil;
116+
AVAssetReader* reader = [AVAssetReader assetReaderWithAsset:asset error:&error];
117+
if (!reader || error) return result;
118+
119+
NSDictionary* outputSettings = @{
120+
AVFormatIDKey: @(kAudioFormatLinearPCM),
121+
AVLinearPCMBitDepthKey: @32,
122+
AVLinearPCMIsFloatKey: @YES,
123+
AVLinearPCMIsNonInterleaved: @NO,
124+
AVNumberOfChannelsKey: @1,
125+
};
126+
127+
AVAssetReaderTrackOutput* output = [AVAssetReaderTrackOutput
128+
assetReaderTrackOutputWithTrack:audioTrack
129+
outputSettings:outputSettings];
130+
output.alwaysCopiesSampleData = NO;
131+
132+
if (![reader canAddOutput:output]) return result;
133+
[reader addOutput:output];
134+
if (![reader startReading]) return result;
135+
136+
float* sumSq = (float*)calloc(numSamples, sizeof(float));
137+
unsigned int* counts = (unsigned int*)calloc(numSamples, sizeof(unsigned int));
138+
if (!sumSq || !counts) {
139+
free(sumSq);
140+
free(counts);
141+
return result;
142+
}
143+
144+
long long sampleIndex = 0;
145+
while (reader.status == AVAssetReaderStatusReading) {
146+
CMSampleBufferRef sampleBuffer = [output copyNextSampleBuffer];
147+
if (!sampleBuffer) break;
148+
CMBlockBufferRef blockBuffer = CMSampleBufferGetDataBuffer(sampleBuffer);
149+
if (blockBuffer) {
150+
size_t length = CMBlockBufferGetDataLength(blockBuffer);
151+
if (length >= sizeof(float)) {
152+
float* chunk = (float*)malloc(length);
153+
if (chunk && CMBlockBufferCopyDataBytes(blockBuffer, 0, length, chunk) == kCMBlockBufferNoErr) {
154+
size_t floatCount = length / sizeof(float);
155+
for (size_t i = 0; i < floatCount; i++) {
156+
int bucket = (int)(((sampleIndex + (long long)i) * numSamples) / totalSamples);
157+
if (bucket >= numSamples) {
158+
bucket = numSamples - 1;
159+
}
160+
float s = chunk[i];
161+
sumSq[bucket] += s * s;
162+
counts[bucket]++;
163+
}
164+
sampleIndex += (long long)floatCount;
165+
}
166+
free(chunk);
167+
}
168+
}
169+
CFRelease(sampleBuffer);
170+
}
171+
172+
float* amps = (float*)calloc(numSamples, sizeof(float));
173+
if (!amps) {
174+
free(sumSq);
175+
free(counts);
176+
return result;
177+
}
178+
179+
for (int i = 0; i < numSamples; i++) {
180+
if (counts[i] > 0) {
181+
amps[i] = sqrtf(sumSq[i] / (float)counts[i]);
182+
}
183+
}
184+
free(sumSq);
185+
free(counts);
186+
187+
result.data = amps;
188+
result.length = numSamples;
189+
return result;
190+
}
191+
74192
#if TARGET_OS_IPHONE
75193
ImageConversionResult HEICToJPEG(const char* inFilename) {
76194
ImageConversionResult result = {NULL, 0};
@@ -121,6 +239,22 @@ import (
121239
"github.com/keybase/client/go/chat/utils"
122240
)
123241

242+
func getAudioAmps(basename string) []float64 {
243+
cbasename := C.CString(basename)
244+
defer C.free(unsafe.Pointer(cbasename))
245+
result := C.GetAudioAmplitudes(cbasename, C.int(audioAmpsCount))
246+
if result.length == 0 || result.data == nil {
247+
return nil
248+
}
249+
defer C.free(unsafe.Pointer(result.data))
250+
amps := make([]float64, int(result.length))
251+
cData := (*[1 << 20]C.float)(unsafe.Pointer(result.data))[:int(result.length):int(result.length)]
252+
for i, v := range cData {
253+
amps[i] = float64(v)
254+
}
255+
return amps
256+
}
257+
124258
func previewVideo(ctx context.Context, log utils.DebugLabeler, src io.Reader,
125259
basename string, nvh types.NativeVideoHelper,
126260
) (res *PreviewRes, err error) {
@@ -137,6 +271,12 @@ func previewVideo(ctx context.Context, log utils.DebugLabeler, src io.Reader,
137271
}
138272
log.Debug(ctx, "previewVideo: length: %d duration: %ds", result.imageLength, duration)
139273
if result.imageLength == 0 {
274+
// Audio-only files (e.g. M4A) have no video track so no thumbnail, but AVFoundation
275+
// can still read their duration. Extract amplitude data for the waveform visualization.
276+
if duration > 1 && isAudioExtension(basename) {
277+
amps := getAudioAmps(basename)
278+
return previewAudio(duration, amps)
279+
}
140280
return res, errors.New("no data returned from native")
141281
}
142282
localDat := make([]byte, result.imageLength)
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
//go:build darwin || android
2+
// +build darwin android
3+
4+
package attachments
5+
6+
import (
7+
"math"
8+
"path/filepath"
9+
"strings"
10+
)
11+
12+
const audioAmpsCount = 60
13+
14+
func isAudioExtension(basename string) bool {
15+
switch strings.ToLower(filepath.Ext(basename)) {
16+
case ".m4a", ".mp3", ".aac", ".ogg", ".flac", ".wav", ".opus", ".aiff", ".caf":
17+
return true
18+
}
19+
return false
20+
}
21+
22+
func normalizeAudioAmps(amps []float64) []float64 {
23+
if len(amps) == 0 {
24+
return make([]float64, audioAmpsCount)
25+
}
26+
return amps
27+
}
28+
29+
// previewAudio generates a waveform preview image and packages amplitude data
30+
// for an audio-only file. amps are linear RMS values in [0,1].
31+
func previewAudio(duration int, amps []float64) (*PreviewRes, error) {
32+
amps = normalizeAudioAmps(amps)
33+
// audioVisualizer expects dB values; convert from linear RMS.
34+
dbAmps := make([]float64, len(amps))
35+
for i, a := range amps {
36+
if a <= 0 {
37+
dbAmps[i] = -80
38+
} else {
39+
dbAmps[i] = 20 * math.Log10(a)
40+
}
41+
}
42+
v := newAudioVisualizer(dbAmps)
43+
dat, width := v.visualize()
44+
return &PreviewRes{
45+
Source: dat,
46+
ContentType: "image/png",
47+
BaseWidth: width,
48+
BaseHeight: v.height,
49+
BaseDurationMs: duration,
50+
BaseIsAudio: true,
51+
AudioAmps: amps,
52+
PreviewWidth: width,
53+
PreviewHeight: v.height,
54+
}, nil
55+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
//go:build darwin || android
2+
// +build darwin android
3+
4+
package attachments
5+
6+
import (
7+
"testing"
8+
9+
"github.com/stretchr/testify/require"
10+
)
11+
12+
func TestPreviewAudioEmptyAmps(t *testing.T) {
13+
res, err := previewAudio(1234, nil)
14+
require.NoError(t, err)
15+
require.NotNil(t, res)
16+
require.True(t, res.BaseIsAudio)
17+
require.Equal(t, 1234, res.BaseDurationMs)
18+
require.Len(t, res.AudioAmps, audioAmpsCount)
19+
require.NotEmpty(t, res.Source)
20+
require.Positive(t, res.PreviewWidth)
21+
require.Positive(t, res.PreviewHeight)
22+
}

go/chat/types/interfaces.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -444,6 +444,7 @@ type AttachmentUploader interface {
444444

445445
type NativeVideoHelper interface {
446446
ThumbnailAndDuration(ctx context.Context, filename string) ([]byte, int, error)
447+
AudioAmps(ctx context.Context, filename string) ([]float64, error)
447448
}
448449

449450
// ShareConversation holds data for donating a conversation to the iOS share sheet.

go/chat/types/types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,10 @@ func (d DummyNativeVideoHelper) ThumbnailAndDuration(ctx context.Context, filena
563563
return nil, 0, nil
564564
}
565565

566+
func (d DummyNativeVideoHelper) AudioAmps(ctx context.Context, filename string) ([]float64, error) {
567+
return nil, nil
568+
}
569+
566570
type UnfurlerTaskStatus int
567571

568572
const (

0 commit comments

Comments
 (0)