AVCodecDecoder: use swresample to interleave audio channels.

Some codecs will always output audio in planar mode no matter what we request. This is the case for example with AAC used for youtube. We now use swresample to convert from planar to packed format. Note that since swresample does its own buffering, we could probably do away with some of the code that handled buffering before, making the audio pipeline simpler and faster. Fixes audio in youtube, but now the video plays at 2x speed. It seems something is wrong with the timestamps. Possible things to investigate: * why do we use the packet dts instead of the pts from the frames anyway? * the pts and pkt_dts are in "stream time_base units". We seem to assume microseconds for audio but this is probably not the case. Or did I miss where the conversion is done?
haiku · Nov 19, 2015 · 856cc59 · 856cc59
1 parent 235725e
commit 856cc59
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 41 deletions.
diff --git a/src/add-ons/media/plugins/ffmpeg/AVCodecDecoder.cpp b/src/add-ons/media/plugins/ffmpeg/AVCodecDecoder.cpp
@@ -5,12 +5,14 @@
  * Copyright (C) 2004 Marcus Overhagen
  * Copyright (C) 2009 Stephan Amßus <superstippi@gmx.de>
  * Copyright (C) 2014 Colin Günther <coling@gmx.de>
+ * Copyright (C) 2015 Adrien Destugues <pulkomandy@pulkomandy.tk>
  *
  * All rights reserved. Distributed under the terms of the MIT License.
  */
 
 //! libavcodec based decoder for Haiku
 
+
 #include "AVCodecDecoder.h"
 
 #include <new>
@@ -95,6 +97,7 @@ AVCodecDecoder::AVCodecDecoder()
 	fIsAudio(false),
 	fCodec(NULL),
 	fContext(avcodec_alloc_context3(NULL)),
+	fResampleContext(NULL),
 	fDecodedData(NULL),
 	fDecodedDataSizeInBytes(0),
 	fPostProcessedDecodedPicture(avcodec_alloc_frame()),
@@ -123,7 +126,6 @@ AVCodecDecoder::AVCodecDecoder()
 	fAudioDecodeError(false),
 
 	fDecodedDataBuffer(avcodec_alloc_frame()),
-	fDecodedDataBufferOffset(0),
 	fDecodedDataBufferSize(0)
 {
 	TRACE("AVCodecDecoder::AVCodecDecoder()\n");
@@ -152,6 +154,7 @@ AVCodecDecoder::~AVCodecDecoder()
 	if (fCodecInitDone)
 		avcodec_close(fContext);
 
+	swr_free(&fResampleContext);
 	free(fChunkBuffer);
 	free(fDecodedData);
 
@@ -237,8 +240,7 @@ AVCodecDecoder::Setup(media_format* ioEncodedFormat, const void* infoBuffer,
 		} else {
 			if (fIsAudio) {
 				fBlockAlign
-					= ioEncodedFormat->u.encoded_audio.output
-						.buffer_size;
+					= ioEncodedFormat->u.encoded_audio.output.buffer_size;
 				TRACE("  using buffer_size as block align: %d\n",
 					fBlockAlign);
 			}
@@ -278,7 +280,6 @@ AVCodecDecoder::SeekedTo(int64 frame, bigtime_t time)
 	free(fChunkBuffer);
 	fChunkBuffer = NULL;
 	fChunkBufferSize = 0;
-	fDecodedDataBufferOffset = 0;
 	fDecodedDataBufferSize = 0;
 	fDecodedDataSizeInBytes = 0;
 
@@ -366,7 +367,6 @@ AVCodecDecoder::_NegotiateAudioOutputFormat(media_format* inOutFormat)
 	fChunkBuffer = NULL;
 	fChunkBufferSize = 0;
 	fAudioDecodeError = false;
-	fDecodedDataBufferOffset = 0;
 	fDecodedDataBufferSize = 0;
 
 	_ResetTempPacket();
@@ -413,6 +413,13 @@ AVCodecDecoder::_NegotiateAudioOutputFormat(media_format* inOutFormat)
 	if (fRawDecodedAudio->opaque == NULL)
 		return B_NO_MEMORY;
 
+	fResampleContext = swr_alloc_set_opts(NULL,
+		fContext->channel_layout, fContext->request_sample_fmt,
+		fContext->sample_rate,
+		fContext->channel_layout, fContext->sample_fmt, fContext->sample_rate,
+		0, NULL);
+	swr_init(fResampleContext);
+
 	TRACE("  bit_rate = %d, sample_rate = %d, channels = %d, "
 		"output frame size: %d, count: %ld, rate: %.2f\n",
 		fContext->bit_rate, fContext->sample_rate, fContext->channels,
@@ -902,24 +909,20 @@ AVCodecDecoder::_MoveAudioFramesToRawDecodedAudioAndUpdateStartTimes()
 	assert(fRawDecodedAudio->nb_samples < fOutputFrameCount);
 	assert(fOutputFrameRate > 0);
 
-	int32 frames = min_c(fOutputFrameCount - fRawDecodedAudio->nb_samples,
-		fDecodedDataBufferSize / fOutputFrameSize);
-	if (frames == 0)
-		debugger("fDecodedDataBufferSize not multiple of frame size!");
-
-	size_t remainingSize = frames * fOutputFrameSize;
-
-#if 0
 	// Some decoders do not support format conversion on themselves, or use
 	// "planar" audio (each channel separated instead of interleaved samples).
-	// If this is a problem we will need to use swresample to convert the data
-	// here, instead of directly copying it.
-	swr_convert(fResampleContext, fRawDecodedAudio->data,
-		fDecodedDataBuffer->data + fDecodedDataBufferOffset, frames);
-#else
-	memcpy(fRawDecodedAudio->data[0], fDecodedDataBuffer->data[0]
-		+ fDecodedDataBufferOffset, remainingSize);
-#endif
+	// In that case, we use swresample to convert the data (and it is
+	// smart enough to do just a copy, when possible)
+	int32 frames = swr_convert(fResampleContext, fRawDecodedAudio->data,
+		fOutputFrameCount - fRawDecodedAudio->nb_samples,
+		(const uint8_t**)fDecodedDataBuffer->data,
+		fDecodedDataBuffer->nb_samples);
+	if (frames < 0)
+		debugger("resampling failed");
+	size_t remainingSize = frames * fOutputFrameSize;
+
+	// libswresample handles all the buffering for us, how nice of them!
+	fDecodedDataBufferSize = 0;
 
 	bool firstAudioFramesCopiedToRawDecodedAudio
 		= fRawDecodedAudio->data[0] != fDecodedData;
@@ -936,19 +939,6 @@ AVCodecDecoder::_MoveAudioFramesToRawDecodedAudioAndUpdateStartTimes()
 	fRawDecodedAudio->data[0] += remainingSize;
 	fRawDecodedAudio->linesize[0] += remainingSize;
 	fRawDecodedAudio->nb_samples += frames;
-
-	fDecodedDataBufferOffset += remainingSize;
-	fDecodedDataBufferSize -= remainingSize;
-
-	// Update start times accordingly
-	bigtime_t framesTimeInterval = static_cast<bigtime_t>(
-		(1000000LL * frames) / fOutputFrameRate);
-	fDecodedDataBuffer->pkt_dts += framesTimeInterval;
-		// Start time of buffer is updated in case that it contains
-		// more audio frames to move.
-	fTempPacket.dts += framesTimeInterval;
-		// Start time of fTempPacket is updated in case the fTempPacket
-		// contains more audio frames to decode.
 }
 
 
@@ -970,8 +960,7 @@ AVCodecDecoder::_MoveAudioFramesToRawDecodedAudioAndUpdateStartTimes()
 	After this function returns successfully the caller can safely make the
 	following assumptions:
 		1. fDecodedDataBufferSize is greater than zero.
-		2. fDecodedDataBufferOffset is set to zero.
-		3. fDecodedDataBuffer contains audio frames.
+		2. fDecodedDataBuffer contains audio frames.
 
 	\returns B_OK on successfully decoding one audio frame chunk.
 	\returns B_LAST_BUFFER_ERROR No more audio frame chunks available. From
@@ -983,7 +972,7 @@ AVCodecDecoder::_DecodeNextAudioFrameChunk()
 {
 	assert(fDecodedDataBufferSize == 0);
 
-	while(fDecodedDataBufferSize == 0) {
+	while (fDecodedDataBufferSize == 0) {
 		status_t loadingChunkStatus
 			= _LoadNextChunkIfNeededAndAssignStartTime();
 		if (loadingChunkStatus != B_OK)
@@ -1030,7 +1019,6 @@ AVCodecDecoder::_DecodeNextAudioFrameChunk()
 		   Also see "Note" below.
 		2. fTempPacket was updated to exclude the data chunk that was consumed
 		   by avcodec_decode_audio4().
-		3. fDecodedDataBufferOffset is set to zero.
 
 	When this function failed to decode at least one audio frame due to a
 	decoding error the caller can safely make the following assumptions:
@@ -1057,7 +1045,6 @@ AVCodecDecoder::_DecodeSomeAudioFramesIntoEmptyDecodedDataBuffer()
 	assert(fTempPacket.size > 0);
 
 	avcodec_get_frame_defaults(fDecodedDataBuffer);
-	fDecodedDataBufferOffset = 0;
 	int gotAudioFrame = 0;
 
 	int encodedDataSizeInBytes = avcodec_decode_audio4(fContext,

diff --git a/src/add-ons/media/plugins/ffmpeg/AVCodecDecoder.h b/src/add-ons/media/plugins/ffmpeg/AVCodecDecoder.h
@@ -4,6 +4,7 @@
  * Copyright (C) 2001 Axel Dörfler.
  * Copyright (C) 2004 Marcus Overhagen.
  * Copyright (C) 2009 Stephan Aßmus <superstippi@gmx.de>.
+ * Copyright (C) 2015 Adrien Destugues <pulkomandy@pulkomandy.tk>.
  *
  * All rights reserved. Distributed under the terms of the MIT License.
  */
@@ -12,13 +13,16 @@
 
 //! libavcodec based decoder for Haiku
 
+
 #include <MediaFormats.h>
 
+
 extern "C" {
 	#include "avcodec.h"
-	#include "swscale.h"
+	#include "swresample.h"
 }
 
+
 #include "DecoderPlugin.h"
 #include "ReaderPlugin.h"
 
@@ -93,6 +97,7 @@ class AVCodecDecoder : public Decoder {
 			// FFmpeg related members
 			AVCodec*			fCodec;
 			AVCodecContext*		fContext;
+			SwrContext*			fResampleContext;
 			uint8_t*			fDecodedData;
 			size_t				fDecodedDataSizeInBytes;
 			AVFrame*			fPostProcessedDecodedPicture;
@@ -102,7 +107,9 @@ class AVCodecDecoder : public Decoder {
 			bool 				fCodecInitDone;
 
 			gfx_convert_func	fFormatConversionFunc;
+#if USE_SWS_FOR_COLOR_SPACE_CONVERSION
 			SwsContext*			fSwsContext;
+#endif
 
 			char*				fExtraData;
 			int					fExtraDataSize;
@@ -119,7 +126,6 @@ class AVCodecDecoder : public Decoder {
 			bool				fAudioDecodeError;
 
 			AVFrame*			fDecodedDataBuffer;
-			int32				fDecodedDataBufferOffset;
 			int32				fDecodedDataBufferSize;
 
 			AVPacket			fTempPacket;

diff --git a/src/add-ons/media/plugins/ffmpeg/Jamfile b/src/add-ons/media/plugins/ffmpeg/Jamfile
@@ -47,6 +47,7 @@ for architectureObject in [ MultiArchSubDirSetup ] {
 		UseHeaders [ FDirName $(ffmpegHeaders) libavformat ] ;
 		UseHeaders [ FDirName $(ffmpegHeaders) libavutil ] ;
 		UseHeaders [ FDirName $(ffmpegHeaders) libswscale ] ;
+		UseHeaders [ FDirName $(ffmpegHeaders) libswresample ] ;
 
 		Addon [ MultiArchDefaultGristFiles ffmpeg ] :
 			$(sources)