Permalink
Browse files

Merge branch 'kaudio' into lsmash

  • Loading branch information...
2 parents 96092fd + faeb40c commit 54929feb3363ccd9fc7b9e0ee8b85461363a0a9a @VFR-maniac VFR-maniac committed May 26, 2012
Showing with 240 additions and 68 deletions.
  1. +9 −0 common/common.c
  2. +4 −0 common/common.h
  3. +3 −3 common/deblock.c
  4. +2 −2 common/frame.c
  5. +4 −0 common/macroblock.c
  6. +2 −2 common/threadpool.c
  7. +34 −6 encoder/encoder.c
  8. +151 −48 encoder/slicetype.c
  9. +26 −5 filters/video/depth.c
  10. +2 −0 x264.c
  11. +3 −2 x264.h
View
9 common/common.c
@@ -50,6 +50,7 @@ void x264_param_default( x264_param_t *param )
/* CPU autodetect */
param->cpu = x264_cpu_detect();
param->i_threads = X264_THREADS_AUTO;
+ param->i_lookahead_threads = X264_THREADS_AUTO;
param->b_deterministic = 1;
param->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;
@@ -632,6 +633,13 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
else
p->i_threads = atoi(value);
}
+ OPT("lookahead-threads")
+ {
+ if( !strcmp(value, "auto") )
+ p->i_lookahead_threads = X264_THREADS_AUTO;
+ else
+ p->i_lookahead_threads = atoi(value);
+ }
OPT("sliced-threads")
p->b_sliced_threads = atobool(value);
OPT("sync-lookahead")
@@ -1285,6 +1293,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
s += sprintf( s, " fast_pskip=%d", p->analyse.b_fast_pskip );
s += sprintf( s, " chroma_qp_offset=%d", p->analyse.i_chroma_qp_offset );
s += sprintf( s, " threads=%d", p->i_threads );
+ s += sprintf( s, " lookahead_threads=%d", p->i_lookahead_threads );
s += sprintf( s, " sliced_threads=%d", p->b_sliced_threads );
if( p->i_slice_count )
s += sprintf( s, " slices=%d", p->i_slice_count );
View
4 common/common.h
@@ -56,6 +56,7 @@ do {\
#define X264_BFRAME_MAX 16
#define X264_REF_MAX 16
#define X264_THREAD_MAX 128
+#define X264_LOOKAHEAD_THREAD_MAX 16
#define X264_PCM_COST (FRAME_SIZE(256*BIT_DEPTH)+16)
#define X264_LOOKAHEAD_MAX 250
#define QP_BD_OFFSET (6*(BIT_DEPTH-8))
@@ -469,13 +470,15 @@ struct x264_t
x264_param_t param;
x264_t *thread[X264_THREAD_MAX+1];
+ x264_t *lookahead_thread[X264_LOOKAHEAD_THREAD_MAX];
int b_thread_active;
int i_thread_phase; /* which thread to use for the next frame */
int i_thread_idx; /* which thread this is */
int i_threadslice_start; /* first row in this thread slice */
int i_threadslice_end; /* row after the end of this thread slice */
int i_threadslice_pass; /* which pass of encoding we are on */
x264_threadpool_t *threadpool;
+ x264_threadpool_t *lookaheadpool;
x264_pthread_mutex_t mutex;
x264_pthread_cond_t cv;
@@ -915,6 +918,7 @@ struct x264_t
/* Buffers that are allocated per-thread even in sliced threads. */
void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
+ void *scratch_buffer2; /* if the first one's already in use */
pixel *intra_border_backup[5][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
/* Deblock strength values are stored for each 4x4 partition. In MBAFF
* there are four extra values that need to be stored, located in [4][i]. */
View
6 common/deblock.c
@@ -506,9 +506,9 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
/* Any MB that was coded, or that analysis decided to skip, has quality commensurate with its QP.
* But if deblocking affects neighboring MBs that were force-skipped, blur might accumulate there.
* So reset their effective QP to max, to indicate that lack of guarantee. */
- if( h->fdec->mb_info && M32( bs[0][0] ) )
+ if( h->fenc->mb_info && M32( bs[0][0] ) )
{
-#define RESET_EFFECTIVE_QP(xy) h->fdec->effective_qp[xy] |= 0xff * !!(h->fdec->mb_info[xy] & X264_MBINFO_CONSTANT);
+#define RESET_EFFECTIVE_QP(xy) h->fdec->effective_qp[xy] |= 0xff * !!(h->fenc->mb_info[xy] & X264_MBINFO_CONSTANT);
RESET_EFFECTIVE_QP(mb_xy);
RESET_EFFECTIVE_QP(h->mb.i_mb_left_xy[0]);
}
@@ -561,7 +561,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
int intra_deblock = intra_cur || intra_top;
/* This edge has been modified, reset effective qp to max. */
- if( h->fdec->mb_info && M32( bs[1][0] ) )
+ if( h->fenc->mb_info && M32( bs[1][0] ) )
{
RESET_EFFECTIVE_QP(mb_xy);
RESET_EFFECTIVE_QP(h->mb.i_mb_top_xy);
View
4 common/frame.c
@@ -357,8 +357,8 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
dst->i_pic_struct = src->i_pic_struct;
dst->extra_sei = src->extra_sei;
dst->opaque = src->opaque;
- dst->mb_info = src->prop.mb_info;
- dst->mb_info_free = src->prop.mb_info_free;
+ dst->mb_info = h->param.analyse.b_mb_info ? src->prop.mb_info : NULL;
+ dst->mb_info_free = h->param.analyse.b_mb_info ? src->prop.mb_info_free : NULL;
uint8_t *pix[3];
int stride[3];
View
4 common/macroblock.c
@@ -401,6 +401,9 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
else
h->scratch_buffer = NULL;
+ int buf_lookahead_threads = (h->mb.i_mb_height + (4 + 32) * h->param.i_lookahead_threads) * sizeof(int) * 2;
+ CHECKED_MALLOC( h->scratch_buffer2, buf_lookahead_threads );
+
return 0;
fail:
return -1;
@@ -418,6 +421,7 @@ void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
x264_free( h->intra_border_backup[i][j] - 16 );
}
x264_free( h->scratch_buffer );
+ x264_free( h->scratch_buffer2 );
}
void x264_macroblock_slice_init( x264_t *h )
View
4 common/threadpool.c
@@ -66,7 +66,7 @@ static void x264_threadpool_thread( x264_threadpool_t *pool )
x264_pthread_mutex_unlock( &pool->run.mutex );
if( !job )
continue;
- job->ret = job->func( job->arg ); /* execute the function */
+ job->ret = (void*)x264_stack_align( job->func, job->arg ); /* execute the function */
x264_sync_frame_list_push( &pool->done, (void*)job );
}
}
@@ -83,7 +83,7 @@ int x264_threadpool_init( x264_threadpool_t **p_pool, int threads,
pool->init_func = init_func;
pool->init_arg = init_arg;
- pool->threads = X264_MIN( threads, X264_THREAD_MAX );
+ pool->threads = threads;
CHECKED_MALLOC( pool->thread_handle, pool->threads * sizeof(x264_pthread_t) );
View
40 encoder/encoder.c
@@ -395,6 +395,15 @@ static void x264_encoder_thread_init( x264_t *h )
x264_cpu_mask_misalign_sse();
#endif
}
+
+static void x264_lookahead_thread_init( x264_t *h )
+{
+#if HAVE_MMX
+ /* Misalign mask has to be set separately for each thread. */
+ if( h->param.cpu&X264_CPU_SSE_MISALIGN )
+ x264_cpu_mask_misalign_sse();
+#endif
+}
#endif
/****************************************************************************
@@ -494,6 +503,9 @@ static int x264_validate_parameters( x264_t *h, int b_open )
if( h->param.i_threads == X264_THREADS_AUTO )
h->param.i_threads = x264_cpu_num_processors() * (h->param.b_sliced_threads?2:3)/2;
+ if( h->param.i_lookahead_threads == X264_THREADS_AUTO )
+ h->param.i_lookahead_threads = h->param.i_threads / (h->param.b_sliced_threads?1:6);
+ int max_sliced_threads = X264_MAX( 1, (h->param.i_height+15)/16 / 4 );
if( h->param.i_threads > 1 )
{
#if !HAVE_THREAD
@@ -503,14 +515,15 @@ static int x264_validate_parameters( x264_t *h, int b_open )
/* Avoid absurdly small thread slices as they can reduce performance
* and VBV compliance. Capped at an arbitrary 4 rows per thread. */
if( h->param.b_sliced_threads )
- {
- int max_threads = (h->param.i_height+15)/16 / 4;
- h->param.i_threads = X264_MIN( h->param.i_threads, max_threads );
- }
+ h->param.i_threads = X264_MIN( h->param.i_threads, max_sliced_threads );
}
h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX );
+ h->param.i_lookahead_threads = x264_clip3( h->param.i_lookahead_threads, 1, X264_MIN( max_sliced_threads, X264_LOOKAHEAD_THREAD_MAX ) );
if( h->param.i_threads == 1 )
+ {
h->param.b_sliced_threads = 0;
+ h->param.i_lookahead_threads = 1;
+ }
h->i_thread_frames = h->param.b_sliced_threads ? 1 : h->param.i_threads;
if( h->i_thread_frames > 1 )
h->param.nalu_process = NULL;
@@ -1271,10 +1284,19 @@ x264_t *x264_encoder_open( x264_param_t *param )
if( h->param.i_threads > 1 &&
x264_threadpool_init( &h->threadpool, h->param.i_threads, (void*)x264_encoder_thread_init, h ) )
goto fail;
+ if( h->param.i_lookahead_threads > 1 &&
+ x264_threadpool_init( &h->lookaheadpool, h->param.i_lookahead_threads, (void*)x264_lookahead_thread_init, h ) )
+ goto fail;
h->thread[0] = h;
for( int i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
+ if( h->param.i_lookahead_threads > 1 )
+ for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+ {
+ CHECKED_MALLOC( h->lookahead_thread[i], sizeof(x264_t) );
+ *h->lookahead_thread[i] = *h;
+ }
for( int i = 0; i < h->param.i_threads; i++ )
{
@@ -3199,8 +3221,8 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
x264_emms();
- if( h->fdec->mb_info_free )
- h->fdec->mb_info_free( h->fdec->mb_info );
+ if( h->fenc->mb_info_free )
+ h->fenc->mb_info_free( h->fenc->mb_info );
/* generate buffering period sei and insert it into place */
if( h->i_thread_frames > 1 && h->fenc->b_keyframe && h->sps->vui.b_nal_hrd_parameters_present )
@@ -3459,6 +3481,8 @@ void x264_encoder_close ( x264_t *h )
x264_threadpool_wait_all( h );
if( h->param.i_threads > 1 )
x264_threadpool_delete( h->threadpool );
+ if( h->param.i_lookahead_threads > 1 )
+ x264_threadpool_delete( h->lookaheadpool );
if( h->i_thread_frames > 1 )
{
for( int i = 0; i < h->i_thread_frames; i++ )
@@ -3768,6 +3792,10 @@ void x264_encoder_close ( x264_t *h )
if( h->thread[i]->fref[0][j] && h->thread[i]->fref[0][j]->b_duplicate )
x264_frame_delete( h->thread[i]->fref[0][j] );
+ if( h->param.i_lookahead_threads > 1 )
+ for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+ x264_free( h->lookahead_thread[i] );
+
for( int i = h->param.i_threads - 1; i >= 0; i-- )
{
x264_frame_t **frame;
View
199 encoder/slicetype.c
@@ -424,9 +424,21 @@ static void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *r
}
}
+/* Output buffers are separated by 128 bytes to avoid false sharing of cachelines
+ * in multithreaded lookahead. */
+#define PAD_SIZE 32
+/* cost_est, cost_est_aq, intra_mbs, num rows */
+#define NUM_INTS 4
+#define COST_EST 0
+#define COST_EST_AQ 1
+#define INTRA_MBS 2
+#define NUM_ROWS 3
+#define ROW_SATD (NUM_INTS + (h->mb.i_mb_y - h->i_threadslice_start))
+
static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
x264_frame_t **frames, int p0, int p1, int b,
- int dist_scale_factor, int do_search[2], const x264_weight_t *w )
+ int dist_scale_factor, int do_search[2], const x264_weight_t *w,
+ int *output_inter, int *output_intra )
{
x264_frame_t *fref0 = frames[p0];
x264_frame_t *fref1 = frames[p1];
@@ -571,7 +583,7 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
#define MVC(mv) { CP32( mvc[i_mvc], mv ); i_mvc++; }
if( i_mb_x < h->mb.i_mb_width - 1 )
MVC( fenc_mv[1] );
- if( i_mb_y < h->mb.i_mb_height - 1 )
+ if( i_mb_y < h->i_threadslice_end - 1 )
{
MVC( fenc_mv[i_mb_stride] );
if( i_mb_x > 0 )
@@ -653,11 +665,11 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
int i_icost_aq = i_icost;
if( h->param.rc.i_aq_mode )
i_icost_aq = (i_icost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8;
- fenc->i_row_satds[0][0][h->mb.i_mb_y] += i_icost_aq;
+ output_intra[ROW_SATD] += i_icost_aq;
if( b_frame_score_mb )
{
- fenc->i_cost_est[0][0] += i_icost;
- fenc->i_cost_est_aq[0][0] += i_icost_aq;
+ output_intra[COST_EST] += i_icost;
+ output_intra[COST_EST_AQ] += i_icost_aq;
}
}
i_bcost += lowres_penalty;
@@ -674,7 +686,7 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
list_used = 0;
}
if( b_frame_score_mb )
- fenc->i_intra_mbs[b-p0] += b_intra;
+ output_inter[INTRA_MBS] += b_intra;
}
/* In an I-frame, we've already added the results above in the intra section. */
@@ -683,12 +695,12 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
int i_bcost_aq = i_bcost;
if( h->param.rc.i_aq_mode )
i_bcost_aq = (i_bcost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8;
- fenc->i_row_satds[b-p0][p1-b][h->mb.i_mb_y] += i_bcost_aq;
+ output_inter[ROW_SATD] += i_bcost_aq;
if( b_frame_score_mb )
{
/* Don't use AQ-weighted costs for slicetype decision, only for ratecontrol. */
- fenc->i_cost_est[b-p0][p1-b] += i_bcost;
- fenc->i_cost_est_aq[b-p0][p1-b] += i_bcost_aq;
+ output_inter[COST_EST] += i_bcost;
+ output_inter[COST_EST_AQ] += i_bcost_aq;
}
}
@@ -701,92 +713,183 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
(h->mb.i_mb_width - 2) * (h->mb.i_mb_height - 2) :\
h->mb.i_mb_width * h->mb.i_mb_height)
+typedef struct
+{
+ x264_t *h;
+ x264_mb_analysis_t *a;
+ x264_frame_t **frames;
+ int p0;
+ int p1;
+ int b;
+ int dist_scale_factor;
+ int *do_search;
+ const x264_weight_t *w;
+ int *output_inter;
+ int *output_intra;
+} x264_slicetype_slice_t;
+
+static void x264_slicetype_slice_cost( x264_slicetype_slice_t *s )
+{
+ x264_t *h = s->h;
+
+ /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode.
+ * This considerably improves MV prediction overall. */
+
+ /* The edge mbs seem to reduce the predictive quality of the
+ * whole frame's score, but are needed for a spatial distribution. */
+ int do_edges = h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size || h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2;
+
+ int start_y = X264_MIN( h->i_threadslice_end - 1, h->mb.i_mb_height - 2 + do_edges );
+ int end_y = X264_MAX( h->i_threadslice_start, 1 - do_edges );
+ int start_x = h->mb.i_mb_width - 2 + do_edges;
+ int end_x = 1 - do_edges;
+
+ for( h->mb.i_mb_y = start_y; h->mb.i_mb_y >= end_y; h->mb.i_mb_y-- )
+ for( h->mb.i_mb_x = start_x; h->mb.i_mb_x >= end_x; h->mb.i_mb_x-- )
+ x264_slicetype_mb_cost( h, s->a, s->frames, s->p0, s->p1, s->b, s->dist_scale_factor,
+ s->do_search, s->w, s->output_inter, s->output_intra );
+}
+
static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
x264_frame_t **frames, int p0, int p1, int b,
int b_intra_penalty )
{
int i_score = 0;
int do_search[2];
const x264_weight_t *w = x264_weight_none;
+ x264_frame_t *fenc = frames[b];
+
/* Check whether we already evaluated this frame
* If we have tried this frame as P, then we have also tried
* the preceding frames as B. (is this still true?) */
/* Also check that we already calculated the row SATDs for the current frame. */
- if( frames[b]->i_cost_est[b-p0][p1-b] >= 0 && (!h->param.rc.i_vbv_buffer_size || frames[b]->i_row_satds[b-p0][p1-b][0] != -1) )
- i_score = frames[b]->i_cost_est[b-p0][p1-b];
+ if( fenc->i_cost_est[b-p0][p1-b] >= 0 && (!h->param.rc.i_vbv_buffer_size || fenc->i_row_satds[b-p0][p1-b][0] != -1) )
+ i_score = fenc->i_cost_est[b-p0][p1-b];
else
{
int dist_scale_factor = 128;
- int *row_satd = frames[b]->i_row_satds[b-p0][p1-b];
- int *row_satd_intra = frames[b]->i_row_satds[0][0];
/* For each list, check to see whether we have lowres motion-searched this reference frame before. */
- do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
- do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
+ do_search[0] = b != p0 && fenc->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
+ do_search[1] = b != p1 && fenc->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
if( do_search[0] )
{
if( h->param.analyse.i_weighted_pred && b == p1 )
{
x264_emms();
- x264_weights_analyse( h, frames[b], frames[p0], 1 );
- w = frames[b]->weight[0];
+ x264_weights_analyse( h, fenc, frames[p0], 1 );
+ w = fenc->weight[0];
}
- frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
+ fenc->lowres_mvs[0][b-p0-1][0][0] = 0;
}
- if( do_search[1] ) frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
+ if( do_search[1] ) fenc->lowres_mvs[1][p1-b-1][0][0] = 0;
- if( b == p1 )
- frames[b]->i_intra_mbs[b-p0] = 0;
- if( !frames[b]->b_intra_calculated )
- {
- frames[b]->i_cost_est[0][0] = 0;
- frames[b]->i_cost_est_aq[0][0] = 0;
- }
if( p1 != p0 )
dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
- frames[b]->i_cost_est[b-p0][p1-b] = 0;
- frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
-
- /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode.
- * This considerably improves MV prediction overall. */
+ int output_buf_size = h->mb.i_mb_height + (NUM_INTS + PAD_SIZE) * h->param.i_lookahead_threads;
+ int *output_inter[X264_LOOKAHEAD_THREAD_MAX+1];
+ int *output_intra[X264_LOOKAHEAD_THREAD_MAX+1];
+ output_inter[0] = h->scratch_buffer2;
+ output_intra[0] = output_inter[0] + output_buf_size;
- /* The edge mbs seem to reduce the predictive quality of the
- * whole frame's score, but are needed for a spatial distribution. */
- if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size ||
- h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2 )
+ if( h->param.i_lookahead_threads > 1 )
{
- for( h->mb.i_mb_y = h->mb.i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
+ x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX];
+
+ for( int i = 0; i < h->param.i_lookahead_threads; i++ )
{
- row_satd[h->mb.i_mb_y] = 0;
- if( !frames[b]->b_intra_calculated )
- row_satd_intra[h->mb.i_mb_y] = 0;
- for( h->mb.i_mb_x = h->mb.i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
- x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
+ x264_t *t = h->lookahead_thread[i];
+
+ /* FIXME move this somewhere else */
+ t->mb.i_me_method = h->mb.i_me_method;
+ t->mb.i_subpel_refine = h->mb.i_subpel_refine;
+ t->mb.b_chroma_me = h->mb.b_chroma_me;
+
+ s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
+ output_inter[i], output_intra[i] };
+
+ t->i_threadslice_start = ((h->mb.i_mb_height * i + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
+ t->i_threadslice_end = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
+
+ int thread_height = t->i_threadslice_end - t->i_threadslice_start;
+ int thread_output_size = thread_height + NUM_INTS;
+ memset( output_inter[i], 0, thread_output_size * sizeof(int) );
+ memset( output_intra[i], 0, thread_output_size * sizeof(int) );
+ output_inter[i][NUM_ROWS] = output_intra[i][NUM_ROWS] = thread_height;
+
+ output_inter[i+1] = output_inter[i] + thread_output_size + PAD_SIZE;
+ output_intra[i+1] = output_intra[i] + thread_output_size + PAD_SIZE;
+
+ x264_threadpool_run( h->lookaheadpool, (void*)x264_slicetype_slice_cost, &s[i] );
}
+ for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+ x264_threadpool_wait( h->lookaheadpool, &s[i] );
}
else
{
- for( h->mb.i_mb_y = h->mb.i_mb_height - 2; h->mb.i_mb_y >= 1; h->mb.i_mb_y-- )
- for( h->mb.i_mb_x = h->mb.i_mb_width - 2; h->mb.i_mb_x >= 1; h->mb.i_mb_x-- )
- x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
+ h->i_threadslice_start = 0;
+ h->i_threadslice_end = h->mb.i_mb_height;
+ memset( output_inter[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
+ memset( output_intra[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
+ output_inter[0][NUM_ROWS] = output_intra[0][NUM_ROWS] = h->mb.i_mb_height;
+ x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
+ output_inter[0], output_intra[0] };
+ x264_slicetype_slice_cost( &s );
+ }
+
+ /* Sum up accumulators */
+ if( b == p1 )
+ fenc->i_intra_mbs[b-p0] = 0;
+ if( !fenc->b_intra_calculated )
+ {
+ fenc->i_cost_est[0][0] = 0;
+ fenc->i_cost_est_aq[0][0] = 0;
+ }
+ fenc->i_cost_est[b-p0][p1-b] = 0;
+ fenc->i_cost_est_aq[b-p0][p1-b] = 0;
+
+ int *row_satd_inter = fenc->i_row_satds[b-p0][p1-b];
+ int *row_satd_intra = fenc->i_row_satds[0][0];
+ for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+ {
+ if( b == p1 )
+ fenc->i_intra_mbs[b-p0] += output_inter[i][INTRA_MBS];
+ if( !fenc->b_intra_calculated )
+ {
+ fenc->i_cost_est[0][0] += output_intra[i][COST_EST];
+ fenc->i_cost_est_aq[0][0] += output_intra[i][COST_EST_AQ];
+ }
+
+ fenc->i_cost_est[b-p0][p1-b] += output_inter[i][COST_EST];
+ fenc->i_cost_est_aq[b-p0][p1-b] += output_inter[i][COST_EST_AQ];
+
+ if( h->param.rc.i_vbv_buffer_size )
+ {
+ int row_count = output_inter[i][NUM_ROWS];
+ memcpy( row_satd_inter, output_inter[i] + NUM_INTS, row_count * sizeof(int) );
+ if( !fenc->b_intra_calculated )
+ memcpy( row_satd_intra, output_intra[i] + NUM_INTS, row_count * sizeof(int) );
+ row_satd_inter += row_count;
+ row_satd_intra += row_count;
+ }
}
- i_score = frames[b]->i_cost_est[b-p0][p1-b];
+ i_score = fenc->i_cost_est[b-p0][p1-b];
if( b != p1 )
i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
else
- frames[b]->b_intra_calculated = 1;
+ fenc->b_intra_calculated = 1;
- frames[b]->i_cost_est[b-p0][p1-b] = i_score;
+ fenc->i_cost_est[b-p0][p1-b] = i_score;
x264_emms();
}
if( b_intra_penalty )
{
// arbitrary penalty for I-blocks after B-frames
int nmb = NUM_MBS;
- i_score += (uint64_t)i_score * frames[b]->i_intra_mbs[b-p0] / (nmb * 8);
+ i_score += (uint64_t)i_score * fenc->i_intra_mbs[b-p0] / (nmb * 8);
}
return i_score;
}
View
31 filters/video/depth.c
@@ -50,13 +50,19 @@ static int depth_filter_csp_is_supported( int csp )
csp_mask == X264_CSP_YV16 ||
csp_mask == X264_CSP_YV24 ||
csp_mask == X264_CSP_NV12 ||
- csp_mask == X264_CSP_NV16;
+ csp_mask == X264_CSP_NV16 ||
+ csp_mask == X264_CSP_BGR ||
+ csp_mask == X264_CSP_RGB ||
+ csp_mask == X264_CSP_BGRA;
}
static int csp_num_interleaved( int csp, int plane )
{
int csp_mask = csp & X264_CSP_MASK;
- return ( (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV16) && plane == 1 ) ? 2 : 1;
+ return (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV16) && plane == 1 ? 2 :
+ csp_mask == X264_CSP_BGR || csp_mask == X264_CSP_RGB ? 3 :
+ csp_mask == X264_CSP_BGRA ? 4 :
+ 1;
}
/* The dithering algorithm is based on Sierra-2-4A error diffusion. It has been
@@ -86,6 +92,8 @@ static void dither_plane_##pitch( pixel *dst, int dst_stride, uint16_t *src, int
DITHER_PLANE( 1 )
DITHER_PLANE( 2 )
+DITHER_PLANE( 3 )
+DITHER_PLANE( 4 )
static void dither_image( cli_image_t *out, cli_image_t *img, int16_t *error_buf )
{
@@ -100,15 +108,28 @@ static void dither_image( cli_image_t *out, cli_image_t *img, int16_t *error_buf
dither_plane_##pitch( ((pixel*)out->plane[i])+off, out->stride[i]/sizeof(pixel), \
((uint16_t*)img->plane[i])+off, img->stride[i]/2, width, height, error_buf )
- if( num_interleaved == 1 )
+ if( num_interleaved == 4 )
{
- CALL_DITHER_PLANE( 1, 0 );
+ CALL_DITHER_PLANE( 4, 0 );
+ CALL_DITHER_PLANE( 4, 1 );
+ CALL_DITHER_PLANE( 4, 2 );
+ CALL_DITHER_PLANE( 4, 3 ); //we probably can skip this one
}
- else
+ else if( num_interleaved == 3 )
+ {
+ CALL_DITHER_PLANE( 3, 0 );
+ CALL_DITHER_PLANE( 3, 1 );
+ CALL_DITHER_PLANE( 3, 2 );
+ }
+ else if( num_interleaved == 2 )
{
CALL_DITHER_PLANE( 2, 0 );
CALL_DITHER_PLANE( 2, 1 );
}
+ else //if( num_interleaved == 1 )
+ {
+ CALL_DITHER_PLANE( 1, 0 );
+ }
}
}
View
2 x264.c
@@ -849,6 +849,7 @@ static void help( x264_param_t *defaults, int longhelp )
H1( " --psnr Enable PSNR computation\n" );
H1( " --ssim Enable SSIM computation\n" );
H1( " --threads <integer> Force a specific number of threads\n" );
+ H2( " --lookahead-threads <integer> Force a specific number of lookahead threads\n" );
H2( " --sliced-threads Low-latency but lower-efficiency threading\n" );
H2( " --thread-input Run Avisynth in its own thread\n" );
H2( " --sync-lookahead <integer> Number of buffer frames for threaded lookahead\n" );
@@ -1043,6 +1044,7 @@ static struct option long_options[] =
{ "zones", required_argument, NULL, 0 },
{ "qpfile", required_argument, NULL, OPT_QPFILE },
{ "threads", required_argument, NULL, 0 },
+ { "lookahead-threads", required_argument, NULL, 0 },
{ "sliced-threads", no_argument, NULL, 0 },
{ "no-sliced-threads", no_argument, NULL, 0 },
{ "slice-max-size", required_argument, NULL, 0 },
View
5 x264.h
@@ -41,7 +41,7 @@
#include "x264_config.h"
-#define X264_BUILD 124
+#define X264_BUILD 125
/* Application developers planning to link against a shared library version of
* libx264 from a Microsoft Visual Studio or similar development environment
@@ -254,7 +254,8 @@ typedef struct x264_param_t
{
/* CPU flags */
unsigned int cpu;
- int i_threads; /* encode multiple frames in parallel */
+ int i_threads; /* encode multiple frames in parallel */
+ int i_lookahead_threads; /* multiple threads for lookahead analysis */
int b_sliced_threads; /* Whether to use slice-based threading. */
int b_deterministic; /* whether to allow non-deterministic optimizations when threaded */
int b_cpu_independent; /* force canonical behavior rather than cpu-dependent optimal algorithms */

0 comments on commit 54929fe

Please sign in to comment.