diff --git a/common/common.c b/common/common.c index d03201d8f..3f40e66f1 100644 --- a/common/common.c +++ b/common/common.c @@ -50,6 +50,7 @@ void x264_param_default( x264_param_t *param ) /* CPU autodetect */ param->cpu = x264_cpu_detect(); param->i_threads = X264_THREADS_AUTO; + param->i_lookahead_threads = X264_THREADS_AUTO; param->b_deterministic = 1; param->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO; @@ -632,6 +633,13 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value ) else p->i_threads = atoi(value); } + OPT("lookahead-threads") + { + if( !strcmp(value, "auto") ) + p->i_lookahead_threads = X264_THREADS_AUTO; + else + p->i_lookahead_threads = atoi(value); + } OPT("sliced-threads") p->b_sliced_threads = atobool(value); OPT("sync-lookahead") @@ -1285,6 +1293,7 @@ char *x264_param2string( x264_param_t *p, int b_res ) s += sprintf( s, " fast_pskip=%d", p->analyse.b_fast_pskip ); s += sprintf( s, " chroma_qp_offset=%d", p->analyse.i_chroma_qp_offset ); s += sprintf( s, " threads=%d", p->i_threads ); + s += sprintf( s, " lookahead_threads=%d", p->i_lookahead_threads ); s += sprintf( s, " sliced_threads=%d", p->b_sliced_threads ); if( p->i_slice_count ) s += sprintf( s, " slices=%d", p->i_slice_count ); diff --git a/common/common.h b/common/common.h index 5e3421291..04ac11dae 100644 --- a/common/common.h +++ b/common/common.h @@ -56,6 +56,7 @@ do {\ #define X264_BFRAME_MAX 16 #define X264_REF_MAX 16 #define X264_THREAD_MAX 128 +#define X264_LOOKAHEAD_THREAD_MAX 16 #define X264_PCM_COST (FRAME_SIZE(256*BIT_DEPTH)+16) #define X264_LOOKAHEAD_MAX 250 #define QP_BD_OFFSET (6*(BIT_DEPTH-8)) @@ -469,6 +470,7 @@ struct x264_t x264_param_t param; x264_t *thread[X264_THREAD_MAX+1]; + x264_t *lookahead_thread[X264_LOOKAHEAD_THREAD_MAX]; int b_thread_active; int i_thread_phase; /* which thread to use for the next frame */ int i_thread_idx; /* which thread this is */ @@ -476,6 +478,7 @@ struct x264_t int i_threadslice_end; /* row after the end of this thread slice */ int i_threadslice_pass; /* which pass of encoding we are on */ x264_threadpool_t *threadpool; + x264_threadpool_t *lookaheadpool; x264_pthread_mutex_t mutex; x264_pthread_cond_t cv; @@ -915,6 +918,7 @@ struct x264_t /* Buffers that are allocated per-thread even in sliced threads. */ void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */ + void *scratch_buffer2; /* if the first one's already in use */ pixel *intra_border_backup[5][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */ /* Deblock strength values are stored for each 4x4 partition. In MBAFF * there are four extra values that need to be stored, located in [4][i]. */ diff --git a/common/deblock.c b/common/deblock.c index 7603a69c6..6022ead0f 100644 --- a/common/deblock.c +++ b/common/deblock.c @@ -506,9 +506,9 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) /* Any MB that was coded, or that analysis decided to skip, has quality commensurate with its QP. * But if deblocking affects neighboring MBs that were force-skipped, blur might accumulate there. * So reset their effective QP to max, to indicate that lack of guarantee. */ - if( h->fdec->mb_info && M32( bs[0][0] ) ) + if( h->fenc->mb_info && M32( bs[0][0] ) ) { -#define RESET_EFFECTIVE_QP(xy) h->fdec->effective_qp[xy] |= 0xff * !!(h->fdec->mb_info[xy] & X264_MBINFO_CONSTANT); +#define RESET_EFFECTIVE_QP(xy) h->fdec->effective_qp[xy] |= 0xff * !!(h->fenc->mb_info[xy] & X264_MBINFO_CONSTANT); RESET_EFFECTIVE_QP(mb_xy); RESET_EFFECTIVE_QP(h->mb.i_mb_left_xy[0]); } @@ -561,7 +561,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) int intra_deblock = intra_cur || intra_top; /* This edge has been modified, reset effective qp to max. */ - if( h->fdec->mb_info && M32( bs[1][0] ) ) + if( h->fenc->mb_info && M32( bs[1][0] ) ) { RESET_EFFECTIVE_QP(mb_xy); RESET_EFFECTIVE_QP(h->mb.i_mb_top_xy); diff --git a/common/frame.c b/common/frame.c index fed227712..b0694f6ec 100644 --- a/common/frame.c +++ b/common/frame.c @@ -357,8 +357,8 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src ) dst->i_pic_struct = src->i_pic_struct; dst->extra_sei = src->extra_sei; dst->opaque = src->opaque; - dst->mb_info = src->prop.mb_info; - dst->mb_info_free = src->prop.mb_info_free; + dst->mb_info = h->param.analyse.b_mb_info ? src->prop.mb_info : NULL; + dst->mb_info_free = h->param.analyse.b_mb_info ? src->prop.mb_info_free : NULL; uint8_t *pix[3]; int stride[3]; diff --git a/common/macroblock.c b/common/macroblock.c index 11c3e75e7..f25df8ee8 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -401,6 +401,9 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead ) else h->scratch_buffer = NULL; + int buf_lookahead_threads = (h->mb.i_mb_height + (4 + 32) * h->param.i_lookahead_threads) * sizeof(int) * 2; + CHECKED_MALLOC( h->scratch_buffer2, buf_lookahead_threads ); + return 0; fail: return -1; @@ -418,6 +421,7 @@ void x264_macroblock_thread_free( x264_t *h, int b_lookahead ) x264_free( h->intra_border_backup[i][j] - 16 ); } x264_free( h->scratch_buffer ); + x264_free( h->scratch_buffer2 ); } void x264_macroblock_slice_init( x264_t *h ) diff --git a/common/threadpool.c b/common/threadpool.c index f7a95fcce..a11bf9d25 100644 --- a/common/threadpool.c +++ b/common/threadpool.c @@ -66,7 +66,7 @@ static void x264_threadpool_thread( x264_threadpool_t *pool ) x264_pthread_mutex_unlock( &pool->run.mutex ); if( !job ) continue; - job->ret = job->func( job->arg ); /* execute the function */ + job->ret = (void*)x264_stack_align( job->func, job->arg ); /* execute the function */ x264_sync_frame_list_push( &pool->done, (void*)job ); } } @@ -83,7 +83,7 @@ int x264_threadpool_init( x264_threadpool_t **p_pool, int threads, pool->init_func = init_func; pool->init_arg = init_arg; - pool->threads = X264_MIN( threads, X264_THREAD_MAX ); + pool->threads = threads; CHECKED_MALLOC( pool->thread_handle, pool->threads * sizeof(x264_pthread_t) ); diff --git a/encoder/encoder.c b/encoder/encoder.c index 7603068d0..3387657ad 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -395,6 +395,15 @@ static void x264_encoder_thread_init( x264_t *h ) x264_cpu_mask_misalign_sse(); #endif } + +static void x264_lookahead_thread_init( x264_t *h ) +{ +#if HAVE_MMX + /* Misalign mask has to be set separately for each thread. */ + if( h->param.cpu&X264_CPU_SSE_MISALIGN ) + x264_cpu_mask_misalign_sse(); +#endif +} #endif /**************************************************************************** @@ -494,6 +503,9 @@ static int x264_validate_parameters( x264_t *h, int b_open ) if( h->param.i_threads == X264_THREADS_AUTO ) h->param.i_threads = x264_cpu_num_processors() * (h->param.b_sliced_threads?2:3)/2; + if( h->param.i_lookahead_threads == X264_THREADS_AUTO ) + h->param.i_lookahead_threads = h->param.i_threads / (h->param.b_sliced_threads?1:6); + int max_sliced_threads = X264_MAX( 1, (h->param.i_height+15)/16 / 4 ); if( h->param.i_threads > 1 ) { #if !HAVE_THREAD @@ -503,14 +515,15 @@ static int x264_validate_parameters( x264_t *h, int b_open ) /* Avoid absurdly small thread slices as they can reduce performance * and VBV compliance. Capped at an arbitrary 4 rows per thread. */ if( h->param.b_sliced_threads ) - { - int max_threads = (h->param.i_height+15)/16 / 4; - h->param.i_threads = X264_MIN( h->param.i_threads, max_threads ); - } + h->param.i_threads = X264_MIN( h->param.i_threads, max_sliced_threads ); } h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX ); + h->param.i_lookahead_threads = x264_clip3( h->param.i_lookahead_threads, 1, X264_MIN( max_sliced_threads, X264_LOOKAHEAD_THREAD_MAX ) ); if( h->param.i_threads == 1 ) + { h->param.b_sliced_threads = 0; + h->param.i_lookahead_threads = 1; + } h->i_thread_frames = h->param.b_sliced_threads ? 1 : h->param.i_threads; if( h->i_thread_frames > 1 ) h->param.nalu_process = NULL; @@ -1271,10 +1284,19 @@ x264_t *x264_encoder_open( x264_param_t *param ) if( h->param.i_threads > 1 && x264_threadpool_init( &h->threadpool, h->param.i_threads, (void*)x264_encoder_thread_init, h ) ) goto fail; + if( h->param.i_lookahead_threads > 1 && + x264_threadpool_init( &h->lookaheadpool, h->param.i_lookahead_threads, (void*)x264_lookahead_thread_init, h ) ) + goto fail; h->thread[0] = h; for( int i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ ) CHECKED_MALLOC( h->thread[i], sizeof(x264_t) ); + if( h->param.i_lookahead_threads > 1 ) + for( int i = 0; i < h->param.i_lookahead_threads; i++ ) + { + CHECKED_MALLOC( h->lookahead_thread[i], sizeof(x264_t) ); + *h->lookahead_thread[i] = *h; + } for( int i = 0; i < h->param.i_threads; i++ ) { @@ -3199,8 +3221,8 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current, x264_emms(); - if( h->fdec->mb_info_free ) - h->fdec->mb_info_free( h->fdec->mb_info ); + if( h->fenc->mb_info_free ) + h->fenc->mb_info_free( h->fenc->mb_info ); /* generate buffering period sei and insert it into place */ if( h->i_thread_frames > 1 && h->fenc->b_keyframe && h->sps->vui.b_nal_hrd_parameters_present ) @@ -3459,6 +3481,8 @@ void x264_encoder_close ( x264_t *h ) x264_threadpool_wait_all( h ); if( h->param.i_threads > 1 ) x264_threadpool_delete( h->threadpool ); + if( h->param.i_lookahead_threads > 1 ) + x264_threadpool_delete( h->lookaheadpool ); if( h->i_thread_frames > 1 ) { for( int i = 0; i < h->i_thread_frames; i++ ) @@ -3768,6 +3792,10 @@ void x264_encoder_close ( x264_t *h ) if( h->thread[i]->fref[0][j] && h->thread[i]->fref[0][j]->b_duplicate ) x264_frame_delete( h->thread[i]->fref[0][j] ); + if( h->param.i_lookahead_threads > 1 ) + for( int i = 0; i < h->param.i_lookahead_threads; i++ ) + x264_free( h->lookahead_thread[i] ); + for( int i = h->param.i_threads - 1; i >= 0; i-- ) { x264_frame_t **frame; diff --git a/encoder/slicetype.c b/encoder/slicetype.c index 1aa489135..352de0418 100644 --- a/encoder/slicetype.c +++ b/encoder/slicetype.c @@ -424,9 +424,21 @@ static void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *r } } +/* Output buffers are separated by 128 bytes to avoid false sharing of cachelines + * in multithreaded lookahead. */ +#define PAD_SIZE 32 +/* cost_est, cost_est_aq, intra_mbs, num rows */ +#define NUM_INTS 4 +#define COST_EST 0 +#define COST_EST_AQ 1 +#define INTRA_MBS 2 +#define NUM_ROWS 3 +#define ROW_SATD (NUM_INTS + (h->mb.i_mb_y - h->i_threadslice_start)) + static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b, - int dist_scale_factor, int do_search[2], const x264_weight_t *w ) + int dist_scale_factor, int do_search[2], const x264_weight_t *w, + int *output_inter, int *output_intra ) { x264_frame_t *fref0 = frames[p0]; x264_frame_t *fref1 = frames[p1]; @@ -571,7 +583,7 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, #define MVC(mv) { CP32( mvc[i_mvc], mv ); i_mvc++; } if( i_mb_x < h->mb.i_mb_width - 1 ) MVC( fenc_mv[1] ); - if( i_mb_y < h->mb.i_mb_height - 1 ) + if( i_mb_y < h->i_threadslice_end - 1 ) { MVC( fenc_mv[i_mb_stride] ); if( i_mb_x > 0 ) @@ -653,11 +665,11 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, int i_icost_aq = i_icost; if( h->param.rc.i_aq_mode ) i_icost_aq = (i_icost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8; - fenc->i_row_satds[0][0][h->mb.i_mb_y] += i_icost_aq; + output_intra[ROW_SATD] += i_icost_aq; if( b_frame_score_mb ) { - fenc->i_cost_est[0][0] += i_icost; - fenc->i_cost_est_aq[0][0] += i_icost_aq; + output_intra[COST_EST] += i_icost; + output_intra[COST_EST_AQ] += i_icost_aq; } } i_bcost += lowres_penalty; @@ -674,7 +686,7 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, list_used = 0; } if( b_frame_score_mb ) - fenc->i_intra_mbs[b-p0] += b_intra; + output_inter[INTRA_MBS] += b_intra; } /* In an I-frame, we've already added the results above in the intra section. */ @@ -683,12 +695,12 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, int i_bcost_aq = i_bcost; if( h->param.rc.i_aq_mode ) i_bcost_aq = (i_bcost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8; - fenc->i_row_satds[b-p0][p1-b][h->mb.i_mb_y] += i_bcost_aq; + output_inter[ROW_SATD] += i_bcost_aq; if( b_frame_score_mb ) { /* Don't use AQ-weighted costs for slicetype decision, only for ratecontrol. */ - fenc->i_cost_est[b-p0][p1-b] += i_bcost; - fenc->i_cost_est_aq[b-p0][p1-b] += i_bcost_aq; + output_inter[COST_EST] += i_bcost; + output_inter[COST_EST_AQ] += i_bcost_aq; } } @@ -701,6 +713,43 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, (h->mb.i_mb_width - 2) * (h->mb.i_mb_height - 2) :\ h->mb.i_mb_width * h->mb.i_mb_height) +typedef struct +{ + x264_t *h; + x264_mb_analysis_t *a; + x264_frame_t **frames; + int p0; + int p1; + int b; + int dist_scale_factor; + int *do_search; + const x264_weight_t *w; + int *output_inter; + int *output_intra; +} x264_slicetype_slice_t; + +static void x264_slicetype_slice_cost( x264_slicetype_slice_t *s ) +{ + x264_t *h = s->h; + + /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode. + * This considerably improves MV prediction overall. */ + + /* The edge mbs seem to reduce the predictive quality of the + * whole frame's score, but are needed for a spatial distribution. */ + int do_edges = h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size || h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2; + + int start_y = X264_MIN( h->i_threadslice_end - 1, h->mb.i_mb_height - 2 + do_edges ); + int end_y = X264_MAX( h->i_threadslice_start, 1 - do_edges ); + int start_x = h->mb.i_mb_width - 2 + do_edges; + int end_x = 1 - do_edges; + + for( h->mb.i_mb_y = start_y; h->mb.i_mb_y >= end_y; h->mb.i_mb_y-- ) + for( h->mb.i_mb_x = start_x; h->mb.i_mb_x >= end_x; h->mb.i_mb_x-- ) + x264_slicetype_mb_cost( h, s->a, s->frames, s->p0, s->p1, s->b, s->dist_scale_factor, + s->do_search, s->w, s->output_inter, s->output_intra ); +} + static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b, int b_intra_penalty ) @@ -708,77 +757,131 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a, int i_score = 0; int do_search[2]; const x264_weight_t *w = x264_weight_none; + x264_frame_t *fenc = frames[b]; + /* Check whether we already evaluated this frame * If we have tried this frame as P, then we have also tried * the preceding frames as B. (is this still true?) */ /* Also check that we already calculated the row SATDs for the current frame. */ - if( frames[b]->i_cost_est[b-p0][p1-b] >= 0 && (!h->param.rc.i_vbv_buffer_size || frames[b]->i_row_satds[b-p0][p1-b][0] != -1) ) - i_score = frames[b]->i_cost_est[b-p0][p1-b]; + if( fenc->i_cost_est[b-p0][p1-b] >= 0 && (!h->param.rc.i_vbv_buffer_size || fenc->i_row_satds[b-p0][p1-b][0] != -1) ) + i_score = fenc->i_cost_est[b-p0][p1-b]; else { int dist_scale_factor = 128; - int *row_satd = frames[b]->i_row_satds[b-p0][p1-b]; - int *row_satd_intra = frames[b]->i_row_satds[0][0]; /* For each list, check to see whether we have lowres motion-searched this reference frame before. */ - do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF; - do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF; + do_search[0] = b != p0 && fenc->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF; + do_search[1] = b != p1 && fenc->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF; if( do_search[0] ) { if( h->param.analyse.i_weighted_pred && b == p1 ) { x264_emms(); - x264_weights_analyse( h, frames[b], frames[p0], 1 ); - w = frames[b]->weight[0]; + x264_weights_analyse( h, fenc, frames[p0], 1 ); + w = fenc->weight[0]; } - frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0; + fenc->lowres_mvs[0][b-p0-1][0][0] = 0; } - if( do_search[1] ) frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0; + if( do_search[1] ) fenc->lowres_mvs[1][p1-b-1][0][0] = 0; - if( b == p1 ) - frames[b]->i_intra_mbs[b-p0] = 0; - if( !frames[b]->b_intra_calculated ) - { - frames[b]->i_cost_est[0][0] = 0; - frames[b]->i_cost_est_aq[0][0] = 0; - } if( p1 != p0 ) dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0); - frames[b]->i_cost_est[b-p0][p1-b] = 0; - frames[b]->i_cost_est_aq[b-p0][p1-b] = 0; - - /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode. - * This considerably improves MV prediction overall. */ + int output_buf_size = h->mb.i_mb_height + (NUM_INTS + PAD_SIZE) * h->param.i_lookahead_threads; + int *output_inter[X264_LOOKAHEAD_THREAD_MAX+1]; + int *output_intra[X264_LOOKAHEAD_THREAD_MAX+1]; + output_inter[0] = h->scratch_buffer2; + output_intra[0] = output_inter[0] + output_buf_size; - /* The edge mbs seem to reduce the predictive quality of the - * whole frame's score, but are needed for a spatial distribution. */ - if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size || - h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2 ) + if( h->param.i_lookahead_threads > 1 ) { - for( h->mb.i_mb_y = h->mb.i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- ) + x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX]; + + for( int i = 0; i < h->param.i_lookahead_threads; i++ ) { - row_satd[h->mb.i_mb_y] = 0; - if( !frames[b]->b_intra_calculated ) - row_satd_intra[h->mb.i_mb_y] = 0; - for( h->mb.i_mb_x = h->mb.i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- ) - x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w ); + x264_t *t = h->lookahead_thread[i]; + + /* FIXME move this somewhere else */ + t->mb.i_me_method = h->mb.i_me_method; + t->mb.i_subpel_refine = h->mb.i_subpel_refine; + t->mb.b_chroma_me = h->mb.b_chroma_me; + + s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w, + output_inter[i], output_intra[i] }; + + t->i_threadslice_start = ((h->mb.i_mb_height * i + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads); + t->i_threadslice_end = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads); + + int thread_height = t->i_threadslice_end - t->i_threadslice_start; + int thread_output_size = thread_height + NUM_INTS; + memset( output_inter[i], 0, thread_output_size * sizeof(int) ); + memset( output_intra[i], 0, thread_output_size * sizeof(int) ); + output_inter[i][NUM_ROWS] = output_intra[i][NUM_ROWS] = thread_height; + + output_inter[i+1] = output_inter[i] + thread_output_size + PAD_SIZE; + output_intra[i+1] = output_intra[i] + thread_output_size + PAD_SIZE; + + x264_threadpool_run( h->lookaheadpool, (void*)x264_slicetype_slice_cost, &s[i] ); } + for( int i = 0; i < h->param.i_lookahead_threads; i++ ) + x264_threadpool_wait( h->lookaheadpool, &s[i] ); } else { - for( h->mb.i_mb_y = h->mb.i_mb_height - 2; h->mb.i_mb_y >= 1; h->mb.i_mb_y-- ) - for( h->mb.i_mb_x = h->mb.i_mb_width - 2; h->mb.i_mb_x >= 1; h->mb.i_mb_x-- ) - x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w ); + h->i_threadslice_start = 0; + h->i_threadslice_end = h->mb.i_mb_height; + memset( output_inter[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) ); + memset( output_intra[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) ); + output_inter[0][NUM_ROWS] = output_intra[0][NUM_ROWS] = h->mb.i_mb_height; + x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w, + output_inter[0], output_intra[0] }; + x264_slicetype_slice_cost( &s ); + } + + /* Sum up accumulators */ + if( b == p1 ) + fenc->i_intra_mbs[b-p0] = 0; + if( !fenc->b_intra_calculated ) + { + fenc->i_cost_est[0][0] = 0; + fenc->i_cost_est_aq[0][0] = 0; + } + fenc->i_cost_est[b-p0][p1-b] = 0; + fenc->i_cost_est_aq[b-p0][p1-b] = 0; + + int *row_satd_inter = fenc->i_row_satds[b-p0][p1-b]; + int *row_satd_intra = fenc->i_row_satds[0][0]; + for( int i = 0; i < h->param.i_lookahead_threads; i++ ) + { + if( b == p1 ) + fenc->i_intra_mbs[b-p0] += output_inter[i][INTRA_MBS]; + if( !fenc->b_intra_calculated ) + { + fenc->i_cost_est[0][0] += output_intra[i][COST_EST]; + fenc->i_cost_est_aq[0][0] += output_intra[i][COST_EST_AQ]; + } + + fenc->i_cost_est[b-p0][p1-b] += output_inter[i][COST_EST]; + fenc->i_cost_est_aq[b-p0][p1-b] += output_inter[i][COST_EST_AQ]; + + if( h->param.rc.i_vbv_buffer_size ) + { + int row_count = output_inter[i][NUM_ROWS]; + memcpy( row_satd_inter, output_inter[i] + NUM_INTS, row_count * sizeof(int) ); + if( !fenc->b_intra_calculated ) + memcpy( row_satd_intra, output_intra[i] + NUM_INTS, row_count * sizeof(int) ); + row_satd_inter += row_count; + row_satd_intra += row_count; + } } - i_score = frames[b]->i_cost_est[b-p0][p1-b]; + i_score = fenc->i_cost_est[b-p0][p1-b]; if( b != p1 ) i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias); else - frames[b]->b_intra_calculated = 1; + fenc->b_intra_calculated = 1; - frames[b]->i_cost_est[b-p0][p1-b] = i_score; + fenc->i_cost_est[b-p0][p1-b] = i_score; x264_emms(); } @@ -786,7 +889,7 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a, { // arbitrary penalty for I-blocks after B-frames int nmb = NUM_MBS; - i_score += (uint64_t)i_score * frames[b]->i_intra_mbs[b-p0] / (nmb * 8); + i_score += (uint64_t)i_score * fenc->i_intra_mbs[b-p0] / (nmb * 8); } return i_score; } diff --git a/filters/video/depth.c b/filters/video/depth.c index e01aaae4f..9465f177d 100644 --- a/filters/video/depth.c +++ b/filters/video/depth.c @@ -50,13 +50,19 @@ static int depth_filter_csp_is_supported( int csp ) csp_mask == X264_CSP_YV16 || csp_mask == X264_CSP_YV24 || csp_mask == X264_CSP_NV12 || - csp_mask == X264_CSP_NV16; + csp_mask == X264_CSP_NV16 || + csp_mask == X264_CSP_BGR || + csp_mask == X264_CSP_RGB || + csp_mask == X264_CSP_BGRA; } static int csp_num_interleaved( int csp, int plane ) { int csp_mask = csp & X264_CSP_MASK; - return ( (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV16) && plane == 1 ) ? 2 : 1; + return (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV16) && plane == 1 ? 2 : + csp_mask == X264_CSP_BGR || csp_mask == X264_CSP_RGB ? 3 : + csp_mask == X264_CSP_BGRA ? 4 : + 1; } /* The dithering algorithm is based on Sierra-2-4A error diffusion. It has been @@ -86,6 +92,8 @@ static void dither_plane_##pitch( pixel *dst, int dst_stride, uint16_t *src, int DITHER_PLANE( 1 ) DITHER_PLANE( 2 ) +DITHER_PLANE( 3 ) +DITHER_PLANE( 4 ) static void dither_image( cli_image_t *out, cli_image_t *img, int16_t *error_buf ) { @@ -100,15 +108,28 @@ static void dither_image( cli_image_t *out, cli_image_t *img, int16_t *error_buf dither_plane_##pitch( ((pixel*)out->plane[i])+off, out->stride[i]/sizeof(pixel), \ ((uint16_t*)img->plane[i])+off, img->stride[i]/2, width, height, error_buf ) - if( num_interleaved == 1 ) + if( num_interleaved == 4 ) { - CALL_DITHER_PLANE( 1, 0 ); + CALL_DITHER_PLANE( 4, 0 ); + CALL_DITHER_PLANE( 4, 1 ); + CALL_DITHER_PLANE( 4, 2 ); + CALL_DITHER_PLANE( 4, 3 ); //we probably can skip this one } - else + else if( num_interleaved == 3 ) + { + CALL_DITHER_PLANE( 3, 0 ); + CALL_DITHER_PLANE( 3, 1 ); + CALL_DITHER_PLANE( 3, 2 ); + } + else if( num_interleaved == 2 ) { CALL_DITHER_PLANE( 2, 0 ); CALL_DITHER_PLANE( 2, 1 ); } + else //if( num_interleaved == 1 ) + { + CALL_DITHER_PLANE( 1, 0 ); + } } } diff --git a/x264.c b/x264.c index b67c0c50c..dcf9a80fa 100644 --- a/x264.c +++ b/x264.c @@ -849,6 +849,7 @@ static void help( x264_param_t *defaults, int longhelp ) H1( " --psnr Enable PSNR computation\n" ); H1( " --ssim Enable SSIM computation\n" ); H1( " --threads Force a specific number of threads\n" ); + H2( " --lookahead-threads Force a specific number of lookahead threads\n" ); H2( " --sliced-threads Low-latency but lower-efficiency threading\n" ); H2( " --thread-input Run Avisynth in its own thread\n" ); H2( " --sync-lookahead Number of buffer frames for threaded lookahead\n" ); @@ -1043,6 +1044,7 @@ static struct option long_options[] = { "zones", required_argument, NULL, 0 }, { "qpfile", required_argument, NULL, OPT_QPFILE }, { "threads", required_argument, NULL, 0 }, + { "lookahead-threads", required_argument, NULL, 0 }, { "sliced-threads", no_argument, NULL, 0 }, { "no-sliced-threads", no_argument, NULL, 0 }, { "slice-max-size", required_argument, NULL, 0 }, diff --git a/x264.h b/x264.h index 06e91c3f5..8a152e6d1 100644 --- a/x264.h +++ b/x264.h @@ -41,7 +41,7 @@ #include "x264_config.h" -#define X264_BUILD 124 +#define X264_BUILD 125 /* Application developers planning to link against a shared library version of * libx264 from a Microsoft Visual Studio or similar development environment @@ -254,7 +254,8 @@ typedef struct x264_param_t { /* CPU flags */ unsigned int cpu; - int i_threads; /* encode multiple frames in parallel */ + int i_threads; /* encode multiple frames in parallel */ + int i_lookahead_threads; /* multiple threads for lookahead analysis */ int b_sliced_threads; /* Whether to use slice-based threading. */ int b_deterministic; /* whether to allow non-deterministic optimizations when threaded */ int b_cpu_independent; /* force canonical behavior rather than cpu-dependent optimal algorithms */