From 1c97f3570fba02f768fbf649b9f7d48beb720048 Mon Sep 17 00:00:00 2001
From: Anton Mitrofanov <BugMaster@narod.ru>
Date: Sat, 12 May 2012 13:57:49 +0400
Subject: [PATCH 1/3] Fix some bugs in mb_info code

---
 common/deblock.c  | 6 +++---
 common/frame.c    | 4 ++--
 encoder/encoder.c | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/common/deblock.c b/common/deblock.c
index 7603a69c6..6022ead0f 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -506,9 +506,9 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
                 /* Any MB that was coded, or that analysis decided to skip, has quality commensurate with its QP.
                  * But if deblocking affects neighboring MBs that were force-skipped, blur might accumulate there.
                  * So reset their effective QP to max, to indicate that lack of guarantee. */
-                if( h->fdec->mb_info && M32( bs[0][0] ) )
+                if( h->fenc->mb_info && M32( bs[0][0] ) )
                 {
-#define RESET_EFFECTIVE_QP(xy) h->fdec->effective_qp[xy] |= 0xff * !!(h->fdec->mb_info[xy] & X264_MBINFO_CONSTANT);
+#define RESET_EFFECTIVE_QP(xy) h->fdec->effective_qp[xy] |= 0xff * !!(h->fenc->mb_info[xy] & X264_MBINFO_CONSTANT);
                     RESET_EFFECTIVE_QP(mb_xy);
                     RESET_EFFECTIVE_QP(h->mb.i_mb_left_xy[0]);
                 }
@@ -561,7 +561,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
                 int intra_deblock = intra_cur || intra_top;
 
                 /* This edge has been modified, reset effective qp to max. */
-                if( h->fdec->mb_info && M32( bs[1][0] ) )
+                if( h->fenc->mb_info && M32( bs[1][0] ) )
                 {
                     RESET_EFFECTIVE_QP(mb_xy);
                     RESET_EFFECTIVE_QP(h->mb.i_mb_top_xy);
diff --git a/common/frame.c b/common/frame.c
index fed227712..b0694f6ec 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -357,8 +357,8 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
     dst->i_pic_struct = src->i_pic_struct;
     dst->extra_sei  = src->extra_sei;
     dst->opaque     = src->opaque;
-    dst->mb_info    = src->prop.mb_info;
-    dst->mb_info_free = src->prop.mb_info_free;
+    dst->mb_info    = h->param.analyse.b_mb_info ? src->prop.mb_info : NULL;
+    dst->mb_info_free = h->param.analyse.b_mb_info ? src->prop.mb_info_free : NULL;
 
     uint8_t *pix[3];
     int stride[3];
diff --git a/encoder/encoder.c b/encoder/encoder.c
index b42d5dc82..4f233ff76 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -3199,8 +3199,8 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
 
     x264_emms();
 
-    if( h->fdec->mb_info_free )
-        h->fdec->mb_info_free( h->fdec->mb_info );
+    if( h->fenc->mb_info_free )
+        h->fenc->mb_info_free( h->fenc->mb_info );
 
     /* generate buffering period sei and insert it into place */
     if( h->i_thread_frames > 1 && h->fenc->b_keyframe && h->sps->vui.b_nal_hrd_parameters_present )

From ecfbf9d8025e39783bc4262dc1972ca742d8a993 Mon Sep 17 00:00:00 2001
From: Anton Mitrofanov <BugMaster@narod.ru>
Date: Fri, 4 May 2012 17:18:12 +0400
Subject: [PATCH 2/3] Add support for RGB formats in bit-depth conversion
 filter

---
 filters/video/depth.c | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/filters/video/depth.c b/filters/video/depth.c
index e01aaae4f..9465f177d 100644
--- a/filters/video/depth.c
+++ b/filters/video/depth.c
@@ -50,13 +50,19 @@ static int depth_filter_csp_is_supported( int csp )
            csp_mask == X264_CSP_YV16 ||
            csp_mask == X264_CSP_YV24 ||
            csp_mask == X264_CSP_NV12 ||
-           csp_mask == X264_CSP_NV16;
+           csp_mask == X264_CSP_NV16 ||
+           csp_mask == X264_CSP_BGR ||
+           csp_mask == X264_CSP_RGB ||
+           csp_mask == X264_CSP_BGRA;
 }
 
 static int csp_num_interleaved( int csp, int plane )
 {
     int csp_mask = csp & X264_CSP_MASK;
-    return ( (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV16) && plane == 1 ) ? 2 : 1;
+    return (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV16) && plane == 1 ? 2 :
+           csp_mask == X264_CSP_BGR || csp_mask == X264_CSP_RGB ? 3 :
+           csp_mask == X264_CSP_BGRA ? 4 :
+           1;
 }
 
 /* The dithering algorithm is based on Sierra-2-4A error diffusion. It has been
@@ -86,6 +92,8 @@ static void dither_plane_##pitch( pixel *dst, int dst_stride, uint16_t *src, int
 
 DITHER_PLANE( 1 )
 DITHER_PLANE( 2 )
+DITHER_PLANE( 3 )
+DITHER_PLANE( 4 )
 
 static void dither_image( cli_image_t *out, cli_image_t *img, int16_t *error_buf )
 {
@@ -100,15 +108,28 @@ static void dither_image( cli_image_t *out, cli_image_t *img, int16_t *error_buf
         dither_plane_##pitch( ((pixel*)out->plane[i])+off, out->stride[i]/sizeof(pixel), \
                 ((uint16_t*)img->plane[i])+off, img->stride[i]/2, width, height, error_buf )
 
-        if( num_interleaved == 1 )
+        if( num_interleaved == 4 )
         {
-            CALL_DITHER_PLANE( 1, 0 );
+            CALL_DITHER_PLANE( 4, 0 );
+            CALL_DITHER_PLANE( 4, 1 );
+            CALL_DITHER_PLANE( 4, 2 );
+            CALL_DITHER_PLANE( 4, 3 ); //we probably can skip this one
         }
-        else
+        else if( num_interleaved == 3 )
+        {
+            CALL_DITHER_PLANE( 3, 0 );
+            CALL_DITHER_PLANE( 3, 1 );
+            CALL_DITHER_PLANE( 3, 2 );
+        }
+        else if( num_interleaved == 2 )
         {
             CALL_DITHER_PLANE( 2, 0 );
             CALL_DITHER_PLANE( 2, 1 );
         }
+        else //if( num_interleaved == 1 )
+        {
+            CALL_DITHER_PLANE( 1, 0 );
+        }
     }
 }
 

From 999b753ff0f4dc872077f4fa90d465e948cbe656 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <jason@x264.com>
Date: Tue, 8 May 2012 15:42:56 -0700
Subject: [PATCH 3/3] Threaded lookahead

Split each lookahead frame analysis call into multiple threads.  Has a small
impact on quality, but does not seem to be consistently any worse.

This helps alleviate bottlenecks with many cores and frame threads. In many
case, this massively increases performance on many-core systems.  For example,
over 100% faster 1080p encoding with --preset veryfast on a 12-core i7 system.
Realtime 1080p30 at --preset slow should now be feasible on real systems.

For sliced-threads, this patch should be faster regardless of settings (~10%).

By default, lookahead threads are 1/6 of regular threads.  This isn't exacting,
but it seems to work well for all presets on real systems.  With sliced-threads,
it's the same as the number of encoding threads.
---
 common/common.c     |   9 ++
 common/common.h     |   4 +
 common/macroblock.c |   4 +
 common/threadpool.c |   4 +-
 encoder/encoder.c   |  36 +++++++-
 encoder/slicetype.c | 199 +++++++++++++++++++++++++++++++++-----------
 x264.c              |   2 +
 x264.h              |   5 +-
 8 files changed, 207 insertions(+), 56 deletions(-)

diff --git a/common/common.c b/common/common.c
index d03201d8f..3f40e66f1 100644
--- a/common/common.c
+++ b/common/common.c
@@ -50,6 +50,7 @@ void x264_param_default( x264_param_t *param )
     /* CPU autodetect */
     param->cpu = x264_cpu_detect();
     param->i_threads = X264_THREADS_AUTO;
+    param->i_lookahead_threads = X264_THREADS_AUTO;
     param->b_deterministic = 1;
     param->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;
 
@@ -632,6 +633,13 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
         else
             p->i_threads = atoi(value);
     }
+    OPT("lookahead-threads")
+    {
+        if( !strcmp(value, "auto") )
+            p->i_lookahead_threads = X264_THREADS_AUTO;
+        else
+            p->i_lookahead_threads = atoi(value);
+    }
     OPT("sliced-threads")
         p->b_sliced_threads = atobool(value);
     OPT("sync-lookahead")
@@ -1285,6 +1293,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
     s += sprintf( s, " fast_pskip=%d", p->analyse.b_fast_pskip );
     s += sprintf( s, " chroma_qp_offset=%d", p->analyse.i_chroma_qp_offset );
     s += sprintf( s, " threads=%d", p->i_threads );
+    s += sprintf( s, " lookahead_threads=%d", p->i_lookahead_threads );
     s += sprintf( s, " sliced_threads=%d", p->b_sliced_threads );
     if( p->i_slice_count )
         s += sprintf( s, " slices=%d", p->i_slice_count );
diff --git a/common/common.h b/common/common.h
index 5e3421291..04ac11dae 100644
--- a/common/common.h
+++ b/common/common.h
@@ -56,6 +56,7 @@ do {\
 #define X264_BFRAME_MAX 16
 #define X264_REF_MAX 16
 #define X264_THREAD_MAX 128
+#define X264_LOOKAHEAD_THREAD_MAX 16
 #define X264_PCM_COST (FRAME_SIZE(256*BIT_DEPTH)+16)
 #define X264_LOOKAHEAD_MAX 250
 #define QP_BD_OFFSET (6*(BIT_DEPTH-8))
@@ -469,6 +470,7 @@ struct x264_t
     x264_param_t    param;
 
     x264_t          *thread[X264_THREAD_MAX+1];
+    x264_t          *lookahead_thread[X264_LOOKAHEAD_THREAD_MAX];
     int             b_thread_active;
     int             i_thread_phase; /* which thread to use for the next frame */
     int             i_thread_idx;   /* which thread this is */
@@ -476,6 +478,7 @@ struct x264_t
     int             i_threadslice_end; /* row after the end of this thread slice */
     int             i_threadslice_pass; /* which pass of encoding we are on */
     x264_threadpool_t *threadpool;
+    x264_threadpool_t *lookaheadpool;
     x264_pthread_mutex_t mutex;
     x264_pthread_cond_t cv;
 
@@ -915,6 +918,7 @@ struct x264_t
 
     /* Buffers that are allocated per-thread even in sliced threads. */
     void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
+    void *scratch_buffer2; /* if the first one's already in use */
     pixel *intra_border_backup[5][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
     /* Deblock strength values are stored for each 4x4 partition. In MBAFF
      * there are four extra values that need to be stored, located in [4][i]. */
diff --git a/common/macroblock.c b/common/macroblock.c
index 11c3e75e7..f25df8ee8 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -401,6 +401,9 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
     else
         h->scratch_buffer = NULL;
 
+    int buf_lookahead_threads = (h->mb.i_mb_height + (4 + 32) * h->param.i_lookahead_threads) * sizeof(int) * 2;
+    CHECKED_MALLOC( h->scratch_buffer2, buf_lookahead_threads );
+
     return 0;
 fail:
     return -1;
@@ -418,6 +421,7 @@ void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
                 x264_free( h->intra_border_backup[i][j] - 16 );
     }
     x264_free( h->scratch_buffer );
+    x264_free( h->scratch_buffer2 );
 }
 
 void x264_macroblock_slice_init( x264_t *h )
diff --git a/common/threadpool.c b/common/threadpool.c
index f7a95fcce..a11bf9d25 100644
--- a/common/threadpool.c
+++ b/common/threadpool.c
@@ -66,7 +66,7 @@ static void x264_threadpool_thread( x264_threadpool_t *pool )
         x264_pthread_mutex_unlock( &pool->run.mutex );
         if( !job )
             continue;
-        job->ret = job->func( job->arg ); /* execute the function */
+        job->ret = (void*)x264_stack_align( job->func, job->arg ); /* execute the function */
         x264_sync_frame_list_push( &pool->done, (void*)job );
     }
 }
@@ -83,7 +83,7 @@ int x264_threadpool_init( x264_threadpool_t **p_pool, int threads,
 
     pool->init_func = init_func;
     pool->init_arg  = init_arg;
-    pool->threads   = X264_MIN( threads, X264_THREAD_MAX );
+    pool->threads   = threads;
 
     CHECKED_MALLOC( pool->thread_handle, pool->threads * sizeof(x264_pthread_t) );
 
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 4f233ff76..104df1ad5 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -395,6 +395,15 @@ static void x264_encoder_thread_init( x264_t *h )
         x264_cpu_mask_misalign_sse();
 #endif
 }
+
+static void x264_lookahead_thread_init( x264_t *h )
+{
+#if HAVE_MMX
+    /* Misalign mask has to be set separately for each thread. */
+    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
+        x264_cpu_mask_misalign_sse();
+#endif
+}
 #endif
 
 /****************************************************************************
@@ -494,6 +503,9 @@ static int x264_validate_parameters( x264_t *h, int b_open )
 
     if( h->param.i_threads == X264_THREADS_AUTO )
         h->param.i_threads = x264_cpu_num_processors() * (h->param.b_sliced_threads?2:3)/2;
+    if( h->param.i_lookahead_threads == X264_THREADS_AUTO )
+        h->param.i_lookahead_threads = h->param.i_threads / (h->param.b_sliced_threads?1:6);
+    int max_sliced_threads = X264_MAX( 1, (h->param.i_height+15)/16 / 4 );
     if( h->param.i_threads > 1 )
     {
 #if !HAVE_THREAD
@@ -503,14 +515,15 @@ static int x264_validate_parameters( x264_t *h, int b_open )
         /* Avoid absurdly small thread slices as they can reduce performance
          * and VBV compliance.  Capped at an arbitrary 4 rows per thread. */
         if( h->param.b_sliced_threads )
-        {
-            int max_threads = (h->param.i_height+15)/16 / 4;
-            h->param.i_threads = X264_MIN( h->param.i_threads, max_threads );
-        }
+            h->param.i_threads = X264_MIN( h->param.i_threads, max_sliced_threads );
     }
     h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX );
+    h->param.i_lookahead_threads = x264_clip3( h->param.i_lookahead_threads, 1, X264_MIN( max_sliced_threads, X264_LOOKAHEAD_THREAD_MAX ) );
     if( h->param.i_threads == 1 )
+    {
         h->param.b_sliced_threads = 0;
+        h->param.i_lookahead_threads = 1;
+    }
     h->i_thread_frames = h->param.b_sliced_threads ? 1 : h->param.i_threads;
     if( h->i_thread_frames > 1 )
         h->param.nalu_process = NULL;
@@ -1271,10 +1284,19 @@ x264_t *x264_encoder_open( x264_param_t *param )
     if( h->param.i_threads > 1 &&
         x264_threadpool_init( &h->threadpool, h->param.i_threads, (void*)x264_encoder_thread_init, h ) )
         goto fail;
+    if( h->param.i_lookahead_threads > 1 &&
+        x264_threadpool_init( &h->lookaheadpool, h->param.i_lookahead_threads, (void*)x264_lookahead_thread_init, h ) )
+        goto fail;
 
     h->thread[0] = h;
     for( int i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
         CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
+    if( h->param.i_lookahead_threads > 1 )
+        for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+        {
+            CHECKED_MALLOC( h->lookahead_thread[i], sizeof(x264_t) );
+            *h->lookahead_thread[i] = *h;
+        }
 
     for( int i = 0; i < h->param.i_threads; i++ )
     {
@@ -3457,6 +3479,8 @@ void    x264_encoder_close  ( x264_t *h )
         x264_threadpool_wait_all( h );
     if( h->param.i_threads > 1 )
         x264_threadpool_delete( h->threadpool );
+    if( h->param.i_lookahead_threads > 1 )
+        x264_threadpool_delete( h->lookaheadpool );
     if( h->i_thread_frames > 1 )
     {
         for( int i = 0; i < h->i_thread_frames; i++ )
@@ -3766,6 +3790,10 @@ void    x264_encoder_close  ( x264_t *h )
                 if( h->thread[i]->fref[0][j] && h->thread[i]->fref[0][j]->b_duplicate )
                     x264_frame_delete( h->thread[i]->fref[0][j] );
 
+    if( h->param.i_lookahead_threads > 1 )
+        for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+            x264_free( h->lookahead_thread[i] );
+
     for( int i = h->param.i_threads - 1; i >= 0; i-- )
     {
         x264_frame_t **frame;
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 1aa489135..352de0418 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -424,9 +424,21 @@ static void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *r
     }
 }
 
+/* Output buffers are separated by 128 bytes to avoid false sharing of cachelines
+ * in multithreaded lookahead. */
+#define PAD_SIZE 32
+/* cost_est, cost_est_aq, intra_mbs, num rows */
+#define NUM_INTS 4
+#define COST_EST 0
+#define COST_EST_AQ 1
+#define INTRA_MBS 2
+#define NUM_ROWS 3
+#define ROW_SATD (NUM_INTS + (h->mb.i_mb_y - h->i_threadslice_start))
+
 static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
                                     x264_frame_t **frames, int p0, int p1, int b,
-                                    int dist_scale_factor, int do_search[2], const x264_weight_t *w )
+                                    int dist_scale_factor, int do_search[2], const x264_weight_t *w,
+                                    int *output_inter, int *output_intra )
 {
     x264_frame_t *fref0 = frames[p0];
     x264_frame_t *fref1 = frames[p1];
@@ -571,7 +583,7 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
 #define MVC(mv) { CP32( mvc[i_mvc], mv ); i_mvc++; }
             if( i_mb_x < h->mb.i_mb_width - 1 )
                 MVC( fenc_mv[1] );
-            if( i_mb_y < h->mb.i_mb_height - 1 )
+            if( i_mb_y < h->i_threadslice_end - 1 )
             {
                 MVC( fenc_mv[i_mb_stride] );
                 if( i_mb_x > 0 )
@@ -653,11 +665,11 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
         int i_icost_aq = i_icost;
         if( h->param.rc.i_aq_mode )
             i_icost_aq = (i_icost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8;
-        fenc->i_row_satds[0][0][h->mb.i_mb_y] += i_icost_aq;
+        output_intra[ROW_SATD] += i_icost_aq;
         if( b_frame_score_mb )
         {
-            fenc->i_cost_est[0][0] += i_icost;
-            fenc->i_cost_est_aq[0][0] += i_icost_aq;
+            output_intra[COST_EST] += i_icost;
+            output_intra[COST_EST_AQ] += i_icost_aq;
         }
     }
     i_bcost += lowres_penalty;
@@ -674,7 +686,7 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
             list_used = 0;
         }
         if( b_frame_score_mb )
-            fenc->i_intra_mbs[b-p0] += b_intra;
+            output_inter[INTRA_MBS] += b_intra;
     }
 
     /* In an I-frame, we've already added the results above in the intra section. */
@@ -683,12 +695,12 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
         int i_bcost_aq = i_bcost;
         if( h->param.rc.i_aq_mode )
             i_bcost_aq = (i_bcost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8;
-        fenc->i_row_satds[b-p0][p1-b][h->mb.i_mb_y] += i_bcost_aq;
+        output_inter[ROW_SATD] += i_bcost_aq;
         if( b_frame_score_mb )
         {
             /* Don't use AQ-weighted costs for slicetype decision, only for ratecontrol. */
-            fenc->i_cost_est[b-p0][p1-b] += i_bcost;
-            fenc->i_cost_est_aq[b-p0][p1-b] += i_bcost_aq;
+            output_inter[COST_EST] += i_bcost;
+            output_inter[COST_EST_AQ] += i_bcost_aq;
         }
     }
 
@@ -701,6 +713,43 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
    (h->mb.i_mb_width - 2) * (h->mb.i_mb_height - 2) :\
     h->mb.i_mb_width * h->mb.i_mb_height)
 
+typedef struct
+{
+    x264_t *h;
+    x264_mb_analysis_t *a;
+    x264_frame_t **frames;
+    int p0;
+    int p1;
+    int b;
+    int dist_scale_factor;
+    int *do_search;
+    const x264_weight_t *w;
+    int *output_inter;
+    int *output_intra;
+} x264_slicetype_slice_t;
+
+static void x264_slicetype_slice_cost( x264_slicetype_slice_t *s )
+{
+    x264_t *h = s->h;
+
+    /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode.
+     * This considerably improves MV prediction overall. */
+
+    /* The edge mbs seem to reduce the predictive quality of the
+     * whole frame's score, but are needed for a spatial distribution. */
+    int do_edges = h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size || h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2;
+
+    int start_y = X264_MIN( h->i_threadslice_end - 1, h->mb.i_mb_height - 2 + do_edges );
+    int end_y = X264_MAX( h->i_threadslice_start, 1 - do_edges );
+    int start_x = h->mb.i_mb_width - 2 + do_edges;
+    int end_x = 1 - do_edges;
+
+    for( h->mb.i_mb_y = start_y; h->mb.i_mb_y >= end_y; h->mb.i_mb_y-- )
+        for( h->mb.i_mb_x = start_x; h->mb.i_mb_x >= end_x; h->mb.i_mb_x-- )
+            x264_slicetype_mb_cost( h, s->a, s->frames, s->p0, s->p1, s->b, s->dist_scale_factor,
+                                    s->do_search, s->w, s->output_inter, s->output_intra );
+}
+
 static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
                                       x264_frame_t **frames, int p0, int p1, int b,
                                       int b_intra_penalty )
@@ -708,77 +757,131 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
     int i_score = 0;
     int do_search[2];
     const x264_weight_t *w = x264_weight_none;
+    x264_frame_t *fenc = frames[b];
+
     /* Check whether we already evaluated this frame
      * If we have tried this frame as P, then we have also tried
      * the preceding frames as B. (is this still true?) */
     /* Also check that we already calculated the row SATDs for the current frame. */
-    if( frames[b]->i_cost_est[b-p0][p1-b] >= 0 && (!h->param.rc.i_vbv_buffer_size || frames[b]->i_row_satds[b-p0][p1-b][0] != -1) )
-        i_score = frames[b]->i_cost_est[b-p0][p1-b];
+    if( fenc->i_cost_est[b-p0][p1-b] >= 0 && (!h->param.rc.i_vbv_buffer_size || fenc->i_row_satds[b-p0][p1-b][0] != -1) )
+        i_score = fenc->i_cost_est[b-p0][p1-b];
     else
     {
         int dist_scale_factor = 128;
-        int *row_satd = frames[b]->i_row_satds[b-p0][p1-b];
-        int *row_satd_intra = frames[b]->i_row_satds[0][0];
 
         /* For each list, check to see whether we have lowres motion-searched this reference frame before. */
-        do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
-        do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
+        do_search[0] = b != p0 && fenc->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
+        do_search[1] = b != p1 && fenc->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
         if( do_search[0] )
         {
             if( h->param.analyse.i_weighted_pred && b == p1 )
             {
                 x264_emms();
-                x264_weights_analyse( h, frames[b], frames[p0], 1 );
-                w = frames[b]->weight[0];
+                x264_weights_analyse( h, fenc, frames[p0], 1 );
+                w = fenc->weight[0];
             }
-            frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
+            fenc->lowres_mvs[0][b-p0-1][0][0] = 0;
         }
-        if( do_search[1] ) frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
+        if( do_search[1] ) fenc->lowres_mvs[1][p1-b-1][0][0] = 0;
 
-        if( b == p1 )
-            frames[b]->i_intra_mbs[b-p0] = 0;
-        if( !frames[b]->b_intra_calculated )
-        {
-            frames[b]->i_cost_est[0][0] = 0;
-            frames[b]->i_cost_est_aq[0][0] = 0;
-        }
         if( p1 != p0 )
             dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
 
-        frames[b]->i_cost_est[b-p0][p1-b] = 0;
-        frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
-
-        /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode.
-         * This considerably improves MV prediction overall. */
+        int output_buf_size = h->mb.i_mb_height + (NUM_INTS + PAD_SIZE) * h->param.i_lookahead_threads;
+        int *output_inter[X264_LOOKAHEAD_THREAD_MAX+1];
+        int *output_intra[X264_LOOKAHEAD_THREAD_MAX+1];
+        output_inter[0] = h->scratch_buffer2;
+        output_intra[0] = output_inter[0] + output_buf_size;
 
-        /* The edge mbs seem to reduce the predictive quality of the
-         * whole frame's score, but are needed for a spatial distribution. */
-        if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size ||
-            h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2 )
+        if( h->param.i_lookahead_threads > 1 )
         {
-            for( h->mb.i_mb_y = h->mb.i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
+            x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX];
+
+            for( int i = 0; i < h->param.i_lookahead_threads; i++ )
             {
-                row_satd[h->mb.i_mb_y] = 0;
-                if( !frames[b]->b_intra_calculated )
-                    row_satd_intra[h->mb.i_mb_y] = 0;
-                for( h->mb.i_mb_x = h->mb.i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
-                    x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
+                x264_t *t = h->lookahead_thread[i];
+
+                /* FIXME move this somewhere else */
+                t->mb.i_me_method = h->mb.i_me_method;
+                t->mb.i_subpel_refine = h->mb.i_subpel_refine;
+                t->mb.b_chroma_me = h->mb.b_chroma_me;
+
+                s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
+                                                 output_inter[i], output_intra[i] };
+
+                t->i_threadslice_start = ((h->mb.i_mb_height *  i    + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
+                t->i_threadslice_end   = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
+
+                int thread_height = t->i_threadslice_end - t->i_threadslice_start;
+                int thread_output_size = thread_height + NUM_INTS;
+                memset( output_inter[i], 0, thread_output_size * sizeof(int) );
+                memset( output_intra[i], 0, thread_output_size * sizeof(int) );
+                output_inter[i][NUM_ROWS] = output_intra[i][NUM_ROWS] = thread_height;
+
+                output_inter[i+1] = output_inter[i] + thread_output_size + PAD_SIZE;
+                output_intra[i+1] = output_intra[i] + thread_output_size + PAD_SIZE;
+
+                x264_threadpool_run( h->lookaheadpool, (void*)x264_slicetype_slice_cost, &s[i] );
             }
+            for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+                x264_threadpool_wait( h->lookaheadpool, &s[i] );
         }
         else
         {
-            for( h->mb.i_mb_y = h->mb.i_mb_height - 2; h->mb.i_mb_y >= 1; h->mb.i_mb_y-- )
-                for( h->mb.i_mb_x = h->mb.i_mb_width - 2; h->mb.i_mb_x >= 1; h->mb.i_mb_x-- )
-                    x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
+            h->i_threadslice_start = 0;
+            h->i_threadslice_end = h->mb.i_mb_height;
+            memset( output_inter[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
+            memset( output_intra[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
+            output_inter[0][NUM_ROWS] = output_intra[0][NUM_ROWS] = h->mb.i_mb_height;
+            x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
+                                                                 output_inter[0], output_intra[0] };
+            x264_slicetype_slice_cost( &s );
+        }
+
+        /* Sum up accumulators */
+        if( b == p1 )
+            fenc->i_intra_mbs[b-p0] = 0;
+        if( !fenc->b_intra_calculated )
+        {
+            fenc->i_cost_est[0][0] = 0;
+            fenc->i_cost_est_aq[0][0] = 0;
+        }
+        fenc->i_cost_est[b-p0][p1-b] = 0;
+        fenc->i_cost_est_aq[b-p0][p1-b] = 0;
+
+        int *row_satd_inter = fenc->i_row_satds[b-p0][p1-b];
+        int *row_satd_intra = fenc->i_row_satds[0][0];
+        for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+        {
+            if( b == p1 )
+                fenc->i_intra_mbs[b-p0] += output_inter[i][INTRA_MBS];
+            if( !fenc->b_intra_calculated )
+            {
+                fenc->i_cost_est[0][0] += output_intra[i][COST_EST];
+                fenc->i_cost_est_aq[0][0] += output_intra[i][COST_EST_AQ];
+            }
+
+            fenc->i_cost_est[b-p0][p1-b] += output_inter[i][COST_EST];
+            fenc->i_cost_est_aq[b-p0][p1-b] += output_inter[i][COST_EST_AQ];
+
+            if( h->param.rc.i_vbv_buffer_size )
+            {
+                int row_count = output_inter[i][NUM_ROWS];
+                memcpy( row_satd_inter, output_inter[i] + NUM_INTS, row_count * sizeof(int) );
+                if( !fenc->b_intra_calculated )
+                    memcpy( row_satd_intra, output_intra[i] + NUM_INTS, row_count * sizeof(int) );
+                row_satd_inter += row_count;
+                row_satd_intra += row_count;
+            }
         }
 
-        i_score = frames[b]->i_cost_est[b-p0][p1-b];
+        i_score = fenc->i_cost_est[b-p0][p1-b];
         if( b != p1 )
             i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
         else
-            frames[b]->b_intra_calculated = 1;
+            fenc->b_intra_calculated = 1;
 
-        frames[b]->i_cost_est[b-p0][p1-b] = i_score;
+        fenc->i_cost_est[b-p0][p1-b] = i_score;
         x264_emms();
     }
 
@@ -786,7 +889,7 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
     {
         // arbitrary penalty for I-blocks after B-frames
         int nmb = NUM_MBS;
-        i_score += (uint64_t)i_score * frames[b]->i_intra_mbs[b-p0] / (nmb * 8);
+        i_score += (uint64_t)i_score * fenc->i_intra_mbs[b-p0] / (nmb * 8);
     }
     return i_score;
 }
diff --git a/x264.c b/x264.c
index 94c31fca7..55198100c 100644
--- a/x264.c
+++ b/x264.c
@@ -797,6 +797,7 @@ static void help( x264_param_t *defaults, int longhelp )
     H1( "      --psnr                  Enable PSNR computation\n" );
     H1( "      --ssim                  Enable SSIM computation\n" );
     H1( "      --threads <integer>     Force a specific number of threads\n" );
+    H2( "      --lookahead-threads <integer> Force a specific number of lookahead threads\n" );
     H2( "      --sliced-threads        Low-latency but lower-efficiency threading\n" );
     H2( "      --thread-input          Run Avisynth in its own thread\n" );
     H2( "      --sync-lookahead <integer> Number of buffer frames for threaded lookahead\n" );
@@ -965,6 +966,7 @@ static struct option long_options[] =
     { "zones",       required_argument, NULL, 0 },
     { "qpfile",      required_argument, NULL, OPT_QPFILE },
     { "threads",     required_argument, NULL, 0 },
+    { "lookahead-threads", required_argument, NULL, 0 },
     { "sliced-threads",    no_argument, NULL, 0 },
     { "no-sliced-threads", no_argument, NULL, 0 },
     { "slice-max-size",    required_argument, NULL, 0 },
diff --git a/x264.h b/x264.h
index eb2b3b719..b6c258ecc 100644
--- a/x264.h
+++ b/x264.h
@@ -41,7 +41,7 @@
 
 #include "x264_config.h"
 
-#define X264_BUILD 124
+#define X264_BUILD 125
 
 /* Application developers planning to link against a shared library version of
  * libx264 from a Microsoft Visual Studio or similar development environment
@@ -254,7 +254,8 @@ typedef struct x264_param_t
 {
     /* CPU flags */
     unsigned int cpu;
-    int         i_threads;       /* encode multiple frames in parallel */
+    int         i_threads;           /* encode multiple frames in parallel */
+    int         i_lookahead_threads; /* multiple threads for lookahead analysis */
     int         b_sliced_threads;  /* Whether to use slice-based threading. */
     int         b_deterministic; /* whether to allow non-deterministic optimizations when threaded */
     int         b_cpu_independent; /* force canonical behavior rather than cpu-dependent optimal algorithms */