From e2ac7886626a494e1c9bbfb083b7a21971fbb283 Mon Sep 17 00:00:00 2001
From: Keith Winstein <keithw@mit.edu>
Date: Wed, 24 Feb 2010 21:14:23 -0500
Subject: [PATCH] Initial import of Ahab 0.1

---
 Makefile                  |   27 +
 ahab.cpp                  |  109 +++
 ahab_fragment_program.hpp |   52 ++
 attributes.h              |   42 +
 bitreader.cpp             |   30 +
 bitreader.hpp             |   27 +
 colorimetry.hpp           |   17 +
 config.h                  |  182 ++++
 controller.cpp            |   93 ++
 controller.hpp            |   35 +
 decoder.cpp               |   70 ++
 decoder.hpp               |   42 +
 decoderop.cpp             |   31 +
 decoderop.hpp             |   42 +
 decoderopq.cpp            |    3 +
 decoderopq.hpp            |    9 +
 displayop.cpp             |   34 +
 displayop.hpp             |   80 ++
 displayopq.cpp            |    3 +
 displayopq.hpp            |    9 +
 es.cpp                    |  161 ++++
 es.hpp                    |   62 ++
 exceptions.cpp            |   25 +
 exceptions.hpp            |   76 ++
 extensions.cpp            |  133 +++
 file.cpp                  |   65 ++
 file.hpp                  |   40 +
 framebuffer.cpp           |  252 ++++++
 framebuffer.hpp           |  111 +++
 idct_mmx.cpp              | 1338 +++++++++++++++++++++++++++++
 libmpeg2.h                |    5 +
 mmx.h                     |  292 +++++++
 motion_comp_mmx.cpp       | 1005 ++++++++++++++++++++++
 mpeg2.h                   |  206 +++++
 mpeg2_internal.h          |  324 +++++++
 mpegheader.cpp            |   83 ++
 mpegheader.hpp            |  360 ++++++++
 mpegtables.hpp            |   46 +
 mutexobj.hpp              |   21 +
 ogl.cpp                   |  433 ++++++++++
 ogl.hpp                   |   78 ++
 opq.cpp                   |  246 ++++++
 opq.hpp                   |   50 ++
 picture.cpp               |  333 ++++++++
 sequence.cpp              |  183 ++++
 slice.cpp                 |   43 +
 slicedecode.cpp           | 1697 +++++++++++++++++++++++++++++++++++++
 startfinder.cpp           |  118 +++
 vlc.h                     |  464 ++++++++++
 xeventloop.cpp            |   58 ++
 xeventloop.hpp            |   25 +
 51 files changed, 9270 insertions(+)
 create mode 100644 Makefile
 create mode 100644 ahab.cpp
 create mode 100644 ahab_fragment_program.hpp
 create mode 100644 attributes.h
 create mode 100644 bitreader.cpp
 create mode 100644 bitreader.hpp
 create mode 100644 colorimetry.hpp
 create mode 100644 config.h
 create mode 100644 controller.cpp
 create mode 100644 controller.hpp
 create mode 100644 decoder.cpp
 create mode 100644 decoder.hpp
 create mode 100644 decoderop.cpp
 create mode 100644 decoderop.hpp
 create mode 100644 decoderopq.cpp
 create mode 100644 decoderopq.hpp
 create mode 100644 displayop.cpp
 create mode 100644 displayop.hpp
 create mode 100644 displayopq.cpp
 create mode 100644 displayopq.hpp
 create mode 100644 es.cpp
 create mode 100644 es.hpp
 create mode 100644 exceptions.cpp
 create mode 100644 exceptions.hpp
 create mode 100644 extensions.cpp
 create mode 100644 file.cpp
 create mode 100644 file.hpp
 create mode 100644 framebuffer.cpp
 create mode 100644 framebuffer.hpp
 create mode 100644 idct_mmx.cpp
 create mode 100644 libmpeg2.h
 create mode 100644 mmx.h
 create mode 100644 motion_comp_mmx.cpp
 create mode 100644 mpeg2.h
 create mode 100644 mpeg2_internal.h
 create mode 100644 mpegheader.cpp
 create mode 100644 mpegheader.hpp
 create mode 100644 mpegtables.hpp
 create mode 100644 mutexobj.hpp
 create mode 100644 ogl.cpp
 create mode 100644 ogl.hpp
 create mode 100644 opq.cpp
 create mode 100644 opq.hpp
 create mode 100644 picture.cpp
 create mode 100644 sequence.cpp
 create mode 100644 slice.cpp
 create mode 100644 slicedecode.cpp
 create mode 100644 startfinder.cpp
 create mode 100644 vlc.h
 create mode 100644 xeventloop.cpp
 create mode 100644 xeventloop.hpp

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..1fb6d56
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,27 @@
+source = ahab.cpp es.cpp mpegheader.cpp bitreader.cpp startfinder.cpp sequence.cpp extensions.cpp exceptions.cpp picture.cpp slice.cpp file.cpp ogl.cpp slicedecode.cpp idct_mmx.cpp motion_comp_mmx.cpp framebuffer.cpp opq.cpp displayop.cpp controller.cpp displayopq.cpp decoder.cpp decoderop.cpp decoderopq.cpp xeventloop.cpp
+objects = es.o mpegheader.o bitreader.o startfinder.o sequence.o extensions.o exceptions.o picture.o slice.o file.o ogl.o slicedecode.o idct_mmx.o motion_comp_mmx.o framebuffer.o opq.o displayop.o controller.o displayopq.o decoder.o decoderop.o decoderopq.o xeventloop.o
+executables = ahab
+
+CPP = g++
+CPPFLAGS = -g -O3 -Wall -fno-implicit-templates -pipe -pthread -D_FILE_OFFSET_BITS=64 -D_XOPEN_SOURCE=500 -DGL_GLEXT_PROTOTYPES -DGLX_GLXEXT_PROTOTYPES `pkg-config gtkmm-2.4 --cflags`
+LIBS = -lX11 -lGL -lGLU `pkg-config gtkmm-2.4 --libs`
+
+all: $(executables)
+
+controller.o: controller.cpp
+	$(CPP) $(CPPFLAGS) -frepo -c -o $@ $<
+
+ahab: ahab.o $(objects)
+	$(CPP) $(CPPFLAGS) -o $@ $+ $(LIBS)
+
+%.o: %.cpp
+	$(CPP) $(CPPFLAGS) -c -o $@ $<
+
+include depend
+
+depend: $(source)
+	$(CPP) $(INCLUDES) -MM $(source) > depend
+
+.PHONY: clean
+clean:
+	-rm -f $(executables) depend *.o *.rpo
diff --git a/ahab.cpp b/ahab.cpp
new file mode 100644
index 0000000..57f3beb
--- /dev/null
+++ b/ahab.cpp
@@ -0,0 +1,109 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "libmpeg2.h"
+
+#include "controller.hpp"
+#include "file.hpp"
+#include "es.hpp"
+#include "ogl.hpp"
+#include "framebuffer.hpp"
+#include "colorimetry.hpp"
+#include "decoder.hpp"
+#include "xeventloop.hpp"
+
+#include <sys/time.h>
+#include <math.h>
+
+void progress_bar( off_t size, off_t location );
+
+int main( int argc, char *argv[] )
+{
+  File *file;
+  ES *stream;
+  Sequence *seq;
+  OpenGLDisplay *display;
+  Controller *controller;
+  Decoder *decoder;
+  XEventLoop *xevents;
+
+  if ( argc != 2 ) {
+    fprintf( stderr, "USAGE: %s FILENAME\n", argv[ 0 ] );
+    exit( 1 );
+  }
+
+  fprintf( stderr, "Opening file..." );
+  file = new File( argv[ 1 ] );
+  fprintf( stderr, " done.\n" );
+
+  fprintf( stderr, "Constructing elementary stream object...      " );
+  try {
+    stream = new ES( file, &progress_bar );
+  } catch ( AhabException *e ) {
+    fprintf( stderr, "Caught exception.\n" );
+    if ( UnixError *ue = dynamic_cast<UnixError *>( e ) ) {
+      fprintf( stderr, "UnixError( %d )\n", ue->err );
+    }
+    return 1;
+  }
+
+  fprintf( stderr, "\b\b\b\b\b\b\b done. \n" );
+
+  seq = stream->get_sequence();
+
+  display = new OpenGLDisplay( (char *)NULL, seq->get_sar(),
+			       16 * seq->get_mb_width(),
+			       16 * seq->get_mb_height(),
+			       seq->get_horizontal_size(),
+			       seq->get_vertical_size() );
+
+  fprintf( stderr, "Pictures: %d, duration: %.3f seconds.\n",
+	   stream->get_num_pictures(), stream->get_duration() );
+
+  controller = new Controller( stream->get_num_pictures() );
+
+  decoder = new Decoder( stream, display->get_queue() );
+
+  xevents = new XEventLoop( display );
+
+  controller->get_queue()->hookup( decoder->get_queue() );
+  xevents->get_queue()->hookup( decoder->get_queue() );
+
+  try {
+    decoder->wait_shutdown();
+  } catch ( UnixAssertError *e ) {
+    e->print();
+  }
+
+  try {
+    delete xevents;
+    delete decoder;
+    delete controller;
+    delete display;
+    delete stream;
+    delete file;
+  } catch ( UnixAssertError *e ) {
+    e->print();
+  }
+
+  return 0;
+}
+
+void progress_bar( off_t size, off_t location )
+{
+  static char percent[ 20 ] = "";
+  char new_percent[ 20 ] = "";
+  char backspaces[ 6 ] = "\b\b\b\b\b";
+
+  snprintf( new_percent, 20, "%2.0f", 100.0 * location / (double)size );
+
+  if ( (percent[ 1 ] != new_percent[ 1 ])
+       || (percent[ 0 ] != new_percent[ 0 ]) ) {
+    fprintf( stderr, "%s[%s%%]", backspaces, new_percent );    
+  }
+
+  percent[ 0 ] = new_percent[ 0 ];
+  percent[ 1 ] = new_percent[ 1 ];
+}
diff --git a/ahab_fragment_program.hpp b/ahab_fragment_program.hpp
new file mode 100644
index 0000000..feaea62
--- /dev/null
+++ b/ahab_fragment_program.hpp
@@ -0,0 +1,52 @@
+#ifndef AHAB_FRAGMENT_PROGRAM_HPP
+#define AHAB_FRAGMENT_PROGRAM_HPP
+
+/*
+  Matlab inversion of BT.709 matrix:
+
+  >> 255 * inv([219*[.7154 .0721 .2125]' 224*[-.386 .5 -.115]' 224*[-.454 -.046 .5]']')
+
+ans =
+
+   1.164165557121523  -0.213138349939461  -0.532748200973066
+   1.166544220758321   2.112430116393991   0.001144179685436
+   1.164384394176109   0.000813948963217   1.793155612333230
+*/
+
+/* Matrix inverstion of SMPTE 170M matrix:
+
+   >> 255 * inv([219*[.587 .114 .299]' 224*[-.331 .500 -.169]' 224*[-.419 -.081 .5]']')
+
+ans =
+
+   1.164383561643836  -0.391260370716072  -0.813004933873461
+   1.164383561643836   2.017414758970775   0.001127259960693
+   1.164383561643836  -0.001054999706803   1.595670195813386
+*/
+
+static char ahab_fragment_program[] = {
+  "!!ARBfp1.0\n"
+
+  "ATTRIB where_Y  = fragment.texcoord[0];\n"
+  "ATTRIB where_Cb = fragment.texcoord[1];\n"
+  "ATTRIB where_Cr = fragment.texcoord[2];\n"
+  "OUTPUT out      = result.color;\n"
+
+  "TEMP YPBPR;\n"
+
+  "TEX YPBPR.x, where_Y, texture[0], RECT;\n"
+  "TEX YPBPR.y, where_Cb, texture[1], RECT;\n"
+  "TEX YPBPR.z, where_Cr, texture[2], RECT;\n"
+
+  "PARAM CTOP = { .06274509803921568627, .50196078431372549019, .50196078431372549019 };\n"
+
+  "SUB YPBPR, YPBPR, CTOP;\n"
+
+  "DP3   out.g, YPBPR, program.local[ 0 ];\n"
+  "DP3   out.b, YPBPR, program.local[ 1 ];\n"
+  "DP3   out.r, YPBPR, program.local[ 2 ];\n"
+
+  "END\n"
+};
+
+#endif
diff --git a/attributes.h b/attributes.h
new file mode 100644
index 0000000..e005eef
--- /dev/null
+++ b/attributes.h
@@ -0,0 +1,42 @@
+/*
+ * attributes.h
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef LIBMPEG2_ATTRIBUTES_H
+#define LIBMPEG2_ATTRIBUTES_H
+
+/* use gcc attribs to align critical data structures */
+#ifdef ATTRIBUTE_ALIGNED_MAX
+#define ATTR_ALIGN(align) __attribute__ ((__aligned__ ((ATTRIBUTE_ALIGNED_MAX < align) ? ATTRIBUTE_ALIGNED_MAX : align)))
+#else
+#define ATTR_ALIGN(align)
+#endif
+
+#ifdef HAVE_BUILTIN_EXPECT
+#define likely(x) __builtin_expect ((x) != 0, 1)
+#define unlikely(x) __builtin_expect ((x) != 0, 0)
+#else
+#define likely(x) (x)
+#define unlikely(x) (x)
+#endif
+
+#endif /* LIBMPEG2_ATTRIBUTES_H */
diff --git a/bitreader.cpp b/bitreader.cpp
new file mode 100644
index 0000000..c4be0c8
--- /dev/null
+++ b/bitreader.cpp
@@ -0,0 +1,30 @@
+#include <stdint.h>
+#include <stdio.h>
+
+#include "bitreader.hpp"
+#include "exceptions.hpp"
+
+inline bool BitReader::thisbit( void )
+{
+  uint octet = bit_offset / 8;
+  uint offset_within_octet = 7 - (bit_offset % 8);
+
+  return ( buf[ octet ] & (1 << offset_within_octet) ? true : false );
+}
+
+uint32_t BitReader::readbits( uint n )
+{
+  uint32_t val = 0;
+
+  if ( (bit_offset + n - 1)/8 >= len ) {
+    throw NeedBits();
+  } 
+
+  for ( uint i = 0; i < n; i++ ) {
+    val <<= 1;
+    val |= thisbit();
+    bit_offset++;
+  }
+
+  return val;
+}
diff --git a/bitreader.hpp b/bitreader.hpp
new file mode 100644
index 0000000..4bdeac0
--- /dev/null
+++ b/bitreader.hpp
@@ -0,0 +1,27 @@
+#ifndef BITREADER_HPP
+#define BITREADER_HPP
+
+#include <stdint.h>
+#include <stdlib.h>
+
+class BitReader {
+private:
+  uint8_t *buf;
+  uint len;
+
+  uint bit_offset;
+
+  bool thisbit( void );
+
+public:
+  BitReader( uint8_t *s_buf, uint s_len ) {
+    buf = s_buf;
+    bit_offset = 0;
+    len = s_len;
+  }
+
+  uint32_t readbits( uint n );
+  void reset( void ) { bit_offset = 0; }
+};
+
+#endif
diff --git a/colorimetry.hpp b/colorimetry.hpp
new file mode 100644
index 0000000..e53cf21
--- /dev/null
+++ b/colorimetry.hpp
@@ -0,0 +1,17 @@
+#ifndef COLORIMETRY_HPP
+#define COLORIMETRY_HPP
+
+#include "displayop.hpp"
+
+static double smpte170m_green[ 3 ] = { 1.164383561643836, -0.391260370716072, -0.813004933873461 };
+static double smpte170m_blue[ 3 ] = { 1.164383561643836,  2.017414758970775,  0.001127259960693 };
+static double smpte170m_red[ 3 ] = { 1.164383561643836, -0.001054999706803,  1.595670195813386 };
+
+static double itu709_green[ 3 ] = { 1.164165557121523,  -0.213138349939461,  -0.532748200973066 };
+static double itu709_blue[ 3 ] = { 1.166544220758321,   2.112430116393991,   0.001144179685436 };
+static double itu709_red[ 3 ] = { 1.164384394176109,   0.000813948963217,   1.793155612333230 };
+
+static LoadMatrixCoefficients smpte170m( smpte170m_green, smpte170m_blue, smpte170m_red );
+static LoadMatrixCoefficients itu709( itu709_green, itu709_blue, itu709_red );
+
+#endif
diff --git a/config.h b/config.h
new file mode 100644
index 0000000..67ace81
--- /dev/null
+++ b/config.h
@@ -0,0 +1,182 @@
+/* Ahab defines */
+
+#define mpeg2_idct_copy mpeg2_idct_copy_sse2
+#define mpeg2_idct_add mpeg2_idct_add_sse2
+#define mpeg2_mc mpeg2_mc_mmxext
+
+/* include/config.h.  Generated from config.h.in by configure.  */
+/* include/config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* autodetect accelerations */
+/* #undef ACCEL_DETECT  */
+
+/* alpha architecture */
+/* #undef ARCH_ALPHA */
+
+/* ARM architecture */
+/* #undef ARCH_ARM */
+
+/* ppc architecture */
+/* #undef ARCH_PPC */
+
+/* sparc architecture */
+/* #undef ARCH_SPARC */
+
+/* x86 architecture */
+#define ARCH_X86 
+
+/* maximum supported data alignment */
+#define ATTRIBUTE_ALIGNED_MAX 64
+
+/* debug mode configuration */
+/* #undef DEBUG */
+
+/* Define to 1 if you have the <altivec.h> header. */
+/* #undef HAVE_ALTIVEC_H */
+
+/* Define if you have the `__builtin_expect' function. */
+#define HAVE_BUILTIN_EXPECT 
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Define to 1 if you have the `ftime' function. */
+#define HAVE_FTIME 1
+
+/* Define to 1 if you have the `gettimeofday' function. */
+#define HAVE_GETTIMEOFDAY 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the <io.h> header file. */
+/* #undef HAVE_IO_H */
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if the system has the type `struct timeval'. */
+#define HAVE_STRUCT_TIMEVAL 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/timeb.h> header file. */
+#define HAVE_SYS_TIMEB_H 1
+
+/* Define to 1 if you have the <sys/time.h> header file. */
+#define HAVE_SYS_TIME_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <time.h> header file. */
+#define HAVE_TIME_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* libvo DirectX support */
+/* #undef LIBVO_DX */
+
+/* libvo SDL support */
+#define LIBVO_SDL 
+
+/* libvo X11 support */
+#define LIBVO_X11 
+
+/* libvo Xv support */
+#define LIBVO_XV 
+
+/* mpeg2dec profiling */
+/* #undef MPEG2DEC_GPROF */
+
+/* Name of package */
+#define PACKAGE "libmpeg2"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT ""
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "libmpeg2"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "libmpeg2 0.5.1"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "libmpeg2"
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "0.5.1"
+
+/* Define as the return type of signal handlers (`int' or `void'). */
+#define RETSIGTYPE void
+
+/* The size of `char', as computed by sizeof. */
+/* #undef SIZEOF_CHAR */
+
+/* The size of `int', as computed by sizeof. */
+/* #undef SIZEOF_INT */
+
+/* The size of `long', as computed by sizeof. */
+/* #undef SIZEOF_LONG */
+
+/* The size of `short', as computed by sizeof. */
+/* #undef SIZEOF_SHORT */
+
+/* The size of `void*', as computed by sizeof. */
+/* #undef SIZEOF_VOIDP */
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Define to 1 if you can safely include both <sys/time.h> and <time.h>. */
+#define TIME_WITH_SYS_TIME 1
+
+/* Version number of package */
+#define VERSION "0.5.1"
+
+/* Define to 1 if your processor stores words with the most significant byte
+   first (like Motorola and SPARC, unlike Intel and VAX). */
+/* #undef WORDS_BIGENDIAN */
+
+/* Define to 1 if the X Window System is missing or not being used. */
+/* #undef X_DISPLAY_MISSING */
+
+/* Number of bits in a file offset, on hosts where this is settable. */
+/* #define _FILE_OFFSET_BITS 64 */
+
+/* Define for large files, on AIX-style hosts. */
+/* #undef _LARGE_FILES */
+
+/* Define to empty if `const' does not conform to ANSI C. */
+/* #undef const */
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+#define inline __attribute__ ((__always_inline__))
+#endif
+
+/* Define as `__restrict' if that's what the C compiler calls it, or to
+   nothing if it is not supported. */
+#define restrict __restrict__
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+/* #undef size_t */
+
+/* Define to empty if the keyword `volatile' does not work. Warning: valid
+   code using `volatile' can become incorrect without. Disable with care. */
+/* #undef volatile */
diff --git a/controller.cpp b/controller.cpp
new file mode 100644
index 0000000..60d2da8
--- /dev/null
+++ b/controller.cpp
@@ -0,0 +1,93 @@
+#include "controller.hpp"
+#include "exceptions.hpp"
+#include "decoderop.hpp"
+#include "mutexobj.hpp"
+
+#include <stdio.h>
+
+static void *thread_helper( void *controller )
+{
+  Controller *me = static_cast<Controller *>( controller );
+  ahabassert( me );
+  me->loop();
+  return NULL;
+}
+
+Controller::Controller( uint s_num_frames )
+  : quit_signal( NULL ),
+    opq( 0 ),
+    num_frames( s_num_frames )
+{
+  pthread_create( &thread_handle, NULL, thread_helper, this );
+}
+
+void Controller::loop( void )
+{
+  main = new Gtk::Main( 0, NULL );
+  window = new Gtk::Window;
+
+  {
+    MutexLock x( &mutex );
+    quit_signal = new Glib::Dispatcher;
+  }
+
+  window->set_default_size( 600, 50 );
+  window->set_title( "Ahab Controller" );
+
+  scale = new Gtk::HScale( 0, num_frames, 1 );
+  window->add( *scale );
+
+  scale->set_update_policy( Gtk::UPDATE_CONTINUOUS );
+  scale->set_digits( 0 );
+  scale->set_value_pos( Gtk::POS_TOP );
+  scale->set_draw_value();
+
+  scale->signal_change_value().connect( sigc::mem_fun( this, &Controller::on_changed_value ) );
+
+  quit_signal->connect( sigc::mem_fun( this, &Controller::shutdown ) );
+
+  scale->show();
+
+  main->run( *window );
+
+  {
+    MutexLock x( &mutex );
+    delete quit_signal;
+    quit_signal = NULL;
+  }
+
+  DecoderShutDown *op = new DecoderShutDown();
+
+  try {
+    opq.enqueue( op );
+  } catch ( UnixAssertError *e ) {}
+
+  delete scale;
+  delete window;
+  delete main;
+}
+
+bool Controller::on_changed_value( Gtk::ScrollType scroll, double new_value )
+{
+  int cur_frame = lround( new_value );
+
+  if ( cur_frame < 0 ) cur_frame = 0;
+  if ( cur_frame >= num_frames ) cur_frame = num_frames - 1;
+
+  SetPictureNumber *op = new SetPictureNumber( cur_frame );
+  opq.flush_type( op );
+  opq.enqueue( op );
+
+  return true;
+}
+
+Controller::~Controller()
+{
+  {
+    MutexLock x( &mutex );
+    if ( quit_signal ) {
+      (*quit_signal)();
+    }
+  }
+  unixassert( pthread_join( thread_handle, NULL ) );
+}
diff --git a/controller.hpp b/controller.hpp
new file mode 100644
index 0000000..4cd6412
--- /dev/null
+++ b/controller.hpp
@@ -0,0 +1,35 @@
+#ifndef CONTROLLER_HPP
+#define CONTROLLER_HPP
+
+#include <gtkmm-2.4/gtkmm.h>
+#include <pthread.h>
+
+#include "decoderop.hpp"
+
+class Controller {
+private:
+  Gtk::Main *main;
+  Gtk::Window *window;
+  Gtk::HScale *scale;
+
+  pthread_mutex_t mutex;
+  pthread_t thread_handle;
+
+  bool on_changed_value( Gtk::ScrollType scroll, double new_value );
+  void shutdown( void ) { main->quit(); }
+
+  Glib::Dispatcher *quit_signal;
+
+  OperationQueue<DecoderOperation> opq;
+
+  int num_frames;
+
+public:
+  Controller( uint s_num_frames );
+  ~Controller();
+
+  void loop( void );
+  OperationQueue<DecoderOperation> *get_queue() { return &opq; }
+};
+
+#endif
diff --git a/decoder.cpp b/decoder.cpp
new file mode 100644
index 0000000..f726227
--- /dev/null
+++ b/decoder.cpp
@@ -0,0 +1,70 @@
+#include <pthread.h>
+
+#include "decoder.hpp"
+#include "exceptions.hpp"
+
+#include "mpegheader.hpp"
+#include "displayop.hpp"
+#include "decoderop.hpp"
+
+static void *thread_helper( void *decoder )
+{
+  Decoder *me = static_cast<Decoder *>( decoder );
+  ahabassert( me );
+  me->loop();
+  return NULL;
+}
+
+Decoder::Decoder( ES *s_stream,
+		  OperationQueue<DisplayOperation> *s_oglq )
+  : opq( 0 ),
+    stream( s_stream )
+{
+  state.current_picture = 0;
+  state.fullscreen = false;
+  state.live = true;
+  state.oglq = s_oglq;
+
+  pthread_create( &thread_handle, NULL, thread_helper, this );
+}
+
+Decoder::~Decoder() {}
+
+void Decoder::decode_and_display( void )
+{
+  Picture *pic = stream->get_picture_displayed( state.current_picture );
+  pic->lock_and_decodeall();
+  DrawAndUnlockFrame *op = new DrawAndUnlockFrame( pic->get_framehandle() );
+  state.oglq->flush_type( op );
+  state.oglq->enqueue( op );  
+}
+
+void Decoder::loop( void )
+{
+  decode_and_display();
+
+  int picture_displayed = state.current_picture;
+
+  while ( state.live ) {
+    if ( state.current_picture < 0 ) {
+      state.current_picture = 0;
+    } else if ( (uint)state.current_picture >= stream->get_num_pictures() ) {
+      state.current_picture = stream->get_num_pictures() - 1;
+    }
+
+    if ( state.current_picture != picture_displayed ) {
+      decode_and_display();
+    }
+
+    picture_displayed = state.current_picture;
+
+    DecoderOperation *op = opq.dequeue( true );
+    op->execute( state );
+    delete op;
+  }
+}
+
+void Decoder::wait_shutdown( void )
+{
+  unixassert( pthread_join( thread_handle, NULL ) );
+}
diff --git a/decoder.hpp b/decoder.hpp
new file mode 100644
index 0000000..28c45ca
--- /dev/null
+++ b/decoder.hpp
@@ -0,0 +1,42 @@
+#ifndef DECODER_HPP
+#define DECODER_HPP
+
+#include <pthread.h>
+
+#include "es.hpp"
+
+class OpenGLDisplay;
+class DecoderOperation;
+class DisplayOperation;
+
+#include "opq.hpp"
+
+class DecoderState {
+public:
+  int current_picture;
+  bool fullscreen;
+  bool live;
+  OpenGLDisplay *display;
+  OperationQueue<DisplayOperation> *oglq;
+};
+
+class Decoder {
+private:
+  DecoderState state;
+  OperationQueue<DecoderOperation> opq;
+  pthread_t thread_handle;
+
+  ES *stream;
+
+  void decode_and_display( void );
+
+public:
+  Decoder( ES *s_stream, OperationQueue<DisplayOperation> *s_oglq );
+  ~Decoder();
+  
+  void loop();
+  OperationQueue<DecoderOperation> *get_queue() { return &opq; }
+  void wait_shutdown( void );
+};
+
+#endif 
diff --git a/decoderop.cpp b/decoderop.cpp
new file mode 100644
index 0000000..29365d4
--- /dev/null
+++ b/decoderop.cpp
@@ -0,0 +1,31 @@
+#include "decoderop.hpp"
+#include "displayop.hpp"
+
+#include <stdio.h>
+
+void XKey::execute( DecoderState &state )
+{
+  switch ( key ) {
+  case '@':
+    {
+      Repaint *op = new Repaint();
+      state.oglq->enqueue( op );
+    }
+    break;
+  case 'f':
+    state.fullscreen = !state.fullscreen;
+    {
+      FullScreenMode *op = new FullScreenMode( state.fullscreen );
+      state.oglq->leapfrog_enqueue( op, (DrawAndUnlockFrame*)NULL );
+    }
+    break;
+  case 'q':
+    state.live = false;
+    break;
+  }
+}
+
+void DecoderShutDown::execute( DecoderState &state )
+{
+  state.live = false;
+}
diff --git a/decoderop.hpp b/decoderop.hpp
new file mode 100644
index 0000000..750105c
--- /dev/null
+++ b/decoderop.hpp
@@ -0,0 +1,42 @@
+#ifndef DECODEROP_HPP
+#define DECODEROP_HPP
+
+class DecoderOperation;
+class DecoderState;
+
+#include "decoder.hpp"
+
+class DecoderOperation {
+public:
+  virtual void execute( DecoderState &state ) = 0;
+  virtual ~DecoderOperation() {}
+};
+
+class SetPictureNumber : public DecoderOperation {
+private:
+  int picture_number;
+  
+public:
+  SetPictureNumber( int s_picture_number ) : picture_number( s_picture_number ) {}
+  ~SetPictureNumber() {}
+  void execute( DecoderState &state ) { state.current_picture = picture_number; }
+};
+
+class XKey : public DecoderOperation {
+private:
+  int key;
+
+public:
+  XKey( int s_key ) : key( s_key ) {}
+  ~XKey() {}
+  void execute ( DecoderState &state );
+};
+
+class DecoderShutDown : public DecoderOperation {
+public:
+  DecoderShutDown() {}
+  ~DecoderShutDown() {}
+  void execute( DecoderState &state );
+};
+
+#endif
diff --git a/decoderopq.cpp b/decoderopq.cpp
new file mode 100644
index 0000000..904d846
--- /dev/null
+++ b/decoderopq.cpp
@@ -0,0 +1,3 @@
+#include "decoderopq.hpp"
+
+template class OperationQueue<DecoderOperation>;
diff --git a/decoderopq.hpp b/decoderopq.hpp
new file mode 100644
index 0000000..9983ba9
--- /dev/null
+++ b/decoderopq.hpp
@@ -0,0 +1,9 @@
+#ifndef DECODEROPQ_HPP
+#define DECODEROPQ_HPP
+
+#include "opq.hpp"
+#include "decoderop.hpp"
+
+#include "opq.cpp"
+
+#endif
diff --git a/displayop.cpp b/displayop.cpp
new file mode 100644
index 0000000..5433046
--- /dev/null
+++ b/displayop.cpp
@@ -0,0 +1,34 @@
+#include <pthread.h>
+#include <unistd.h>
+#include <stdio.h>
+
+#include "displayop.hpp"
+
+void DrawAndUnlockFrame::execute( OpcodeState &state )
+{
+  state.draw( handle->get_frame()->get_buf() );
+}
+
+void LoadMatrixCoefficients::execute( OpcodeState &state )
+{
+  state.load_matrix_coefficients( green, blue, red );
+}
+
+void ShutDown::execute( OpcodeState &state )
+{
+  pthread_exit( NULL );
+}
+
+void FullScreenMode::execute( OpcodeState &state )
+{
+  if ( fullscreen ) {
+    state.dofullscreen();
+  } else {
+    state.unfullscreen();
+  }
+}
+
+void Repaint::execute( OpcodeState &state )
+{
+  state.paint();
+}
diff --git a/displayop.hpp b/displayop.hpp
new file mode 100644
index 0000000..59946e1
--- /dev/null
+++ b/displayop.hpp
@@ -0,0 +1,80 @@
+#ifndef DISPLAYOPERATION_HPP
+#define DISPLAYOPERATION_HPP
+
+class DisplayOperation;
+class OpenGLDisplay;
+class OpcodeState;
+
+#include "framebuffer.hpp"
+#include "ogl.hpp"
+
+class DisplayOperation {
+public:
+  virtual void execute( OpcodeState &state ) = 0;
+  virtual ~DisplayOperation() {}
+};
+
+class Repaint : public DisplayOperation {
+public:
+  Repaint() {}
+  ~Repaint() {}
+  void execute( OpcodeState &state );
+};
+
+class DrawAndUnlockFrame : public DisplayOperation {
+private:
+  FrameHandle *handle;
+
+  static void load_tex( GLenum tnum, GLuint tex, uint width, uint height,
+			uint8_t *data );
+
+public:
+  DrawAndUnlockFrame( FrameHandle *s_handle ) { handle = s_handle; }
+  ~DrawAndUnlockFrame() { handle->decrement_lockcount(); }
+  void execute( OpcodeState &state );
+};
+
+class LoadMatrixCoefficients : public DisplayOperation {
+private:
+  double green[ 3 ], blue[ 3 ], red[ 3 ];
+
+public:
+  LoadMatrixCoefficients( double s_green[ 3 ],
+			  double s_blue[ 3 ],
+			  double s_red[ 3 ] ) {
+    for ( int i = 0; i < 3; i++ ) {
+      green[ i ] = s_green[ i ];
+      blue[ i ] = s_blue[ i ];
+      red[ i ] = s_red[ i ];
+    }
+  }
+
+  ~LoadMatrixCoefficients() {}
+  void execute( OpcodeState &state );
+};
+
+class ShutDown : public DisplayOperation {
+public:
+  ShutDown() {}
+  ~ShutDown() {}
+  void execute( OpcodeState &state );
+};
+
+class FullScreenMode : public DisplayOperation {
+private:
+  bool fullscreen;
+
+public:
+  FullScreenMode( bool s_fullscreen ) { fullscreen = s_fullscreen; }
+  ~FullScreenMode() {}
+  void execute( OpcodeState &state );
+};
+
+class NullOperation : public DisplayOperation {
+public:
+  NullOperation( void ) {}
+  ~NullOperation( void ) {}
+  void execute( OpcodeState &state ) {}
+};
+
+#endif
diff --git a/displayopq.cpp b/displayopq.cpp
new file mode 100644
index 0000000..7350731
--- /dev/null
+++ b/displayopq.cpp
@@ -0,0 +1,3 @@
+#include "displayopq.hpp"
+
+template class OperationQueue<DisplayOperation>;
diff --git a/displayopq.hpp b/displayopq.hpp
new file mode 100644
index 0000000..6bda773
--- /dev/null
+++ b/displayopq.hpp
@@ -0,0 +1,9 @@
+#ifndef DISPLAYOPQ_HPP
+#define DISPLAYOPQ_HPP
+
+#include "opq.hpp"
+#include "displayop.hpp"
+
+#include "opq.cpp"
+
+#endif
diff --git a/es.cpp b/es.cpp
new file mode 100644
index 0000000..0e9bec7
--- /dev/null
+++ b/es.cpp
@@ -0,0 +1,161 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <typeinfo>
+
+#include "es.hpp"
+#include "mpegheader.hpp"
+#include "exceptions.hpp"
+#include "bitreader.hpp"
+#include "framebuffer.hpp"
+
+const uint pool_slots = 50;
+
+ES::ES( File *s_file, void (*progress)( off_t size, off_t location ) )
+{
+  file = s_file;
+  first_header = last_header = NULL;
+
+  /* Find first sequence header */
+  /* We can't decode pictures before this, in general,
+     because we don't know the quantization matrices. */
+  off_t start = startfinder( 0, progress, &ES::first_sequence );
+
+  if ( start == -1 ) {
+    throw SequenceNotFound();
+  }
+
+  /* Ingest every start code, including before "start" */
+  seq = NULL;
+  startfinder( 0, progress, &ES::add_header );
+  if ( !seq ) {
+    throw SequenceNotFound();
+  }
+
+  /* Make ghost sequence header at start */
+  MPEGHeader *real_first_header = first_header;
+
+  Sequence *ghost_sequence = new Sequence( *seq );
+  SequenceExtension *first_extension = dynamic_cast<SequenceExtension *>( seq->get_next() );
+  SequenceExtension *ghost_extension = new SequenceExtension( *first_extension );
+
+  if ( (ghost_sequence == NULL) || (first_extension == NULL) || (ghost_extension == NULL) ) {
+    fprintf( stderr, "Problem assembling ghost sequence or extension header.\n" );
+    throw MPEGInvalid();
+  }
+
+  first_header = ghost_sequence;
+  ghost_sequence->override_next( ghost_extension );
+  ghost_extension->override_next( real_first_header );
+
+  ghost_sequence->set_unknown_quantiser_flags();
+
+  /* Link headers to one another as appropriate */
+  for ( MPEGHeader *hdr = first_header; hdr != NULL; hdr = hdr->get_next() ) {
+      hdr->link();
+  }
+
+  pool = new BufferPool( pool_slots, 16 * seq->get_mb_width(),
+			 16 * seq->get_mb_height() );
+
+  /* Figure out the display order of each picture and link each
+     from the coded_picture and displayed_picture arrays */
+  number_pictures();
+
+  /* Count up duration in seconds */
+  duration_numer = 0;
+  duration_denom = 2 * seq->get_frame_rate_numerator();
+  uint ticks = seq->get_frame_rate_denominator();
+  for ( uint i = 0; i < get_num_pictures(); i++ ) {
+    displayed_picture[ i ]->set_time( duration_numer / (double)duration_denom );
+    duration_numer += ticks * displayed_picture[ i ]->num_fields();
+  }
+  duration = duration_numer / (double)duration_denom;
+}
+
+ES::~ES()
+{
+  MPEGHeader *hdr = first_header;
+  while ( hdr != NULL ) {
+    MPEGHeader *next = hdr->get_next();
+    delete hdr;
+    hdr = next;
+  }
+
+  delete pool;
+  delete[] coded_picture;
+  delete[] displayed_picture;
+}
+
+void ES::number_pictures( void )
+{
+  /* Number each picture */
+  Picture *oanchor = NULL;
+  Picture *nanchor = NULL;
+  uint coded_order = 0;
+  uint display_order = 0;
+
+  for ( MPEGHeader *hdr = first_header; hdr != NULL; hdr = hdr->get_next() ) {
+    if ( typeid( *hdr ) == typeid( Picture ) ) {
+      Picture *tp = static_cast<Picture *>( hdr );
+      tp->set_coded( coded_order++ );
+
+      switch ( tp->get_type() ) {
+      case B:
+	tp->set_display( display_order++ );
+	tp->set_forward( oanchor );
+	tp->set_backward( nanchor );
+	if ( (oanchor == NULL) || (nanchor == NULL)
+	     || (oanchor->problem()) || (nanchor->problem()) ) tp->set_broken( true );
+	break;
+
+      case P:
+	tp->set_forward( nanchor );
+	if ( (nanchor == NULL) || nanchor->problem() ) tp->set_broken( true );
+	/* don't break */
+      case I:
+	if ( nanchor ) nanchor->set_display( display_order++ );
+	oanchor = nanchor;
+	nanchor = tp;
+	break;
+      }
+    } else if ( typeid( *hdr ) == typeid( SequenceEnd ) ) {
+      if ( nanchor ) nanchor->set_display( display_order++ );
+      nanchor = NULL;
+    }
+  }
+
+  if ( nanchor && (nanchor->get_display() == -1) ) {
+    nanchor->set_display( display_order++ );
+    nanchor->set_unclean( true );
+    nanchor = NULL;
+  }
+
+  ahabassert( coded_order == display_order );
+  num_pictures = coded_order;
+
+  /* Allocate memory */
+  coded_picture = new Picture *[ num_pictures ];
+  displayed_picture = new Picture *[ num_pictures ];
+
+  /* Make pointers in order */
+  for ( MPEGHeader *hdr = first_header; hdr != NULL; hdr = hdr->get_next() ) {
+    if ( typeid( *hdr ) == typeid( Picture ) ) {
+      Picture *tp = static_cast<Picture *>( hdr );
+      uint this_coded = tp->get_coded();
+      uint this_display = tp->get_display();
+
+      tp->init_fh( pool );
+
+      ahabassert( (this_coded >= 0) && (this_coded < num_pictures) );
+      ahabassert( (this_display >= 0) && (this_coded < num_pictures) );
+
+      coded_picture[ this_coded ] = tp;
+      displayed_picture[ this_display ] = tp;
+    }
+  }
+}
diff --git a/es.hpp b/es.hpp
new file mode 100644
index 0000000..116f89d
--- /dev/null
+++ b/es.hpp
@@ -0,0 +1,62 @@
+#ifndef ES_HPP
+#define ES_HPP
+
+/* MPEG-2 Video Elementary Stream */
+
+#include <stdint.h>
+
+#include "mpegheader.hpp"
+#include "bitreader.hpp"
+#include "file.hpp"
+
+const int BLOCK = 65536;
+const int LARGEST_HEADER = 260;
+const int START_CODE_LENGTH = 3;
+
+class MPEGHeader;
+
+class ES {
+private:
+  File *file;
+
+  off_t startfinder( off_t start,
+		     void (*progress)( off_t size, off_t location ),
+		     bool (ES::*todo)( uint8_t *buffer, off_t location,
+				       size_t len ) );
+
+  bool first_sequence( uint8_t *buf, off_t location, size_t len );
+  bool add_header( uint8_t *buf, off_t location, size_t len );
+
+  MPEGHeader *first_header;
+  MPEGHeader *last_header;
+
+  Sequence *seq;
+
+  void number_pictures( void );
+
+  uint num_pictures;
+  Picture **coded_picture;
+  Picture **displayed_picture;
+
+  uint64_t duration_numer, duration_denom;
+  double duration;
+
+  BufferPool *pool;
+
+public:
+  ES( File *s_file, void (*progress)( off_t size, off_t location ) );
+  ~ES();
+
+  uint get_num_pictures( void ) { return num_pictures; }
+  double get_duration( void ) { return duration; }
+
+  Picture *get_picture_displayed( uint n ) { ahabassert( n < num_pictures ); return displayed_picture[ n ]; }
+  Picture *get_picture_coded( uint n ) { ahabassert( n < num_pictures ); return coded_picture[ n ]; }
+
+  Sequence *get_sequence( void ) { return seq; }
+  File *get_file( void ) { return file; }
+
+  BufferPool *get_pool( void ) { return pool; }
+};
+
+#endif
diff --git a/exceptions.cpp b/exceptions.cpp
new file mode 100644
index 0000000..1f71104
--- /dev/null
+++ b/exceptions.cpp
@@ -0,0 +1,25 @@
+#include <stdio.h>
+
+#include "exceptions.hpp"
+
+void failure( const char *assertion, const char *file, int line, const char *function, AhabException *e )
+{
+  fprintf( stderr, "\nAssertion \"%s\" failed in file %s, function %s(), line #%d\n",
+	   assertion, file, function, line );
+
+  throw e;
+}
+
+void warn( const char *assertion, const char *file, int line, const char *function )
+{
+  fprintf( stderr, "\nAssertion \"%s\" failed in file %s, function %s(), line #%d\n",
+	   assertion, file, function, line );
+}
+
+void UnixAssertError::print( void )
+{
+  fprintf( stderr, "Statement \"%s\" failed in file %s, function %s(), line #%d\n",
+	   expr, file, function, line );
+  fprintf( stderr, "Return value was %d, errno = %d (%s).\n",
+	   result, errnumber, strerror( errnumber ) );
+}
diff --git a/exceptions.hpp b/exceptions.hpp
new file mode 100644
index 0000000..7ba44b3
--- /dev/null
+++ b/exceptions.hpp
@@ -0,0 +1,76 @@
+#ifndef EXCEPTIONS_HPP
+#define EXCEPTIONS_HPP
+
+#include <assert.h>
+#include <stdint.h>
+#include <errno.h>
+#include <string.h>
+
+class AhabException { public: virtual ~AhabException() {} };
+class SequenceNotFound : public AhabException {};
+class ConformanceLimitExceeded : public AhabException {};
+class UnixError : public AhabException
+{
+public:
+  int err;
+  UnixError( int s_errno ) { err = s_errno; }
+};
+class NotMPEGES : public AhabException {};
+class InternalError : public AhabException {};
+class MPEGInvalid : public AhabException {};
+class NeedBits : public AhabException {};
+class OutOfFrames : public AhabException {};
+class DisplayError : public AhabException {};
+class UnixAssertError : public AhabException
+{
+private:
+  char *expr, *file, *function;
+  int line, result, errnumber;
+  
+public:
+  UnixAssertError( const char *s_expr, const char *s_file, int s_line, const char *s_function,
+		   int s_result, int s_errnumber )
+  {
+    expr = strdup( s_expr );
+    file = strdup( s_file );
+    line = s_line;
+    function = strdup( s_function );
+    result = s_result;
+    errnumber = s_errnumber;
+  }
+
+  void print( void );
+};
+
+void failure( const char *assertion, const char *file, int line, const char *function, AhabException *e );
+void warn( const char *assertion, const char *file, int line, const char *function );
+
+#define checkbound(ptr,check) \
+do { \
+if ( ptr > check ) { \
+  fprintf( stderr, "%s:%d -- Pointer exceeds bound by %d bytes.\n", __FILE__, __LINE__, ptr - check ); \
+  } \
+} while( 0 ) \
+
+#define unixassert(expr) do {\
+   int assertion_result = (expr);\
+   if ( assertion_result != 0 ) {\
+     throw new UnixAssertError( __STRING(expr), __FILE__, __LINE__, __PRETTY_FUNCTION__, assertion_result, errno );\
+   } } while ( 0 )
+
+#define ahabassert(expr)                                                 \
+  ((expr)                                                                \
+   ? (void)0							         \
+   : failure (__STRING(expr), __FILE__, __LINE__, __PRETTY_FUNCTION__, new InternalError() ))
+
+#define ahabcomplain(expr)                                               \
+  ((expr)                                                                \
+   ? (void)0							         \
+   : warn (__STRING(expr), __FILE__, __LINE__, __PRETTY_FUNCTION__ ))
+
+#define mpegassert(expr)                                                 \
+  ((expr)                                                                \
+   ? (void)0							         \
+   : failure (__STRING(expr), __FILE__, __LINE__, __PRETTY_FUNCTION__, new MPEGInvalid() ))
+
+#endif
diff --git a/extensions.cpp b/extensions.cpp
new file mode 100644
index 0000000..43828eb
--- /dev/null
+++ b/extensions.cpp
@@ -0,0 +1,133 @@
+#include "mpegheader.hpp"
+#include "mpegtables.hpp"
+
+SequenceExtension::SequenceExtension( BitReader &hdr )
+{
+  init();
+  hdr.reset();
+
+  ahabassert( hdr.readbits( 32 ) == 0x000001b5 );
+  ahabassert( hdr.readbits( 4 ) == 1 );
+
+  escape_bit = hdr.readbits( 1 );
+  profile = hdr.readbits( 3 );
+  level = hdr.readbits( 4 );
+  progressive_sequence = hdr.readbits( 1 );
+  chroma_format = hdr.readbits( 2 );
+  horizontal_size_extension = hdr.readbits( 2 );
+  vertical_size_extension = hdr.readbits( 2 );
+  bit_rate_extension = hdr.readbits( 12 );
+
+  mpegassert( hdr.readbits( 1 ) == 1 );
+  
+  vbv_buffer_size_extension = hdr.readbits( 8 );
+  low_delay = hdr.readbits( 1 );
+  frame_rate_extension_n = hdr.readbits( 2 );
+  frame_rate_extension_d = hdr.readbits( 5 );
+
+  /* Check conformance limits */
+  if ( escape_bit ) {
+    throw ConformanceLimitExceeded();
+  }
+
+  /* Simple profile or Main profile */
+  if ( (profile != 5) && (profile != 4) ) {
+    throw ConformanceLimitExceeded();
+  }
+
+  /* 4:2:0 only */
+  if ( chroma_format != 1 ) {
+    throw ConformanceLimitExceeded();
+  }
+}
+
+bool SequenceExtension::operator==(const SequenceExtension &o) const {
+  return ((escape_bit == o.escape_bit)
+	  && (profile == o.profile)
+	  && (level == o.level)
+	  && (progressive_sequence == o.progressive_sequence)
+	  && (chroma_format == o.chroma_format)
+	  && (horizontal_size_extension == o.horizontal_size_extension)
+	  && (vertical_size_extension == o.vertical_size_extension)
+	  && (bit_rate_extension == o.bit_rate_extension)
+	  && (vbv_buffer_size_extension == o.vbv_buffer_size_extension)
+	  && (low_delay == o.low_delay)
+	  && (frame_rate_extension_n == o.frame_rate_extension_n)
+	  && (frame_rate_extension_d == o.frame_rate_extension_d));
+}
+
+void SequenceExtension::link( void )
+{
+  /* Make sure sequence paramenters don't change */
+  MPEGHeader *hdr = get_next();
+  while ( hdr ) {
+    if ( SequenceExtension *ts = dynamic_cast<SequenceExtension *>( hdr ) ) {
+      mpegassert( *this == *ts );
+      break;
+    } else {
+      hdr = hdr->get_next();
+    }
+  }
+}
+
+QuantMatrixExtension::QuantMatrixExtension( BitReader &hdr )
+{
+  init();
+  hdr.reset();
+
+  ahabassert( hdr.readbits( 32 ) == 0x000001b5 );
+  ahabassert( hdr.readbits( 4 ) == 3 );
+
+  load_intra_quantiser_matrix = hdr.readbits( 1 );
+  for ( int i = 0; i < 64; i++ ) {
+    intra_quantiser_matrix[ mpeg2_normal_scan[ i ] ] =
+      load_intra_quantiser_matrix
+      ? hdr.readbits( 8 )
+      : default_intra_quantiser_matrix[ i ];
+    mpegassert( intra_quantiser_matrix[ mpeg2_normal_scan[ i ] ] != 0 );
+  }
+
+  load_non_intra_quantiser_matrix = hdr.readbits( 1 );
+  for ( int i = 0; i < 64; i++ ) {
+    non_intra_quantiser_matrix[ mpeg2_normal_scan[ i ] ] =
+      load_non_intra_quantiser_matrix ? hdr.readbits( 8 ) : 16;
+    mpegassert( intra_quantiser_matrix[ mpeg2_normal_scan[ i ] ] != 0 );
+  }
+}
+
+PictureCodingExtension::PictureCodingExtension( BitReader &hdr )
+{
+  init();
+  hdr.reset();
+
+  ahabassert( hdr.readbits( 32 ) == 0x000001b5 );
+  ahabassert( hdr.readbits( 4 ) == 8 );
+  
+  f_code_fh = hdr.readbits( 4 );
+  f_code_fv = hdr.readbits( 4 );
+  f_code_bh = hdr.readbits( 4 );
+  f_code_bv = hdr.readbits( 4 );
+
+  intra_dc_precision = hdr.readbits( 2 );
+  picture_structure = hdr.readbits( 2 );
+
+  top_field_first = hdr.readbits( 1 );
+  frame_pred_frame_dct = hdr.readbits( 1 );
+  concealment_motion_vectors = hdr.readbits( 1 );
+  q_scale_type = hdr.readbits( 1 );
+  intra_vlc_format = hdr.readbits( 1 );
+  alternate_scan = hdr.readbits( 1 );
+  repeat_first_field = hdr.readbits( 1 );
+  chroma_420_type = hdr.readbits( 1 );
+  progressive_frame = hdr.readbits( 1 );
+
+  if ( picture_structure != 3 ) {
+    fprintf( stderr, "Ahab does not support field pictures.\n" );
+    throw ConformanceLimitExceeded();
+  }
+
+  mpegassert( ((f_code_fh >= 1) && (f_code_fh <= 9)) || (f_code_fh == 15) );
+  mpegassert( ((f_code_fv >= 1) && (f_code_fv <= 9)) || (f_code_fv == 15) );
+  mpegassert( ((f_code_bh >= 1) && (f_code_bh <= 9)) || (f_code_bh == 15) );
+  mpegassert( ((f_code_bv >= 1) && (f_code_bv <= 9)) || (f_code_bv == 15) );
+}
diff --git a/file.cpp b/file.cpp
new file mode 100644
index 0000000..f476c4f
--- /dev/null
+++ b/file.cpp
@@ -0,0 +1,65 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdio.h>
+
+#include "file.hpp"
+#include "exceptions.hpp"
+
+File::File( char *filename )
+{
+  /* Open file */
+  fd = open( filename, O_RDONLY );
+  if ( fd < 0 ) {
+    perror( "open" );
+    throw UnixError( errno );
+  }
+
+  /* Get size of file */
+  struct stat thestat;
+
+  if ( fstat( fd, &thestat ) < 0 ) {
+    perror( "fstat" );
+    throw UnixError( errno );
+  }
+
+  filesize = thestat.st_size;
+}
+
+File::~File()
+{
+  if ( close( fd ) < 0 ) {
+    perror( "close" );
+    throw UnixError( errno );
+  }
+}
+
+MapHandle *File::map( off_t offset, size_t len )
+{
+  long page = sysconf( _SC_PAGE_SIZE );
+
+  off_t mmap_offset = offset & ~(page - 1);
+
+  uint8_t *mbuf = (uint8_t *)mmap( NULL, len + offset - mmap_offset,
+				   PROT_READ,
+				   MAP_SHARED, fd, mmap_offset );
+  if ( mbuf == MAP_FAILED ) {
+    perror( "mmap" );
+    throw UnixError( errno );
+  }
+
+  uint8_t *buf = mbuf + offset - mmap_offset;
+
+  return new MapHandle( buf, mbuf, len + offset - mmap_offset, len );
+}
+
+MapHandle::~MapHandle()
+{
+  if ( munmap( mmap_buf, maplen ) < 0 ) {
+    perror( "munmap" );
+    throw UnixError( errno );
+  }
+}
diff --git a/file.hpp b/file.hpp
new file mode 100644
index 0000000..75e12fa
--- /dev/null
+++ b/file.hpp
@@ -0,0 +1,40 @@
+#ifndef FILE_HPP
+#define FILE_HPP
+
+#include <stdint.h>
+#include <sys/mman.h>
+
+class MapHandle {
+  friend class File;
+
+private:
+  uint8_t *user_buf;
+  uint8_t *mmap_buf;
+  size_t maplen;
+  size_t userlen;
+  MapHandle( uint8_t *s_user, uint8_t *s_mmap, size_t s_maplen, size_t s_userlen )
+  {
+    user_buf = s_user; mmap_buf = s_mmap; maplen = s_maplen; userlen = s_userlen;
+  }
+
+public:
+  ~MapHandle();
+
+  uint8_t *get_buf( void ) { return user_buf; }
+  size_t get_len( void ) { return userlen; }
+};
+
+class File {
+private:
+  int fd;
+  off_t filesize;
+
+public:
+  File( char *filename );
+  ~File();
+
+  MapHandle *map( off_t offset, size_t len );
+  off_t get_filesize( void ) { return filesize; }
+};
+
+#endif
diff --git a/framebuffer.cpp b/framebuffer.cpp
new file mode 100644
index 0000000..4bbacc5
--- /dev/null
+++ b/framebuffer.cpp
@@ -0,0 +1,252 @@
+#include "framebuffer.hpp"
+#include "mutexobj.hpp"
+
+FrameQueue::FrameQueue( void )
+{
+  first = last = NULL;
+  unixassert( pthread_mutex_init( &mutex, NULL ) );
+}
+
+void FrameQueue::add( Frame *frame )
+{
+  MutexLock x( &mutex );
+
+  frame->prev = last;
+  frame->next = NULL;
+
+  if ( last ) {
+    last->next = frame;
+    last = frame;
+  } else {
+    first = last = frame;
+  }
+}
+
+Frame *FrameQueue::remove( void )
+{
+  MutexLock x( &mutex );
+
+  if ( first == NULL ) { /* empty */
+    return NULL;
+  }
+
+  Frame *return_value = first;
+
+  first = first->next;
+
+  if ( first ) {
+    first->prev = NULL;
+  } else {
+    last = NULL;
+  }
+
+  return_value->prev = return_value->next = NULL;
+
+  return return_value;
+}
+
+void FrameQueue::remove_specific( Frame *frame )
+{
+  MutexLock x( &mutex );
+
+  if ( frame->prev ) {
+    frame->prev->next = frame->next;
+  } else {
+    first = frame->next;
+  }
+
+  if ( frame->next ) {
+    frame->next->prev = frame->prev;
+  } else {
+    last = frame->prev;
+  }
+
+  frame->prev = frame->next = NULL;
+}
+
+BufferPool::BufferPool( uint s_num_frames, uint s_width, uint s_height )
+{
+  num_frames = s_num_frames;
+  width = s_width;
+  height = s_height;
+
+  frames = new Frame *[ num_frames ];
+  for ( uint i = 0; i < num_frames; i++ ) {
+    frames[ i ] = new Frame( width, height );
+    free.add( frames[ i ] );
+  }
+
+  unixassert( pthread_mutex_init( &mutex, NULL ) );
+}
+
+BufferPool::~BufferPool()
+{
+  for ( uint i = 0; i < num_frames; i++ ) {
+    Frame *frame = frames[ i ];
+    delete frame;
+  }
+  delete[] frames;
+
+  unixassert( pthread_mutex_destroy( &mutex ) );
+}
+
+Frame::Frame( uint s_width, uint s_height )
+{
+  width = s_width;
+  height = s_height;
+  buf = new uint8_t[ sizeof( uint8_t ) * (3 * width * height / 2) ];
+  state = FREE;
+  handle = NULL;
+  unixassert( pthread_mutex_init( &mutex, NULL ) );
+}
+
+Frame::~Frame()
+{
+  delete[] buf;
+  unixassert( pthread_mutex_destroy( &mutex ) );
+}
+
+void Frame::lock( FrameHandle *s_handle )
+{
+  MutexLock x( &mutex );
+
+  ahabassert( handle == NULL );
+  ahabassert( state == FREE );
+  handle = s_handle;
+  state = LOCKED;
+}
+
+void Frame::set_rendered( void )
+{
+  MutexLock x( &mutex );
+
+  ahabassert( state == LOCKED );
+  state = RENDERED;
+}
+
+void Frame::relock( void )
+{
+  MutexLock x( &mutex );
+
+  ahabassert( state == FREEABLE );
+  state = RENDERED;
+}
+
+void Frame::set_freeable( void )
+{
+  MutexLock x( &mutex );
+
+  ahabassert( state == RENDERED );
+  state = FREEABLE;
+}
+
+void Frame::free_locked( void )
+{
+  MutexLock x( &mutex );
+
+  ahabassert( state == LOCKED );
+  /* handle->set_frame( NULL ); */ /* handle takes care of this */
+  handle = NULL;
+  state = FREE;
+}
+
+void Frame::free( void )
+{
+  MutexLock x( &mutex );
+
+  ahabassert( state == FREEABLE );
+  handle->set_frame( NULL );
+  handle = NULL;
+  state = FREE;
+}
+
+FrameHandle::FrameHandle( BufferPool *s_pool, Picture *s_pic )
+{
+  pool = s_pool;
+  pic = s_pic;
+  frame = NULL;
+  locks = 0;
+  unixassert( pthread_mutex_init( &mutex, NULL ) );
+}
+
+void FrameHandle::increment_lockcount( void )
+{
+  MutexLock x( &mutex );
+
+  if ( frame ) {
+    if ( locks == 0 ) {
+      ahabassert( frame->get_state() == FREEABLE );
+      pool->remove_from_freeable( frame );
+      frame->relock();
+    }
+    locks++;
+  } else {
+    ahabassert( locks == 0 );
+    frame = pool->get_free_frame();
+    frame->lock( this );
+    locks++;
+  }
+}
+
+void FrameHandle::decrement_lockcount( void )
+{
+  MutexLock x( &mutex );
+
+  ahabassert( locks > 0 );
+  locks--;
+  if ( locks == 0 ) {
+    if ( frame->get_state() == RENDERED ) {
+      pool->make_freeable( frame );
+      frame->set_freeable();
+    } else if ( frame->get_state() == LOCKED ) {
+      pool->make_free( frame );
+      frame->free_locked();
+      frame = NULL;
+    } else {
+      throw AhabException();
+    }
+  }
+}
+
+Frame *BufferPool::get_free_frame( void )
+{
+  MutexLock x( &mutex );
+
+  Frame *first_free = free.remove();
+  if ( first_free ) {
+    return first_free;
+  }
+
+  Frame *first_freeable = freeable.remove();
+  if ( !first_freeable ) {
+    throw OutOfFrames();
+  }
+  first_freeable->free();
+
+  return first_freeable;
+}
+
+void BufferPool::make_freeable( Frame *frame )
+{
+  MutexLock x( &mutex );
+  freeable.add( frame );
+}
+
+void BufferPool::make_free( Frame *frame )
+{
+  MutexLock x( &mutex );
+  free.add( frame );
+}
+
+void BufferPool::remove_from_freeable( Frame *frame )
+{
+  MutexLock x( &mutex );
+  freeable.remove_specific( frame );
+}
+
+void FrameHandle::set_frame( Frame *s_frame )
+{
+  MutexLock x( &mutex );
+  ahabassert( locks == 0 );
+  frame = s_frame;
+}
diff --git a/framebuffer.hpp b/framebuffer.hpp
new file mode 100644
index 0000000..5d438be
--- /dev/null
+++ b/framebuffer.hpp
@@ -0,0 +1,111 @@
+#ifndef FRAMEBUFFER_HPP
+#define FRAMEBUFFER_HPP
+
+#include "mpegheader.hpp"
+#include "exceptions.hpp"
+
+#include <stdint.h>
+#include <pthread.h>
+
+class Frame;
+class BufferPool;
+
+class FrameHandle
+{
+  friend class Frame;
+
+private:
+  BufferPool *pool;
+  Picture *pic;
+  Frame *frame;
+  int locks;
+
+  pthread_mutex_t mutex;
+
+  void set_frame( Frame *s_frame );
+
+public:
+  void increment_lockcount( void );
+  void decrement_lockcount( void );
+
+  Frame *get_frame( void ) { return frame; }
+  Picture *get_picture( void ) { return pic; }
+
+  FrameHandle( BufferPool *s_pool, Picture *s_pic );
+  ~FrameHandle() { unixassert( pthread_mutex_destroy( &mutex ) ); }
+};
+
+class FrameQueue
+{
+private:
+  Frame *first, *last;
+
+  pthread_mutex_t mutex;
+
+public:
+  FrameQueue( void );
+  void add( Frame *frame );
+  Frame *remove( void );
+  void remove_specific( Frame *frame );
+  ~FrameQueue() { unixassert( pthread_mutex_destroy( &mutex ) ); };
+};
+
+class BufferPool
+{
+private:
+  uint num_frames, width, height;
+  Frame **frames;
+
+  FrameQueue free;
+  FrameQueue freeable;
+
+  pthread_mutex_t mutex;
+
+public:
+  BufferPool( uint s_num_frames, uint s_width, uint s_height );
+  ~BufferPool();
+
+  FrameHandle *make_handle( Picture *pic ) { return new FrameHandle( this, pic ); }
+  Frame *get_free_frame( void );
+  void make_freeable( Frame *frame );
+  void make_free( Frame *frame );
+  void remove_from_freeable( Frame *frame );
+};
+
+enum FrameState { FREE, LOCKED, RENDERED, FREEABLE };
+
+class Frame
+{
+  friend class FrameQueue;
+
+private:
+  uint width, height;
+  uint8_t *buf;
+  FrameState state;
+
+  FrameHandle *handle;
+
+  Frame *prev, *next;
+
+  pthread_mutex_t mutex;
+
+public:
+  Frame( uint s_width, uint s_height );
+  ~Frame();
+
+  uint8_t *get_buf( void ) { return buf; }
+  uint8_t *get_y( void ) { return buf; }
+  uint8_t *get_cb( void ) { return buf + width * height; }
+  uint8_t *get_cr( void ) { return buf + width * height + width * height / 4; }
+
+  void lock( FrameHandle *s_handle );
+  void set_rendered( void );
+  void set_freeable( void );
+  void relock( void );
+  void free( void );
+  void free_locked( void );
+
+  FrameState get_state( void ) { return state; }
+};
+
+#endif
diff --git a/idct_mmx.cpp b/idct_mmx.cpp
new file mode 100644
index 0000000..4acfee2
--- /dev/null
+++ b/idct_mmx.cpp
@@ -0,0 +1,1338 @@
+/*
+ * idct_mmx.c
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+
+#include <inttypes.h>
+
+#include "mpeg2.h"
+#include "attributes.h"
+#include "mpeg2_internal.h"
+#include "mmx.h"
+
+/* Reordered for MMX IDCT */
+
+const uint8_t mpeg2_scan_norm[ 64 ] ATTR_ALIGN(16) = {
+  0, 4, 8, 16, 12, 1, 5, 9, 20, 24, 32, 28, 17, 13, 2, 6, 10, 21,
+  25, 36, 40, 48, 44, 33, 29, 18, 14, 3, 7, 11, 22, 26, 37, 41, 52,
+  56, 60, 49, 45, 34, 30, 19, 15, 23, 27, 38, 42, 53, 57, 61, 50, 46,
+  35, 31, 39, 43, 54, 58, 62, 51, 47, 55, 59, 63
+};
+
+const uint8_t mpeg2_scan_alt[ 64 ] ATTR_ALIGN(16) = {
+  0, 8, 16, 24, 4, 12, 1, 9, 20, 28, 32, 40, 48, 56, 60, 52, 44,
+  36, 25, 17, 5, 13, 2, 10, 21, 29, 33, 41, 49, 57, 37, 45, 53,
+  61, 18, 26, 6, 14, 3, 11, 22, 30, 34, 42, 50, 58, 38, 46, 54,
+  62, 19, 27, 7, 15, 23, 31, 35, 43, 51, 59, 39, 47, 55, 63
+};
+
+/*
+uint8_t mpeg2_scan_norm[64] ATTR_ALIGN(16) = {
+     0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,
+    12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,
+    35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+    58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
+};
+
+uint8_t mpeg2_scan_alt[64] ATTR_ALIGN(16) = {
+     0, 8,  16, 24,  1,  9,  2, 10, 17, 25, 32, 40, 48, 56, 57, 49,
+    41, 33, 26, 18,  3, 11,  4, 12, 19, 27, 34, 42, 50, 58, 35, 43,
+    51, 59, 20, 28,  5, 13,  6, 14, 21, 29, 36, 44, 52, 60, 37, 45,
+    53, 61, 22, 30,  7, 15, 23, 31, 38, 46, 54, 62, 39, 47, 55, 63
+};
+*/
+
+#define ROW_SHIFT 15
+#define COL_SHIFT 6
+
+#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
+#define rounder(bias) {round (bias), round (bias)}
+#define rounder_sse2(bias) {round (bias), round (bias), round (bias), round (bias)}
+
+
+#if 0
+/* C row IDCT - it is just here to document the MMXEXT and MMX versions */
+static inline void idct_row (int16_t * row, int offset,
+			     int16_t * table, int32_t * rounder)
+{
+    int C1, C2, C3, C4, C5, C6, C7;
+    int a0, a1, a2, a3, b0, b1, b2, b3;
+
+    row += offset;
+
+    C1 = table[1];
+    C2 = table[2];
+    C3 = table[3];
+    C4 = table[4];
+    C5 = table[5];
+    C6 = table[6];
+    C7 = table[7];
+
+    a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder;
+    a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder;
+    a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder;
+    a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder;
+
+    b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
+    b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
+    b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
+    b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
+
+    row[0] = (a0 + b0) >> ROW_SHIFT;
+    row[1] = (a1 + b1) >> ROW_SHIFT;
+    row[2] = (a2 + b2) >> ROW_SHIFT;
+    row[3] = (a3 + b3) >> ROW_SHIFT;
+    row[4] = (a3 - b3) >> ROW_SHIFT;
+    row[5] = (a2 - b2) >> ROW_SHIFT;
+    row[6] = (a1 - b1) >> ROW_SHIFT;
+    row[7] = (a0 - b0) >> ROW_SHIFT;
+}
+#endif
+
+
+/* SSE2 row IDCT */
+#define sse2_table(c1,c2,c3,c4,c5,c6,c7) {  c4,  c2,  c4,  c6,   \
+					    c4, -c6,  c4, -c2,   \
+					    c4,  c6, -c4, -c2,   \
+					   -c4,  c2,  c4, -c6,   \
+					    c1,  c3,  c3, -c7,   \
+					    c5, -c1,  c7, -c5,   \
+					    c5,  c7, -c1, -c5,   \
+					    c7,  c3,  c3, -c1 }
+
+#define SSE2_IDCT_2ROW(table, row1, row2, round1, round2) do {               \
+    /* no scheduling: trust in out of order execution */                     \
+    /* based on Intel AP-945 */                                              \
+    /* (http://cache-www.intel.com/cd/00/00/01/76/17680_w_idct.pdf) */       \
+                                                                             \
+    /* input */                      /* 1: row1= x7 x5 x3 x1  x6 x4 x2 x0 */ \
+    pshufd_r2r   (row1, xmm1, 0);    /* 1: xmm1= x2 x0 x2 x0  x2 x0 x2 x0 */ \
+    pmaddwd_m2r  (table[0], xmm1);   /* 1: xmm1= x2*C + x0*C ...          */ \
+    pshufd_r2r   (row1, xmm3, 0xaa); /* 1: xmm3= x3 x1 x3 x1  x3 x1 x3 x1 */ \
+    pmaddwd_m2r  (table[2*8], xmm3); /* 1: xmm3= x3*C + x1*C ...          */ \
+    pshufd_r2r   (row1, xmm2, 0x55); /* 1: xmm2= x6 x4 x6 x4  x6 x4 x6 x4 */ \
+    pshufd_r2r   (row1, row1, 0xff); /* 1: row1= x7 x5 x7 x5  x7 x5 x7 x5 */ \
+    pmaddwd_m2r  (table[1*8], xmm2); /* 1: xmm2= x6*C + x4*C ...          */ \
+    paddd_m2r    (round1, xmm1);     /* 1: xmm1= x2*C + x0*C + round ...  */ \
+    pmaddwd_m2r  (table[3*8], row1); /* 1: row1= x7*C + x5*C ...          */ \
+    pshufd_r2r   (row2, xmm5, 0);    /*    2:                             */ \
+    pshufd_r2r   (row2, xmm6, 0x55); /*    2:                             */ \
+    pmaddwd_m2r  (table[0], xmm5);   /*    2:                             */ \
+    paddd_r2r    (xmm2, xmm1);       /* 1: xmm1= a[]                      */ \
+    movdqa_r2r   (xmm1, xmm2);       /* 1: xmm2= a[]                      */ \
+    pshufd_r2r   (row2, xmm7, 0xaa); /*    2:                             */ \
+    pmaddwd_m2r  (table[1*8], xmm6); /*    2:                             */ \
+    paddd_r2r    (xmm3, row1);       /* 1: row1= b[]= 7*C+5*C+3*C+1*C ... */ \
+    pshufd_r2r   (row2, row2, 0xff); /*    2:                             */ \
+    psubd_r2r    (row1, xmm2);       /* 1: xmm2= a[] - b[]                */ \
+    pmaddwd_m2r  (table[2*8], xmm7); /*    2:                             */ \
+    paddd_r2r    (xmm1, row1);       /* 1: row1= a[] + b[]                */ \
+    psrad_i2r    (ROW_SHIFT, xmm2);  /* 1: xmm2= result 4...7             */ \
+    paddd_m2r    (round2, xmm5);     /*    2:                             */ \
+    pmaddwd_m2r  (table[3*8], row2); /*    2:                             */ \
+    paddd_r2r    (xmm6, xmm5);       /*    2:                             */ \
+    movdqa_r2r   (xmm5, xmm6);       /*    2:                             */ \
+    psrad_i2r    (ROW_SHIFT, row1);  /* 1: row1= result 0...4             */ \
+    pshufd_r2r   (xmm2, xmm2, 0x1b); /* 1: [0 1 2 3] -> [3 2 1 0]         */ \
+    packssdw_r2r (xmm2, row1);       /* 1: row1= result[]                 */ \
+    paddd_r2r    (xmm7, row2);       /*    2:                             */ \
+    psubd_r2r    (row2, xmm6);       /*    2:                             */ \
+    paddd_r2r    (xmm5, row2);       /*    2:                             */ \
+    psrad_i2r    (ROW_SHIFT, xmm6);  /*    2:                             */ \
+    psrad_i2r    (ROW_SHIFT, row2);  /*    2:                             */ \
+    pshufd_r2r   (xmm6, xmm6, 0x1b); /*    2:                             */ \
+    packssdw_r2r (xmm6, row2);       /*    2:                             */ \
+} while (0)
+
+
+/* MMXEXT row IDCT */
+
+#define mmxext_table(c1,c2,c3,c4,c5,c6,c7)	{  c4,  c2, -c4, -c2,	\
+						   c4,  c6,  c4,  c6,	\
+						   c1,  c3, -c1, -c5,	\
+						   c5,  c7,  c3, -c7,	\
+						   c4, -c6,  c4, -c6,	\
+						  -c4,  c2,  c4, -c2,	\
+						   c5, -c1,  c3, -c1,	\
+						   c7,  c3,  c7, -c5 }
+
+static inline void mmxext_row_head (int16_t * const row, const int offset,
+				    const int16_t * const table)
+{
+    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
+
+    movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
+    movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
+
+    movq_m2r (*table, mm3);		/* mm3 = -C2 -C4 C2 C4 */
+    movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
+
+    movq_m2r (*(table+4), mm4);		/* mm4 = C6 C4 C6 C4 */
+    pmaddwd_r2r (mm0, mm3);		/* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
+
+    pshufw_r2r (mm2, mm2, 0x4e);	/* mm2 = x2 x0 x6 x4 */
+}
+
+static inline void mmxext_row (const int16_t * const table,
+			       const int32_t * const rounder)
+{
+    movq_m2r (*(table+8), mm1);		/* mm1 = -C5 -C1 C3 C1 */
+    pmaddwd_r2r (mm2, mm4);		/* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */
+
+    pmaddwd_m2r (*(table+16), mm0);	/* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */
+    pshufw_r2r (mm6, mm6, 0x4e);	/* mm6 = x3 x1 x7 x5 */
+
+    movq_m2r (*(table+12), mm7);	/* mm7 = -C7 C3 C7 C5 */
+    pmaddwd_r2r (mm5, mm1);		/* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */
+
+    paddd_m2r (*rounder, mm3);		/* mm3 += rounder */
+    pmaddwd_r2r (mm6, mm7);		/* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */
+
+    pmaddwd_m2r (*(table+20), mm2);	/* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */
+    paddd_r2r (mm4, mm3);		/* mm3 = a1 a0 + rounder */
+
+    pmaddwd_m2r (*(table+24), mm5);	/* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */
+    movq_r2r (mm3, mm4);		/* mm4 = a1 a0 + rounder */
+
+    pmaddwd_m2r (*(table+28), mm6);	/* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */
+    paddd_r2r (mm7, mm1);		/* mm1 = b1 b0 */
+
+    paddd_m2r (*rounder, mm0);		/* mm0 += rounder */
+    psubd_r2r (mm1, mm3);		/* mm3 = a1-b1 a0-b0 + rounder */
+
+    psrad_i2r (ROW_SHIFT, mm3);		/* mm3 = y6 y7 */
+    paddd_r2r (mm4, mm1);		/* mm1 = a1+b1 a0+b0 + rounder */
+
+    paddd_r2r (mm2, mm0);		/* mm0 = a3 a2 + rounder */
+    psrad_i2r (ROW_SHIFT, mm1);		/* mm1 = y1 y0 */
+
+    paddd_r2r (mm6, mm5);		/* mm5 = b3 b2 */
+    movq_r2r (mm0, mm4);		/* mm4 = a3 a2 + rounder */
+
+    paddd_r2r (mm5, mm0);		/* mm0 = a3+b3 a2+b2 + rounder */
+    psubd_r2r (mm5, mm4);		/* mm4 = a3-b3 a2-b2 + rounder */
+}
+
+static inline void mmxext_row_tail (int16_t * const row, const int store)
+{
+    psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
+
+    psrad_i2r (ROW_SHIFT, mm4);		/* mm4 = y4 y5 */
+
+    packssdw_r2r (mm0, mm1);		/* mm1 = y3 y2 y1 y0 */
+
+    packssdw_r2r (mm3, mm4);		/* mm4 = y6 y7 y4 y5 */
+
+    movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
+    pshufw_r2r (mm4, mm4, 0xb1);	/* mm4 = y7 y6 y5 y4 */
+
+    /* slot */
+
+    movq_r2m (mm4, *(row+store+4));	/* save y7 y6 y5 y4 */
+}
+
+static inline void mmxext_row_mid (int16_t * const row, const int store,
+				   const int offset,
+				   const int16_t * const table)
+{
+    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
+    psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
+
+    movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
+    psrad_i2r (ROW_SHIFT, mm4);		/* mm4 = y4 y5 */
+
+    packssdw_r2r (mm0, mm1);		/* mm1 = y3 y2 y1 y0 */
+    movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
+
+    packssdw_r2r (mm3, mm4);		/* mm4 = y6 y7 y4 y5 */
+    movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
+
+    movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
+    pshufw_r2r (mm4, mm4, 0xb1);	/* mm4 = y7 y6 y5 y4 */
+
+    movq_m2r (*table, mm3);		/* mm3 = -C2 -C4 C2 C4 */
+    movq_r2m (mm4, *(row+store+4));	/* save y7 y6 y5 y4 */
+
+    pmaddwd_r2r (mm0, mm3);		/* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
+
+    movq_m2r (*(table+4), mm4);		/* mm4 = C6 C4 C6 C4 */
+    pshufw_r2r (mm2, mm2, 0x4e);	/* mm2 = x2 x0 x6 x4 */
+}
+
+
+/* MMX row IDCT */
+
+#define mmx_table(c1,c2,c3,c4,c5,c6,c7)	{  c4,  c2,  c4,  c6,	\
+					   c4,  c6, -c4, -c2,	\
+					   c1,  c3,  c3, -c7,	\
+					   c5,  c7, -c1, -c5,	\
+					   c4, -c6,  c4, -c2,	\
+					  -c4,  c2,  c4, -c6,	\
+					   c5, -c1,  c7, -c5,	\
+					   c7,  c3,  c3, -c1 }
+
+static inline void mmx_row_head (int16_t * const row, const int offset,
+				 const int16_t * const table)
+{
+    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
+
+    movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
+    movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
+
+    movq_m2r (*table, mm3);		/* mm3 = C6 C4 C2 C4 */
+    movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
+
+    punpckldq_r2r (mm0, mm0);		/* mm0 = x2 x0 x2 x0 */
+
+    movq_m2r (*(table+4), mm4);		/* mm4 = -C2 -C4 C6 C4 */
+    pmaddwd_r2r (mm0, mm3);		/* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
+
+    movq_m2r (*(table+8), mm1);		/* mm1 = -C7 C3 C3 C1 */
+    punpckhdq_r2r (mm2, mm2);		/* mm2 = x6 x4 x6 x4 */
+}
+
+static inline void mmx_row (const int16_t * const table,
+			    const int32_t * const rounder)
+{
+    pmaddwd_r2r (mm2, mm4);		/* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */
+    punpckldq_r2r (mm5, mm5);		/* mm5 = x3 x1 x3 x1 */
+
+    pmaddwd_m2r (*(table+16), mm0);	/* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */
+    punpckhdq_r2r (mm6, mm6);		/* mm6 = x7 x5 x7 x5 */
+
+    movq_m2r (*(table+12), mm7);	/* mm7 = -C5 -C1 C7 C5 */
+    pmaddwd_r2r (mm5, mm1);		/* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */
+
+    paddd_m2r (*rounder, mm3);		/* mm3 += rounder */
+    pmaddwd_r2r (mm6, mm7);		/* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */
+
+    pmaddwd_m2r (*(table+20), mm2);	/* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */
+    paddd_r2r (mm4, mm3);		/* mm3 = a1 a0 + rounder */
+
+    pmaddwd_m2r (*(table+24), mm5);	/* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */
+    movq_r2r (mm3, mm4);		/* mm4 = a1 a0 + rounder */
+
+    pmaddwd_m2r (*(table+28), mm6);	/* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */
+    paddd_r2r (mm7, mm1);		/* mm1 = b1 b0 */
+
+    paddd_m2r (*rounder, mm0);		/* mm0 += rounder */
+    psubd_r2r (mm1, mm3);		/* mm3 = a1-b1 a0-b0 + rounder */
+
+    psrad_i2r (ROW_SHIFT, mm3);		/* mm3 = y6 y7 */
+    paddd_r2r (mm4, mm1);		/* mm1 = a1+b1 a0+b0 + rounder */
+
+    paddd_r2r (mm2, mm0);		/* mm0 = a3 a2 + rounder */
+    psrad_i2r (ROW_SHIFT, mm1);		/* mm1 = y1 y0 */
+
+    paddd_r2r (mm6, mm5);		/* mm5 = b3 b2 */
+    movq_r2r (mm0, mm7);		/* mm7 = a3 a2 + rounder */
+
+    paddd_r2r (mm5, mm0);		/* mm0 = a3+b3 a2+b2 + rounder */
+    psubd_r2r (mm5, mm7);		/* mm7 = a3-b3 a2-b2 + rounder */
+}
+
+static inline void mmx_row_tail (int16_t * const row, const int store)
+{
+    psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
+
+    psrad_i2r (ROW_SHIFT, mm7);		/* mm7 = y4 y5 */
+
+    packssdw_r2r (mm0, mm1);		/* mm1 = y3 y2 y1 y0 */
+
+    packssdw_r2r (mm3, mm7);		/* mm7 = y6 y7 y4 y5 */
+
+    movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
+    movq_r2r (mm7, mm4);		/* mm4 = y6 y7 y4 y5 */
+
+    pslld_i2r (16, mm7);		/* mm7 = y7 0 y5 0 */
+
+    psrld_i2r (16, mm4);		/* mm4 = 0 y6 0 y4 */
+
+    por_r2r (mm4, mm7);			/* mm7 = y7 y6 y5 y4 */
+
+    /* slot */
+
+    movq_r2m (mm7, *(row+store+4));	/* save y7 y6 y5 y4 */
+}
+
+static inline void mmx_row_mid (int16_t * const row, const int store,
+				const int offset, const int16_t * const table)
+{
+    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
+    psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
+
+    movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
+    psrad_i2r (ROW_SHIFT, mm7);		/* mm7 = y4 y5 */
+
+    packssdw_r2r (mm0, mm1);		/* mm1 = y3 y2 y1 y0 */
+    movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
+
+    packssdw_r2r (mm3, mm7);		/* mm7 = y6 y7 y4 y5 */
+    movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
+
+    movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
+    movq_r2r (mm7, mm1);		/* mm1 = y6 y7 y4 y5 */
+
+    punpckldq_r2r (mm0, mm0);		/* mm0 = x2 x0 x2 x0 */
+    psrld_i2r (16, mm7);		/* mm7 = 0 y6 0 y4 */
+
+    movq_m2r (*table, mm3);		/* mm3 = C6 C4 C2 C4 */
+    pslld_i2r (16, mm1);		/* mm1 = y7 0 y5 0 */
+
+    movq_m2r (*(table+4), mm4);		/* mm4 = -C2 -C4 C6 C4 */
+    por_r2r (mm1, mm7);			/* mm7 = y7 y6 y5 y4 */
+
+    movq_m2r (*(table+8), mm1);		/* mm1 = -C7 C3 C3 C1 */
+    punpckhdq_r2r (mm2, mm2);		/* mm2 = x6 x4 x6 x4 */
+
+    movq_r2m (mm7, *(row+store+4));	/* save y7 y6 y5 y4 */
+    pmaddwd_r2r (mm0, mm3);		/* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
+}
+
+
+#if 0
+/* C column IDCT - it is just here to document the MMXEXT and MMX versions */
+static inline void idct_col (int16_t * col, int offset)
+{
+/* multiplication - as implemented on mmx */
+#define F(c,x) (((c) * (x)) >> 16)
+
+/* saturation - it helps us handle torture test cases */
+#define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x))
+
+    int16_t x0, x1, x2, x3, x4, x5, x6, x7;
+    int16_t y0, y1, y2, y3, y4, y5, y6, y7;
+    int16_t a0, a1, a2, a3, b0, b1, b2, b3;
+    int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12;
+
+    col += offset;
+
+    x0 = col[0*8];
+    x1 = col[1*8];
+    x2 = col[2*8];
+    x3 = col[3*8];
+    x4 = col[4*8];
+    x5 = col[5*8];
+    x6 = col[6*8];
+    x7 = col[7*8];
+
+    u04 = S (x0 + x4);
+    v04 = S (x0 - x4);
+    u26 = S (F (T2, x6) + x2);
+    v26 = S (F (T2, x2) - x6);
+
+    a0 = S (u04 + u26);
+    a1 = S (v04 + v26);
+    a2 = S (v04 - v26);
+    a3 = S (u04 - u26);
+
+    u17 = S (F (T1, x7) + x1);
+    v17 = S (F (T1, x1) - x7);
+    u35 = S (F (T3, x5) + x3);
+    v35 = S (F (T3, x3) - x5);
+
+    b0 = S (u17 + u35);
+    b3 = S (v17 - v35);
+    u12 = S (u17 - u35);
+    v12 = S (v17 + v35);
+    u12 = S (2 * F (C4, u12));
+    v12 = S (2 * F (C4, v12));
+    b1 = S (u12 + v12);
+    b2 = S (u12 - v12);
+
+    y0 = S (a0 + b0) >> COL_SHIFT;
+    y1 = S (a1 + b1) >> COL_SHIFT;
+    y2 = S (a2 + b2) >> COL_SHIFT;
+    y3 = S (a3 + b3) >> COL_SHIFT;
+
+    y4 = S (a3 - b3) >> COL_SHIFT;
+    y5 = S (a2 - b2) >> COL_SHIFT;
+    y6 = S (a1 - b1) >> COL_SHIFT;
+    y7 = S (a0 - b0) >> COL_SHIFT;
+
+    col[0*8] = y0;
+    col[1*8] = y1;
+    col[2*8] = y2;
+    col[3*8] = y3;
+    col[4*8] = y4;
+    col[5*8] = y5;
+    col[6*8] = y6;
+    col[7*8] = y7;
+}
+#endif
+
+
+#define T1 13036
+#define T2 27146
+#define T3 43790
+#define C4 23170
+
+
+/* SSE2 column IDCT */
+static inline void sse2_idct_col (int16_t * const col)
+{
+    /* Almost identical to mmxext version:  */
+    /* just do both 4x8 columns in paraller */
+
+    static const short t1_vector[] ATTR_ALIGN(16) = {T1,T1,T1,T1,T1,T1,T1,T1};
+    static const short t2_vector[] ATTR_ALIGN(16) = {T2,T2,T2,T2,T2,T2,T2,T2};
+    static const short t3_vector[] ATTR_ALIGN(16) = {T3,T3,T3,T3,T3,T3,T3,T3};
+    static const short c4_vector[] ATTR_ALIGN(16) = {C4,C4,C4,C4,C4,C4,C4,C4};
+
+#if defined(__x86_64__)
+
+    /* INPUT: block in xmm8 ... xmm15 */
+
+    movdqa_m2r (*t1_vector, xmm0);	/* xmm0  = T1 */
+    movdqa_r2r (xmm9, xmm1);		/* xmm1  = x1 */
+
+    movdqa_r2r (xmm0, xmm2);		/* xmm2  = T1 */
+    pmulhw_r2r (xmm1, xmm0);		/* xmm0  = T1*x1 */
+
+    movdqa_m2r (*t3_vector, xmm5);	/* xmm5  = T3 */
+    pmulhw_r2r (xmm15, xmm2);		/* xmm2  = T1*x7 */
+
+    movdqa_r2r (xmm5, xmm7);		/* xmm7  = T3-1 */
+    psubsw_r2r (xmm15, xmm0);		/* xmm0  = v17 */
+
+    movdqa_m2r (*t2_vector, xmm9);	/* xmm9  = T2 */
+    pmulhw_r2r (xmm11, xmm5);		/* xmm5  = (T3-1)*x3 */
+
+    paddsw_r2r (xmm2, xmm1);		/* xmm1  = u17 */
+    pmulhw_r2r (xmm13, xmm7);		/* xmm7  = (T3-1)*x5 */
+
+    movdqa_r2r (xmm9, xmm2);		/* xmm2  = T2 */
+    paddsw_r2r (xmm11, xmm5);		/* xmm5  = T3*x3 */
+
+    pmulhw_r2r (xmm10, xmm9);   	/* xmm9  = T2*x2 */
+    paddsw_r2r (xmm13, xmm7);		/* xmm7  = T3*x5 */
+
+    psubsw_r2r (xmm13, xmm5);		/* xmm5  = v35 */
+    paddsw_r2r (xmm11, xmm7);		/* xmm7  = u35 */
+
+    movdqa_r2r (xmm0, xmm6);		/* xmm6  = v17 */
+    pmulhw_r2r (xmm14, xmm2);		/* xmm2  = T2*x6 */
+
+    psubsw_r2r (xmm5, xmm0);		/* xmm0  = b3 */
+    psubsw_r2r (xmm14, xmm9);		/* xmm9  = v26 */
+
+    paddsw_r2r (xmm6, xmm5);		/* xmm5  = v12 */
+    movdqa_r2r (xmm0, xmm11);		/* xmm11 = b3 */
+
+    movdqa_r2r (xmm1, xmm6);		/* xmm6  = u17 */
+    paddsw_r2r (xmm10, xmm2);		/* xmm2  = u26 */
+
+    paddsw_r2r (xmm7, xmm6);		/* xmm6  = b0 */
+    psubsw_r2r (xmm7, xmm1);		/* xmm1  = u12 */
+
+    movdqa_r2r (xmm1, xmm7);		/* xmm7  = u12 */
+    paddsw_r2r (xmm5, xmm1);		/* xmm1  = u12+v12 */
+
+    movdqa_m2r (*c4_vector, xmm0);	/* xmm0  = C4/2 */
+    psubsw_r2r (xmm5, xmm7);		/* xmm7  = u12-v12 */
+
+    movdqa_r2r (xmm6, xmm4);		/* xmm4  = b0 */
+    pmulhw_r2r (xmm0, xmm1);		/* xmm1  = b1/2 */
+
+    movdqa_r2r (xmm9, xmm6);		/* xmm6  = v26 */
+    pmulhw_r2r (xmm0, xmm7);		/* xmm7  = b2/2 */
+
+    movdqa_r2r (xmm8, xmm10);		/* xmm10 = x0 */
+    movdqa_r2r (xmm8, xmm0);		/* xmm0  = x0 */
+
+    psubsw_r2r (xmm12, xmm10);		/* xmm10 = v04 */
+    paddsw_r2r (xmm12, xmm0);		/* xmm0  = u04 */
+
+    paddsw_r2r (xmm10, xmm9);		/* xmm9  = a1 */
+    movdqa_r2r (xmm0, xmm8);		/* xmm8  = u04 */
+
+    psubsw_r2r (xmm6, xmm10);		/* xmm10 = a2 */
+    paddsw_r2r (xmm2, xmm8);		/* xmm5  = a0 */
+
+    paddsw_r2r (xmm1, xmm1);		/* xmm1  = b1 */
+    psubsw_r2r (xmm2, xmm0);		/* xmm0  = a3 */
+
+    paddsw_r2r (xmm7, xmm7);		/* xmm7  = b2 */
+    movdqa_r2r (xmm10, xmm13);		/* xmm13 = a2 */
+
+    movdqa_r2r (xmm9, xmm14);		/* xmm14 = a1 */
+    paddsw_r2r (xmm7, xmm10);		/* xmm10 = a2+b2 */
+
+    psraw_i2r (COL_SHIFT,xmm10);	/* xmm10 = y2 */
+    paddsw_r2r (xmm1, xmm9);		/* xmm9  = a1+b1 */
+
+    psraw_i2r (COL_SHIFT, xmm9);	/* xmm9  = y1 */
+    psubsw_r2r (xmm1, xmm14);		/* xmm14 = a1-b1 */
+
+    psubsw_r2r (xmm7, xmm13);		/* xmm13 = a2-b2 */
+    psraw_i2r (COL_SHIFT,xmm14);	/* xmm14 = y6 */
+
+    movdqa_r2r (xmm8, xmm15);		/* xmm15 = a0 */
+    psraw_i2r (COL_SHIFT,xmm13);	/* xmm13 = y5 */
+
+    paddsw_r2r (xmm4, xmm8);		/* xmm8  = a0+b0 */
+    psubsw_r2r (xmm4, xmm15);		/* xmm15 = a0-b0 */
+
+    psraw_i2r (COL_SHIFT, xmm8);	/* xmm8  = y0 */
+    movdqa_r2r (xmm0, xmm12);		/* xmm12 = a3 */
+
+    psubsw_r2r (xmm11, xmm12);		/* xmm12 = a3-b3 */
+    psraw_i2r (COL_SHIFT,xmm15);	/* xmm15 = y7 */
+
+    paddsw_r2r (xmm0, xmm11);		/* xmm11 = a3+b3 */
+    psraw_i2r (COL_SHIFT,xmm12);	/* xmm12 = y4 */
+
+    psraw_i2r (COL_SHIFT,xmm11);	/* xmm11 = y3 */
+
+    /* OUTPUT: block in xmm8 ... xmm15 */
+
+#else
+    movdqa_m2r (*t1_vector, xmm0);	/* xmm0 = T1 */
+
+    movdqa_m2r (*(col+1*8), xmm1);	/* xmm1 = x1 */
+    movdqa_r2r (xmm0, xmm2);		/* xmm2 = T1 */
+
+    movdqa_m2r (*(col+7*8), xmm4);	/* xmm4 = x7 */
+    pmulhw_r2r (xmm1, xmm0);		/* xmm0 = T1*x1 */
+
+    movdqa_m2r (*t3_vector, xmm5);	/* xmm5 = T3 */
+    pmulhw_r2r (xmm4, xmm2);		/* xmm2 = T1*x7 */
+
+    movdqa_m2r (*(col+5*8), xmm6);	/* xmm6 = x5 */
+    movdqa_r2r (xmm5, xmm7);		/* xmm7 = T3-1 */
+
+    movdqa_m2r (*(col+3*8), xmm3);	/* xmm3 = x3 */
+    psubsw_r2r (xmm4, xmm0);		/* xmm0 = v17 */
+
+    movdqa_m2r (*t2_vector, xmm4);	/* xmm4 = T2 */
+    pmulhw_r2r (xmm3, xmm5);		/* xmm5 = (T3-1)*x3 */
+
+    paddsw_r2r (xmm2, xmm1);		/* xmm1 = u17 */
+    pmulhw_r2r (xmm6, xmm7);		/* xmm7 = (T3-1)*x5 */
+
+    /* slot */
+
+    movdqa_r2r (xmm4, xmm2);		/* xmm2 = T2 */
+    paddsw_r2r (xmm3, xmm5);		/* xmm5 = T3*x3 */
+
+    pmulhw_m2r (*(col+2*8), xmm4);	/* xmm4 = T2*x2 */
+    paddsw_r2r (xmm6, xmm7);		/* xmm7 = T3*x5 */
+
+    psubsw_r2r (xmm6, xmm5);		/* xmm5 = v35 */
+    paddsw_r2r (xmm3, xmm7);		/* xmm7 = u35 */
+
+    movdqa_m2r (*(col+6*8), xmm3);	/* xmm3 = x6 */
+    movdqa_r2r (xmm0, xmm6);		/* xmm6 = v17 */
+
+    pmulhw_r2r (xmm3, xmm2);		/* xmm2 = T2*x6 */
+    psubsw_r2r (xmm5, xmm0);		/* xmm0 = b3 */
+
+    psubsw_r2r (xmm3, xmm4);		/* xmm4 = v26 */
+    paddsw_r2r (xmm6, xmm5);		/* xmm5 = v12 */
+
+    movdqa_r2m (xmm0, *(col+3*8));	/* save b3 in scratch0 */
+    movdqa_r2r (xmm1, xmm6);		/* xmm6 = u17 */
+
+    paddsw_m2r (*(col+2*8), xmm2);	/* xmm2 = u26 */
+    paddsw_r2r (xmm7, xmm6);		/* xmm6 = b0 */
+
+    psubsw_r2r (xmm7, xmm1);		/* xmm1 = u12 */
+    movdqa_r2r (xmm1, xmm7);		/* xmm7 = u12 */
+
+    movdqa_m2r (*(col+0*8), xmm3);	/* xmm3 = x0 */
+    paddsw_r2r (xmm5, xmm1);		/* xmm1 = u12+v12 */
+
+    movdqa_m2r (*c4_vector, xmm0);	/* xmm0 = C4/2 */
+    psubsw_r2r (xmm5, xmm7);		/* xmm7 = u12-v12 */
+
+    movdqa_r2m (xmm6, *(col+5*8));	/* save b0 in scratch1 */
+    pmulhw_r2r (xmm0, xmm1);		/* xmm1 = b1/2 */
+
+    movdqa_r2r (xmm4, xmm6);		/* xmm6 = v26 */
+    pmulhw_r2r (xmm0, xmm7);		/* xmm7 = b2/2 */
+
+    movdqa_m2r (*(col+4*8), xmm5);	/* xmm5 = x4 */
+    movdqa_r2r (xmm3, xmm0);		/* xmm0 = x0 */
+
+    psubsw_r2r (xmm5, xmm3);		/* xmm3 = v04 */
+    paddsw_r2r (xmm5, xmm0);		/* xmm0 = u04 */
+
+    paddsw_r2r (xmm3, xmm4);		/* xmm4 = a1 */
+    movdqa_r2r (xmm0, xmm5);		/* xmm5 = u04 */
+
+    psubsw_r2r (xmm6, xmm3);		/* xmm3 = a2 */
+    paddsw_r2r (xmm2, xmm5);		/* xmm5 = a0 */
+
+    paddsw_r2r (xmm1, xmm1);		/* xmm1 = b1 */
+    psubsw_r2r (xmm2, xmm0);		/* xmm0 = a3 */
+
+    paddsw_r2r (xmm7, xmm7);		/* xmm7 = b2 */
+    movdqa_r2r (xmm3, xmm2);		/* xmm2 = a2 */
+
+    movdqa_r2r (xmm4, xmm6);		/* xmm6 = a1 */
+    paddsw_r2r (xmm7, xmm3);		/* xmm3 = a2+b2 */
+
+    psraw_i2r (COL_SHIFT, xmm3);	/* xmm3 = y2 */
+    paddsw_r2r (xmm1, xmm4);		/* xmm4 = a1+b1 */
+
+    psraw_i2r (COL_SHIFT, xmm4);	/* xmm4 = y1 */
+    psubsw_r2r (xmm1, xmm6);		/* xmm6 = a1-b1 */
+
+    movdqa_m2r (*(col+5*8), xmm1);	/* xmm1 = b0 */
+    psubsw_r2r (xmm7, xmm2);		/* xmm2 = a2-b2 */
+
+    psraw_i2r (COL_SHIFT, xmm6);	/* xmm6 = y6 */
+    movdqa_r2r (xmm5, xmm7);		/* xmm7 = a0 */
+
+    movdqa_r2m (xmm4, *(col+1*8));	/* save y1 */
+    psraw_i2r (COL_SHIFT, xmm2);	/* xmm2 = y5 */
+
+    movdqa_r2m (xmm3, *(col+2*8));	/* save y2 */
+    paddsw_r2r (xmm1, xmm5);		/* xmm5 = a0+b0 */
+
+    movdqa_m2r (*(col+3*8), xmm4);	/* xmm4 = b3 */
+    psubsw_r2r (xmm1, xmm7);		/* xmm7 = a0-b0 */
+
+    psraw_i2r (COL_SHIFT, xmm5);	/* xmm5 = y0 */
+    movdqa_r2r (xmm0, xmm3);		/* xmm3 = a3 */
+
+    movdqa_r2m (xmm2, *(col+5*8));	/* save y5 */
+    psubsw_r2r (xmm4, xmm3);		/* xmm3 = a3-b3 */
+
+    psraw_i2r (COL_SHIFT, xmm7);	/* xmm7 = y7 */
+    paddsw_r2r (xmm0, xmm4);		/* xmm4 = a3+b3 */
+
+    movdqa_r2m (xmm5, *(col+0*8));	/* save y0 */
+    psraw_i2r (COL_SHIFT, xmm3);	/* xmm3 = y4 */
+
+    movdqa_r2m (xmm6, *(col+6*8));	/* save y6 */
+    psraw_i2r (COL_SHIFT, xmm4);	/* xmm4 = y3 */
+
+    movdqa_r2m (xmm7, *(col+7*8));	/* save y7 */
+
+    movdqa_r2m (xmm3, *(col+4*8));	/* save y4 */
+
+    movdqa_r2m (xmm4, *(col+3*8));	/* save y3 */
+#endif
+}
+
+
+/* MMX column IDCT */
+static inline void idct_col (int16_t * const col, const int offset)
+{
+    static const short t1_vector[] ATTR_ALIGN(8) = {T1,T1,T1,T1};
+    static const short t2_vector[] ATTR_ALIGN(8) = {T2,T2,T2,T2};
+    static const short t3_vector[] ATTR_ALIGN(8) = {T3,T3,T3,T3};
+    static const short c4_vector[] ATTR_ALIGN(8) = {C4,C4,C4,C4};
+
+    /* column code adapted from peter gubanov */
+    /* http://www.elecard.com/peter/idct.shtml */
+
+    movq_m2r (*t1_vector, mm0);		/* mm0 = T1 */
+
+    movq_m2r (*(col+offset+1*8), mm1);	/* mm1 = x1 */
+    movq_r2r (mm0, mm2);		/* mm2 = T1 */
+
+    movq_m2r (*(col+offset+7*8), mm4);	/* mm4 = x7 */
+    pmulhw_r2r (mm1, mm0);		/* mm0 = T1*x1 */
+
+    movq_m2r (*t3_vector, mm5);		/* mm5 = T3 */
+    pmulhw_r2r (mm4, mm2);		/* mm2 = T1*x7 */
+
+    movq_m2r (*(col+offset+5*8), mm6);	/* mm6 = x5 */
+    movq_r2r (mm5, mm7);		/* mm7 = T3-1 */
+
+    movq_m2r (*(col+offset+3*8), mm3);	/* mm3 = x3 */
+    psubsw_r2r (mm4, mm0);		/* mm0 = v17 */
+
+    movq_m2r (*t2_vector, mm4);		/* mm4 = T2 */
+    pmulhw_r2r (mm3, mm5);		/* mm5 = (T3-1)*x3 */
+
+    paddsw_r2r (mm2, mm1);		/* mm1 = u17 */
+    pmulhw_r2r (mm6, mm7);		/* mm7 = (T3-1)*x5 */
+
+    /* slot */
+
+    movq_r2r (mm4, mm2);		/* mm2 = T2 */
+    paddsw_r2r (mm3, mm5);		/* mm5 = T3*x3 */
+
+    pmulhw_m2r (*(col+offset+2*8), mm4);/* mm4 = T2*x2 */
+    paddsw_r2r (mm6, mm7);		/* mm7 = T3*x5 */
+
+    psubsw_r2r (mm6, mm5);		/* mm5 = v35 */
+    paddsw_r2r (mm3, mm7);		/* mm7 = u35 */
+
+    movq_m2r (*(col+offset+6*8), mm3);	/* mm3 = x6 */
+    movq_r2r (mm0, mm6);		/* mm6 = v17 */
+
+    pmulhw_r2r (mm3, mm2);		/* mm2 = T2*x6 */
+    psubsw_r2r (mm5, mm0);		/* mm0 = b3 */
+
+    psubsw_r2r (mm3, mm4);		/* mm4 = v26 */
+    paddsw_r2r (mm6, mm5);		/* mm5 = v12 */
+
+    movq_r2m (mm0, *(col+offset+3*8));	/* save b3 in scratch0 */
+    movq_r2r (mm1, mm6);		/* mm6 = u17 */
+
+    paddsw_m2r (*(col+offset+2*8), mm2);/* mm2 = u26 */
+    paddsw_r2r (mm7, mm6);		/* mm6 = b0 */
+
+    psubsw_r2r (mm7, mm1);		/* mm1 = u12 */
+    movq_r2r (mm1, mm7);		/* mm7 = u12 */
+
+    movq_m2r (*(col+offset+0*8), mm3);	/* mm3 = x0 */
+    paddsw_r2r (mm5, mm1);		/* mm1 = u12+v12 */
+
+    movq_m2r (*c4_vector, mm0);		/* mm0 = C4/2 */
+    psubsw_r2r (mm5, mm7);		/* mm7 = u12-v12 */
+
+    movq_r2m (mm6, *(col+offset+5*8));	/* save b0 in scratch1 */
+    pmulhw_r2r (mm0, mm1);		/* mm1 = b1/2 */
+
+    movq_r2r (mm4, mm6);		/* mm6 = v26 */
+    pmulhw_r2r (mm0, mm7);		/* mm7 = b2/2 */
+
+    movq_m2r (*(col+offset+4*8), mm5);	/* mm5 = x4 */
+    movq_r2r (mm3, mm0);		/* mm0 = x0 */
+
+    psubsw_r2r (mm5, mm3);		/* mm3 = v04 */
+    paddsw_r2r (mm5, mm0);		/* mm0 = u04 */
+
+    paddsw_r2r (mm3, mm4);		/* mm4 = a1 */
+    movq_r2r (mm0, mm5);		/* mm5 = u04 */
+
+    psubsw_r2r (mm6, mm3);		/* mm3 = a2 */
+    paddsw_r2r (mm2, mm5);		/* mm5 = a0 */
+
+    paddsw_r2r (mm1, mm1);		/* mm1 = b1 */
+    psubsw_r2r (mm2, mm0);		/* mm0 = a3 */
+
+    paddsw_r2r (mm7, mm7);		/* mm7 = b2 */
+    movq_r2r (mm3, mm2);		/* mm2 = a2 */
+
+    movq_r2r (mm4, mm6);		/* mm6 = a1 */
+    paddsw_r2r (mm7, mm3);		/* mm3 = a2+b2 */
+
+    psraw_i2r (COL_SHIFT, mm3);		/* mm3 = y2 */
+    paddsw_r2r (mm1, mm4);		/* mm4 = a1+b1 */
+
+    psraw_i2r (COL_SHIFT, mm4);		/* mm4 = y1 */
+    psubsw_r2r (mm1, mm6);		/* mm6 = a1-b1 */
+
+    movq_m2r (*(col+offset+5*8), mm1);	/* mm1 = b0 */
+    psubsw_r2r (mm7, mm2);		/* mm2 = a2-b2 */
+
+    psraw_i2r (COL_SHIFT, mm6);		/* mm6 = y6 */
+    movq_r2r (mm5, mm7);		/* mm7 = a0 */
+
+    movq_r2m (mm4, *(col+offset+1*8));	/* save y1 */
+    psraw_i2r (COL_SHIFT, mm2);		/* mm2 = y5 */
+
+    movq_r2m (mm3, *(col+offset+2*8));	/* save y2 */
+    paddsw_r2r (mm1, mm5);		/* mm5 = a0+b0 */
+
+    movq_m2r (*(col+offset+3*8), mm4);	/* mm4 = b3 */
+    psubsw_r2r (mm1, mm7);		/* mm7 = a0-b0 */
+
+    psraw_i2r (COL_SHIFT, mm5);		/* mm5 = y0 */
+    movq_r2r (mm0, mm3);		/* mm3 = a3 */
+
+    movq_r2m (mm2, *(col+offset+5*8));	/* save y5 */
+    psubsw_r2r (mm4, mm3);		/* mm3 = a3-b3 */
+
+    psraw_i2r (COL_SHIFT, mm7);		/* mm7 = y7 */
+    paddsw_r2r (mm0, mm4);		/* mm4 = a3+b3 */
+
+    movq_r2m (mm5, *(col+offset+0*8));	/* save y0 */
+    psraw_i2r (COL_SHIFT, mm3);		/* mm3 = y4 */
+
+    movq_r2m (mm6, *(col+offset+6*8));	/* save y6 */
+    psraw_i2r (COL_SHIFT, mm4);		/* mm4 = y3 */
+
+    movq_r2m (mm7, *(col+offset+7*8));	/* save y7 */
+
+    movq_r2m (mm3, *(col+offset+4*8));	/* save y4 */
+
+    movq_r2m (mm4, *(col+offset+3*8));	/* save y3 */
+}
+
+
+static const int32_t rounder0[] ATTR_ALIGN(8) =
+    rounder ((1 << (COL_SHIFT - 1)) - 0.5);
+static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0);
+static const int32_t rounder1[] ATTR_ALIGN(8) =
+    rounder (1.25683487303);	/* C1*(C1/C4+C1+C7)/2 */
+static const int32_t rounder7[] ATTR_ALIGN(8) =
+    rounder (-0.25);		/* C1*(C7/C4+C7-C1)/2 */
+static const int32_t rounder2[] ATTR_ALIGN(8) =
+    rounder (0.60355339059);	/* C2 * (C6+C2)/2 */
+static const int32_t rounder6[] ATTR_ALIGN(8) =
+    rounder (-0.25);		/* C2 * (C6-C2)/2 */
+static const int32_t rounder3[] ATTR_ALIGN(8) =
+    rounder (0.087788325588);	/* C3*(-C3/C4+C3+C5)/2 */
+static const int32_t rounder5[] ATTR_ALIGN(8) =
+    rounder (-0.441341716183);	/* C3*(-C5/C4+C5-C3)/2 */
+
+
+#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid)	\
+static inline void idct (int16_t * const block)				\
+{									\
+    static const int16_t table04[] ATTR_ALIGN(16) =			\
+	table (22725, 21407, 19266, 16384, 12873,  8867, 4520);		\
+    static const int16_t table17[] ATTR_ALIGN(16) =			\
+	table (31521, 29692, 26722, 22725, 17855, 12299, 6270);		\
+    static const int16_t table26[] ATTR_ALIGN(16) =			\
+	table (29692, 27969, 25172, 21407, 16819, 11585, 5906);		\
+    static const int16_t table35[] ATTR_ALIGN(16) =			\
+	table (26722, 25172, 22654, 19266, 15137, 10426, 5315);		\
+									\
+    idct_row_head (block, 0*8, table04);				\
+    idct_row (table04, rounder0);					\
+    idct_row_mid (block, 0*8, 4*8, table04);				\
+    idct_row (table04, rounder4);					\
+    idct_row_mid (block, 4*8, 1*8, table17);				\
+    idct_row (table17, rounder1);					\
+    idct_row_mid (block, 1*8, 7*8, table17);				\
+    idct_row (table17, rounder7);					\
+    idct_row_mid (block, 7*8, 2*8, table26);				\
+    idct_row (table26, rounder2);					\
+    idct_row_mid (block, 2*8, 6*8, table26);				\
+    idct_row (table26, rounder6);					\
+    idct_row_mid (block, 6*8, 3*8, table35);				\
+    idct_row (table35, rounder3);					\
+    idct_row_mid (block, 3*8, 5*8, table35);				\
+    idct_row (table35, rounder5);					\
+    idct_row_tail (block, 5*8);						\
+									\
+    idct_col (block, 0);						\
+    idct_col (block, 4);						\
+}
+
+static inline void sse2_idct (int16_t * const block)
+{
+    static const int16_t table04[] ATTR_ALIGN(16) =
+	sse2_table (22725, 21407, 19266, 16384, 12873,  8867, 4520);
+    static const int16_t table17[] ATTR_ALIGN(16) =
+	sse2_table (31521, 29692, 26722, 22725, 17855, 12299, 6270);
+    static const int16_t table26[] ATTR_ALIGN(16) =
+	sse2_table (29692, 27969, 25172, 21407, 16819, 11585, 5906);
+    static const int16_t table35[] ATTR_ALIGN(16) =
+	sse2_table (26722, 25172, 22654, 19266, 15137, 10426, 5315);
+
+    static const int32_t rounder0_128[] ATTR_ALIGN(16) =
+	rounder_sse2 ((1 << (COL_SHIFT - 1)) - 0.5);
+    static const int32_t rounder4_128[] ATTR_ALIGN(16) = rounder_sse2 (0);
+    static const int32_t rounder1_128[] ATTR_ALIGN(16) =
+	rounder_sse2 (1.25683487303);	/* C1*(C1/C4+C1+C7)/2 */
+    static const int32_t rounder7_128[] ATTR_ALIGN(16) =
+	rounder_sse2 (-0.25);		/* C1*(C7/C4+C7-C1)/2 */
+    static const int32_t rounder2_128[] ATTR_ALIGN(16) =
+	rounder_sse2 (0.60355339059);	/* C2 * (C6+C2)/2 */
+    static const int32_t rounder6_128[] ATTR_ALIGN(16) =
+	rounder_sse2 (-0.25);		/* C2 * (C6-C2)/2 */
+    static const int32_t rounder3_128[] ATTR_ALIGN(16) =
+	rounder_sse2 (0.087788325588);	/* C3*(-C3/C4+C3+C5)/2 */
+    static const int32_t rounder5_128[] ATTR_ALIGN(16) =
+	rounder_sse2 (-0.441341716183);	/* C3*(-C5/C4+C5-C3)/2 */
+
+#if defined(__x86_64__)
+    movdqa_m2r (block[0*8], xmm8);
+    movdqa_m2r (block[4*8], xmm12);
+    SSE2_IDCT_2ROW (table04,  xmm8, xmm12, *rounder0_128, *rounder4_128);
+
+    movdqa_m2r (block[1*8], xmm9);
+    movdqa_m2r (block[7*8], xmm15);
+    SSE2_IDCT_2ROW (table17,  xmm9, xmm15, *rounder1_128, *rounder7_128);
+
+    movdqa_m2r (block[2*8], xmm10);
+    movdqa_m2r (block[6*8], xmm14);
+    SSE2_IDCT_2ROW (table26, xmm10, xmm14, *rounder2_128, *rounder6_128);
+
+    movdqa_m2r (block[3*8], xmm11);
+    movdqa_m2r (block[5*8], xmm13);
+    SSE2_IDCT_2ROW (table35, xmm11, xmm13, *rounder3_128, *rounder5_128);
+
+    /* OUTPUT: block in xmm8 ... xmm15 */
+
+#else
+    movdqa_m2r (block[0*8], xmm0);
+    movdqa_m2r (block[4*8], xmm4);
+    SSE2_IDCT_2ROW (table04, xmm0, xmm4, *rounder0_128, *rounder4_128);
+    movdqa_r2m (xmm0, block[0*8]);
+    movdqa_r2m (xmm4, block[4*8]);
+
+    movdqa_m2r (block[1*8], xmm0);
+    movdqa_m2r (block[7*8], xmm4);
+    SSE2_IDCT_2ROW (table17, xmm0, xmm4, *rounder1_128, *rounder7_128);
+    movdqa_r2m (xmm0, block[1*8]);
+    movdqa_r2m (xmm4, block[7*8]);
+
+    movdqa_m2r (block[2*8], xmm0);
+    movdqa_m2r (block[6*8], xmm4);
+    SSE2_IDCT_2ROW (table26, xmm0, xmm4, *rounder2_128, *rounder6_128);
+    movdqa_r2m (xmm0, block[2*8]);
+    movdqa_r2m (xmm4, block[6*8]);
+
+    movdqa_m2r (block[3*8], xmm0);
+    movdqa_m2r (block[5*8], xmm4);
+    SSE2_IDCT_2ROW (table35, xmm0, xmm4, *rounder3_128, *rounder5_128);
+    movdqa_r2m (xmm0, block[3*8]);
+    movdqa_r2m (xmm4, block[5*8]);
+#endif
+
+    sse2_idct_col (block);
+}
+
+static void sse2_block_copy (int16_t * const block, uint8_t * dest,
+			     const int stride)
+{
+#if defined(__x86_64__)
+    /* INPUT: block in xmm8 ... xmm15 */
+    packuswb_r2r (xmm8, xmm8);
+    packuswb_r2r (xmm9, xmm9);
+    movq_r2m (xmm8,  *(dest+0*stride));
+    packuswb_r2r (xmm10, xmm10);
+    movq_r2m (xmm9,  *(dest+1*stride));
+    packuswb_r2r (xmm11, xmm11);
+    movq_r2m (xmm10, *(dest+2*stride));
+    packuswb_r2r (xmm12, xmm12);
+    movq_r2m (xmm11, *(dest+3*stride));
+    packuswb_r2r (xmm13, xmm13);
+    movq_r2m (xmm12, *(dest+4*stride));
+    packuswb_r2r (xmm14, xmm14);
+    movq_r2m (xmm13, *(dest+5*stride));
+    packuswb_r2r (xmm15, xmm15);
+    movq_r2m (xmm14, *(dest+6*stride));
+    movq_r2m (xmm15, *(dest+7*stride));
+#else
+    movdqa_m2r (*(block+0*8), xmm0);
+    movdqa_m2r (*(block+1*8), xmm1);
+    movdqa_m2r (*(block+2*8), xmm2);
+    packuswb_r2r (xmm0, xmm0);
+    movdqa_m2r (*(block+3*8), xmm3);
+    packuswb_r2r (xmm1, xmm1);
+    movdqa_m2r (*(block+4*8), xmm4);
+    packuswb_r2r (xmm2, xmm2);
+    movdqa_m2r (*(block+5*8), xmm5);
+    packuswb_r2r (xmm3, xmm3);
+    movdqa_m2r (*(block+6*8), xmm6);
+    packuswb_r2r (xmm4, xmm4);
+    movdqa_m2r (*(block+7*8), xmm7);
+    movq_r2m (xmm0, *(dest+0*stride));
+    packuswb_r2r (xmm5, xmm5);
+    movq_r2m (xmm1, *(dest+1*stride));
+    packuswb_r2r (xmm6, xmm6);
+    movq_r2m (xmm2, *(dest+2*stride));
+    packuswb_r2r (xmm7, xmm7);
+    movq_r2m (xmm3, *(dest+3*stride));
+    movq_r2m (xmm4, *(dest+4*stride));
+    movq_r2m (xmm5, *(dest+5*stride));
+    movq_r2m (xmm6, *(dest+6*stride));
+    movq_r2m (xmm7, *(dest+7*stride));
+#endif
+}
+
+#define COPY_MMX(offset,r0,r1,r2)	\
+do {					\
+    movq_m2r (*(block+offset), r0);	\
+    dest += stride;			\
+    movq_m2r (*(block+offset+4), r1);	\
+    movq_r2m (r2, *dest);		\
+    packuswb_r2r (r1, r0);		\
+} while (0)
+
+static inline void block_copy (int16_t * const block, uint8_t * dest,
+			       const int stride)
+{
+    movq_m2r (*(block+0*8), mm0);
+    movq_m2r (*(block+0*8+4), mm1);
+    movq_m2r (*(block+1*8), mm2);
+    packuswb_r2r (mm1, mm0);
+    movq_m2r (*(block+1*8+4), mm3);
+    movq_r2m (mm0, *dest);
+    packuswb_r2r (mm3, mm2);
+    COPY_MMX (2*8, mm0, mm1, mm2);
+    COPY_MMX (3*8, mm2, mm3, mm0);
+    COPY_MMX (4*8, mm0, mm1, mm2);
+    COPY_MMX (5*8, mm2, mm3, mm0);
+    COPY_MMX (6*8, mm0, mm1, mm2);
+    COPY_MMX (7*8, mm2, mm3, mm0);
+    movq_r2m (mm2, *(dest+stride));
+}
+
+#define ADD_SSE2_2ROW(op, block0, block1)\
+do {					\
+    movq_m2r (*(dest), xmm1);		\
+    movq_m2r (*(dest+stride), xmm2);	\
+    punpcklbw_r2r (xmm0, xmm1);		\
+    punpcklbw_r2r (xmm0, xmm2);		\
+    paddsw_##op (block0, xmm1);		\
+    paddsw_##op (block1, xmm2);		\
+    packuswb_r2r (xmm1, xmm1);		\
+    packuswb_r2r (xmm2, xmm2);		\
+    movq_r2m (xmm1, *(dest));		\
+    movq_r2m (xmm2, *(dest+stride));	\
+    dest += 2*stride;			\
+} while (0)
+
+static void sse2_block_add (int16_t * const block, uint8_t * dest,
+			    const int stride)
+{
+    pxor_r2r(xmm0, xmm0);
+#if defined(__x86_64__)
+    /* INPUT: block in xmm8 ... xmm15 */
+    ADD_SSE2_2ROW(r2r, xmm8, xmm9);
+    ADD_SSE2_2ROW(r2r, xmm10, xmm11);
+    ADD_SSE2_2ROW(r2r, xmm12, xmm13);
+    ADD_SSE2_2ROW(r2r, xmm14, xmm15);
+#else
+    ADD_SSE2_2ROW(m2r, *(block+0*8), *(block+1*8));
+    ADD_SSE2_2ROW(m2r, *(block+2*8), *(block+3*8));
+    ADD_SSE2_2ROW(m2r, *(block+4*8), *(block+5*8));
+    ADD_SSE2_2ROW(m2r, *(block+6*8), *(block+7*8));
+#endif
+}
+
+#define ADD_MMX(offset,r1,r2,r3,r4)	\
+do {					\
+    movq_m2r (*(dest+2*stride), r1);	\
+    packuswb_r2r (r4, r3);		\
+    movq_r2r (r1, r2);			\
+    dest += stride;			\
+    movq_r2m (r3, *dest);		\
+    punpcklbw_r2r (mm0, r1);		\
+    paddsw_m2r (*(block+offset), r1);	\
+    punpckhbw_r2r (mm0, r2);		\
+    paddsw_m2r (*(block+offset+4), r2);	\
+} while (0)
+
+static inline void block_add (int16_t * const block, uint8_t * dest,
+			      const int stride)
+{
+    movq_m2r (*dest, mm1);
+    pxor_r2r (mm0, mm0);
+    movq_m2r (*(dest+stride), mm3);
+    movq_r2r (mm1, mm2);
+    punpcklbw_r2r (mm0, mm1);
+    movq_r2r (mm3, mm4);
+    paddsw_m2r (*(block+0*8), mm1);
+    punpckhbw_r2r (mm0, mm2);
+    paddsw_m2r (*(block+0*8+4), mm2);
+    punpcklbw_r2r (mm0, mm3);
+    paddsw_m2r (*(block+1*8), mm3);
+    packuswb_r2r (mm2, mm1);
+    punpckhbw_r2r (mm0, mm4);
+    movq_r2m (mm1, *dest);
+    paddsw_m2r (*(block+1*8+4), mm4);
+    ADD_MMX (2*8, mm1, mm2, mm3, mm4);
+    ADD_MMX (3*8, mm3, mm4, mm1, mm2);
+    ADD_MMX (4*8, mm1, mm2, mm3, mm4);
+    ADD_MMX (5*8, mm3, mm4, mm1, mm2);
+    ADD_MMX (6*8, mm1, mm2, mm3, mm4);
+    ADD_MMX (7*8, mm3, mm4, mm1, mm2);
+    packuswb_r2r (mm4, mm3);
+    movq_r2m (mm3, *(dest+stride));
+}
+
+
+static inline void sse2_block_zero (int16_t * const block)
+{
+    pxor_r2r (xmm0, xmm0);
+    movdqa_r2m (xmm0, *(block+0*8));
+    movdqa_r2m (xmm0, *(block+1*8));
+    movdqa_r2m (xmm0, *(block+2*8));
+    movdqa_r2m (xmm0, *(block+3*8));
+    movdqa_r2m (xmm0, *(block+4*8));
+    movdqa_r2m (xmm0, *(block+5*8));
+    movdqa_r2m (xmm0, *(block+6*8));
+    movdqa_r2m (xmm0, *(block+7*8));
+}
+
+static inline void block_zero (int16_t * const block)
+{
+    pxor_r2r (mm0, mm0);
+    movq_r2m (mm0, *(block+0*4));
+    movq_r2m (mm0, *(block+1*4));
+    movq_r2m (mm0, *(block+2*4));
+    movq_r2m (mm0, *(block+3*4));
+    movq_r2m (mm0, *(block+4*4));
+    movq_r2m (mm0, *(block+5*4));
+    movq_r2m (mm0, *(block+6*4));
+    movq_r2m (mm0, *(block+7*4));
+    movq_r2m (mm0, *(block+8*4));
+    movq_r2m (mm0, *(block+9*4));
+    movq_r2m (mm0, *(block+10*4));
+    movq_r2m (mm0, *(block+11*4));
+    movq_r2m (mm0, *(block+12*4));
+    movq_r2m (mm0, *(block+13*4));
+    movq_r2m (mm0, *(block+14*4));
+    movq_r2m (mm0, *(block+15*4));
+}
+
+
+#define CPU_MMXEXT 0
+#define CPU_MMX 1
+
+#define dup4(reg)			\
+do {					\
+    if (cpu != CPU_MMXEXT) {		\
+	punpcklwd_r2r (reg, reg);	\
+	punpckldq_r2r (reg, reg);	\
+    } else				\
+	pshufw_r2r (reg, reg, 0x00);	\
+} while (0)
+
+static inline void block_add_DC (int16_t * const block, uint8_t * dest,
+				 const int stride, const int cpu)
+{
+    movd_v2r ((block[0] + 64) >> 7, mm0);
+    pxor_r2r (mm1, mm1);
+    movq_m2r (*dest, mm2);
+    dup4 (mm0);
+    psubsw_r2r (mm0, mm1);
+    packuswb_r2r (mm0, mm0);
+    paddusb_r2r (mm0, mm2);
+    packuswb_r2r (mm1, mm1);
+    movq_m2r (*(dest + stride), mm3);
+    psubusb_r2r (mm1, mm2);
+    block[0] = 0;
+    paddusb_r2r (mm0, mm3);
+    movq_r2m (mm2, *dest);
+    psubusb_r2r (mm1, mm3);
+    movq_m2r (*(dest + 2*stride), mm2);
+    dest += stride;
+    movq_r2m (mm3, *dest);
+    paddusb_r2r (mm0, mm2);
+    movq_m2r (*(dest + 2*stride), mm3);
+    psubusb_r2r (mm1, mm2);
+    dest += stride;
+    paddusb_r2r (mm0, mm3);
+    movq_r2m (mm2, *dest);
+    psubusb_r2r (mm1, mm3);
+    movq_m2r (*(dest + 2*stride), mm2);
+    dest += stride;
+    movq_r2m (mm3, *dest);
+    paddusb_r2r (mm0, mm2);
+    movq_m2r (*(dest + 2*stride), mm3);
+    psubusb_r2r (mm1, mm2);
+    dest += stride;
+    paddusb_r2r (mm0, mm3);
+    movq_r2m (mm2, *dest);
+    psubusb_r2r (mm1, mm3);
+    movq_m2r (*(dest + 2*stride), mm2);
+    dest += stride;
+    movq_r2m (mm3, *dest);
+    paddusb_r2r (mm0, mm2);
+    movq_m2r (*(dest + 2*stride), mm3);
+    psubusb_r2r (mm1, mm2);
+    block[63] = 0;
+    paddusb_r2r (mm0, mm3);
+    movq_r2m (mm2, *(dest + stride));
+    psubusb_r2r (mm1, mm3);
+    movq_r2m (mm3, *(dest + 2*stride));
+}
+
+void mpeg2_idct_copy_sse2 (int16_t * const block, uint8_t * const dest,
+			   const int stride)
+{
+    sse2_idct (block);
+    sse2_block_copy (block, dest, stride);
+    sse2_block_zero (block);
+}
+
+void mpeg2_idct_add_sse2 (const int last, int16_t * const block,
+			  uint8_t * const dest, const int stride)
+{
+    if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
+	sse2_idct (block);
+	sse2_block_add (block, dest, stride);
+	sse2_block_zero (block);
+    } else
+	block_add_DC (block, dest, stride, CPU_MMXEXT);
+}
+
+
+declare_idct (mmxext_idct, mmxext_table,
+	      mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid)
+
+void mpeg2_idct_copy_mmxext (int16_t * const block, uint8_t * const dest,
+			     const int stride)
+{
+    mmxext_idct (block);
+    block_copy (block, dest, stride);
+    block_zero (block);
+}
+
+void mpeg2_idct_add_mmxext (const int last, int16_t * const block,
+			    uint8_t * const dest, const int stride)
+{
+    if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
+	mmxext_idct (block);
+	block_add (block, dest, stride);
+	block_zero (block);
+    } else
+	block_add_DC (block, dest, stride, CPU_MMXEXT);
+}
+
+
+declare_idct (mmx_idct, mmx_table,
+	      mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
+
+void mpeg2_idct_copy_mmx (int16_t * const block, uint8_t * const dest,
+			  const int stride)
+{
+    mmx_idct (block);
+    block_copy (block, dest, stride);
+    block_zero (block);
+}
+
+void mpeg2_idct_add_mmx (const int last, int16_t * const block,
+			 uint8_t * const dest, const int stride)
+{
+    if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
+	mmx_idct (block);
+	block_add (block, dest, stride);
+	block_zero (block);
+    } else
+	block_add_DC (block, dest, stride, CPU_MMX);
+}
+
+/*
+void mpeg2_idct_mmx_init (void)
+{
+    int i, j;
+
+     the mmx/mmxext idct uses a reordered input, so we patch scan tables 
+
+    for (i = 0; i < 64; i++) {
+	j = mpeg2_scan_norm[i];
+	mpeg2_scan_norm[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
+	j = mpeg2_scan_alt[i];
+	mpeg2_scan_alt[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
+    }
+}
+*/
+
+#endif
diff --git a/libmpeg2.h b/libmpeg2.h
new file mode 100644
index 0000000..7b254cb
--- /dev/null
+++ b/libmpeg2.h
@@ -0,0 +1,5 @@
+#include "config.h"
+#include <inttypes.h>
+#include "mpeg2.h"
+#include "attributes.h"
+#include "mpeg2_internal.h"
diff --git a/mmx.h b/mmx.h
new file mode 100644
index 0000000..c62be1f
--- /dev/null
+++ b/mmx.h
@@ -0,0 +1,292 @@
+/*
+ * mmx.h
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef LIBMPEG2_MMX_H
+#define LIBMPEG2_MMX_H
+
+/*
+ * The type of an value that fits in an MMX register (note that long
+ * long constant values MUST be suffixed by LL and unsigned long long
+ * values by ULL, lest they be truncated by the compiler)
+ */
+
+typedef	union {
+	long long		q;	/* Quadword (64-bit) value */
+	unsigned long long	uq;	/* Unsigned Quadword */
+	int			d[2];	/* 2 Doubleword (32-bit) values */
+	unsigned int		ud[2];	/* 2 Unsigned Doubleword */
+	short			w[4];	/* 4 Word (16-bit) values */
+	unsigned short		uw[4];	/* 4 Unsigned Word */
+	char			b[8];	/* 8 Byte (8-bit) values */
+	unsigned char		ub[8];	/* 8 Unsigned Byte */
+	float			s[2];	/* Single-precision (32-bit) value */
+} ATTR_ALIGN(8) mmx_t;	/* On an 8-byte (64-bit) boundary */
+
+
+#define	mmx_i2r(op,imm,reg) \
+	__asm__ __volatile__ (#op " %0, %%" #reg \
+			      : /* nothing */ \
+			      : "i" (imm) )
+
+#define	mmx_m2r(op,mem,reg) \
+	__asm__ __volatile__ (#op " %0, %%" #reg \
+			      : /* nothing */ \
+			      : "m" (mem))
+
+#define	mmx_r2m(op,reg,mem) \
+	__asm__ __volatile__ (#op " %%" #reg ", %0" \
+			      : "=m" (mem) \
+			      : /* nothing */ )
+
+#define	mmx_r2r(op,regs,regd) \
+	__asm__ __volatile__ (#op " %" #regs ", %" #regd)
+
+
+#define	emms() __asm__ __volatile__ ("emms")
+
+#define	movd_m2r(var,reg)	mmx_m2r (movd, var, reg)
+#define	movd_r2m(reg,var)	mmx_r2m (movd, reg, var)
+#define	movd_v2r(var,reg)	__asm__ __volatile__ ("movd %0, %%" #reg \
+						      : /* nothing */ \
+						      : "rm" (var))
+#define	movd_r2v(reg,var)	__asm__ __volatile__ ("movd %%" #reg ", %0" \
+						      : "=rm" (var) \
+						      : /* nothing */ )
+
+#define	movq_m2r(var,reg)	mmx_m2r (movq, var, reg)
+#define	movq_r2m(reg,var)	mmx_r2m (movq, reg, var)
+#define	movq_r2r(regs,regd)	mmx_r2r (movq, regs, regd)
+
+#define	packssdw_m2r(var,reg)	mmx_m2r (packssdw, var, reg)
+#define	packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd)
+#define	packsswb_m2r(var,reg)	mmx_m2r (packsswb, var, reg)
+#define	packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd)
+
+#define	packuswb_m2r(var,reg)	mmx_m2r (packuswb, var, reg)
+#define	packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd)
+
+#define	paddb_m2r(var,reg)	mmx_m2r (paddb, var, reg)
+#define	paddb_r2r(regs,regd)	mmx_r2r (paddb, regs, regd)
+#define	paddd_m2r(var,reg)	mmx_m2r (paddd, var, reg)
+#define	paddd_r2r(regs,regd)	mmx_r2r (paddd, regs, regd)
+#define	paddw_m2r(var,reg)	mmx_m2r (paddw, var, reg)
+#define	paddw_r2r(regs,regd)	mmx_r2r (paddw, regs, regd)
+
+#define	paddsb_m2r(var,reg)	mmx_m2r (paddsb, var, reg)
+#define	paddsb_r2r(regs,regd)	mmx_r2r (paddsb, regs, regd)
+#define	paddsw_m2r(var,reg)	mmx_m2r (paddsw, var, reg)
+#define	paddsw_r2r(regs,regd)	mmx_r2r (paddsw, regs, regd)
+
+#define	paddusb_m2r(var,reg)	mmx_m2r (paddusb, var, reg)
+#define	paddusb_r2r(regs,regd)	mmx_r2r (paddusb, regs, regd)
+#define	paddusw_m2r(var,reg)	mmx_m2r (paddusw, var, reg)
+#define	paddusw_r2r(regs,regd)	mmx_r2r (paddusw, regs, regd)
+
+#define	pand_m2r(var,reg)	mmx_m2r (pand, var, reg)
+#define	pand_r2r(regs,regd)	mmx_r2r (pand, regs, regd)
+
+#define	pandn_m2r(var,reg)	mmx_m2r (pandn, var, reg)
+#define	pandn_r2r(regs,regd)	mmx_r2r (pandn, regs, regd)
+
+#define	pcmpeqb_m2r(var,reg)	mmx_m2r (pcmpeqb, var, reg)
+#define	pcmpeqb_r2r(regs,regd)	mmx_r2r (pcmpeqb, regs, regd)
+#define	pcmpeqd_m2r(var,reg)	mmx_m2r (pcmpeqd, var, reg)
+#define	pcmpeqd_r2r(regs,regd)	mmx_r2r (pcmpeqd, regs, regd)
+#define	pcmpeqw_m2r(var,reg)	mmx_m2r (pcmpeqw, var, reg)
+#define	pcmpeqw_r2r(regs,regd)	mmx_r2r (pcmpeqw, regs, regd)
+
+#define	pcmpgtb_m2r(var,reg)	mmx_m2r (pcmpgtb, var, reg)
+#define	pcmpgtb_r2r(regs,regd)	mmx_r2r (pcmpgtb, regs, regd)
+#define	pcmpgtd_m2r(var,reg)	mmx_m2r (pcmpgtd, var, reg)
+#define	pcmpgtd_r2r(regs,regd)	mmx_r2r (pcmpgtd, regs, regd)
+#define	pcmpgtw_m2r(var,reg)	mmx_m2r (pcmpgtw, var, reg)
+#define	pcmpgtw_r2r(regs,regd)	mmx_r2r (pcmpgtw, regs, regd)
+
+#define	pmaddwd_m2r(var,reg)	mmx_m2r (pmaddwd, var, reg)
+#define	pmaddwd_r2r(regs,regd)	mmx_r2r (pmaddwd, regs, regd)
+
+#define	pmulhw_m2r(var,reg)	mmx_m2r (pmulhw, var, reg)
+#define	pmulhw_r2r(regs,regd)	mmx_r2r (pmulhw, regs, regd)
+
+#define	pmullw_m2r(var,reg)	mmx_m2r (pmullw, var, reg)
+#define	pmullw_r2r(regs,regd)	mmx_r2r (pmullw, regs, regd)
+
+#define	por_m2r(var,reg)	mmx_m2r (por, var, reg)
+#define	por_r2r(regs,regd)	mmx_r2r (por, regs, regd)
+
+#define	pslld_i2r(imm,reg)	mmx_i2r (pslld, imm, reg)
+#define	pslld_m2r(var,reg)	mmx_m2r (pslld, var, reg)
+#define	pslld_r2r(regs,regd)	mmx_r2r (pslld, regs, regd)
+#define	psllq_i2r(imm,reg)	mmx_i2r (psllq, imm, reg)
+#define	psllq_m2r(var,reg)	mmx_m2r (psllq, var, reg)
+#define	psllq_r2r(regs,regd)	mmx_r2r (psllq, regs, regd)
+#define	psllw_i2r(imm,reg)	mmx_i2r (psllw, imm, reg)
+#define	psllw_m2r(var,reg)	mmx_m2r (psllw, var, reg)
+#define	psllw_r2r(regs,regd)	mmx_r2r (psllw, regs, regd)
+
+#define	psrad_i2r(imm,reg)	mmx_i2r (psrad, imm, reg)
+#define	psrad_m2r(var,reg)	mmx_m2r (psrad, var, reg)
+#define	psrad_r2r(regs,regd)	mmx_r2r (psrad, regs, regd)
+#define	psraw_i2r(imm,reg)	mmx_i2r (psraw, imm, reg)
+#define	psraw_m2r(var,reg)	mmx_m2r (psraw, var, reg)
+#define	psraw_r2r(regs,regd)	mmx_r2r (psraw, regs, regd)
+
+#define	psrld_i2r(imm,reg)	mmx_i2r (psrld, imm, reg)
+#define	psrld_m2r(var,reg)	mmx_m2r (psrld, var, reg)
+#define	psrld_r2r(regs,regd)	mmx_r2r (psrld, regs, regd)
+#define	psrlq_i2r(imm,reg)	mmx_i2r (psrlq, imm, reg)
+#define	psrlq_m2r(var,reg)	mmx_m2r (psrlq, var, reg)
+#define	psrlq_r2r(regs,regd)	mmx_r2r (psrlq, regs, regd)
+#define	psrlw_i2r(imm,reg)	mmx_i2r (psrlw, imm, reg)
+#define	psrlw_m2r(var,reg)	mmx_m2r (psrlw, var, reg)
+#define	psrlw_r2r(regs,regd)	mmx_r2r (psrlw, regs, regd)
+
+#define	psubb_m2r(var,reg)	mmx_m2r (psubb, var, reg)
+#define	psubb_r2r(regs,regd)	mmx_r2r (psubb, regs, regd)
+#define	psubd_m2r(var,reg)	mmx_m2r (psubd, var, reg)
+#define	psubd_r2r(regs,regd)	mmx_r2r (psubd, regs, regd)
+#define	psubw_m2r(var,reg)	mmx_m2r (psubw, var, reg)
+#define	psubw_r2r(regs,regd)	mmx_r2r (psubw, regs, regd)
+
+#define	psubsb_m2r(var,reg)	mmx_m2r (psubsb, var, reg)
+#define	psubsb_r2r(regs,regd)	mmx_r2r (psubsb, regs, regd)
+#define	psubsw_m2r(var,reg)	mmx_m2r (psubsw, var, reg)
+#define	psubsw_r2r(regs,regd)	mmx_r2r (psubsw, regs, regd)
+
+#define	psubusb_m2r(var,reg)	mmx_m2r (psubusb, var, reg)
+#define	psubusb_r2r(regs,regd)	mmx_r2r (psubusb, regs, regd)
+#define	psubusw_m2r(var,reg)	mmx_m2r (psubusw, var, reg)
+#define	psubusw_r2r(regs,regd)	mmx_r2r (psubusw, regs, regd)
+
+#define	punpckhbw_m2r(var,reg)		mmx_m2r (punpckhbw, var, reg)
+#define	punpckhbw_r2r(regs,regd)	mmx_r2r (punpckhbw, regs, regd)
+#define	punpckhdq_m2r(var,reg)		mmx_m2r (punpckhdq, var, reg)
+#define	punpckhdq_r2r(regs,regd)	mmx_r2r (punpckhdq, regs, regd)
+#define	punpckhwd_m2r(var,reg)		mmx_m2r (punpckhwd, var, reg)
+#define	punpckhwd_r2r(regs,regd)	mmx_r2r (punpckhwd, regs, regd)
+
+#define	punpcklbw_m2r(var,reg) 		mmx_m2r (punpcklbw, var, reg)
+#define	punpcklbw_r2r(regs,regd)	mmx_r2r (punpcklbw, regs, regd)
+#define	punpckldq_m2r(var,reg)		mmx_m2r (punpckldq, var, reg)
+#define	punpckldq_r2r(regs,regd)	mmx_r2r (punpckldq, regs, regd)
+#define	punpcklwd_m2r(var,reg)		mmx_m2r (punpcklwd, var, reg)
+#define	punpcklwd_r2r(regs,regd)	mmx_r2r (punpcklwd, regs, regd)
+
+#define	pxor_m2r(var,reg)	mmx_m2r (pxor, var, reg)
+#define	pxor_r2r(regs,regd)	mmx_r2r (pxor, regs, regd)
+
+
+/* 3DNOW extensions */
+
+#define pavgusb_m2r(var,reg)	mmx_m2r (pavgusb, var, reg)
+#define pavgusb_r2r(regs,regd)	mmx_r2r (pavgusb, regs, regd)
+
+
+/* AMD MMX extensions - also available in intel SSE */
+
+
+#define mmx_m2ri(op,mem,reg,imm) \
+	__asm__ __volatile__ (#op " %1, %0, %%" #reg \
+			      : /* nothing */ \
+			      : "m" (mem), "i" (imm))
+
+#define mmx_r2ri(op,regs,regd,imm) \
+	__asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \
+			      : /* nothing */ \
+			      : "i" (imm) )
+
+#define	mmx_fetch(mem,hint) \
+	__asm__ __volatile__ ("prefetch" #hint " %0" \
+			      : /* nothing */ \
+			      : "m" (mem))
+
+
+#define	maskmovq(regs,maskreg)		mmx_r2ri (maskmovq, regs, maskreg)
+
+#define	movntq_r2m(mmreg,var)		mmx_r2m (movntq, mmreg, var)
+
+#define	pavgb_m2r(var,reg)		mmx_m2r (pavgb, var, reg)
+#define	pavgb_r2r(regs,regd)		mmx_r2r (pavgb, regs, regd)
+#define	pavgw_m2r(var,reg)		mmx_m2r (pavgw, var, reg)
+#define	pavgw_r2r(regs,regd)		mmx_r2r (pavgw, regs, regd)
+
+#define	pextrw_r2r(mmreg,reg,imm)	mmx_r2ri (pextrw, mmreg, reg, imm)
+
+#define	pinsrw_r2r(reg,mmreg,imm)	mmx_r2ri (pinsrw, reg, mmreg, imm)
+
+#define	pmaxsw_m2r(var,reg)		mmx_m2r (pmaxsw, var, reg)
+#define	pmaxsw_r2r(regs,regd)		mmx_r2r (pmaxsw, regs, regd)
+
+#define	pmaxub_m2r(var,reg)		mmx_m2r (pmaxub, var, reg)
+#define	pmaxub_r2r(regs,regd)		mmx_r2r (pmaxub, regs, regd)
+
+#define	pminsw_m2r(var,reg)		mmx_m2r (pminsw, var, reg)
+#define	pminsw_r2r(regs,regd)		mmx_r2r (pminsw, regs, regd)
+
+#define	pminub_m2r(var,reg)		mmx_m2r (pminub, var, reg)
+#define	pminub_r2r(regs,regd)		mmx_r2r (pminub, regs, regd)
+
+#define	pmovmskb(mmreg,reg) \
+	__asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg)
+
+#define	pmulhuw_m2r(var,reg)		mmx_m2r (pmulhuw, var, reg)
+#define	pmulhuw_r2r(regs,regd)		mmx_r2r (pmulhuw, regs, regd)
+
+#define	prefetcht0(mem)			mmx_fetch (mem, t0)
+#define	prefetcht1(mem)			mmx_fetch (mem, t1)
+#define	prefetcht2(mem)			mmx_fetch (mem, t2)
+#define	prefetchnta(mem)		mmx_fetch (mem, nta)
+
+#define	psadbw_m2r(var,reg)		mmx_m2r (psadbw, var, reg)
+#define	psadbw_r2r(regs,regd)		mmx_r2r (psadbw, regs, regd)
+
+
+/* SSE2 */
+
+typedef	union {
+	long long		q[2];	/* Quadword (64-bit) value */
+	unsigned long long	uq[2];	/* Unsigned Quadword */
+	int			d[4];	/* 2 Doubleword (32-bit) values */
+	unsigned int		ud[4];	/* 2 Unsigned Doubleword */
+	short			w[8];	/* 4 Word (16-bit) values */
+	unsigned short		uw[8];	/* 4 Unsigned Word */
+	char			b[16];	/* 8 Byte (8-bit) values */
+	unsigned char		ub[16];	/* 8 Unsigned Byte */
+	float			s[4];	/* Single-precision (32-bit) value */
+} ATTR_ALIGN(16) sse_t;	/* On an 16-byte (128-bit) boundary */
+
+#define	movdqu_m2r(var,reg)	mmx_m2r (movdqu, var, reg)
+#define	movdqu_r2m(reg,var)	mmx_r2m (movdqu, reg, var)
+#define	movdqu_r2r(regs,regd)	mmx_r2r (movdqu, regs, regd)
+#define	movdqa_m2r(var,reg)	mmx_m2r (movdqa, var, reg)
+#define	movdqa_r2m(reg,var)	mmx_r2m (movdqa, reg, var)
+#define	movdqa_r2r(regs,regd)	mmx_r2r (movdqa, regs, regd)
+
+#define	pshufd_r2r(regs,regd,imm)	mmx_r2ri(pshufd, regs, regd, imm)
+
+#define	pshufw_m2r(var,reg,imm)		mmx_m2ri(pshufw, var, reg, imm)
+#define	pshufw_r2r(regs,regd,imm)	mmx_r2ri(pshufw, regs, regd, imm)
+
+#define	sfence() __asm__ __volatile__ ("sfence\n\t")
+
+#endif /* LIBMPEG2_MMX_H */
diff --git a/motion_comp_mmx.cpp b/motion_comp_mmx.cpp
new file mode 100644
index 0000000..fc265f4
--- /dev/null
+++ b/motion_comp_mmx.cpp
@@ -0,0 +1,1005 @@
+/*
+ * motion_comp_mmx.c
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+
+#include <inttypes.h>
+
+#include "mpeg2.h"
+#include "attributes.h"
+#include "mpeg2_internal.h"
+#include "mmx.h"
+
+#define CPU_MMXEXT 0
+#define CPU_3DNOW 1
+
+
+/* MMX code - needs a rewrite */
+
+/*
+ * Motion Compensation frequently needs to average values using the
+ * formula (x+y+1)>>1. Both MMXEXT and 3Dnow include one instruction
+ * to compute this, but it's been left out of classic MMX.
+ *
+ * We need to be careful of overflows when doing this computation.
+ * Rather than unpacking data to 16-bits, which reduces parallelism,
+ * we use the following formulas:
+ *
+ * (x+y)>>1 == (x&y)+((x^y)>>1)
+ * (x+y+1)>>1 == (x|y)-((x^y)>>1)
+ */
+
+/* some rounding constants */
+static mmx_t mask1 = {0xfefefefefefefefeLL};
+static mmx_t round4 = {0x0002000200020002LL};
+
+/*
+ * This code should probably be compiled with loop unrolling
+ * (ie, -funroll-loops in gcc)becuase some of the loops
+ * use a small static number of iterations. This was written
+ * with the assumption the compiler knows best about when
+ * unrolling will help
+ */
+
+static inline void mmx_zero_reg (void)
+{
+    /* load 0 into mm0 */
+    pxor_r2r (mm0, mm0);
+}
+
+static inline void mmx_average_2_U8 (uint8_t * dest, const uint8_t * src1,
+				     const uint8_t * src2)
+{
+    /* *dest = (*src1 + *src2 + 1)/ 2; */
+
+    movq_m2r (*src1, mm1);	/* load 8 src1 bytes */
+    movq_r2r (mm1, mm2);	/* copy 8 src1 bytes */
+
+    movq_m2r (*src2, mm3);	/* load 8 src2 bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 src2 bytes */
+
+    pxor_r2r (mm1, mm3);	/* xor src1 and src2 */
+    pand_m2r (mask1, mm3);	/* mask lower bits */
+    psrlq_i2r (1, mm3);		/* /2 */
+    por_r2r (mm2, mm4);		/* or src1 and src2 */
+    psubb_r2r (mm3, mm4);	/* subtract subresults */
+    movq_r2m (mm4, *dest);	/* store result in dest */
+}
+
+static inline void mmx_interp_average_2_U8 (uint8_t * dest,
+					    const uint8_t * src1,
+					    const uint8_t * src2)
+{
+    /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */
+
+    movq_m2r (*dest, mm1);	/* load 8 dest bytes */
+    movq_r2r (mm1, mm2);	/* copy 8 dest bytes */
+
+    movq_m2r (*src1, mm3);	/* load 8 src1 bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 src1 bytes */
+
+    movq_m2r (*src2, mm5);	/* load 8 src2 bytes */
+    movq_r2r (mm5, mm6);	/* copy 8 src2 bytes */
+
+    pxor_r2r (mm3, mm5);	/* xor src1 and src2 */
+    pand_m2r (mask1, mm5);	/* mask lower bits */
+    psrlq_i2r (1, mm5);		/* /2 */
+    por_r2r (mm4, mm6);		/* or src1 and src2 */
+    psubb_r2r (mm5, mm6);	/* subtract subresults */
+    movq_r2r (mm6, mm5);	/* copy subresult */
+
+    pxor_r2r (mm1, mm5);	/* xor srcavg and dest */
+    pand_m2r (mask1, mm5);	/* mask lower bits */
+    psrlq_i2r (1, mm5);		/* /2 */
+    por_r2r (mm2, mm6);		/* or srcavg and dest */
+    psubb_r2r (mm5, mm6);	/* subtract subresults */
+    movq_r2m (mm6, *dest);	/* store result in dest */
+}
+
+static inline void mmx_average_4_U8 (uint8_t * dest, const uint8_t * src1,
+				     const uint8_t * src2,
+				     const uint8_t * src3,
+				     const uint8_t * src4)
+{
+    /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */
+
+    movq_m2r (*src1, mm1);	/* load 8 src1 bytes */
+    movq_r2r (mm1, mm2);	/* copy 8 src1 bytes */
+
+    punpcklbw_r2r (mm0, mm1);	/* unpack low src1 bytes */
+    punpckhbw_r2r (mm0, mm2);	/* unpack high src1 bytes */
+
+    movq_m2r (*src2, mm3);	/* load 8 src2 bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 src2 bytes */
+
+    punpcklbw_r2r (mm0, mm3);	/* unpack low src2 bytes */
+    punpckhbw_r2r (mm0, mm4);	/* unpack high src2 bytes */
+
+    paddw_r2r (mm3, mm1);	/* add lows */
+    paddw_r2r (mm4, mm2);	/* add highs */
+
+    /* now have partials in mm1 and mm2 */
+
+    movq_m2r (*src3, mm3);	/* load 8 src3 bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 src3 bytes */
+
+    punpcklbw_r2r (mm0, mm3);	/* unpack low src3 bytes */
+    punpckhbw_r2r (mm0, mm4);	/* unpack high src3 bytes */
+
+    paddw_r2r (mm3, mm1);	/* add lows */
+    paddw_r2r (mm4, mm2);	/* add highs */
+
+    movq_m2r (*src4, mm5);	/* load 8 src4 bytes */
+    movq_r2r (mm5, mm6);	/* copy 8 src4 bytes */
+
+    punpcklbw_r2r (mm0, mm5);	/* unpack low src4 bytes */
+    punpckhbw_r2r (mm0, mm6);	/* unpack high src4 bytes */
+
+    paddw_r2r (mm5, mm1);	/* add lows */
+    paddw_r2r (mm6, mm2);	/* add highs */
+
+    /* now have subtotal in mm1 and mm2 */
+
+    paddw_m2r (round4, mm1);
+    psraw_i2r (2, mm1);		/* /4 */
+    paddw_m2r (round4, mm2);
+    psraw_i2r (2, mm2);		/* /4 */
+
+    packuswb_r2r (mm2, mm1);	/* pack (w/ saturation) */
+    movq_r2m (mm1, *dest);	/* store result in dest */
+}
+
+static inline void mmx_interp_average_4_U8 (uint8_t * dest,
+					    const uint8_t * src1,
+					    const uint8_t * src2,
+					    const uint8_t * src3,
+					    const uint8_t * src4)
+{
+    /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */
+
+    movq_m2r (*src1, mm1);	/* load 8 src1 bytes */
+    movq_r2r (mm1, mm2);	/* copy 8 src1 bytes */
+
+    punpcklbw_r2r (mm0, mm1);	/* unpack low src1 bytes */
+    punpckhbw_r2r (mm0, mm2);	/* unpack high src1 bytes */
+
+    movq_m2r (*src2, mm3);	/* load 8 src2 bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 src2 bytes */
+
+    punpcklbw_r2r (mm0, mm3);	/* unpack low src2 bytes */
+    punpckhbw_r2r (mm0, mm4);	/* unpack high src2 bytes */
+
+    paddw_r2r (mm3, mm1);	/* add lows */
+    paddw_r2r (mm4, mm2);	/* add highs */
+
+    /* now have partials in mm1 and mm2 */
+
+    movq_m2r (*src3, mm3);	/* load 8 src3 bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 src3 bytes */
+
+    punpcklbw_r2r (mm0, mm3);	/* unpack low src3 bytes */
+    punpckhbw_r2r (mm0, mm4);	/* unpack high src3 bytes */
+
+    paddw_r2r (mm3, mm1);	/* add lows */
+    paddw_r2r (mm4, mm2);	/* add highs */
+
+    movq_m2r (*src4, mm5);	/* load 8 src4 bytes */
+    movq_r2r (mm5, mm6);	/* copy 8 src4 bytes */
+
+    punpcklbw_r2r (mm0, mm5);	/* unpack low src4 bytes */
+    punpckhbw_r2r (mm0, mm6);	/* unpack high src4 bytes */
+
+    paddw_r2r (mm5, mm1);	/* add lows */
+    paddw_r2r (mm6, mm2);	/* add highs */
+
+    paddw_m2r (round4, mm1);
+    psraw_i2r (2, mm1);		/* /4 */
+    paddw_m2r (round4, mm2);
+    psraw_i2r (2, mm2);		/* /4 */
+
+    /* now have subtotal/4 in mm1 and mm2 */
+
+    movq_m2r (*dest, mm3);	/* load 8 dest bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 dest bytes */
+
+    packuswb_r2r (mm2, mm1);	/* pack (w/ saturation) */
+    movq_r2r (mm1,mm2);		/* copy subresult */
+
+    pxor_r2r (mm1, mm3);	/* xor srcavg and dest */
+    pand_m2r (mask1, mm3);	/* mask lower bits */
+    psrlq_i2r (1, mm3);		/* /2 */
+    por_r2r (mm2, mm4);		/* or srcavg and dest */
+    psubb_r2r (mm3, mm4);	/* subtract subresults */
+    movq_r2m (mm4, *dest);	/* store result in dest */
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_avg_mmx (const int width, int height, uint8_t * dest,
+			       const uint8_t * ref, const int stride)
+{
+    mmx_zero_reg ();
+
+    do {
+	mmx_average_2_U8 (dest, dest, ref);
+
+	if (width == 16)
+	    mmx_average_2_U8 (dest+8, dest+8, ref+8);
+
+	dest += stride;
+	ref += stride;
+    } while (--height);
+}
+
+static void MC_avg_o_16_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    MC_avg_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_avg_o_8_mmx (uint8_t * dest, const uint8_t * ref,
+			    int stride, int height)
+{
+    MC_avg_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_put_mmx (const int width, int height, uint8_t * dest,
+			       const uint8_t * ref, const int stride)
+{
+    mmx_zero_reg ();
+
+    do {
+	movq_m2r (* ref, mm1);	/* load 8 ref bytes */
+	movq_r2m (mm1,* dest);	/* store 8 bytes at curr */
+
+	if (width == 16)
+	    {
+		movq_m2r (* (ref+8), mm1);	/* load 8 ref bytes */
+		movq_r2m (mm1,* (dest+8));	/* store 8 bytes at curr */
+	    }
+
+	dest += stride;
+	ref += stride;
+    } while (--height);
+}
+
+static void MC_put_o_16_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    MC_put_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_put_o_8_mmx (uint8_t * dest, const uint8_t * ref,
+			    int stride, int height)
+{
+    MC_put_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+/* Half pixel interpolation in the x direction */
+static inline void MC_avg_x_mmx (const int width, int height, uint8_t * dest,
+				 const uint8_t * ref, const int stride)
+{
+    mmx_zero_reg ();
+
+    do {
+	mmx_interp_average_2_U8 (dest, ref, ref+1);
+
+	if (width == 16)
+	    mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
+
+	dest += stride;
+	ref += stride;
+    } while (--height);
+}
+
+static void MC_avg_x_16_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    MC_avg_x_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_avg_x_8_mmx (uint8_t * dest, const uint8_t * ref,
+			    int stride, int height)
+{
+    MC_avg_x_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_put_x_mmx (const int width, int height, uint8_t * dest,
+				 const uint8_t * ref, const int stride)
+{
+    mmx_zero_reg ();
+
+    do {
+	mmx_average_2_U8 (dest, ref, ref+1);
+
+	if (width == 16)
+	    mmx_average_2_U8 (dest+8, ref+8, ref+9);
+
+	dest += stride;
+	ref += stride;
+    } while (--height);
+}
+
+static void MC_put_x_16_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    MC_put_x_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_put_x_8_mmx (uint8_t * dest, const uint8_t * ref,
+			    int stride, int height)
+{
+    MC_put_x_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_avg_xy_mmx (const int width, int height, uint8_t * dest,
+				  const uint8_t * ref, const int stride)
+{
+    const uint8_t * ref_next = ref + stride;
+
+    mmx_zero_reg ();
+
+    do {
+	mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
+
+	if (width == 16)
+	    mmx_interp_average_4_U8 (dest+8, ref+8, ref+9,
+				     ref_next+8, ref_next+9);
+
+	dest += stride;
+	ref += stride;
+	ref_next += stride;
+    } while (--height);
+}
+
+static void MC_avg_xy_16_mmx (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    MC_avg_xy_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_avg_xy_8_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    MC_avg_xy_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_put_xy_mmx (const int width, int height, uint8_t * dest,
+				  const uint8_t * ref, const int stride)
+{
+    const uint8_t * ref_next = ref + stride;
+
+    mmx_zero_reg ();
+
+    do {
+	mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
+
+	if (width == 16)
+	    mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9);
+
+	dest += stride;
+	ref += stride;
+	ref_next += stride;
+    } while (--height);
+}
+
+static void MC_put_xy_16_mmx (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    MC_put_xy_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_put_xy_8_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    MC_put_xy_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_avg_y_mmx (const int width, int height, uint8_t * dest,
+				 const uint8_t * ref, const int stride)
+{
+    const uint8_t * ref_next = ref + stride;
+
+    mmx_zero_reg ();
+
+    do {
+	mmx_interp_average_2_U8 (dest, ref, ref_next);
+
+	if (width == 16)
+	    mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
+
+	dest += stride;
+	ref += stride;
+	ref_next += stride;
+    } while (--height);
+}
+
+static void MC_avg_y_16_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    MC_avg_y_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_avg_y_8_mmx (uint8_t * dest, const uint8_t * ref,
+			    int stride, int height)
+{
+    MC_avg_y_mmx (8, height, dest, ref, stride);
+}
+
+/*-----------------------------------------------------------------------*/
+
+static inline void MC_put_y_mmx (const int width, int height, uint8_t * dest,
+				 const uint8_t * ref, const int stride)
+{
+    const uint8_t * ref_next = ref + stride;
+
+    mmx_zero_reg ();
+
+    do {
+	mmx_average_2_U8 (dest, ref, ref_next);
+
+	if (width == 16)
+	    mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
+
+	dest += stride;
+	ref += stride;
+	ref_next += stride;
+    } while (--height);
+}
+
+static void MC_put_y_16_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
+{
+    MC_put_y_mmx (16, height, dest, ref, stride);
+}
+
+static void MC_put_y_8_mmx (uint8_t * dest, const uint8_t * ref,
+			    int stride, int height)
+{
+    MC_put_y_mmx (8, height, dest, ref, stride);
+}
+
+
+MPEG2_MC_EXTERN (mmx)
+
+
+
+
+
+
+
+/* CPU_MMXEXT/CPU_3DNOW adaptation layer */
+
+#define pavg_r2r(src,dest)		\
+do {					\
+    if (cpu == CPU_MMXEXT)		\
+	pavgb_r2r (src, dest);		\
+    else				\
+	pavgusb_r2r (src, dest);	\
+} while (0)
+
+#define pavg_m2r(src,dest)		\
+do {					\
+    if (cpu == CPU_MMXEXT)		\
+	pavgb_m2r (src, dest);		\
+    else				\
+	pavgusb_m2r (src, dest);	\
+} while (0)
+
+
+/* CPU_MMXEXT code */
+
+
+static inline void MC_put1_8 (int height, uint8_t * dest, const uint8_t * ref,
+			      const int stride)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_r2m (mm0, *dest);
+	ref += stride;
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_put1_16 (int height, uint8_t * dest, const uint8_t * ref,
+			       const int stride)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+8), mm1);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	movq_r2m (mm1, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg1_8 (int height, uint8_t * dest, const uint8_t * ref,
+			      const int stride, const int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	pavg_m2r (*dest, mm0);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg1_16 (int height, uint8_t * dest, const uint8_t * ref,
+			       const int stride, const int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+8), mm1);
+	pavg_m2r (*dest, mm0);
+	pavg_m2r (*(dest+8), mm1);
+	movq_r2m (mm0, *dest);
+	ref += stride;
+	movq_r2m (mm1, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_put2_8 (int height, uint8_t * dest, const uint8_t * ref,
+			      const int stride, const int offset,
+			      const int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	pavg_m2r (*(ref+offset), mm0);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_put2_16 (int height, uint8_t * dest, const uint8_t * ref,
+			       const int stride, const int offset,
+			       const int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+8), mm1);
+	pavg_m2r (*(ref+offset), mm0);
+	pavg_m2r (*(ref+offset+8), mm1);
+	movq_r2m (mm0, *dest);
+	ref += stride;
+	movq_r2m (mm1, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg2_8 (int height, uint8_t * dest, const uint8_t * ref,
+			      const int stride, const int offset,
+			      const int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	pavg_m2r (*(ref+offset), mm0);
+	pavg_m2r (*dest, mm0);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg2_16 (int height, uint8_t * dest, const uint8_t * ref,
+			       const int stride, const int offset,
+			       const int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+8), mm1);
+	pavg_m2r (*(ref+offset), mm0);
+	pavg_m2r (*(ref+offset+8), mm1);
+	pavg_m2r (*dest, mm0);
+	pavg_m2r (*(dest+8), mm1);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	movq_r2m (mm1, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static mmx_t mask_one = {0x0101010101010101LL};
+
+static inline void MC_put4_8 (int height, uint8_t * dest, const uint8_t * ref,
+			      const int stride, const int cpu)
+{
+    movq_m2r (*ref, mm0);
+    movq_m2r (*(ref+1), mm1);
+    movq_r2r (mm0, mm7);
+    pxor_r2r (mm1, mm7);
+    pavg_r2r (mm1, mm0);
+    ref += stride;
+
+    do {
+	movq_m2r (*ref, mm2);
+	movq_r2r (mm0, mm5);
+
+	movq_m2r (*(ref+1), mm3);
+	movq_r2r (mm2, mm6);
+
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm3, mm2);
+
+	por_r2r (mm6, mm7);
+	pxor_r2r (mm2, mm5);
+
+	pand_r2r (mm5, mm7);
+	pavg_r2r (mm2, mm0);
+
+	pand_m2r (mask_one, mm7);
+
+	psubusb_r2r (mm7, mm0);
+
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	dest += stride;
+
+	movq_r2r (mm6, mm7);	/* unroll ! */
+	movq_r2r (mm2, mm0);	/* unroll ! */
+    } while (--height);
+}
+
+static inline void MC_put4_16 (int height, uint8_t * dest, const uint8_t * ref,
+			       const int stride, const int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+stride+1), mm1);
+	movq_r2r (mm0, mm7);
+	movq_m2r (*(ref+1), mm2);
+	pxor_r2r (mm1, mm7);
+	movq_m2r (*(ref+stride), mm3);
+	movq_r2r (mm2, mm6);
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm1, mm0);
+	pavg_r2r (mm3, mm2);
+	por_r2r (mm6, mm7);
+	movq_r2r (mm0, mm6);
+	pxor_r2r (mm2, mm6);
+	pand_r2r (mm6, mm7);
+	pand_m2r (mask_one, mm7);
+	pavg_r2r (mm2, mm0);
+	psubusb_r2r (mm7, mm0);
+	movq_r2m (mm0, *dest);
+
+	movq_m2r (*(ref+8), mm0);
+	movq_m2r (*(ref+stride+9), mm1);
+	movq_r2r (mm0, mm7);
+	movq_m2r (*(ref+9), mm2);
+	pxor_r2r (mm1, mm7);
+	movq_m2r (*(ref+stride+8), mm3);
+	movq_r2r (mm2, mm6);
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm1, mm0);
+	pavg_r2r (mm3, mm2);
+	por_r2r (mm6, mm7);
+	movq_r2r (mm0, mm6);
+	pxor_r2r (mm2, mm6);
+	pand_r2r (mm6, mm7);
+	pand_m2r (mask_one, mm7);
+	pavg_r2r (mm2, mm0);
+	psubusb_r2r (mm7, mm0);
+	ref += stride;
+	movq_r2m (mm0, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg4_8 (int height, uint8_t * dest, const uint8_t * ref,
+			      const int stride, const int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+stride+1), mm1);
+	movq_r2r (mm0, mm7);
+	movq_m2r (*(ref+1), mm2);
+	pxor_r2r (mm1, mm7);
+	movq_m2r (*(ref+stride), mm3);
+	movq_r2r (mm2, mm6);
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm1, mm0);
+	pavg_r2r (mm3, mm2);
+	por_r2r (mm6, mm7);
+	movq_r2r (mm0, mm6);
+	pxor_r2r (mm2, mm6);
+	pand_r2r (mm6, mm7);
+	pand_m2r (mask_one, mm7);
+	pavg_r2r (mm2, mm0);
+	psubusb_r2r (mm7, mm0);
+	movq_m2r (*dest, mm1);
+	pavg_r2r (mm1, mm0);
+	ref += stride;
+	movq_r2m (mm0, *dest);
+	dest += stride;
+    } while (--height);
+}
+
+static inline void MC_avg4_16 (int height, uint8_t * dest, const uint8_t * ref,
+			       const int stride, const int cpu)
+{
+    do {
+	movq_m2r (*ref, mm0);
+	movq_m2r (*(ref+stride+1), mm1);
+	movq_r2r (mm0, mm7);
+	movq_m2r (*(ref+1), mm2);
+	pxor_r2r (mm1, mm7);
+	movq_m2r (*(ref+stride), mm3);
+	movq_r2r (mm2, mm6);
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm1, mm0);
+	pavg_r2r (mm3, mm2);
+	por_r2r (mm6, mm7);
+	movq_r2r (mm0, mm6);
+	pxor_r2r (mm2, mm6);
+	pand_r2r (mm6, mm7);
+	pand_m2r (mask_one, mm7);
+	pavg_r2r (mm2, mm0);
+	psubusb_r2r (mm7, mm0);
+	movq_m2r (*dest, mm1);
+	pavg_r2r (mm1, mm0);
+	movq_r2m (mm0, *dest);
+
+	movq_m2r (*(ref+8), mm0);
+	movq_m2r (*(ref+stride+9), mm1);
+	movq_r2r (mm0, mm7);
+	movq_m2r (*(ref+9), mm2);
+	pxor_r2r (mm1, mm7);
+	movq_m2r (*(ref+stride+8), mm3);
+	movq_r2r (mm2, mm6);
+	pxor_r2r (mm3, mm6);
+	pavg_r2r (mm1, mm0);
+	pavg_r2r (mm3, mm2);
+	por_r2r (mm6, mm7);
+	movq_r2r (mm0, mm6);
+	pxor_r2r (mm2, mm6);
+	pand_r2r (mm6, mm7);
+	pand_m2r (mask_one, mm7);
+	pavg_r2r (mm2, mm0);
+	psubusb_r2r (mm7, mm0);
+	movq_m2r (*(dest+8), mm1);
+	pavg_r2r (mm1, mm0);
+	ref += stride;
+	movq_r2m (mm0, *(dest+8));
+	dest += stride;
+    } while (--height);
+}
+
+static void MC_avg_o_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
+{
+    MC_avg1_16 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+static void MC_avg_o_8_mmxext (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg1_8 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+static void MC_put_o_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
+{
+    MC_put1_16 (height, dest, ref, stride);
+}
+
+static void MC_put_o_8_mmxext (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put1_8 (height, dest, ref, stride);
+}
+
+static void MC_avg_x_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
+{
+    MC_avg2_16 (height, dest, ref, stride, 1, CPU_MMXEXT);
+}
+
+static void MC_avg_x_8_mmxext (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg2_8 (height, dest, ref, stride, 1, CPU_MMXEXT);
+}
+
+static void MC_put_x_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
+{
+    MC_put2_16 (height, dest, ref, stride, 1, CPU_MMXEXT);
+}
+
+static void MC_put_x_8_mmxext (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put2_8 (height, dest, ref, stride, 1, CPU_MMXEXT);
+}
+
+static void MC_avg_y_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
+{
+    MC_avg2_16 (height, dest, ref, stride, stride, CPU_MMXEXT);
+}
+
+static void MC_avg_y_8_mmxext (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg2_8 (height, dest, ref, stride, stride, CPU_MMXEXT);
+}
+
+static void MC_put_y_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
+{
+    MC_put2_16 (height, dest, ref, stride, stride, CPU_MMXEXT);
+}
+
+static void MC_put_y_8_mmxext (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put2_8 (height, dest, ref, stride, stride, CPU_MMXEXT);
+}
+
+static void MC_avg_xy_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				 int stride, int height)
+{
+    MC_avg4_16 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+static void MC_avg_xy_8_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
+{
+    MC_avg4_8 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+static void MC_put_xy_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				 int stride, int height)
+{
+    MC_put4_16 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+static void MC_put_xy_8_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
+{
+    MC_put4_8 (height, dest, ref, stride, CPU_MMXEXT);
+}
+
+
+MPEG2_MC_EXTERN (mmxext)
+
+
+
+static void MC_avg_o_16_3dnow (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg1_16 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+static void MC_avg_o_8_3dnow (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    MC_avg1_8 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+static void MC_put_o_16_3dnow (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put1_16 (height, dest, ref, stride);
+}
+
+static void MC_put_o_8_3dnow (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    MC_put1_8 (height, dest, ref, stride);
+}
+
+static void MC_avg_x_16_3dnow (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg2_16 (height, dest, ref, stride, 1, CPU_3DNOW);
+}
+
+static void MC_avg_x_8_3dnow (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    MC_avg2_8 (height, dest, ref, stride, 1, CPU_3DNOW);
+}
+
+static void MC_put_x_16_3dnow (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put2_16 (height, dest, ref, stride, 1, CPU_3DNOW);
+}
+
+static void MC_put_x_8_3dnow (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    MC_put2_8 (height, dest, ref, stride, 1, CPU_3DNOW);
+}
+
+static void MC_avg_y_16_3dnow (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg2_16 (height, dest, ref, stride, stride, CPU_3DNOW);
+}
+
+static void MC_avg_y_8_3dnow (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    MC_avg2_8 (height, dest, ref, stride, stride, CPU_3DNOW);
+}
+
+static void MC_put_y_16_3dnow (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put2_16 (height, dest, ref, stride, stride, CPU_3DNOW);
+}
+
+static void MC_put_y_8_3dnow (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
+{
+    MC_put2_8 (height, dest, ref, stride, stride, CPU_3DNOW);
+}
+
+static void MC_avg_xy_16_3dnow (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
+{
+    MC_avg4_16 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+static void MC_avg_xy_8_3dnow (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_avg4_8 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+static void MC_put_xy_16_3dnow (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
+{
+    MC_put4_16 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+static void MC_put_xy_8_3dnow (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
+{
+    MC_put4_8 (height, dest, ref, stride, CPU_3DNOW);
+}
+
+
+MPEG2_MC_EXTERN (3dnow)
+
+#endif
diff --git a/mpeg2.h b/mpeg2.h
new file mode 100644
index 0000000..3888715
--- /dev/null
+++ b/mpeg2.h
@@ -0,0 +1,206 @@
+/*
+ * mpeg2.h
+ * Copyright (C) 2000-2004 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef LIBMPEG2_MPEG2_H
+#define LIBMPEG2_MPEG2_H
+
+#define MPEG2_VERSION(a,b,c) (((a)<<16)|((b)<<8)|(c))
+#define MPEG2_RELEASE MPEG2_VERSION (0, 5, 1)	/* 0.5.1 */
+
+#define SEQ_FLAG_MPEG2 1
+#define SEQ_FLAG_CONSTRAINED_PARAMETERS 2
+#define SEQ_FLAG_PROGRESSIVE_SEQUENCE 4
+#define SEQ_FLAG_LOW_DELAY 8
+#define SEQ_FLAG_COLOUR_DESCRIPTION 16
+
+#define SEQ_MASK_VIDEO_FORMAT 0xe0
+#define SEQ_VIDEO_FORMAT_COMPONENT 0
+#define SEQ_VIDEO_FORMAT_PAL 0x20
+#define SEQ_VIDEO_FORMAT_NTSC 0x40
+#define SEQ_VIDEO_FORMAT_SECAM 0x60
+#define SEQ_VIDEO_FORMAT_MAC 0x80
+#define SEQ_VIDEO_FORMAT_UNSPECIFIED 0xa0
+
+typedef struct mpeg2_sequence_s {
+    unsigned int width, height;
+    unsigned int chroma_width, chroma_height;
+    unsigned int byte_rate;
+    unsigned int vbv_buffer_size;
+    uint32_t flags;
+
+    unsigned int picture_width, picture_height;
+    unsigned int display_width, display_height;
+    unsigned int pixel_width, pixel_height;
+    unsigned int frame_period;
+
+    uint8_t profile_level_id;
+    uint8_t colour_primaries;
+    uint8_t transfer_characteristics;
+    uint8_t matrix_coefficients;
+} mpeg2_sequence_t;
+
+#define GOP_FLAG_DROP_FRAME 1
+#define GOP_FLAG_BROKEN_LINK 2
+#define GOP_FLAG_CLOSED_GOP 4
+
+typedef struct mpeg2_gop_s {
+    uint8_t hours;
+    uint8_t minutes;
+    uint8_t seconds;
+    uint8_t pictures;
+    uint32_t flags;
+} mpeg2_gop_t;
+
+#define PIC_MASK_CODING_TYPE 7
+#define PIC_FLAG_CODING_TYPE_I 1
+#define PIC_FLAG_CODING_TYPE_P 2
+#define PIC_FLAG_CODING_TYPE_B 3
+#define PIC_FLAG_CODING_TYPE_D 4
+
+#define PIC_FLAG_TOP_FIELD_FIRST 8
+#define PIC_FLAG_PROGRESSIVE_FRAME 16
+#define PIC_FLAG_COMPOSITE_DISPLAY 32
+#define PIC_FLAG_SKIP 64
+#define PIC_FLAG_TAGS 128
+#define PIC_FLAG_REPEAT_FIRST_FIELD 256
+#define PIC_MASK_COMPOSITE_DISPLAY 0xfffff000
+
+typedef struct mpeg2_picture_s {
+    unsigned int temporal_reference;
+    unsigned int nb_fields;
+    uint32_t tag, tag2;
+    uint32_t flags;
+    struct {
+	int x, y;
+    } display_offset[3];
+} mpeg2_picture_t;
+
+typedef struct mpeg2_fbuf_s {
+    uint8_t * buf[3];
+    void * id;
+} mpeg2_fbuf_t;
+
+typedef struct mpeg2_info_s {
+    const mpeg2_sequence_t * sequence;
+    const mpeg2_gop_t * gop;
+    const mpeg2_picture_t * current_picture;
+    const mpeg2_picture_t * current_picture_2nd;
+    const mpeg2_fbuf_t * current_fbuf;
+    const mpeg2_picture_t * display_picture;
+    const mpeg2_picture_t * display_picture_2nd;
+    const mpeg2_fbuf_t * display_fbuf;
+    const mpeg2_fbuf_t * discard_fbuf;
+    const uint8_t * user_data;
+    unsigned int user_data_len;
+} mpeg2_info_t;
+
+typedef struct mpeg2dec_s mpeg2dec_t;
+typedef struct mpeg2_decoder_s mpeg2_decoder_t;
+
+typedef enum {
+    STATE_BUFFER = 0,
+    STATE_SEQUENCE = 1,
+    STATE_SEQUENCE_REPEATED = 2,
+    STATE_GOP = 3,
+    STATE_PICTURE = 4,
+    STATE_SLICE_1ST = 5,
+    STATE_PICTURE_2ND = 6,
+    STATE_SLICE = 7,
+    STATE_END = 8,
+    STATE_INVALID = 9,
+    STATE_INVALID_END = 10,
+    STATE_SEQUENCE_MODIFIED = 11
+} mpeg2_state_t;
+
+typedef struct mpeg2_convert_init_s {
+    unsigned int id_size;
+    unsigned int buf_size[3];
+    void (* start) (void * id, const mpeg2_fbuf_t * fbuf,
+		    const mpeg2_picture_t * picture, const mpeg2_gop_t * gop);
+    void (* copy) (void * id, uint8_t * const * src, unsigned int v_offset);
+} mpeg2_convert_init_t;
+typedef enum {
+    MPEG2_CONVERT_SET = 0,
+    MPEG2_CONVERT_STRIDE = 1,
+    MPEG2_CONVERT_START = 2
+} mpeg2_convert_stage_t;
+typedef int mpeg2_convert_t (int stage, void * id,
+			     const mpeg2_sequence_t * sequence, int stride,
+			     uint32_t accel, void * arg,
+			     mpeg2_convert_init_t * result);
+int mpeg2_convert (mpeg2dec_t * mpeg2dec, mpeg2_convert_t convert, void * arg);
+int mpeg2_stride (mpeg2dec_t * mpeg2dec, int stride);
+void mpeg2_set_buf (mpeg2dec_t * mpeg2dec, uint8_t * buf[3], void * id);
+void mpeg2_custom_fbuf (mpeg2dec_t * mpeg2dec, int custom_fbuf);
+
+#define MPEG2_ACCEL_X86_MMX 1
+#define MPEG2_ACCEL_X86_3DNOW 2
+#define MPEG2_ACCEL_X86_MMXEXT 4
+#define MPEG2_ACCEL_X86_SSE2 8
+#define MPEG2_ACCEL_X86_SSE3 16
+#define MPEG2_ACCEL_PPC_ALTIVEC 1
+#define MPEG2_ACCEL_ALPHA 1
+#define MPEG2_ACCEL_ALPHA_MVI 2
+#define MPEG2_ACCEL_SPARC_VIS 1
+#define MPEG2_ACCEL_SPARC_VIS2 2
+#define MPEG2_ACCEL_ARM 1
+#define MPEG2_ACCEL_DETECT 0x80000000
+
+uint32_t mpeg2_accel (uint32_t accel);
+mpeg2dec_t * mpeg2_init (void);
+const mpeg2_info_t * mpeg2_info (mpeg2dec_t * mpeg2dec);
+void mpeg2_close (mpeg2dec_t * mpeg2dec);
+
+void mpeg2_buffer (mpeg2dec_t * mpeg2dec, uint8_t * start, uint8_t * end);
+int mpeg2_getpos (mpeg2dec_t * mpeg2dec);
+mpeg2_state_t mpeg2_parse (mpeg2dec_t * mpeg2dec);
+
+void mpeg2_reset (mpeg2dec_t * mpeg2dec, int full_reset);
+void mpeg2_skip (mpeg2dec_t * mpeg2dec, int skip);
+void mpeg2_slice_region (mpeg2dec_t * mpeg2dec, int start, int end);
+
+void mpeg2_tag_picture (mpeg2dec_t * mpeg2dec, uint32_t tag, uint32_t tag2);
+
+/*
+void mpeg2_init_fbuf (mpeg2_decoder_t * decoder, uint8_t * current_fbuf[3],
+		      uint8_t * forward_fbuf[3], uint8_t * backward_fbuf[3]);
+void mpeg2_slice (mpeg2_decoder_t * decoder, int code, const uint8_t * buffer);
+*/
+int mpeg2_guess_aspect (const mpeg2_sequence_t * sequence,
+			unsigned int * pixel_width,
+			unsigned int * pixel_height);
+
+typedef enum {
+    MPEG2_ALLOC_MPEG2DEC = 0,
+    MPEG2_ALLOC_CHUNK = 1,
+    MPEG2_ALLOC_YUV = 2,
+    MPEG2_ALLOC_CONVERT_ID = 3,
+    MPEG2_ALLOC_CONVERTED = 4
+} mpeg2_alloc_t;
+
+void * mpeg2_malloc (unsigned size, mpeg2_alloc_t reason);
+void mpeg2_free (void * buf);
+void mpeg2_malloc_hooks (void * malloc (unsigned, mpeg2_alloc_t),
+			 int free (void *));
+
+#endif /* LIBMPEG2_MPEG2_H */
diff --git a/mpeg2_internal.h b/mpeg2_internal.h
new file mode 100644
index 0000000..39a6c1f
--- /dev/null
+++ b/mpeg2_internal.h
@@ -0,0 +1,324 @@
+/*
+ * mpeg2_internal.h
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef LIBMPEG2_MPEG2_INTERNAL_H
+#define LIBMPEG2_MPEG2_INTERNAL_H
+
+#include "config.h"
+#include "attributes.h"
+
+#define STATE_INTERNAL_NORETURN ((mpeg2_state_t)-1)
+
+/* macroblock modes */
+#define MACROBLOCK_INTRA 1
+#define MACROBLOCK_PATTERN 2
+#define MACROBLOCK_MOTION_BACKWARD 4
+#define MACROBLOCK_MOTION_FORWARD 8
+#define MACROBLOCK_QUANT 16
+#define DCT_TYPE_INTERLACED 32
+/* motion_type */
+#define MOTION_TYPE_SHIFT 6
+#define MC_FIELD 1
+#define MC_FRAME 2
+#define MC_16X8 2
+#define MC_DMV 3
+
+/* picture structure */
+#define TOP_FIELD 1
+#define BOTTOM_FIELD 2
+#define FRAME_PICTURE 3
+
+/* picture coding type */
+#define I_TYPE 1
+#define P_TYPE 2
+#define B_TYPE 3
+#define D_TYPE 4
+
+typedef void mpeg2_mc_fct (uint8_t *, const uint8_t *, int, int);
+
+typedef struct {
+    uint8_t * ref[2][3];
+    uint8_t ** ref2[2];
+    int pmv[2][2];
+    int f_code[2];
+} motion_t;
+
+typedef void motion_parser_t (mpeg2_decoder_t * decoder,
+			      motion_t * motion,
+			      mpeg2_mc_fct * const * table);
+
+struct mpeg2_decoder_s {
+    /* first, state that carries information from one macroblock to the */
+    /* next inside a slice, and is never used outside of mpeg2_slice() */
+
+    /* bit parsing stuff */
+    uint32_t bitstream_buf;		/* current 32 bit working set */
+    int bitstream_bits;			/* used bits in working set */
+    const uint8_t * bitstream_ptr;	/* buffer with stream data */
+    const uint8_t * bit_ptr_end;
+
+    uint8_t * dest[3];
+
+    int offset;
+    int stride;
+    int uv_stride;
+    int slice_stride;
+    int slice_uv_stride;
+    int stride_frame;
+    unsigned int limit_x;
+    unsigned int limit_y_16;
+    unsigned int limit_y_8;
+    unsigned int limit_y;
+
+    /* Motion vectors */
+    /* The f_ and b_ correspond to the forward and backward motion */
+    /* predictors */
+    motion_t b_motion;
+    motion_t f_motion;
+    motion_parser_t * motion_parser[5];
+
+    /* predictor for DC coefficients in intra blocks */
+    int16_t dc_dct_pred[3];
+
+    /* DCT coefficients */
+    int16_t DCTblock[64] ATTR_ALIGN(64);
+
+    uint8_t * picture_dest[3];
+    void (* convert) (void * convert_id, uint8_t * const * src,
+		      unsigned int v_offset);
+    void * convert_id;
+
+    int dmv_offset;
+    unsigned int v_offset;
+
+    /* now non-slice-specific information */
+
+    /* sequence header stuff */
+    uint16_t * quantizer_matrix[4];
+    uint16_t (* chroma_quantizer[2])[64];
+    uint16_t quantizer_prescale[4][32][64];
+
+    /* The width and height of the picture snapped to macroblock units */
+    int width;
+    int height;
+    int vertical_position_extension;
+    int chroma_format;
+
+    /* picture header stuff */
+
+    /* what type of picture this is (I, P, B, D) */
+    int coding_type;
+
+    /* picture coding extension stuff */
+
+    /* quantization factor for intra dc coefficients */
+    int intra_dc_precision;
+    /* top/bottom/both fields */
+    int picture_structure;
+    /* bool to indicate all predictions are frame based */
+    int frame_pred_frame_dct;
+    /* bool to indicate whether intra blocks have motion vectors */
+    /* (for concealment) */
+    int concealment_motion_vectors;
+    /* bool to use different vlc tables */
+    int intra_vlc_format;
+    /* used for DMV MC */
+    int top_field_first;
+
+    /* stuff derived from bitstream */
+
+    /* pointer to the zigzag scan we're supposed to be using */
+    const uint8_t * scan;
+
+    int second_field;
+
+    int mpeg1;
+
+    /* XXX: stuff due to xine shit */
+    int8_t q_scale_type;
+
+    /* Ahab field */
+    bool invalid;
+};
+
+typedef struct {
+    mpeg2_fbuf_t fbuf;
+} fbuf_alloc_t;
+
+struct mpeg2dec_s {
+    mpeg2_decoder_t decoder;
+
+    mpeg2_info_t info;
+
+    uint32_t shift;
+    int is_display_initialized;
+    mpeg2_state_t (* action) (struct mpeg2dec_s * mpeg2dec);
+    mpeg2_state_t state;
+    uint32_t ext_state;
+
+    /* allocated in init - gcc has problems allocating such big structures */
+    uint8_t * chunk_buffer;
+    /* pointer to start of the current chunk */
+    uint8_t * chunk_start;
+    /* pointer to current position in chunk_buffer */
+    uint8_t * chunk_ptr;
+    /* last start code ? */
+    uint8_t code;
+
+    /* picture tags */
+    uint32_t tag_current, tag2_current, tag_previous, tag2_previous;
+    int num_tags;
+    int bytes_since_tag;
+
+    int first;
+    int alloc_index_user;
+    int alloc_index;
+    uint8_t first_decode_slice;
+    uint8_t nb_decode_slices;
+
+    unsigned int user_data_len;
+
+    mpeg2_sequence_t new_sequence;
+    mpeg2_sequence_t sequence;
+    mpeg2_gop_t new_gop;
+    mpeg2_gop_t gop;
+    mpeg2_picture_t new_picture;
+    mpeg2_picture_t pictures[4];
+    mpeg2_picture_t * picture;
+    /*const*/ mpeg2_fbuf_t * fbuf[3];	/* 0: current fbuf, 1-2: prediction fbufs */
+
+    fbuf_alloc_t fbuf_alloc[3];
+    int custom_fbuf;
+
+    uint8_t * yuv_buf[3][3];
+    int yuv_index;
+    mpeg2_convert_t * convert;
+    void * convert_arg;
+    unsigned int convert_id_size;
+    int convert_stride;
+    void (* convert_start) (void * id, const mpeg2_fbuf_t * fbuf,
+			    const mpeg2_picture_t * picture,
+			    const mpeg2_gop_t * gop);
+
+    uint8_t * buf_start;
+    uint8_t * buf_end;
+
+    int16_t display_offset_x, display_offset_y;
+
+    int copy_matrix;
+    int8_t scaled[4]; /* XXX: MOVED */
+    //int8_t q_scale_type, scaled[4];
+    uint8_t quantizer_matrix[4][64];
+    uint8_t new_quantizer_matrix[4][64];
+};
+
+typedef struct {
+#ifdef ARCH_PPC
+    uint8_t regv[12*16];
+#endif
+    int dummy;
+} cpu_state_t;
+
+/* cpu_accel.c */
+uint32_t mpeg2_detect_accel (uint32_t accel);
+
+/* cpu_state.c */
+void mpeg2_cpu_state_init (uint32_t accel);
+
+/* decode.c */
+mpeg2_state_t mpeg2_seek_header (mpeg2dec_t * mpeg2dec);
+mpeg2_state_t mpeg2_parse_header (mpeg2dec_t * mpeg2dec);
+
+/* header.c */
+void mpeg2_header_state_init (mpeg2dec_t * mpeg2dec);
+void mpeg2_reset_info (mpeg2_info_t * info);
+int mpeg2_header_sequence (mpeg2dec_t * mpeg2dec);
+int mpeg2_header_gop (mpeg2dec_t * mpeg2dec);
+mpeg2_state_t mpeg2_header_picture_start (mpeg2dec_t * mpeg2dec);
+int mpeg2_header_picture (mpeg2dec_t * mpeg2dec);
+int mpeg2_header_extension (mpeg2dec_t * mpeg2dec);
+int mpeg2_header_user_data (mpeg2dec_t * mpeg2dec);
+void mpeg2_header_sequence_finalize (mpeg2dec_t * mpeg2dec);
+void mpeg2_header_gop_finalize (mpeg2dec_t * mpeg2dec);
+void mpeg2_header_picture_finalize (mpeg2dec_t * mpeg2dec, uint32_t accels);
+mpeg2_state_t mpeg2_header_slice_start (mpeg2dec_t * mpeg2dec);
+mpeg2_state_t mpeg2_header_end (mpeg2dec_t * mpeg2dec);
+void mpeg2_set_fbuf (mpeg2dec_t * mpeg2dec, int b_type);
+
+/* idct.c */
+extern void mpeg2_idct_init (uint32_t accel);
+extern const uint8_t mpeg2_scan_norm[64];
+extern const uint8_t mpeg2_scan_alt[64];
+
+/* idct_mmx.c */
+void mpeg2_idct_copy_sse2 (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_add_sse2 (int last, int16_t * block,
+			  uint8_t * dest, int stride);
+void mpeg2_idct_copy_mmxext (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_add_mmxext (int last, int16_t * block,
+			    uint8_t * dest, int stride);
+void mpeg2_idct_copy_mmx (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_add_mmx (int last, int16_t * block,
+			 uint8_t * dest, int stride);
+void mpeg2_idct_mmx_init (void);
+
+/* idct_altivec.c */
+void mpeg2_idct_copy_altivec (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_add_altivec (int last, int16_t * block,
+			     uint8_t * dest, int stride);
+void mpeg2_idct_altivec_init (void);
+
+/* idct_alpha.c */
+void mpeg2_idct_copy_mvi (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_add_mvi (int last, int16_t * block,
+			 uint8_t * dest, int stride);
+void mpeg2_idct_copy_alpha (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_add_alpha (int last, int16_t * block,
+			   uint8_t * dest, int stride);
+void mpeg2_idct_alpha_init (void);
+
+/* motion_comp.c */
+void mpeg2_mc_init (uint32_t accel);
+
+typedef struct {
+    mpeg2_mc_fct * put [8];
+    mpeg2_mc_fct * avg [8];
+} mpeg2_mc_t;
+
+#define MPEG2_MC_EXTERN(x) mpeg2_mc_t mpeg2_mc_##x = {			  \
+    {MC_put_o_16_##x, MC_put_x_16_##x, MC_put_y_16_##x, MC_put_xy_16_##x, \
+     MC_put_o_8_##x,  MC_put_x_8_##x,  MC_put_y_8_##x,  MC_put_xy_8_##x}, \
+    {MC_avg_o_16_##x, MC_avg_x_16_##x, MC_avg_y_16_##x, MC_avg_xy_16_##x, \
+     MC_avg_o_8_##x,  MC_avg_x_8_##x,  MC_avg_y_8_##x,  MC_avg_xy_8_##x}  \
+};
+
+extern mpeg2_mc_t mpeg2_mc_c;
+extern mpeg2_mc_t mpeg2_mc_mmx;
+extern mpeg2_mc_t mpeg2_mc_mmxext;
+extern mpeg2_mc_t mpeg2_mc_3dnow;
+extern mpeg2_mc_t mpeg2_mc_altivec;
+extern mpeg2_mc_t mpeg2_mc_alpha;
+extern mpeg2_mc_t mpeg2_mc_vis;
+extern mpeg2_mc_t mpeg2_mc_arm;
+
+#endif /* LIBMPEG2_MPEG2_INTERNAL_H */
diff --git a/mpegheader.cpp b/mpegheader.cpp
new file mode 100644
index 0000000..31795a0
--- /dev/null
+++ b/mpegheader.cpp
@@ -0,0 +1,83 @@
+#include <stdint.h>
+#include <stdio.h>
+
+#include "mpegheader.hpp"
+#include "exceptions.hpp"
+#include "bitreader.hpp"
+#include "file.hpp"
+
+MPEGHeader *MPEGHeader::make( BitReader &hdr, File *file )
+{
+  ahabassert( hdr.readbits( 24 ) == 0x1 );
+
+  uint8_t val = hdr.readbits( 8 );
+  int extension_start_code_identifier;
+
+  /* process slice_start_code */
+  if ( (val >= 0x01) && (val <= 0xAF) ) {
+    return new Slice( val, file );
+  }
+
+  /* process system start codes */
+  if ( val >= 0xB9 ) {
+    fprintf( stderr,
+	     "Saw system start code (0x%02x). Does not appear to be an elementary stream.",
+	     val );
+    throw NotMPEGES();
+  }
+
+  switch ( val ) {
+  case 0x00:
+    return new Picture( hdr );
+    break;
+
+  case 0xB0:
+  case 0xB1:
+  case 0xB6:
+    /* reserved */
+    return new ReservedHeader( hdr );
+    break;
+
+  case 0xB2:
+    /* user data */
+    return new UserData( hdr );
+    break;
+
+  case 0xB3:
+    return new Sequence( hdr );
+    break;
+
+  case 0xB4:
+    /* sequence error */
+    return new SequenceError( hdr );
+    break;
+
+  case 0xB5:
+    extension_start_code_identifier = hdr.readbits( 4 );
+
+    switch ( extension_start_code_identifier ) {
+    case 1: return new SequenceExtension( hdr ); break;
+    case 3: return new QuantMatrixExtension( hdr ); break;
+    case 8: return new PictureCodingExtension( hdr ); break;
+    default: return new OtherExtension( hdr ); break;
+    }
+
+    break;
+
+  case 0xB7:
+    /* sequence end */
+    return new SequenceEnd( hdr );
+    break;
+
+  case 0xB8:
+    /* group start */
+    return new Group( hdr );
+    break;
+
+  default:
+    throw InternalError();
+  }
+
+  ahabassert( 0 );
+  return NULL;
+}
diff --git a/mpegheader.hpp b/mpegheader.hpp
new file mode 100644
index 0000000..777cb58
--- /dev/null
+++ b/mpegheader.hpp
@@ -0,0 +1,360 @@
+#ifndef MPEGHEADER_HPP
+#define MPEGHEADER_HPP
+
+#include <stdint.h>
+#include <sys/types.h>
+#include <stdio.h>
+
+#include "bitreader.hpp"
+#include "exceptions.hpp"
+#include "file.hpp"
+
+#include "libmpeg2.h"
+
+class ES;
+class BufferPool;
+
+class MPEGHeader {
+private:
+  off_t location;
+  MPEGHeader *next;
+
+protected:
+  void init( void ) { location = -1; next = NULL; }
+  
+public:
+  static MPEGHeader *make( BitReader &hdr, File *file );
+
+  MPEGHeader *get_next( void ) { return next; }
+  void set_next( MPEGHeader *s_next ) { ahabassert( next == NULL ); next = s_next; }
+  void override_next( MPEGHeader *s_next ) { ahabassert( next != NULL ); next = s_next; }
+
+  off_t get_location( void ) {
+    ahabassert( location != -1 );
+    return location;
+  }
+  void set_location( off_t s_location ) { ahabassert( location == -1 ); location = s_location; }
+
+  virtual void print_info( void ) = 0;
+  virtual void link( void ) = 0;
+  virtual ~MPEGHeader( void ) {};
+};
+
+class SequenceExtension : public MPEGHeader
+{
+  friend class Sequence;
+
+private:
+  bool escape_bit;
+  uint8_t profile;
+  uint8_t level;
+  bool progressive_sequence;
+  uint8_t chroma_format;
+  uint8_t horizontal_size_extension, vertical_size_extension;
+  uint16_t bit_rate_extension;
+  uint8_t vbv_buffer_size_extension;
+  bool low_delay;
+  uint8_t frame_rate_extension_n;
+  uint8_t frame_rate_extension_d;
+
+public:
+  SequenceExtension( BitReader &hdr );
+  virtual void print_info( void ) { printf( "sequence extension\n" ); }  
+  virtual void link( void );
+  bool operator==(const SequenceExtension &o) const;
+};
+
+class Sequence : public MPEGHeader
+{
+private:
+  uint horizontal_size_value, vertical_size_value;
+  enum AspectRatio { SAR1x1, DAR4x3, DAR16x9, DAR221x100 };
+  AspectRatio aspect;
+  uint8_t frame_rate_code;
+
+  uint8_t intra_quantiser_matrix[ 64 ];
+  uint8_t non_intra_quantiser_matrix[ 64 ];
+
+  uint32_t bit_rate_value;
+  uint16_t vbv_buffer_size_value;
+  bool constrained_parameters_flag;
+
+  SequenceExtension *extension;
+
+public:
+  virtual void print_info( void ) { printf( "sequence\n" ); }
+  virtual void link( void );
+
+  Sequence( BitReader &hdr );
+
+  uint get_horizontal_size( void );
+  uint get_vertical_size( void );
+  uint get_mb_width( void );
+  uint get_mb_height( void );
+  uint64_t get_frame_rate_numerator( void );
+  uint64_t get_frame_rate_denominator( void );
+  double get_frame_rate( void );
+
+  uint8_t *get_intra_quantiser_matrix( void ) {
+    return intra_quantiser_matrix;
+  }
+
+  uint8_t *get_non_intra_quantiser_matrix( void ) {
+    return non_intra_quantiser_matrix;
+  }
+
+  bool get_progressive_sequence( void ) { return get_extension()->progressive_sequence; }
+
+  SequenceExtension *get_extension( void ) {
+    ahabassert( extension );
+    return extension;
+  }
+
+  AspectRatio get_aspect( void ) { return aspect; }
+  double get_sar( void );
+  void set_unknown_quantiser_flags( void );
+};
+
+class PictureCodingExtension : public MPEGHeader
+{
+  friend class Picture;
+
+private:
+  uint8_t f_code_fh, f_code_fv, f_code_bh, f_code_bv;
+  uint8_t intra_dc_precision;
+  uint8_t picture_structure;
+  bool top_field_first, repeat_first_field, progressive_frame;
+  bool frame_pred_frame_dct, concealment_motion_vectors,
+    q_scale_type, intra_vlc_format, alternate_scan,
+    chroma_420_type;
+
+public:
+  PictureCodingExtension( BitReader &hdr );
+  virtual void print_info( void ) { printf( "picture coding extension\n" ); }
+  virtual void link( void ) {}
+};
+
+class Picture;
+
+class Slice : public MPEGHeader
+{
+private:
+  uint val;
+  Slice *next_slice_in_row;
+  uint len;
+  Picture *picture;
+  bool incomplete;
+  File *file;
+
+public:
+  Slice( uint s_val, File *s_file );
+  virtual void link( void );
+
+  uint get_len( void ) { return len; }
+  uint get_val( void ) { return val; }
+  uint top_line( void ) { return (val - 1) * 16; }
+  uint bot_line( void ) { return val * 16 - 1; }
+  virtual void print_info( void );
+  Slice *get_next_in_row( void ) { return next_slice_in_row; }
+  bool get_incomplete( void ) { return incomplete; }
+
+  MapHandle *map_chunk( void );
+
+  void set_picture( Picture *s ) { ahabassert( picture == NULL ); picture = s; }
+  Picture *get_picture( void ) { ahabassert( picture ); return picture; }
+
+  void decode( mpeg2_decoder_t * const decoder, const int code,
+	       const uint8_t * const buffer);
+};
+
+enum PictureType { I = 1, P, B };
+
+class FrameHandle;
+
+class Picture : public MPEGHeader
+{
+private:
+  int coded_order, display_order;
+  PictureType type;
+
+  uint16_t temporal_reference;
+  uint16_t vbv_delay;
+
+  Sequence *sequence;
+  PictureCodingExtension *extension;
+
+  bool unclean_last_anchor;
+  bool incomplete;
+  bool broken;
+  bool unknown_quantiser_matrix;
+  bool invalid;
+
+  double presentation_time;
+
+  uint8_t *intra_quantiser_matrix,
+    *non_intra_quantiser_matrix;
+
+  Slice **slicerow;
+
+  Picture *forward_reference, *backward_reference;
+
+  void setup_decoder( mpeg2_decoder_t *d,
+		      uint8_t *current_fbuf[3],
+		      uint8_t *forward_fbuf[3],
+		      uint8_t *backward_fbuf[3] );
+
+  static void motion_setup( mpeg2_decoder_t *d );
+
+  FrameHandle *fh;
+
+public:
+  int get_coded( void ) { return coded_order; }
+  void set_coded( int s_coded ) { coded_order = s_coded; }
+
+  int get_display( void ) { return display_order; }
+  void set_display( int s_display ) { display_order = s_display; }
+
+  bool get_unclean( void ) { return unclean_last_anchor; }
+  void set_unclean( bool s_unclean ) { unclean_last_anchor = s_unclean; }
+
+  bool get_broken( void ) { return broken; }
+  void set_broken( bool s ) { broken = s; }
+
+  void set_unknown_quantiser_matrix( bool s_unknown ) { unknown_quantiser_matrix = s_unknown; }
+  bool get_unknown_quantiser_matrix( void ) { return unknown_quantiser_matrix; }
+
+  bool get_incomplete( void ) { return incomplete; }
+
+  bool get_invalid( void ) { return invalid; }
+
+  bool problem( void ) { return (get_broken() || get_unknown_quantiser_matrix() || get_incomplete() || get_invalid()); }
+
+  double get_time( void ) { return presentation_time; }
+  void set_time( double s_time ) { presentation_time = s_time; }
+
+  void set_intra( uint8_t *s ) { intra_quantiser_matrix = s; }
+  void set_non_intra( uint8_t *s ) { non_intra_quantiser_matrix = s; }
+
+  uint num_fields( void );
+
+  PictureType get_type( void ) { return type; }
+  char get_type_char( void ) { return "XIPB"[type]; }
+
+  void set_sequence( Sequence *s )
+  {
+    ahabassert( sequence == NULL );
+    sequence = s;
+  }
+
+  Sequence *get_sequence( void ) {
+    ahabassert( sequence );
+    return sequence;
+  }
+
+  PictureCodingExtension *get_extension( void ) {
+    ahabassert( extension );
+    return extension;
+  }
+
+  void set_forward( Picture *s ) {
+    ahabassert( forward_reference == NULL );
+    ahabassert( (type == P) || (type == B) );
+    forward_reference = s;
+  }
+
+  void set_backward( Picture *s ) {
+    ahabassert( backward_reference == NULL );
+    ahabassert( type == B );
+    backward_reference = s;
+  }
+
+  Picture *get_forward( void ) { return forward_reference; }
+  Picture *get_backward( void ) { return backward_reference; }
+
+  Slice *get_slicerow( uint row ) { return slicerow[ row ]; }
+
+  FrameHandle *get_framehandle( void ) { return fh; }
+
+  Picture( BitReader &hdr );
+  ~Picture();
+
+  void init_fh( BufferPool *pool );
+
+  virtual void print_info( void );
+  virtual void link( void );
+
+  void lock_and_decodeall();
+};
+
+class SequenceEnd : public MPEGHeader
+{
+public:
+  SequenceEnd( BitReader &hdr ) { init(); }
+  virtual void print_info( void ) { printf( "sequence end\n" ); }
+  virtual void link( void ) {}
+};
+
+class OtherExtension : public MPEGHeader
+{
+public:
+  OtherExtension( BitReader &hdr ) { init(); }
+  virtual void print_info( void ) { printf( "other extension\n" ); }
+  virtual void link( void ) {}
+};
+
+class ReservedHeader : public MPEGHeader
+{
+public:
+  ReservedHeader( BitReader &hdr ) { init(); }
+  virtual void print_info( void ) { printf( "reserved header\n" ); }
+  virtual void link( void ) {}
+};
+
+class UserData : public MPEGHeader
+{
+public:
+  UserData( BitReader &hdr ) { init(); }
+  virtual void print_info( void ) { printf( "user data\n" ); }
+  virtual void link( void ) {}
+};
+
+class SequenceError : public MPEGHeader
+{
+public:
+  SequenceError( BitReader &hdr ) { init(); }
+  virtual void print_info( void ) { printf( "sequence error\n" ); }
+  virtual void link( void ) {}
+};
+
+class Group : public MPEGHeader
+{
+public:
+  Group( BitReader &hdr ) { init(); }
+  virtual void print_info( void ) { printf( "group\n" ); }
+  virtual void link( void ) {}
+};
+
+class QuantMatrixExtension : public MPEGHeader
+{
+private:
+  uint load_intra_quantiser_matrix;
+  uint load_non_intra_quantiser_matrix;
+
+  uint8_t intra_quantiser_matrix[ 64 ];
+  uint8_t non_intra_quantiser_matrix[ 64 ];
+
+public:
+  uint8_t *get_intra_quantiser_matrix( void ) {
+    return load_intra_quantiser_matrix ? intra_quantiser_matrix : NULL;
+  }
+
+  uint8_t *get_non_intra_quantiser_matrix( void ) {
+    return load_non_intra_quantiser_matrix ? non_intra_quantiser_matrix : NULL;
+  }  
+
+  QuantMatrixExtension( BitReader &hdr );
+  virtual void print_info( void ) { printf( "quant matrix extension\n" ); }
+  virtual void link( void ) {}
+};
+
+#endif
diff --git a/mpegtables.hpp b/mpegtables.hpp
new file mode 100644
index 0000000..80f23cc
--- /dev/null
+++ b/mpegtables.hpp
@@ -0,0 +1,46 @@
+#ifndef MPEGTABLES_HPP
+#define MPEGTABLES_HPP
+
+static const uint8_t default_intra_quantiser_matrix[ 64 ] = {
+  8,
+  16, 16,
+  19, 16, 19,
+  22, 22, 22, 22,
+  22, 22, 26, 24, 26,
+  27, 27, 27, 26, 26, 26,
+  26, 27, 27, 27, 29, 29, 29,
+  34, 34, 34, 29, 29, 29, 27, 27,
+  29, 29, 32, 32, 34, 34, 37,
+  38, 37, 35, 35, 34, 35,
+  38, 38, 40, 40, 40,
+  48, 48, 46, 46,
+  56, 56, 58,
+  69, 69,
+  83
+};
+
+/*
+static const uint8_t mpeg2_normal_scan[ 64 ] = {
+  0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,
+  12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,
+  35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+  58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
+};
+*/
+
+static const uint8_t mpeg2_normal_scan[ 64 ] ATTR_ALIGN(16) = {
+  0, 4, 8, 16, 12, 1, 5, 9, 20, 24, 32, 28, 17, 13, 2, 6, 10, 21,
+  25, 36, 40, 48, 44, 33, 29, 18, 14, 3, 7, 11, 22, 26, 37, 41, 52,
+  56, 60, 49, 45, 34, 30, 19, 15, 23, 27, 38, 42, 53, 57, 61, 50, 46,
+  35, 31, 39, 43, 54, 58, 62, 51, 47, 55, 59, 63
+};
+
+static const uint64_t sequence_numerators[] = {
+  0, 24000, 24, 25, 30000, 30, 50, 60000, 60
+};
+
+static const uint64_t sequence_denominators[] = {
+  1,  1001,  1,  1,  1001,  1,  1,  1001,  1
+};
+
+#endif
diff --git a/mutexobj.hpp b/mutexobj.hpp
new file mode 100644
index 0000000..56a6700
--- /dev/null
+++ b/mutexobj.hpp
@@ -0,0 +1,21 @@
+#ifndef MUTEXOBJ_HPP
+#define MUTEXOBJ_HPP
+
+#include <pthread.h>
+#include "exceptions.hpp"
+
+class MutexLock {
+private:
+  pthread_mutex_t *mutex;
+
+public:
+  MutexLock( pthread_mutex_t *s_mutex ) {
+    mutex = s_mutex;
+    unixassert( pthread_mutex_lock( mutex ) );
+  }
+  ~MutexLock() {
+    unixassert( pthread_mutex_unlock( mutex ) );
+  }
+};
+
+#endif
diff --git a/ogl.cpp b/ogl.cpp
new file mode 100644
index 0000000..ee0afee
--- /dev/null
+++ b/ogl.cpp
@@ -0,0 +1,433 @@
+#include "ogl.hpp"
+
+#include <X11/Xlib.h>
+#include <GL/gl.h>
+#include <GL/glext.h>
+#include <GL/glx.h>
+#include <GL/glu.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#include "ahab_fragment_program.hpp"
+#include "exceptions.hpp"
+#include "colorimetry.hpp"
+
+#include "displayopq.hpp"
+
+const int opqueue_len = 6;
+
+static void *thread_helper( void *ogl )
+{
+  OpenGLDisplay *me = static_cast<OpenGLDisplay *>( ogl );
+  ahabassert( me );
+  me->loop();
+  return NULL;
+}
+
+OpenGLDisplay::OpenGLDisplay( char *display_name,
+			      double movie_sar,
+			      uint s_framewidth, uint s_frameheight,
+			      uint s_dispwidth, uint s_dispheight )
+  : opq( opqueue_len )
+{
+  state.framewidth = s_framewidth;
+  state.frameheight = s_frameheight;
+  state.dispwidth = s_dispwidth;
+  state.dispheight = s_dispheight;
+  
+  if ( 0 == XInitThreads() ) {
+    fprintf( stderr, "XInitThreads() failed." );
+    throw DisplayError();
+  }
+
+  state.display = XOpenDisplay( display_name );
+  if ( state.display == NULL ) {
+    fprintf( stderr, "Couldn't open display.\n" );
+    throw DisplayError();
+  }
+
+  /* Figure out the dimensions given the width and height,
+     the video sample aspect ratio, and the screen pixel aspect ratio */
+  double display_sar = ((double)DisplayHeight( state.display, DefaultScreen( state.display ) )
+			/ (double)DisplayHeightMM( state.display, DefaultScreen( state.display ) ))
+    / ((double)DisplayWidth( state.display, DefaultScreen( state.display ) )
+       / (double)DisplayWidthMM( state.display, DefaultScreen( state.display ) ));
+
+  state.sar = movie_sar / display_sar;
+
+  // sar = 1; /* XXX */
+
+  if ( state.sar > 1 ) {
+    state.width = lrint( (double)state.dispwidth * state.sar );
+    state.height = state.dispheight;
+  } else {
+    state.width = state.dispwidth;
+    state.height = lrint( (double)state.dispheight / state.sar );
+  }
+
+  fprintf( stderr, "Display is %dx%d with pixel AR %.3f:1, display AR %.3f:1.\n",
+	   DisplayWidth( state.display, DefaultScreen( state.display ) ),
+	   DisplayHeight( state.display, DefaultScreen( state.display ) ),
+	   display_sar,
+	   (double)DisplayWidthMM( state.display, DefaultScreen( state.display ) )
+	   / (double)DisplayHeightMM( state.display, DefaultScreen( state.display ) ) );
+
+  fprintf( stderr, "MPEG-2 sequence is %dx%d with sample AR %.3f:1, display AR %.3f:1.\n",
+	   state.dispwidth, state.dispheight,
+	   movie_sar,
+	   movie_sar * state.dispwidth / (double)state.dispheight );
+
+  fprintf( stderr, "Video SAR in display pixel units = %.3f:1. Display size = %dx%d.\n",
+	   state.sar, state.width, state.height );
+
+  pthread_create( &thread_handle, NULL,
+		  thread_helper, this );
+}
+
+void OpenGLDisplay::init_context( void ) {
+  int attributes[] = { GLX_RGBA,
+		       GLX_DOUBLEBUFFER, True,
+		       GLX_RED_SIZE, 8,
+		       GLX_GREEN_SIZE, 8,
+		       GLX_BLUE_SIZE, 8,
+		       None };
+
+  XVisualInfo *visual = glXChooseVisual( state.display, 0, attributes );
+  if ( visual == NULL ) {
+    fprintf( stderr, "Could not open glX visual.\n" );
+    throw DisplayError();
+  }
+
+  state.context = glXCreateContext( state.display, visual, NULL, True );
+  if ( state.context == NULL ) {
+    fprintf( stderr, "No glX context.\n" );
+    throw DisplayError();
+  }
+
+  XFree( visual );
+
+  if ( !glXMakeCurrent( state.display, state.window, state.context ) ) {
+    fprintf( stderr, "Could not reactivate OpenGL.\n" );
+    throw DisplayError();
+  }
+
+  GLcheck( "glXMakeCurrent" );
+
+  /* initialize textures */
+  init_tex( GL_TEXTURE0, GL_LUMINANCE8, &state.Y_tex,
+	    state.framewidth, state.frameheight, GL_LINEAR );
+  init_tex( GL_TEXTURE1, GL_LUMINANCE8, &state.Cb_tex,
+	    state.framewidth/2, state.frameheight/2, GL_LINEAR );
+  init_tex( GL_TEXTURE2, GL_LUMINANCE8, &state.Cr_tex,
+	    state.framewidth/2, state.frameheight/2, GL_LINEAR );
+
+  /* load the shader */
+  GLint errorloc;  
+  glEnable( GL_FRAGMENT_PROGRAM_ARB );
+  glGenProgramsARB( 1, &shader );
+  glBindProgramARB( GL_FRAGMENT_PROGRAM_ARB, shader );
+  glProgramStringARB( GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
+		      strlen( ahab_fragment_program ),
+		      ahab_fragment_program );
+  glGetIntegerv( GL_PROGRAM_ERROR_POSITION_ARB, &errorloc );
+  if ( errorloc != -1 ) {
+    fprintf( stderr, "Error in fragment shader at position %d.\n", errorloc );
+    fprintf( stderr, "Error string: %s\n",
+	     glGetString( GL_PROGRAM_ERROR_STRING_ARB ) );
+    throw DisplayError();
+  }
+
+  GLcheck( "glProgramString" );
+
+  /* guess colors */
+  if ( state.frameheight <= 480 ) {
+    smpte170m.execute( state );
+  } else {
+    itu709.execute( state );
+  }
+}
+
+void OpcodeState::reset_viewport( void )
+{
+  glXSwapBuffers( display, window );
+  glFinish();
+  OpenGLDisplay::GLcheck( "reset_viewport: glFinish" );
+  XSync( display, False );
+  glLoadIdentity();
+  glViewport( 0, 0, width, height );
+  glMatrixMode( GL_PROJECTION );
+  glLoadIdentity();
+  glOrtho( 0, width, height, 0, -1, 1 );
+  glMatrixMode( GL_MODELVIEW );
+  glLoadIdentity();
+  glPixelStorei( GL_UNPACK_ALIGNMENT, 1 );
+  OpenGLDisplay::GLcheck( "glPixelStorei" );
+  glWindowPos2d( 0, 0 );
+  glClear( GL_COLOR_BUFFER_BIT );
+  OpenGLDisplay::GLcheck( "reset_viewport" );
+}
+
+void OpcodeState::window_setup( void )
+{
+  window = XCreateSimpleWindow( display, DefaultRootWindow( display ),
+				0, 0, width, height, 0, 0, 0 );
+  XSetStandardProperties( display, window, "Ahab", "Ahab",
+			  None, NULL, 0, NULL );
+  XSelectInput( display, window, ExposureMask | KeyPressMask );
+}
+
+OpenGLDisplay::~OpenGLDisplay()
+{
+  opq.flush();
+
+  DisplayOperation *shutdown = new ShutDown();
+  opq.enqueue( shutdown );
+
+  unixassert( pthread_join( thread_handle, NULL ) );
+
+  delete shutdown; /* Doesn't get deleted by loop because opcode execute() exits first. */
+
+  glDeleteProgramsARB( 1, &shader );
+  glDeleteTextures( 1, &state.Y_tex );
+  glDeleteTextures( 1, &state.Cb_tex );
+  glDeleteTextures( 1, &state.Cr_tex );
+  glXDestroyContext( state.display, state.context );
+  XDestroyWindow( state.display, state.window );
+  XCloseDisplay( state.display );
+}
+
+void OpenGLDisplay::init_tex( GLenum tnum, GLint internalformat, GLuint *tex,
+			      uint width, uint height, GLint interp )
+{
+  glActiveTexture( tnum );
+  glEnable( GL_TEXTURE_RECTANGLE_ARB );
+  glGenTextures( 1, tex );
+  glBindTexture( GL_TEXTURE_RECTANGLE_ARB, *tex );
+  glTexImage2D( GL_TEXTURE_RECTANGLE_ARB, 0,
+		internalformat, width, height,
+		0, GL_LUMINANCE, GL_UNSIGNED_BYTE, NULL );
+  glTexParameteri( GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MIN_FILTER, interp );
+  glTexParameteri( GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MAG_FILTER, interp );
+  glTexParameteri( GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE );
+  glTexParameteri( GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE );
+  glTexEnvi( GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE );
+  GLcheck( "init_tex" );
+}
+
+void OpcodeState::paint( void )
+{
+  glPushMatrix();
+  glLoadIdentity();
+  glTranslatef( 0, 0, 0 );
+  glBegin( GL_POLYGON );
+
+  const double ff = 1.0/128; /* Mesa fudge factor */
+  const double xoffset = 0.25; /* MPEG-2 style 4:2:0 subsampling */
+
+  glMultiTexCoord2d( GL_TEXTURE0, ff, ff );
+  glMultiTexCoord2d( GL_TEXTURE1, xoffset+ff, ff );
+  glMultiTexCoord2d( GL_TEXTURE2, xoffset+ff, ff );
+  glVertex2s( 0, 0 );
+
+  glMultiTexCoord2d( GL_TEXTURE0, dispwidth+ff, ff );
+  glMultiTexCoord2d( GL_TEXTURE1, dispwidth/2 + xoffset + ff, ff );
+  glMultiTexCoord2d( GL_TEXTURE2, dispwidth/2 + xoffset + ff, ff );
+  glVertex2s( width, 0 );
+
+  glMultiTexCoord2d( GL_TEXTURE0, dispwidth+ff, dispheight+ff );
+  glMultiTexCoord2d( GL_TEXTURE1, dispwidth/2 + xoffset + ff, dispheight/2 + ff);
+  glMultiTexCoord2d( GL_TEXTURE2, dispwidth/2 + xoffset + ff, dispheight/2 + ff);
+  glVertex2s( width, height);
+
+  glMultiTexCoord2d( GL_TEXTURE0, ff, dispheight+ff );
+  glMultiTexCoord2d( GL_TEXTURE1, xoffset+ff, dispheight/2 + ff );
+  glMultiTexCoord2d( GL_TEXTURE2, xoffset+ff, dispheight/2 + ff );
+  glVertex2s( 0, height);
+
+  glEnd();
+
+  glPopMatrix();
+
+  glXSwapBuffers( display, window );
+
+  OpenGLDisplay::GLcheck( "glXSwapBuffers" );
+}
+
+typedef struct
+{
+    long flags;
+    long functions;
+    long decorations;
+    long input_mode;
+    long state;
+} MotifWmHints;
+
+#define MWM_HINTS_DECORATIONS   (1L << 1)
+
+void OpcodeState::dofullscreen( void )
+{
+  /* move on top */
+  XEvent xev;
+  Atom wm_state = XInternAtom( display, "_NET_WM_STATE", False);
+  Atom fullscreen = XInternAtom( display, "_NET_WM_STATE_FULLSCREEN", False);
+
+  memset(&xev, 0, sizeof(xev));
+  xev.type = ClientMessage;
+  xev.xclient.window = window;
+  xev.xclient.message_type = wm_state;
+  xev.xclient.format = 32;
+  xev.xclient.data.l[0] = 1;
+  xev.xclient.data.l[1] = fullscreen;
+  xev.xclient.data.l[2] = 0;
+
+  XSendEvent( display, DefaultRootWindow( display ), False,
+	      SubstructureNotifyMask, &xev);
+
+  /* hide cursor */
+  Cursor thecursor;
+  Pixmap thepixmap;
+  char no_data[] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+  XColor bg;
+
+  thepixmap = XCreateBitmapFromData( display, window, no_data, 8, 8 );
+  thecursor = XCreatePixmapCursor( display, thepixmap, thepixmap, &bg, &bg, 0, 0 );
+  XDefineCursor( display, window, thecursor );
+  XFreeCursor( display, thecursor );
+  XFreePixmap( display, thepixmap );
+
+  /* adjust width and height of displayed image */
+  width = DisplayWidth( display, DefaultScreen( display ) );
+  height = lrint( (double)dispheight * (double)width / ((double)dispwidth * sar) );
+
+  if ( (signed)height > DisplayHeight( display, DefaultScreen( display ) ) ) {
+    height = DisplayHeight( display, DefaultScreen( display ) );
+    width = lrint( (double)dispwidth * (double) sar * (double)height / (double)dispheight );
+  }
+
+  reset_viewport();
+
+  OpenGLDisplay::GLcheck( "fullscreen" );
+
+  paint();
+}
+
+void OpcodeState::unfullscreen( void )
+{
+  XDestroyWindow( display, window );
+
+  /* adjust width and height of displayed image */
+  if ( sar > 1 ) {
+    width = lrint( (double)dispwidth * sar );
+    height = dispheight;
+  } else {
+    width = dispwidth;
+    height = lrint( (double)dispheight / sar );
+  }
+
+  window_setup();
+
+  if ( !glXMakeCurrent( display, window, context ) ) {
+    fprintf( stderr, "Could not reactivate OpenGL.\n" );
+    throw DisplayError();
+  }
+  
+  OpenGLDisplay::GLcheck( "glXMakeCurrent" );
+
+  reset_viewport();
+
+  XMapRaised( display, window );
+
+  paint();
+}
+
+void OpenGLDisplay::makeevent( void )
+{
+  XEvent event;
+  memset( &event, 0, sizeof( event ) );
+  event.type = Expose;
+  if ( 0 == XSendEvent( state.display, state.window, False, ExposureMask, &event ) ) {
+    throw DisplayError();
+  };
+  XFlush( state.display );
+}
+
+char OpenGLDisplay::getevent( bool block )
+{
+  XEvent               myevent;
+  XExposeEvent        *myexpose = (XExposeEvent *)&myevent;
+  XKeyEvent           *mykey    = (XKeyEvent *)&myevent;
+
+  if ( block || XPending( state.display ) ) {
+    XNextEvent( state.display, &myevent );
+
+    if ( myevent.type == Expose && myexpose->count == 0 ) {
+      return '@';
+    } else if ( myevent.type == KeyPress ) {
+      char key;
+      KeySym keysym;
+      if ( XLookupString( mykey, &key, 1, &keysym, NULL ) == 1 ) {
+	return key;
+      }
+    }
+  }
+
+  return 0;
+}
+
+void OpcodeState::load_matrix_coefficients( double green[ 3 ],
+					    double blue[ 3 ],
+					    double red[ 3 ] )
+{
+  glProgramLocalParameter4dARB( GL_FRAGMENT_PROGRAM_ARB, 0,
+				green[ 0 ], green[ 1 ], green[ 2 ], 0 );
+  glProgramLocalParameter4dARB( GL_FRAGMENT_PROGRAM_ARB, 1,
+				blue[ 0 ], blue[ 1 ], blue[ 2 ], 0 );
+  glProgramLocalParameter4dARB( GL_FRAGMENT_PROGRAM_ARB, 2,
+				red[ 0 ], red[ 1 ], red[ 2 ], 0 );
+  OpenGLDisplay::GLcheck( "glProgramEnvParamater4dARB" );
+}
+
+void OpcodeState::load_tex( GLenum tnum, GLuint tex,
+			     uint width, uint height, uint8_t *data )
+{
+  glActiveTexture( tnum );
+  OpenGLDisplay::GLcheck( "glActiveTexture" );
+  glBindTexture( GL_TEXTURE_RECTANGLE_ARB, tex );
+  OpenGLDisplay::GLcheck( "glBindTexture" );
+  glTexSubImage2D( GL_TEXTURE_RECTANGLE_ARB, 0, 0, 0, width, height,
+		   GL_LUMINANCE, GL_UNSIGNED_BYTE, data );
+  OpenGLDisplay::GLcheck( "glTexSubImage2D" );
+}
+
+void OpcodeState::draw( uint8_t *ycbcr )
+{
+  load_tex( GL_TEXTURE0, Y_tex, framewidth, frameheight, ycbcr );
+  load_tex( GL_TEXTURE1, Cb_tex, framewidth/2, frameheight/2,
+	    ycbcr + framewidth * frameheight );
+  load_tex( GL_TEXTURE2, Cr_tex, framewidth/2, frameheight/2,
+	    ycbcr + framewidth * frameheight + framewidth * frameheight / 4);
+  
+  paint();
+}
+
+void OpenGLDisplay::loop( void )
+{
+  state.window_setup();
+  XMapRaised( state.display, state.window );
+  XEvent event;
+  XNextEvent( state.display, &event );
+
+  init_context();
+  state.reset_viewport();
+
+  while ( 1 ) {
+    DisplayOperation *op = opq.dequeue( true );
+    op->execute( state );
+    delete op;
+  }
+}
diff --git a/ogl.hpp b/ogl.hpp
new file mode 100644
index 0000000..f57e2b9
--- /dev/null
+++ b/ogl.hpp
@@ -0,0 +1,78 @@
+#ifndef OGL_HPP
+#define OGL_HPP
+
+#include <X11/Xlib.h>
+#include <GL/gl.h>
+#include <GL/glext.h>
+#include <GL/glx.h>
+#include <GL/glu.h>
+#include <stdint.h>
+
+#include "displayopq.hpp"
+#include "displayop.hpp"
+
+class OpcodeState {
+private:
+  static void load_tex( GLenum tnum, GLuint tex,
+			uint width, uint height, uint8_t *data );
+
+public:
+  Display *display;
+  Window window;
+  GLXContext context;
+  uint width, height; /* window size on screen */
+  uint framewidth, frameheight; /* luma matrix dimensions */
+  uint dispwidth, dispheight; /* MPEG-2 intended display size */
+  double sar;
+
+  void draw( uint8_t *ycbcr );
+  void paint( void );
+  void window_setup( void );
+  void reset_viewport( void );
+
+  void dofullscreen( void );
+  void unfullscreen( void );
+
+  void load_matrix_coefficients( double green[ 3 ],
+				 double blue[ 3 ],
+				 double red[ 3 ] );
+  GLuint Y_tex, Cb_tex, Cr_tex;
+};
+
+class OpenGLDisplay {
+ private:
+  OpcodeState state;
+
+  GLuint shader;
+
+  void init_context( void );
+  static void init_tex( GLenum tnum, GLint internalformat, GLuint *tex,
+			uint width, uint height, GLint interp );
+
+
+  pthread_t thread_handle;
+  OperationQueue<DisplayOperation> opq;
+
+ public:
+  OpenGLDisplay( char *display_name, double movie_sar,
+		 uint s_framewidth, uint s_frameheight,
+		 uint s_dispwidth, uint s_dispheight );
+  ~OpenGLDisplay();
+  char getevent( bool block );
+  void makeevent( void );
+
+  void loop( void );
+
+  OperationQueue<DisplayOperation> *get_queue() { return &opq; }
+
+  static void GLcheck( const char *where ) {
+    GLenum GLerror;
+    
+    if ( (GLerror = glGetError()) != GL_NO_ERROR ) {
+      fprintf( stderr, "GL error (%x) at (%s) (%s).\n", GLerror, where,
+	       gluErrorString( GLerror ) );
+    }
+  }
+};
+
+#endif
diff --git a/opq.cpp b/opq.cpp
new file mode 100644
index 0000000..4d0c3c8
--- /dev/null
+++ b/opq.cpp
@@ -0,0 +1,246 @@
+#include <pthread.h>
+
+#include "opq.hpp"
+#include "exceptions.hpp"
+#include "mutexobj.hpp"
+
+#include <typeinfo>
+
+template <class T>
+OperationQueue<T>::OperationQueue( int s_max_size )
+  : count( 0 ),
+    max_size( s_max_size ),
+    head( NULL ),
+    tail( NULL ),
+    output( NULL )
+{
+  unixassert( pthread_mutex_init( &mutex, NULL ) );
+  unixassert( pthread_cond_init( &write_activity, NULL ) );
+  unixassert( pthread_cond_init( &read_activity, NULL ) );
+}
+
+template <class T>
+OperationQueue<T>::~OperationQueue()
+{
+  {
+    MutexLock x( &mutex );
+
+    QueueElement<T> *ptr = head;
+
+    while ( ptr ) {
+      T *op = ptr->element;
+      QueueElement <T> *next = ptr->next;
+      delete op;
+      delete ptr;
+      ptr = next;
+    }
+
+    unixassert( pthread_cond_destroy( &read_activity ) );
+    unixassert( pthread_cond_destroy( &write_activity ) );
+  }
+
+  unixassert( pthread_mutex_destroy( &mutex ) );
+}
+
+template <class T>
+void OperationQueue<T>::enqueue( T *h )
+{
+  MutexLock x( &mutex );
+
+  if ( output ) {
+    ahabassert( !head );
+    ahabassert( !tail );
+    return output->enqueue( h );
+  }
+
+  while ( max_size && (count >= max_size) ) {
+    unixassert( pthread_cond_wait( &read_activity, &mutex ) );
+  }
+
+  QueueElement<T> *op = new QueueElement<T>( h );
+  op->prev = NULL;
+  op->next = head;
+
+  if ( head ) {
+    head->prev = op;
+    head = op;
+  } else {
+    head = tail = op;
+  }
+
+  count++;
+
+  unixassert( pthread_cond_signal( &write_activity ) );
+}
+
+template <class T>
+void OperationQueue<T>::leapfrog_enqueue( T *h,
+					  T *leapfrog_type )
+{
+  MutexLock x( &mutex );
+
+  if ( output ) {
+    ahabassert( !head );
+    ahabassert( !tail );
+    return output->leapfrog_enqueue( h, leapfrog_type );
+  }
+
+  while ( max_size && (count >= max_size) ) {
+    unixassert( pthread_cond_wait( &read_activity, &mutex ) );
+  }
+
+  QueueElement<T> *op = new QueueElement<T>( h );
+  QueueElement<T> *ptr = tail;
+
+  while ( (ptr != NULL) && ( typeid( ptr->element ) !=
+			     typeid( h ) ) ) {
+    ptr = tail->prev;
+  }
+  
+  if ( ptr ) {
+    op->next = ptr->next;
+    ptr->next = op;
+    op->prev = ptr;
+    if ( op->next ) {
+      op->next->prev = op;
+    } else {
+      tail = op;
+    }
+  } else {
+    head = tail = op;
+    op->prev = op->next = NULL;
+  }
+
+  count++;
+
+  unixassert( pthread_cond_signal( &write_activity ) );
+}
+
+template <class T>
+T *OperationQueue<T>::dequeue( bool wait )
+{
+  QueueElement<T> *ret_elem;
+  T *ret;
+  MutexLock x( &mutex );
+
+  if ( (!wait) && (count == 0 ) ) {
+    return NULL;
+  }
+
+  while ( count == 0 ) {
+    unixassert( pthread_cond_wait( &write_activity, &mutex ) );      
+  }
+
+  ret_elem = tail;
+  tail = tail->prev;
+  if ( tail ) {
+    tail->next = NULL;
+  } else {
+    assert( count == 1 );
+    head = NULL;
+  }
+
+  ret = ret_elem->element;
+  delete ret_elem;
+  count--;
+
+  unixassert( pthread_cond_signal( &read_activity ) );
+
+  return ret;
+}
+
+template <class T>
+void OperationQueue<T>::flush( void )
+{
+  MutexLock x( &mutex );
+
+  if ( output ) {
+    ahabassert( !head );
+    ahabassert( !tail );
+    return output->flush();
+  }
+
+  QueueElement<T> *ptr = head;
+
+  while ( ptr ) {
+    T *op = ptr->element;
+    QueueElement <T> *next = ptr->next;
+    delete op;
+    delete ptr;
+    ptr = next;
+  }
+
+  head = tail = NULL;
+  count = 0;
+}
+
+template <class T>
+void OperationQueue<T>::flush_type( T *h )
+{
+  MutexLock x( &mutex );
+  
+  if ( output ) {
+    ahabassert( !head );
+    ahabassert( !tail );
+    return output->flush_type( h );
+  }
+
+  QueueElement<T> *ptr = head;
+
+  while ( ptr ) {
+    T *op = ptr->element;
+
+    bool deleting = ( typeid( op ) == typeid( h ) );
+
+    QueueElement <T> *next = ptr->next;
+
+    if ( deleting ) {
+      if ( ptr->prev ) {
+	ptr->prev->next = ptr->next;
+      } else {
+	ahabassert( ptr == head );
+	head = ptr->next;
+      }
+
+      if ( ptr->next ) {
+	ptr->next->prev = ptr->prev;
+      } else {
+	assert( ptr == tail );
+	tail = ptr->prev;
+      }
+
+      delete op;
+      delete ptr;
+
+      count--;
+    }
+
+    ptr = next;
+  }
+}
+
+template <class T>
+void OperationQueue<T>::hookup( OperationQueue<T> *s_output )
+{
+  MutexLock x( &mutex );
+
+  ahabassert( !output );
+
+  output = s_output;
+
+  /* move anything already in our queue */
+  QueueElement<T> *ptr = head;
+
+  while ( ptr ) {
+    T *op = ptr->element;
+    QueueElement <T> *next = ptr->next;
+
+    output->enqueue( op ); /* this can block for a long time */
+
+    delete ptr;
+    ptr = next;
+  }
+
+  head = tail = NULL;
+  count = 0;
+}
diff --git a/opq.hpp b/opq.hpp
new file mode 100644
index 0000000..b776e53
--- /dev/null
+++ b/opq.hpp
@@ -0,0 +1,50 @@
+#ifndef OPQ_HPP
+#define OPQ_HPP
+
+#include <pthread.h>
+#include <sys/types.h>
+
+#include "mutexobj.hpp"
+
+template <class T>
+class QueueElement
+{
+public:
+  QueueElement<T> *prev, *next;
+  T *element;
+
+  QueueElement<T>( T *s ) { element = s; }
+};
+
+template <class T>
+class OperationQueue
+{
+private:
+  uint num_ops;
+
+  pthread_mutex_t mutex;
+  pthread_cond_t  read_activity;
+  pthread_cond_t  write_activity;
+
+  int count, max_size;
+  QueueElement<T> *head, *tail;
+
+  OperationQueue<T> *output;
+
+public:
+  OperationQueue( int s_max_size );
+  ~OperationQueue();
+  void enqueue( T *h );
+  void leapfrog_enqueue( T *h, T *leapfrog_type );
+
+  T *dequeue( bool wait );
+
+  void flush_type( T *h );
+  void flush( void );  
+
+  int get_count( void ) { MutexLock x( &mutex ); return count; }
+
+  void hookup( OperationQueue<T> *s_output );
+};
+
+#endif
diff --git a/picture.cpp b/picture.cpp
new file mode 100644
index 0000000..47917c2
--- /dev/null
+++ b/picture.cpp
@@ -0,0 +1,333 @@
+#include <string.h>
+#include <typeinfo>
+#include <stdio.h>
+#include <stdint.h>
+
+#include "libmpeg2.h"
+
+#include "mpegheader.hpp"
+#include "framebuffer.hpp"
+
+extern const uint8_t mpeg2_scan_norm[ 64 ]; /* These are the MMX versions */
+extern const uint8_t mpeg2_scan_alt[ 64 ];
+
+Picture::Picture( BitReader &hdr ) {
+  init();
+  sequence = NULL;
+  extension = NULL;
+  slicerow = NULL;
+  coded_order = display_order = -1;
+  set_unclean( false );
+  set_broken( false );
+  set_unknown_quantiser_matrix( false );
+  incomplete = false;
+  forward_reference = backward_reference = NULL;
+  fh = NULL;
+
+  hdr.reset();
+  ahabassert( hdr.readbits( 32 ) == 0x00000100 );
+
+  temporal_reference = hdr.readbits( 10 );
+  uint8_t picture_coding_type = hdr.readbits( 3 );
+  switch ( picture_coding_type ) {
+  case 1: type = I; break;
+  case 2: type = P; break;
+  case 3: type = B; break;
+  default: throw MPEGInvalid(); break;
+  }
+
+  vbv_delay = hdr.readbits( 16 );
+}
+
+Picture::~Picture()
+{
+  if ( slicerow ) {
+    delete[] slicerow;
+  }
+  if ( fh ) {
+    delete fh;
+  }
+}
+
+static const int non_linear_scale [] = {
+  0,  1,  2,  3,  4,  5,   6,   7,
+  8, 10, 12, 14, 16, 18,  20,  22,
+  24, 28, 32, 36, 40, 44,  48,  52,
+  56, 64, 72, 80, 88, 96, 104, 112
+};
+
+void Picture::link( void )
+{
+  /* We should handle this more gracefully if a stream is truncated
+     after picture but before picture coding extension XXX */
+
+  /* Find my extension */
+  PictureCodingExtension *pe = dynamic_cast<PictureCodingExtension *>( get_next() );
+  if ( pe == NULL ) {
+    fprintf( stderr, "Picture coding extension not found at %lld.\n", get_location() );
+    throw MPEGInvalid();
+  }
+  extension = pe;
+
+  /* Allocate space for slicerows */
+  uint mb_height = get_sequence()->get_mb_height();
+  slicerow = new Slice *[ mb_height ];
+
+  for ( uint i = 0; i < mb_height; i++ ) {
+    slicerow[ i ] = NULL;
+  }
+
+  /* Collect all the slicerows */
+  MPEGHeader *hdr = extension;
+  while ( hdr && ( typeid( *hdr ) != typeid( Picture ) ) ) {
+    if ( typeid( *hdr ) == typeid( Slice ) ) {
+      Slice *ts = static_cast<Slice *>( hdr );
+      ts->set_picture( this );
+      uint val = ts->get_val();
+      mpegassert( (val > 0) && (val <= mb_height) );
+      int loc = val - 1;
+      if ( slicerow[ loc ] == NULL ) {
+	slicerow[ loc ] = ts;
+      }
+    }
+
+    hdr = hdr->get_next();
+  }
+
+  for ( uint i = 0; i < mb_height; i++ ) {
+    if ( slicerow[ i ] == NULL ) {
+      incomplete = true;
+      break;
+    }
+  }
+}
+
+uint Picture::num_fields( void )
+{
+  PictureCodingExtension *ext = get_extension();
+
+  uint fields = 0;
+
+  /* Calculate how many fields picture is */
+  if ( ext->picture_structure != 3 ) {
+    fields = 1;
+  } else {
+    if ( get_sequence()->get_progressive_sequence() ) {
+      if ( ext->repeat_first_field == 0 ) fields = 2;
+      if ( (ext->repeat_first_field == 1) && (ext->top_field_first == 0) ) fields = 4;
+      if ( (ext->repeat_first_field == 1) && (ext->top_field_first == 1) ) fields = 6;
+    } else { /* interlaced sequence */
+      if ( ext->progressive_frame == 0 ) fields = 2;
+      if ( (ext->progressive_frame == 1) && (ext->repeat_first_field == 0) ) fields = 2;
+      if ( (ext->progressive_frame == 1) && (ext->repeat_first_field == 1) ) fields = 3;
+    }
+  }
+
+  mpegassert( fields != 0 );
+  return fields;
+}
+
+void Picture::print_info( void ) {
+  Picture *forw = get_forward();
+  Picture *bakw = get_backward();
+  
+  char forstr[ 64 ], bakstr[ 64 ];
+
+  if ( forw ) {
+    snprintf( forstr, 64, "%d", forw->get_display() );
+  } else {
+    strcpy( forstr, "XXX" );
+  }
+
+  if ( bakw ) {
+    snprintf( bakstr, 64, "%d", bakw->get_display() );
+  } else {
+    strcpy( bakstr, "XXX" );
+  }
+
+  char dependencies[64];
+  switch ( type ) {
+  case I: strcpy( dependencies, "" ); break;
+  case P: snprintf( dependencies, 64, "(%s=>) ", forstr ); break;
+  case B: snprintf( dependencies, 64, "(%s=>,=>%s) ", forstr, bakstr ); break;
+  }
+
+  PictureCodingExtension *e = get_extension();
+  printf( "PICTURE[%c] %d %sis coded %d %s %s %s %s\n",
+	  get_type_char(), get_display(), dependencies, get_coded(), get_unclean() ? "(unflushed anchor)" : "",
+	  get_incomplete() ? "(missing slices)" : "",
+	  get_broken() ? "(broken dependencies)" : "",
+	  get_unknown_quantiser_matrix() ? "(unknown quantiser matrix)" : "" );
+  printf( "params: intra_dc_precision=%d, picture_structure=%d, frame_pred_frame_dct=%d, concealment_motion_vectors=%d, q_scale_type=%d, intra_vlc_format=%d, alternate_scan=%d, top_field_first=%d\n", e->intra_dc_precision, e->picture_structure, e->frame_pred_frame_dct, e->concealment_motion_vectors, e->q_scale_type, e->intra_vlc_format, e->alternate_scan, e->top_field_first );
+  printf( "intra_quantiser_matrix[ 64 ] = { " );
+  for ( int i = 0; i < 64; i++ ) {
+    printf( "%d, ", intra_quantiser_matrix[ i ] );
+  }
+  printf( "};\n non_intra_quantiser_matrix[ 64 ] = { " );
+  for ( int i = 0; i < 64; i++ ) {
+    printf( "%d, ", non_intra_quantiser_matrix[ i ] );
+  }
+  printf( "};\n" );
+}
+
+void Picture::setup_decoder( mpeg2_decoder_t *d, uint8_t *current_fbuf[3],
+			     uint8_t *forward_fbuf[3],
+			     uint8_t *backward_fbuf[3] )
+{
+  d->picture_structure = get_extension()->picture_structure;
+  d->stride_frame = 16 * get_sequence()->get_mb_width();
+  d->width = 16 * get_sequence()->get_mb_width();
+  d->height = 16 * get_sequence()->get_mb_height();
+  d->coding_type = type;
+  d->vertical_position_extension = 0;
+  d->chroma_format = 0; /* 4:2:0 */
+  d->concealment_motion_vectors = get_extension()->concealment_motion_vectors;
+  d->scan = get_extension()->alternate_scan ? mpeg2_scan_alt : mpeg2_scan_norm;
+  d->intra_dc_precision = 7 - get_extension()->intra_dc_precision;
+  d->frame_pred_frame_dct = get_extension()->frame_pred_frame_dct;
+  d->q_scale_type = get_extension()->q_scale_type;
+  d->intra_vlc_format = get_extension()->intra_vlc_format;
+  d->top_field_first = get_extension()->top_field_first;
+  d->convert = NULL;
+  d->convert_id = NULL;
+  d->mpeg1 = 0;
+  
+  d->f_motion.f_code[ 0 ] = get_extension()->f_code_fh - 1;
+  d->f_motion.f_code[ 1 ] = get_extension()->f_code_fv - 1;
+  d->b_motion.f_code[ 0 ] = get_extension()->f_code_bh - 1;
+  d->b_motion.f_code[ 1 ] = get_extension()->f_code_bv - 1;
+
+  /* Calculate prescaled quantisers */
+  for ( uint i = 0; i < 32; i++ ) {
+    int k = get_extension()->q_scale_type ? non_linear_scale[ i ] : (i << 1);
+    for ( uint j = 0; j < 64; j++ ) {
+      d->quantizer_prescale[ 0 ][ i ][ j ] =
+        k * intra_quantiser_matrix[ j ];
+      d->quantizer_prescale[ 1 ][ i ][ j ] =
+        k * non_intra_quantiser_matrix[ j ];
+    }
+  }
+
+  d->chroma_quantizer[ 0 ] = d->quantizer_prescale[ 0 ];
+  d->chroma_quantizer[ 1 ] = d->quantizer_prescale[ 1 ];
+
+  int stride, height;
+  
+  stride = d->stride_frame;
+  height = d->height;
+  
+  d->picture_dest[0] = current_fbuf[0];
+  d->picture_dest[1] = current_fbuf[1];
+  d->picture_dest[2] = current_fbuf[2];
+  
+  d->f_motion.ref[0][0] = forward_fbuf[0];
+  d->f_motion.ref[0][1] = forward_fbuf[1];
+  d->f_motion.ref[0][2] = forward_fbuf[2];
+  
+  d->b_motion.ref[0][0] = backward_fbuf[0];
+  d->b_motion.ref[0][1] = backward_fbuf[1];
+  d->b_motion.ref[0][2] = backward_fbuf[2];
+  
+  d->stride = stride;
+  d->uv_stride = stride >> 1;
+  d->slice_stride = 16 * stride;
+  d->slice_uv_stride =
+    d->slice_stride >> (2 - d->chroma_format);
+  d->limit_x = 2 * d->width - 32;
+  d->limit_y_16 = 2 * height - 32;
+  d->limit_y_8 = 2 * height - 16;
+  d->limit_y = height - 16;
+
+  d->invalid = false;
+
+  memset( d->DCTblock, 0, 64 * sizeof( int16_t ) );
+
+  motion_setup( d );
+}
+
+void debug( void ) {}
+
+void Picture::lock_and_decodeall( void )
+{
+  fh->increment_lockcount();
+
+  if ( fh->get_frame()
+       && fh->get_frame()->get_state() == RENDERED ) {
+    return;
+  }
+
+  fh->decrement_lockcount();
+
+  Frame *cur, *fwd, *back;
+
+  /* Lock and decode pre-requisites */
+  if ( forward_reference ) {
+    forward_reference->lock_and_decodeall();
+  }
+
+  if ( backward_reference ) {
+    backward_reference->lock_and_decodeall();
+  }
+
+  /* Lock myself */
+  fh->increment_lockcount();
+
+  cur = fwd = back = fh->get_frame();
+
+  if ( forward_reference ) fwd = forward_reference->get_framehandle()->get_frame();
+  if ( backward_reference ) back = backward_reference->get_framehandle()->get_frame();
+
+  uint8_t *curf[3] = { cur->get_y(), cur->get_cb(), cur->get_cr() };
+  uint8_t *fwdf[3] = { fwd->get_y(), fwd->get_cb(), fwd->get_cr() };
+  uint8_t *backf[3] = { back->get_y(), back->get_cb(), back->get_cr() };
+
+  uint height = 16 * get_sequence()->get_mb_height();
+  uint width = 16 * get_sequence()->get_mb_width();
+
+  if ( problem() ) {
+    memset( cur->get_y(), 128, height * width );
+    memset( cur->get_cb(), 128, height * width / 4 );
+    memset( cur->get_cr(), 128, height * width / 4 );
+  }
+
+  uint rows = get_sequence()->get_mb_height();
+
+  mpeg2_decoder_t d;
+  setup_decoder( &d, curf, fwdf, backf );
+
+  for ( uint row = 0; row < rows; row++ ) {
+    Slice *s = get_slicerow( row );
+    while ( s != NULL ) {
+      MapHandle *chunk = s->map_chunk();
+
+      d.bitstream_buf = 0;
+      d.bitstream_bits = 0;
+      d.bitstream_ptr = chunk->get_buf() + 4;
+      d.bit_ptr_end = chunk->get_buf() + chunk->get_len();
+
+      s->decode( &d, s->get_val(), chunk->get_buf() + 4 );
+
+      if ( d.invalid ) {
+	invalid = true;
+	d.invalid = false;
+      }
+
+      delete chunk;
+
+      s = s->get_next_in_row();
+    }
+  }
+
+  fh->get_frame()->set_rendered();
+
+  if ( forward_reference ) forward_reference->get_framehandle()->decrement_lockcount();  
+  if ( backward_reference ) backward_reference->get_framehandle()->decrement_lockcount();
+
+  /* leave myself locked */
+}
+
+void Picture::init_fh( BufferPool *pool )
+{
+  fh = pool->make_handle( this );
+}
+
diff --git a/sequence.cpp b/sequence.cpp
new file mode 100644
index 0000000..4350c27
--- /dev/null
+++ b/sequence.cpp
@@ -0,0 +1,183 @@
+#include <typeinfo>
+
+#include "mpegheader.hpp"
+#include "mpegtables.hpp"
+
+Sequence::Sequence( BitReader &hdr )
+{
+  init();
+
+  hdr.reset();
+  ahabassert ( hdr.readbits( 32 ) == 0x000001b3 );
+
+  horizontal_size_value = hdr.readbits( 12 );
+  vertical_size_value = hdr.readbits( 12 );
+
+  uint atmp = hdr.readbits( 4 );
+  switch ( atmp ) {
+  case 1: aspect = SAR1x1; break;
+  case 2: aspect = DAR4x3; break;
+  case 3: aspect = DAR16x9; break;
+  case 4: aspect = DAR221x100; break;
+  default: throw MPEGInvalid(); break;
+  }
+
+  frame_rate_code = hdr.readbits( 4 );
+  mpegassert( (frame_rate_code != 0) && (frame_rate_code <= 8) );
+
+  bit_rate_value = hdr.readbits( 18 );
+  mpegassert( hdr.readbits( 1 ) == 1 );
+  vbv_buffer_size_value = hdr.readbits( 10 );
+  constrained_parameters_flag = hdr.readbits( 1 );
+
+  mpegassert( constrained_parameters_flag == 0 );
+
+  uint load_intra_quantiser_matrix = hdr.readbits( 1 );
+  for ( int i = 0; i < 64; i++ ) {
+    intra_quantiser_matrix[ mpeg2_normal_scan[ i ] ] =
+      load_intra_quantiser_matrix
+      ? hdr.readbits( 8 )
+      : default_intra_quantiser_matrix[ i ];
+    mpegassert( intra_quantiser_matrix[ mpeg2_normal_scan[ i ] ] != 0 );
+  }
+
+  uint load_non_intra_quantiser_matrix = hdr.readbits( 1 );
+  for ( int i = 0; i < 64; i++ ) {
+    non_intra_quantiser_matrix[ mpeg2_normal_scan[ i ] ] =
+      load_non_intra_quantiser_matrix ? hdr.readbits( 8 ) : 16;
+    mpegassert( intra_quantiser_matrix[ mpeg2_normal_scan[ i ] ] != 0 );
+  }
+}
+
+uint Sequence::get_horizontal_size( void )
+{
+  return (get_extension()->horizontal_size_extension << 12)
+    | horizontal_size_value;
+}
+
+uint Sequence::get_vertical_size( void )
+{
+  return (get_extension()->vertical_size_extension << 12)
+    | vertical_size_value;
+}
+
+uint Sequence::get_mb_width( void )
+{
+  return (get_horizontal_size() + 15) / 16;
+}
+
+uint Sequence::get_mb_height( void )
+{
+  if ( get_extension()->progressive_sequence ) {
+    return (get_vertical_size() + 15)/16;
+  } else {
+    return 2*((get_vertical_size() + 31)/32);
+  }
+  /* We don't support field pictures. */
+}
+
+uint64_t Sequence::get_frame_rate_numerator( void )
+{
+  return sequence_numerators[ frame_rate_code ]
+    * ( get_extension()->frame_rate_extension_n + 1 );
+}
+
+uint64_t Sequence::get_frame_rate_denominator( void )
+{
+  return sequence_denominators[ frame_rate_code ]
+    * ( get_extension()->frame_rate_extension_d + 1 );
+}
+
+double Sequence::get_frame_rate( void )
+{
+  return (double)get_frame_rate_numerator() / (double)get_frame_rate_denominator();
+}
+
+void Sequence::set_unknown_quantiser_flags( void )
+{
+  MPEGHeader *hdr = get_next();
+
+  while ( hdr && ( typeid( *hdr ) != typeid( Sequence ) ) ) {
+    if ( typeid( *hdr ) == typeid( Picture ) ) {
+      Picture *pic = static_cast<Picture *>( hdr );
+      pic->set_unknown_quantiser_matrix( true );
+    }
+
+    hdr = hdr->get_next();
+  }
+}
+
+void Sequence::link( void )
+{
+  /* We should handle this more gracefully if a stream is truncated
+     after sequence header but before sequence extension XXX */
+
+  /* Find my extension */
+  SequenceExtension *se = dynamic_cast<SequenceExtension *>( get_next() );
+  if ( se == NULL ) {
+    fprintf( stderr, "Sequence extension not found at %lld.\n", get_location() );
+    throw MPEGInvalid();
+  }
+  extension = se;
+
+  if ( get_vertical_size() > 2800 ) {
+    throw ConformanceLimitExceeded();
+  }
+
+  /* Set quantization matrices of pictures until next sequence */
+  uint8_t *current_intra = intra_quantiser_matrix;
+  uint8_t *current_non_intra = non_intra_quantiser_matrix;
+
+  MPEGHeader *hdr = extension;
+  while ( hdr && ( typeid( *hdr ) != typeid( Sequence ) ) ) {
+    if ( typeid( *hdr ) == typeid( QuantMatrixExtension ) ) {
+      QuantMatrixExtension *tq =
+	static_cast<QuantMatrixExtension *>( hdr );
+      uint8_t *new_intra = tq->get_intra_quantiser_matrix();
+      uint8_t *new_non_intra = tq->get_non_intra_quantiser_matrix();
+
+      if ( new_intra ) current_intra = new_intra;
+      if ( new_non_intra ) current_non_intra = new_non_intra;
+    } else if ( typeid( *hdr ) == typeid( Picture ) ) {
+      Picture *tp = static_cast<Picture *>( hdr );
+      tp->set_sequence( this );
+      tp->set_intra( current_intra );
+      tp->set_non_intra( current_non_intra );
+    }
+
+    hdr = hdr->get_next();
+  }
+
+  /* Make sure sequence parameters don't change */
+  if ( hdr ) {
+    Sequence *ts = dynamic_cast<Sequence *>( hdr );
+    ahabassert( ts );
+    mpegassert( horizontal_size_value == ts->horizontal_size_value );
+    mpegassert( vertical_size_value == ts->vertical_size_value );
+    //    mpegassert( aspect == ts->aspect );
+    mpegassert( frame_rate_code == ts->frame_rate_code );
+    //    mpegassert( bit_rate_value == ts->bit_rate_value );
+    mpegassert( vbv_buffer_size_value == ts->vbv_buffer_size_value );
+    mpegassert( constrained_parameters_flag == ts->constrained_parameters_flag );
+
+    if ( (aspect != ts->aspect) || (bit_rate_value != ts->bit_rate_value) ) {
+      fprintf( stderr, "Warning, sequence illegally changes parameters (aspect %d=>%d, bit_rate_value %d=>%d).\n",
+	       aspect, ts->aspect, bit_rate_value, ts->bit_rate_value);
+    }
+  }
+}
+
+double Sequence::get_sar( void )
+{
+  double width = get_horizontal_size(); /* XXX should be display size if present*/
+  double height = get_vertical_size();
+
+  switch ( aspect ) {
+  case SAR1x1: return 1;
+  case DAR4x3: return (height/3.0) / (width/4.0);
+  case DAR16x9: return (height/9.0) / (width/16.0);
+  case DAR221x100: return (height/100.0) / (width/221.0);
+  }
+
+  throw AhabException();
+}
diff --git a/slice.cpp b/slice.cpp
new file mode 100644
index 0000000..34f0c3c
--- /dev/null
+++ b/slice.cpp
@@ -0,0 +1,43 @@
+#include <errno.h>
+#include <typeinfo>
+#include <stdio.h>
+
+#include "mpegheader.hpp"
+#include "file.hpp"
+
+Slice::Slice( uint s_val, File *s_file ) {
+  init();
+
+  val = s_val;
+  file = s_file;
+
+  next_slice_in_row = NULL;
+  picture = NULL;
+  len = 0;
+  incomplete = false;
+}
+
+void Slice::link( void )
+{
+  MPEGHeader *hdr = get_next();
+  if ( hdr == NULL ) {
+    incomplete = true;
+    return;
+  }
+
+  len = hdr->get_location() - get_location();
+  if ( typeid( *hdr ) == typeid( Slice ) ) {
+    Slice *ts = static_cast<Slice *>( hdr );
+    if ( ts->get_val() == get_val() ) {
+      next_slice_in_row = ts;
+    }
+  }
+}
+
+void Slice::print_info( void ) {
+  printf( "s(%u,len=%u%s)", val, len, incomplete ? " [incomplete]" : "" );
+}
+
+MapHandle *Slice::map_chunk( void ) {
+  return file->map( get_location(), len );
+}
diff --git a/slicedecode.cpp b/slicedecode.cpp
new file mode 100644
index 0000000..712ad98
--- /dev/null
+++ b/slicedecode.cpp
@@ -0,0 +1,1697 @@
+/*
+ * slice.c
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 2003      Peter Gubanov <peter@elecard.net.ru>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+
+#include <inttypes.h>
+
+#include "mpeg2.h"
+#include "attributes.h"
+#include "mpeg2_internal.h"
+
+#include "mpegheader.hpp"
+
+#include <stdio.h>
+
+extern mpeg2_mc_t mpeg2_mc;
+
+#include "vlc.h"
+#include "mmx.h"
+
+static inline int get_macroblock_modes (mpeg2_decoder_t * const decoder)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+    int macroblock_modes;
+    const MBtab * tab;
+
+    switch (decoder->coding_type) {
+    case I_TYPE:
+
+	tab = MB_I + UBITS (bit_buf, 1);
+	DUMPBITS (bit_buf, bits, tab->len);
+	macroblock_modes = tab->modes;
+
+	if ((! (decoder->frame_pred_frame_dct)) &&
+	    (decoder->picture_structure == FRAME_PICTURE)) {
+	    macroblock_modes |= UBITS (bit_buf, 1) * DCT_TYPE_INTERLACED;
+	    DUMPBITS (bit_buf, bits, 1);
+	}
+
+	return macroblock_modes;
+
+    case P_TYPE:
+
+	tab = MB_P + UBITS (bit_buf, 5);
+	DUMPBITS (bit_buf, bits, tab->len);
+	macroblock_modes = tab->modes;
+
+	if (decoder->picture_structure != FRAME_PICTURE) {
+	    if (macroblock_modes & MACROBLOCK_MOTION_FORWARD) {
+		macroblock_modes |= UBITS (bit_buf, 2) << MOTION_TYPE_SHIFT;
+		DUMPBITS (bit_buf, bits, 2);
+	    }
+	    return macroblock_modes | MACROBLOCK_MOTION_FORWARD;
+	} else if (decoder->frame_pred_frame_dct) {
+	    if (macroblock_modes & MACROBLOCK_MOTION_FORWARD)
+		macroblock_modes |= MC_FRAME << MOTION_TYPE_SHIFT;
+	    return macroblock_modes | MACROBLOCK_MOTION_FORWARD;
+	} else {
+	    if (macroblock_modes & MACROBLOCK_MOTION_FORWARD) {
+		macroblock_modes |= UBITS (bit_buf, 2) << MOTION_TYPE_SHIFT;
+		DUMPBITS (bit_buf, bits, 2);
+	    }
+	    if (macroblock_modes & (MACROBLOCK_INTRA | MACROBLOCK_PATTERN)) {
+		macroblock_modes |= UBITS (bit_buf, 1) * DCT_TYPE_INTERLACED;
+		DUMPBITS (bit_buf, bits, 1);
+	    }
+	    return macroblock_modes | MACROBLOCK_MOTION_FORWARD;
+	}
+
+    case B_TYPE:
+
+	tab = MB_B + UBITS (bit_buf, 6);
+	DUMPBITS (bit_buf, bits, tab->len);
+	macroblock_modes = tab->modes;
+
+	if (decoder->picture_structure != FRAME_PICTURE) {
+	    if (! (macroblock_modes & MACROBLOCK_INTRA)) {
+		macroblock_modes |= UBITS (bit_buf, 2) << MOTION_TYPE_SHIFT;
+		DUMPBITS (bit_buf, bits, 2);
+	    }
+	    return macroblock_modes;
+	} else if (decoder->frame_pred_frame_dct) {
+	    /* if (! (macroblock_modes & MACROBLOCK_INTRA)) */
+	    macroblock_modes |= MC_FRAME << MOTION_TYPE_SHIFT;
+	    return macroblock_modes;
+	} else {
+	    if (macroblock_modes & MACROBLOCK_INTRA)
+		goto intra;
+	    macroblock_modes |= UBITS (bit_buf, 2) << MOTION_TYPE_SHIFT;
+	    DUMPBITS (bit_buf, bits, 2);
+	    if (macroblock_modes & (MACROBLOCK_INTRA | MACROBLOCK_PATTERN)) {
+	    intra:
+		macroblock_modes |= UBITS (bit_buf, 1) * DCT_TYPE_INTERLACED;
+		DUMPBITS (bit_buf, bits, 1);
+	    }
+	    return macroblock_modes;
+	}
+
+    case D_TYPE:
+
+	DUMPBITS (bit_buf, bits, 1);
+	return MACROBLOCK_INTRA;
+
+    default:
+	return 0;
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline void get_quantizer_scale (mpeg2_decoder_t * const decoder)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+
+    int quantizer_scale_code;
+
+    quantizer_scale_code = UBITS (bit_buf, 5);
+    DUMPBITS (bit_buf, bits, 5);
+
+    decoder->quantizer_matrix[0] =
+	decoder->quantizer_prescale[0][quantizer_scale_code];
+    decoder->quantizer_matrix[1] =
+	decoder->quantizer_prescale[1][quantizer_scale_code];
+    decoder->quantizer_matrix[2] =
+	decoder->chroma_quantizer[0][quantizer_scale_code];
+    decoder->quantizer_matrix[3] =
+	decoder->chroma_quantizer[1][quantizer_scale_code];
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_motion_delta (mpeg2_decoder_t * const decoder,
+				    const int f_code)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+
+    int delta;
+    int sign;
+    const MVtab * tab;
+
+    if (bit_buf & 0x80000000) {
+	DUMPBITS (bit_buf, bits, 1);
+	return 0;
+    } else if (bit_buf >= 0x0c000000) {
+
+	tab = MV_4 + UBITS (bit_buf, 4);
+	delta = (tab->delta << f_code) + 1;
+	bits += tab->len + f_code + 1;
+	bit_buf <<= tab->len;
+
+	sign = SBITS (bit_buf, 1);
+	bit_buf <<= 1;
+
+	if (f_code)
+	    delta += UBITS (bit_buf, f_code);
+	bit_buf <<= f_code;
+
+	return (delta ^ sign) - sign;
+
+    } else {
+
+	tab = MV_10 + UBITS (bit_buf, 10);
+	delta = (tab->delta << f_code) + 1;
+	bits += tab->len + 1;
+	bit_buf <<= tab->len;
+
+	sign = SBITS (bit_buf, 1);
+	bit_buf <<= 1;
+
+	if (f_code) {
+	    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+	    delta += UBITS (bit_buf, f_code);
+	    DUMPBITS (bit_buf, bits, f_code);
+	}
+
+	return (delta ^ sign) - sign;
+
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int bound_motion_vector (const int vector, const int f_code)
+{
+    return ((int32_t)vector << (27 - f_code)) >> (27 - f_code);
+}
+
+static inline int get_dmv (mpeg2_decoder_t * const decoder)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+
+    const DMVtab * tab;
+
+    tab = DMV_2 + UBITS (bit_buf, 2);
+    DUMPBITS (bit_buf, bits, tab->len);
+    return tab->dmv;
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_coded_block_pattern (mpeg2_decoder_t * const decoder)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+
+    const CBPtab * tab;
+
+    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+
+    if (bit_buf >= 0x20000000) {
+
+	tab = CBP_7 + (UBITS (bit_buf, 7) - 16);
+	DUMPBITS (bit_buf, bits, tab->len);
+	return tab->cbp;
+
+    } else {
+
+	tab = CBP_9 + UBITS (bit_buf, 9);
+	DUMPBITS (bit_buf, bits, tab->len);
+	return tab->cbp;
+    }
+
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_luma_dc_dct_diff (mpeg2_decoder_t * const decoder)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+    const DCtab * tab;
+    int size;
+    int dc_diff;
+
+    if (bit_buf < 0xf8000000) {
+	tab = DC_lum_5 + UBITS (bit_buf, 5);
+	size = tab->size;
+	if (size) {
+	    bits += tab->len + size;
+	    bit_buf <<= tab->len;
+	    dc_diff =
+		UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
+	    bit_buf <<= size;
+	    return dc_diff << decoder->intra_dc_precision;
+	} else {
+	    DUMPBITS (bit_buf, bits, 3);
+	    return 0;
+	}
+    } else {
+	tab = DC_long + (UBITS (bit_buf, 9) - 0x1e0);
+	size = tab->size;
+	DUMPBITS (bit_buf, bits, tab->len);
+	NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+	dc_diff = UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
+	DUMPBITS (bit_buf, bits, size);
+	return dc_diff << decoder->intra_dc_precision;
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline int get_chroma_dc_dct_diff (mpeg2_decoder_t * const decoder)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+    const DCtab * tab;
+    int size;
+    int dc_diff;
+
+    if (bit_buf < 0xf8000000) {
+	tab = DC_chrom_5 + UBITS (bit_buf, 5);
+	size = tab->size;
+	if (size) {
+	    bits += tab->len + size;
+	    bit_buf <<= tab->len;
+	    dc_diff =
+		UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
+	    bit_buf <<= size;
+	    return dc_diff << decoder->intra_dc_precision;
+	} else {
+	    DUMPBITS (bit_buf, bits, 2);
+	    return 0;
+	}
+    } else {
+	tab = DC_long + (UBITS (bit_buf, 10) - 0x3e0);
+	size = tab->size;
+	DUMPBITS (bit_buf, bits, tab->len + 1);
+	NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+	dc_diff = UBITS (bit_buf, size) - UBITS (SBITS (~bit_buf, 1), size);
+	DUMPBITS (bit_buf, bits, size);
+	return dc_diff << decoder->intra_dc_precision;
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+#define SATURATE(val)				\
+do {						\
+    val <<= 4;					\
+    if (unlikely (val != (int16_t) val))	\
+	val = (SBITS (val, 1) ^ 2047) << 4;	\
+} while (0)
+
+static void get_intra_block_B14 (mpeg2_decoder_t * const decoder,
+				 const uint16_t * const quant_matrix)
+{
+    int i;
+    int j;
+    int val;
+    const uint8_t * const scan = decoder->scan;
+    int mismatch;
+    const DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    const uint8_t * bit_ptr;
+    int16_t * const dest = decoder->DCTblock;
+
+    i = 0;
+    mismatch = ~dest[0];
+
+    bit_buf = decoder->bitstream_buf;
+    bits = decoder->bitstream_bits;
+    bit_ptr = decoder->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+
+    while (1) {
+	if (bit_buf >= 0x28000000) {
+
+	    tab = DCT_B14AC_5 + (UBITS (bit_buf, 5) - 5);
+
+	    i += tab->run;
+	    if (i >= 64)
+		break;	/* end of block */
+
+	normal_code:
+	    j = scan[i];
+	    bit_buf <<= tab->len;
+	    bits += tab->len + 1;
+	    val = (tab->level * quant_matrix[j]) >> 4;
+
+	    /* if (bitstream_get (1)) val = -val; */
+	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+	    SATURATE (val);
+	    dest[j] = val;
+	    mismatch ^= val;
+
+	    bit_buf <<= 1;
+	    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+
+	    continue;
+
+	} else if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B14_8 + (UBITS (bit_buf, 8) - 4);
+
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+
+	    /* escape code */
+
+	    i += UBITS (bit_buf << 6, 6) - 64;
+	    if (i >= 64) {
+	      throw MPEGInvalid();
+	      break;	/* illegal, check needed to avoid buffer overflow */
+	    }
+
+	    j = scan[i];
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+	    val = (SBITS (bit_buf, 12) * quant_matrix[j]) / 16;
+
+	    SATURATE (val);
+	    dest[j] = val;
+	    mismatch ^= val;
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+
+	    continue;
+
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B14_10 + (UBITS (bit_buf, 10) - 8);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	//	throw MPEGInvalid();
+	//	fprintf( stderr, "Suspected invalid block.\n" );
+	decoder->invalid = true;
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    dest[63] ^= mismatch & 16;
+    DUMPBITS (bit_buf, bits, tab->len);	/* dump end of block code */
+    decoder->bitstream_buf = bit_buf;
+    decoder->bitstream_bits = bits;
+    decoder->bitstream_ptr = bit_ptr;
+}
+
+static void get_intra_block_B15 (mpeg2_decoder_t * const decoder,
+				 const uint16_t * const quant_matrix)
+{
+    int i;
+    int j;
+    int val;
+    const uint8_t * const scan = decoder->scan;
+    int mismatch;
+    const DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    const uint8_t * bit_ptr;
+    int16_t * const dest = decoder->DCTblock;
+
+    i = 0;
+    mismatch = ~dest[0];
+
+    bit_buf = decoder->bitstream_buf;
+    bits = decoder->bitstream_bits;
+    bit_ptr = decoder->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+
+    while (1) {
+	if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B15_8 + (UBITS (bit_buf, 8) - 4);
+
+	    i += tab->run;
+	    if (i < 64) {
+
+	    normal_code:
+		j = scan[i];
+		bit_buf <<= tab->len;
+		bits += tab->len + 1;
+		val = (tab->level * quant_matrix[j]) >> 4;
+
+		/* if (bitstream_get (1)) val = -val; */
+		val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+				
+
+		SATURATE (val);
+		dest[j] = val;
+		mismatch ^= val;
+
+		bit_buf <<= 1;
+		NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+
+		continue;
+
+	    } else {
+
+		/* end of block. I commented out this code because if we */
+		/* do not exit here we will still exit at the later test :) */
+
+		/* if (i >= 128) break;	*/	/* end of block */
+
+		/* escape code */
+
+		i += UBITS (bit_buf << 6, 6) - 64;
+		if (i >= 64) {
+		  //		  throw MPEGInvalid();
+		  /* This seems to show up in all kinds of bitstreams -- KJW */
+		  //		  fprintf( stderr, "Invalid macroblock.\n" );
+		  break;	/* illegal, check against buffer overflow */
+		}
+
+		j = scan[i];
+
+		DUMPBITS (bit_buf, bits, 12);
+		NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+		val = (SBITS (bit_buf, 12) * quant_matrix[j]) / 16;
+
+		SATURATE (val);
+		dest[j] = val;
+		mismatch ^= val;
+
+		DUMPBITS (bit_buf, bits, 12);
+		NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+
+		continue;
+
+	    }
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B15_10 + (UBITS (bit_buf, 10) - 8);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	//	throw MPEGInvalid();
+	//	fprintf( stderr, "Suspected invalid block.\n" );
+	decoder->invalid = true;
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    dest[63] ^= mismatch & 16;
+    DUMPBITS (bit_buf, bits, tab->len);	/* dump end of block code */
+    decoder->bitstream_buf = bit_buf;
+    decoder->bitstream_bits = bits;
+    decoder->bitstream_ptr = bit_ptr;
+}
+
+static int get_non_intra_block (mpeg2_decoder_t * const decoder,
+				const uint16_t * const quant_matrix)
+{
+    int i;
+    int j;
+    int val;
+    const uint8_t * const scan = decoder->scan;
+    int mismatch;
+    const DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    const uint8_t * bit_ptr;
+    int16_t * const dest = decoder->DCTblock;
+
+    i = -1;
+    mismatch = -1;
+
+    bit_buf = decoder->bitstream_buf;
+    bits = decoder->bitstream_bits;
+    bit_ptr = decoder->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+    if (bit_buf >= 0x28000000) {
+	tab = DCT_B14DC_5 + (UBITS (bit_buf, 5) - 5);
+	goto entry_1;
+    } else
+	goto entry_2;
+
+    while (1) {
+	if (bit_buf >= 0x28000000) {
+
+	    tab = DCT_B14AC_5 + (UBITS (bit_buf, 5) - 5);
+
+	entry_1:
+	    i += tab->run;
+	    if (i >= 64)
+		break;	/* end of block */
+
+	normal_code:
+	    j = scan[i];
+	    bit_buf <<= tab->len;
+	    bits += tab->len + 1;
+	    val = ((2 * tab->level + 1) * quant_matrix[j]) >> 5;
+
+	    /* if (bitstream_get (1)) val = -val; */
+	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+	    SATURATE (val);
+	    dest[j] = val;
+	    mismatch ^= val;
+
+	    bit_buf <<= 1;
+	    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+
+	    continue;
+
+	}
+
+    entry_2:
+	if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B14_8 + (UBITS (bit_buf, 8) - 4);
+
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+
+	    /* escape code */
+
+	    i += UBITS (bit_buf << 6, 6) - 64;
+	    if (i >= 64) {
+	      //	      throw MPEGInvalid();
+	      //	      fprintf( stderr, "Suspected invalid block.\n" );
+	      decoder->invalid = true;
+	      break;	/* illegal, check needed to avoid buffer overflow */
+	    }
+
+	    j = scan[i];
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+	    val = 2 * (SBITS (bit_buf, 12) + SBITS (bit_buf, 1)) + 1;
+	    val = (val * quant_matrix[j]) / 32;
+
+	    SATURATE (val);
+	    dest[j] = val;
+	    mismatch ^= val;
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+
+	    continue;
+
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B14_10 + (UBITS (bit_buf, 10) - 8);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	//	fprintf( stderr, "Suspected invalid block.\n" );
+	decoder->invalid = true;
+	//	throw MPEGInvalid();
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    dest[63] ^= mismatch & 16;
+    DUMPBITS (bit_buf, bits, tab->len);	/* dump end of block code */
+    decoder->bitstream_buf = bit_buf;
+    decoder->bitstream_bits = bits;
+    decoder->bitstream_ptr = bit_ptr;
+    return i;
+}
+
+static void get_mpeg1_intra_block (mpeg2_decoder_t * const decoder)
+{
+    int i;
+    int j;
+    int val;
+    const uint8_t * const scan = decoder->scan;
+    const uint16_t * const quant_matrix = decoder->quantizer_matrix[0];
+    const DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    const uint8_t * bit_ptr;
+    int16_t * const dest = decoder->DCTblock;
+
+    i = 0;
+
+    bit_buf = decoder->bitstream_buf;
+    bits = decoder->bitstream_bits;
+    bit_ptr = decoder->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+
+    while (1) {
+	if (bit_buf >= 0x28000000) {
+
+	    tab = DCT_B14AC_5 + (UBITS (bit_buf, 5) - 5);
+
+	    i += tab->run;
+	    if (i >= 64)
+		break;	/* end of block */
+
+	normal_code:
+	    j = scan[i];
+	    bit_buf <<= tab->len;
+	    bits += tab->len + 1;
+	    val = (tab->level * quant_matrix[j]) >> 4;
+
+	    /* oddification */
+	    val = (val - 1) | 1;
+
+	    /* if (bitstream_get (1)) val = -val; */
+	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+	    SATURATE (val);
+	    dest[j] = val;
+
+	    bit_buf <<= 1;
+	    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+
+	    continue;
+
+	} else if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B14_8 + (UBITS (bit_buf, 8) - 4);
+
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+
+	    /* escape code */
+
+	    i += UBITS (bit_buf << 6, 6) - 64;
+	    if (i >= 64) {
+	      throw MPEGInvalid();
+	      break;	/* illegal, check needed to avoid buffer overflow */
+	    }
+
+	    j = scan[i];
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+	    val = SBITS (bit_buf, 8);
+	    if (! (val & 0x7f)) {
+		DUMPBITS (bit_buf, bits, 8);
+		val = UBITS (bit_buf, 8) + 2 * val;
+	    }
+	    val = (val * quant_matrix[j]) / 16;
+
+	    /* oddification */
+	    val = (val + ~SBITS (val, 1)) | 1;
+
+	    SATURATE (val);
+	    dest[j] = val;
+
+	    DUMPBITS (bit_buf, bits, 8);
+	    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+
+	    continue;
+
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B14_10 + (UBITS (bit_buf, 10) - 8);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	throw MPEGInvalid();
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    DUMPBITS (bit_buf, bits, tab->len);	/* dump end of block code */
+    decoder->bitstream_buf = bit_buf;
+    decoder->bitstream_bits = bits;
+    decoder->bitstream_ptr = bit_ptr;
+}
+
+static int get_mpeg1_non_intra_block (mpeg2_decoder_t * const decoder)
+{
+    int i;
+    int j;
+    int val;
+    const uint8_t * const scan = decoder->scan;
+    const uint16_t * const quant_matrix = decoder->quantizer_matrix[1];
+    const DCTtab * tab;
+    uint32_t bit_buf;
+    int bits;
+    const uint8_t * bit_ptr;
+    int16_t * const dest = decoder->DCTblock;
+
+    i = -1;
+
+    bit_buf = decoder->bitstream_buf;
+    bits = decoder->bitstream_bits;
+    bit_ptr = decoder->bitstream_ptr;
+
+    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+    if (bit_buf >= 0x28000000) {
+	tab = DCT_B14DC_5 + (UBITS (bit_buf, 5) - 5);
+	goto entry_1;
+    } else
+	goto entry_2;
+
+    while (1) {
+	if (bit_buf >= 0x28000000) {
+
+	    tab = DCT_B14AC_5 + (UBITS (bit_buf, 5) - 5);
+
+	entry_1:
+	    i += tab->run;
+	    if (i >= 64)
+		break;	/* end of block */
+
+	normal_code:
+	    j = scan[i];
+	    bit_buf <<= tab->len;
+	    bits += tab->len + 1;
+	    val = ((2 * tab->level + 1) * quant_matrix[j]) >> 5;
+
+	    /* oddification */
+	    val = (val - 1) | 1;
+
+	    /* if (bitstream_get (1)) val = -val; */
+	    val = (val ^ SBITS (bit_buf, 1)) - SBITS (bit_buf, 1);
+
+	    SATURATE (val);
+	    dest[j] = val;
+
+	    bit_buf <<= 1;
+	    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+
+	    continue;
+
+	}
+
+    entry_2:
+	if (bit_buf >= 0x04000000) {
+
+	    tab = DCT_B14_8 + (UBITS (bit_buf, 8) - 4);
+
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+
+	    /* escape code */
+
+	    i += UBITS (bit_buf << 6, 6) - 64;
+	    if (i >= 64) {
+	      throw MPEGInvalid();
+	      break;	/* illegal, check needed to avoid buffer overflow */
+	    }
+
+	    j = scan[i];
+
+	    DUMPBITS (bit_buf, bits, 12);
+	    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+	    val = SBITS (bit_buf, 8);
+	    if (! (val & 0x7f)) {
+		DUMPBITS (bit_buf, bits, 8);
+		val = UBITS (bit_buf, 8) + 2 * val;
+	    }
+	    val = 2 * (val + SBITS (val, 1)) + 1;
+	    val = (val * quant_matrix[j]) / 32;
+
+	    /* oddification */
+	    val = (val + ~SBITS (val, 1)) | 1;
+
+	    SATURATE (val);
+	    dest[j] = val;
+
+	    DUMPBITS (bit_buf, bits, 8);
+	    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+
+	    continue;
+
+	} else if (bit_buf >= 0x02000000) {
+	    tab = DCT_B14_10 + (UBITS (bit_buf, 10) - 8);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00800000) {
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else if (bit_buf >= 0x00200000) {
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	} else {
+	    tab = DCT_16 + UBITS (bit_buf, 16);
+	    bit_buf <<= 16;
+	    GETWORD (bit_buf, bits + 16, bit_ptr);
+	    i += tab->run;
+	    if (i < 64)
+		goto normal_code;
+	}
+	throw MPEGInvalid();
+	break;	/* illegal, check needed to avoid buffer overflow */
+    }
+    DUMPBITS (bit_buf, bits, tab->len);	/* dump end of block code */
+    decoder->bitstream_buf = bit_buf;
+    decoder->bitstream_bits = bits;
+    decoder->bitstream_ptr = bit_ptr;
+    return i;
+}
+
+static inline void slice_intra_DCT (mpeg2_decoder_t * const decoder,
+				    const int cc,
+				    uint8_t * const dest, const int stride)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+    /* Get the intra DC coefficient and inverse quantize it */
+    if (cc == 0)
+	decoder->DCTblock[0] =
+	    decoder->dc_dct_pred[0] += get_luma_dc_dct_diff (decoder);
+    else
+	decoder->DCTblock[0] =
+	    decoder->dc_dct_pred[cc] += get_chroma_dc_dct_diff (decoder);
+
+    if (decoder->mpeg1) {
+	if (decoder->coding_type != D_TYPE)
+	    get_mpeg1_intra_block (decoder);
+    } else if (decoder->intra_vlc_format)
+	get_intra_block_B15 (decoder, decoder->quantizer_matrix[cc ? 2 : 0]);
+    else
+	get_intra_block_B14 (decoder, decoder->quantizer_matrix[cc ? 2 : 0]);
+    mpeg2_idct_copy (decoder->DCTblock, dest, stride);
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+static inline void slice_non_intra_DCT (mpeg2_decoder_t * const decoder,
+					const int cc,
+					uint8_t * const dest, const int stride)
+{
+    int last;
+
+    if (decoder->mpeg1)
+	last = get_mpeg1_non_intra_block (decoder);
+    else
+	last = get_non_intra_block (decoder,
+				    decoder->quantizer_matrix[cc ? 3 : 1]);
+    mpeg2_idct_add (last, decoder->DCTblock, dest, stride);
+}
+
+#define MOTION_420(table,ref,motion_x,motion_y,size,y)			      \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = 2 * decoder->v_offset + motion_y + 2 * y;			      \
+    if (unlikely (pos_x > decoder->limit_x)) {				      \
+	pos_x = ((int)pos_x < 0) ? 0 : decoder->limit_x;		      \
+	motion_x = pos_x - 2 * decoder->offset;				      \
+    }									      \
+    if (unlikely (pos_y > decoder->limit_y_ ## size)) {			      \
+	pos_y = ((int)pos_y < 0) ? 0 : decoder->limit_y_ ## size;	      \
+	motion_y = pos_y - 2 * decoder->v_offset - 2 * y;		      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    table[xy_half] (decoder->dest[0] + y * decoder->stride + decoder->offset, \
+		    ref[0] + (pos_x >> 1) + (pos_y >> 1) * decoder->stride,   \
+		    decoder->stride, size);				      \
+    motion_x /= 2;	motion_y /= 2;					      \
+    xy_half = ((motion_y & 1) << 1) | (motion_x & 1);			      \
+    offset = (((decoder->offset + motion_x) >> 1) +			      \
+	      ((((decoder->v_offset + motion_y) >> 1) + y/2) *		      \
+	       decoder->uv_stride));					      \
+    table[4+xy_half] (decoder->dest[1] + y/2 * decoder->uv_stride +	      \
+		      (decoder->offset >> 1), ref[1] + offset,		      \
+		      decoder->uv_stride, size/2);			      \
+    table[4+xy_half] (decoder->dest[2] + y/2 * decoder->uv_stride +	      \
+		      (decoder->offset >> 1), ref[2] + offset,		      \
+		      decoder->uv_stride, size/2)
+
+#define MOTION_FIELD_420(table,ref,motion_x,motion_y,dest_field,op,src_field) \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = decoder->v_offset + motion_y;				      \
+    if (unlikely (pos_x > decoder->limit_x)) {				      \
+	pos_x = ((int)pos_x < 0) ? 0 : decoder->limit_x;		      \
+	motion_x = pos_x - 2 * decoder->offset;				      \
+    }									      \
+    if (unlikely (pos_y > decoder->limit_y)) {				      \
+	pos_y = ((int)pos_y < 0) ? 0 : decoder->limit_y;		      \
+	motion_y = pos_y - decoder->v_offset;				      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    table[xy_half] (decoder->dest[0] + dest_field * decoder->stride +	      \
+		    decoder->offset,					      \
+		    (ref[0] + (pos_x >> 1) +				      \
+		     ((pos_y op) + src_field) * decoder->stride),	      \
+		    2 * decoder->stride, 8);				      \
+    motion_x /= 2;	motion_y /= 2;					      \
+    xy_half = ((motion_y & 1) << 1) | (motion_x & 1);			      \
+    offset = (((decoder->offset + motion_x) >> 1) +			      \
+	      (((decoder->v_offset >> 1) + (motion_y op) + src_field) *	      \
+	       decoder->uv_stride));					      \
+    table[4+xy_half] (decoder->dest[1] + dest_field * decoder->uv_stride +    \
+		      (decoder->offset >> 1), ref[1] + offset,		      \
+		      2 * decoder->uv_stride, 4);			      \
+    table[4+xy_half] (decoder->dest[2] + dest_field * decoder->uv_stride +    \
+		      (decoder->offset >> 1), ref[2] + offset,		      \
+		      2 * decoder->uv_stride, 4)
+
+#define MOTION_DMV_420(table,ref,motion_x,motion_y)			      \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = decoder->v_offset + motion_y;				      \
+    if (unlikely (pos_x > decoder->limit_x)) {				      \
+	pos_x = ((int)pos_x < 0) ? 0 : decoder->limit_x;		      \
+	motion_x = pos_x - 2 * decoder->offset;				      \
+    }									      \
+    if (unlikely (pos_y > decoder->limit_y)) {				      \
+	pos_y = ((int)pos_y < 0) ? 0 : decoder->limit_y;		      \
+	motion_y = pos_y - decoder->v_offset;				      \
+    }									      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    offset = (pos_x >> 1) + (pos_y & ~1) * decoder->stride;		      \
+    table[xy_half] (decoder->dest[0] + decoder->offset,			      \
+		    ref[0] + offset, 2 * decoder->stride, 8);		      \
+    table[xy_half] (decoder->dest[0] + decoder->stride + decoder->offset,     \
+		    ref[0] + decoder->stride + offset,			      \
+		    2 * decoder->stride, 8);				      \
+    motion_x /= 2;	motion_y /= 2;					      \
+    xy_half = ((motion_y & 1) << 1) | (motion_x & 1);			      \
+    offset = (((decoder->offset + motion_x) >> 1) +			      \
+	      (((decoder->v_offset >> 1) + (motion_y & ~1)) *		      \
+	       decoder->uv_stride));					      \
+    table[4+xy_half] (decoder->dest[1] + (decoder->offset >> 1),	      \
+		      ref[1] + offset, 2 * decoder->uv_stride, 4);	      \
+    table[4+xy_half] (decoder->dest[1] + decoder->uv_stride +		      \
+		      (decoder->offset >> 1),				      \
+		      ref[1] + decoder->uv_stride + offset,		      \
+		      2 * decoder->uv_stride, 4);			      \
+    table[4+xy_half] (decoder->dest[2] + (decoder->offset >> 1),	      \
+		      ref[2] + offset, 2 * decoder->uv_stride, 4);	      \
+    table[4+xy_half] (decoder->dest[2] + decoder->uv_stride +		      \
+		      (decoder->offset >> 1),				      \
+		      ref[2] + decoder->uv_stride + offset,		      \
+		      2 * decoder->uv_stride, 4)
+
+#define MOTION_ZERO_420(table,ref)					      \
+    table[0] (decoder->dest[0] + decoder->offset,			      \
+	      (ref[0] + decoder->offset +				      \
+	       decoder->v_offset * decoder->stride), decoder->stride, 16);    \
+    offset = ((decoder->offset >> 1) +					      \
+	      (decoder->v_offset >> 1) * decoder->uv_stride);		      \
+    table[4] (decoder->dest[1] + (decoder->offset >> 1),		      \
+	      ref[1] + offset, decoder->uv_stride, 8);			      \
+    table[4] (decoder->dest[2] + (decoder->offset >> 1),		      \
+	      ref[2] + offset, decoder->uv_stride, 8)
+
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+
+#define MOTION_FUNCTIONS(FORMAT,MOTION,MOTION_FIELD,MOTION_DMV,MOTION_ZERO)   \
+									      \
+static void motion_fr_frame_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				      motion_t * const motion,		      \
+				      mpeg2_mc_fct * const * const table)     \
+{									      \
+    int motion_x, motion_y;						      \
+    unsigned int pos_x, pos_y, xy_half, offset;				      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);					      \
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,		      \
+						     motion->f_code[0]);      \
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);	      \
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;			      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);					      \
+    motion_y = motion->pmv[0][1] + get_motion_delta (decoder,		      \
+						     motion->f_code[1]);      \
+    motion_y = bound_motion_vector (motion_y, motion->f_code[1]);	      \
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y;			      \
+									      \
+    MOTION (table, motion->ref[0], motion_x, motion_y, 16, 0);		      \
+}									      \
+									      \
+static void motion_fr_field_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				      motion_t * const motion,		      \
+				      mpeg2_mc_fct * const * const table)     \
+{									      \
+    int motion_x, motion_y, field;					      \
+    unsigned int pos_x, pos_y, xy_half, offset;				      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);					      \
+    field = UBITS (bit_buf, 1);						      \
+    DUMPBITS (bit_buf, bits, 1);					      \
+									      \
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,		      \
+						     motion->f_code[0]);      \
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);	      \
+    motion->pmv[0][0] = motion_x;					      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);					      \
+    motion_y = ((motion->pmv[0][1] >> 1) +				      \
+		get_motion_delta (decoder, motion->f_code[1]));		      \
+    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */	      \
+    motion->pmv[0][1] = motion_y << 1;					      \
+									      \
+    MOTION_FIELD (table, motion->ref[0], motion_x, motion_y, 0, & ~1, field); \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);					      \
+    field = UBITS (bit_buf, 1);						      \
+    DUMPBITS (bit_buf, bits, 1);					      \
+									      \
+    motion_x = motion->pmv[1][0] + get_motion_delta (decoder,		      \
+						     motion->f_code[0]);      \
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);	      \
+    motion->pmv[1][0] = motion_x;					      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);					      \
+    motion_y = ((motion->pmv[1][1] >> 1) +				      \
+		get_motion_delta (decoder, motion->f_code[1]));		      \
+    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */	      \
+    motion->pmv[1][1] = motion_y << 1;					      \
+									      \
+    MOTION_FIELD (table, motion->ref[0], motion_x, motion_y, 1, & ~1, field); \
+}									      \
+									      \
+static void motion_fr_dmv_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				    motion_t * const motion,		      \
+				    mpeg2_mc_fct * const * const table)	      \
+{									      \
+    int motion_x, motion_y, dmv_x, dmv_y, m, other_x, other_y;		      \
+    unsigned int pos_x, pos_y, xy_half, offset;				      \
+									      \
+    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);					      \
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,		      \
+						     motion->f_code[0]);      \
+    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);	      \
+    motion->pmv[1][0] = motion->pmv[0][0] = motion_x;			      \
+    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);					      \
+    dmv_x = get_dmv (decoder);						      \
+									      \
+    motion_y = ((motion->pmv[0][1] >> 1) +				      \
+		get_motion_delta (decoder, motion->f_code[1]));		      \
+    /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */	      \
+    motion->pmv[1][1] = motion->pmv[0][1] = motion_y << 1;		      \
+    dmv_y = get_dmv (decoder);						      \
+									      \
+    m = decoder->top_field_first ? 1 : 3;				      \
+    other_x = ((motion_x * m + (motion_x > 0)) >> 1) + dmv_x;		      \
+    other_y = ((motion_y * m + (motion_y > 0)) >> 1) + dmv_y - 1;	      \
+    MOTION_FIELD (mpeg2_mc.put, motion->ref[0], other_x, other_y, 0, | 1, 0); \
+									      \
+    m = decoder->top_field_first ? 3 : 1;				      \
+    other_x = ((motion_x * m + (motion_x > 0)) >> 1) + dmv_x;		      \
+    other_y = ((motion_y * m + (motion_y > 0)) >> 1) + dmv_y + 1;	      \
+    MOTION_FIELD (mpeg2_mc.put, motion->ref[0], other_x, other_y, 1, & ~1, 0);\
+									      \
+    MOTION_DMV (mpeg2_mc.avg, motion->ref[0], motion_x, motion_y);	      \
+}									      \
+									      \
+static void motion_reuse_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				   motion_t * const motion,		      \
+				   mpeg2_mc_fct * const * const table)	      \
+{									      \
+    int motion_x, motion_y;						      \
+    unsigned int pos_x, pos_y, xy_half, offset;				      \
+									      \
+    motion_x = motion->pmv[0][0];					      \
+    motion_y = motion->pmv[0][1];					      \
+									      \
+    MOTION (table, motion->ref[0], motion_x, motion_y, 16, 0);		      \
+}									      \
+									      \
+static void motion_zero_##FORMAT (mpeg2_decoder_t * const decoder,	      \
+				  motion_t * const motion,		      \
+				  mpeg2_mc_fct * const * const table)	      \
+{									      \
+    unsigned int offset;						      \
+									      \
+    motion->pmv[0][0] = motion->pmv[0][1] = 0;				      \
+    motion->pmv[1][0] = motion->pmv[1][1] = 0;				      \
+									      \
+    MOTION_ZERO (table, motion->ref[0]);				      \
+}									      \
+									      \
+
+MOTION_FUNCTIONS (420, MOTION_420, MOTION_FIELD_420, MOTION_DMV_420,
+		  MOTION_ZERO_420)
+
+/* like motion_frame, but parsing without actual motion compensation */
+static void motion_fr_conceal (mpeg2_decoder_t * const decoder)
+{
+    int tmp;
+
+    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+    tmp = (decoder->f_motion.pmv[0][0] +
+	   get_motion_delta (decoder, decoder->f_motion.f_code[0]));
+    tmp = bound_motion_vector (tmp, decoder->f_motion.f_code[0]);
+    decoder->f_motion.pmv[1][0] = decoder->f_motion.pmv[0][0] = tmp;
+
+    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+    tmp = (decoder->f_motion.pmv[0][1] +
+	   get_motion_delta (decoder, decoder->f_motion.f_code[1]));
+    tmp = bound_motion_vector (tmp, decoder->f_motion.f_code[1]);
+    decoder->f_motion.pmv[1][1] = decoder->f_motion.pmv[0][1] = tmp;
+
+    DUMPBITS (bit_buf, bits, 1); /* remove marker_bit */
+}
+
+static void motion_fi_conceal (mpeg2_decoder_t * const decoder)
+{
+    int tmp;
+
+    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+    DUMPBITS (bit_buf, bits, 1); /* remove field_select */
+
+    tmp = (decoder->f_motion.pmv[0][0] +
+	   get_motion_delta (decoder, decoder->f_motion.f_code[0]));
+    tmp = bound_motion_vector (tmp, decoder->f_motion.f_code[0]);
+    decoder->f_motion.pmv[1][0] = decoder->f_motion.pmv[0][0] = tmp;
+
+    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+    tmp = (decoder->f_motion.pmv[0][1] +
+	   get_motion_delta (decoder, decoder->f_motion.f_code[1]));
+    tmp = bound_motion_vector (tmp, decoder->f_motion.f_code[1]);
+    decoder->f_motion.pmv[1][1] = decoder->f_motion.pmv[0][1] = tmp;
+
+    DUMPBITS (bit_buf, bits, 1); /* remove marker_bit */
+}
+
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+
+#define MOTION_CALL(routine,direction)				\
+do {								\
+    if ((direction) & MACROBLOCK_MOTION_FORWARD)		\
+	routine (decoder, &(decoder->f_motion), mpeg2_mc.put);	\
+    if ((direction) & MACROBLOCK_MOTION_BACKWARD)		\
+	routine (decoder, &(decoder->b_motion),			\
+		 ((direction) & MACROBLOCK_MOTION_FORWARD ?	\
+		  mpeg2_mc.avg : mpeg2_mc.put));		\
+} while (0)
+
+#define NEXT_MACROBLOCK							\
+do {									\
+    decoder->offset += 16;						\
+    if (decoder->offset == decoder->width) {				\
+	do { /* just so we can use the break statement */		\
+	    if (decoder->convert) {					\
+		decoder->convert (decoder->convert_id, decoder->dest,	\
+				  decoder->v_offset);			\
+		if (decoder->coding_type == B_TYPE)			\
+		    break;						\
+	    }								\
+	    decoder->dest[0] += decoder->slice_stride;			\
+	    decoder->dest[1] += decoder->slice_uv_stride;		\
+	    decoder->dest[2] += decoder->slice_uv_stride;		\
+	} while (0);							\
+	decoder->v_offset += 16;					\
+	if (decoder->v_offset > decoder->limit_y) {			\
+	    emms();                                                     \
+	    return;							\
+	}								\
+	decoder->offset = 0;						\
+    }									\
+} while (0)
+
+void Picture::motion_setup( mpeg2_decoder_t *d )
+{
+  d->motion_parser[0] = motion_zero_420;
+  d->motion_parser[MC_FIELD] = motion_fr_field_420;
+  d->motion_parser[MC_FRAME] = motion_fr_frame_420;
+  d->motion_parser[MC_DMV] = motion_fr_dmv_420;
+  d->motion_parser[4] = motion_reuse_420;
+}
+
+static inline int slice_init (mpeg2_decoder_t * const decoder, int code)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+    int offset;
+    const MBAtab * mba;
+
+    decoder->dc_dct_pred[0] = decoder->dc_dct_pred[1] =
+	decoder->dc_dct_pred[2] = 16384;
+
+    decoder->f_motion.pmv[0][0] = decoder->f_motion.pmv[0][1] = 0;
+    decoder->f_motion.pmv[1][0] = decoder->f_motion.pmv[1][1] = 0;
+    decoder->b_motion.pmv[0][0] = decoder->b_motion.pmv[0][1] = 0;
+    decoder->b_motion.pmv[1][0] = decoder->b_motion.pmv[1][1] = 0;
+
+    if (decoder->vertical_position_extension) {
+	code += UBITS (bit_buf, 3) << 7;
+	DUMPBITS (bit_buf, bits, 3);
+    }
+    decoder->v_offset = (code - 1) * 16;
+    offset = 0;
+    if (!(decoder->convert) || decoder->coding_type != B_TYPE)
+	offset = (code - 1) * decoder->slice_stride;
+
+    decoder->dest[0] = decoder->picture_dest[0] + offset;
+    offset >>= (2 - decoder->chroma_format);
+    decoder->dest[1] = decoder->picture_dest[1] + offset;
+    decoder->dest[2] = decoder->picture_dest[2] + offset;
+
+    get_quantizer_scale (decoder);
+
+    /* ignore intra_slice and all the extra data */
+    while (bit_buf & 0x80000000) {
+	DUMPBITS (bit_buf, bits, 9);
+	NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+    }
+
+    /* decode initial macroblock address increment */
+    offset = 0;
+    while (1) {
+	if (bit_buf >= 0x08000000) {
+	    mba = MBA_5 + (UBITS (bit_buf, 6) - 2);
+	    break;
+	} else if (bit_buf >= 0x01800000) {
+	    mba = MBA_11 + (UBITS (bit_buf, 12) - 24);
+	    break;
+	} else switch (UBITS (bit_buf, 12)) {
+	case 8:		/* macroblock_escape */
+	    offset += 33;
+	    DUMPBITS (bit_buf, bits, 11);
+	    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+	    continue;
+	case 15:	/* macroblock_stuffing (MPEG1 only) */
+	    bit_buf &= 0xfffff;
+	    DUMPBITS (bit_buf, bits, 11);
+	    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+	    continue;
+	default:	/* error */
+	  fprintf( stderr, "Couldn't decode initial macroblock address increment.\n" );
+	  throw MPEGInvalid();
+	  return 1;
+	}
+    }
+    DUMPBITS (bit_buf, bits, mba->len + 1);
+    decoder->offset = (offset + mba->mba) << 4;
+
+    while (decoder->offset - decoder->width >= 0) {
+	decoder->offset -= decoder->width;
+	if (!(decoder->convert) || decoder->coding_type != B_TYPE) {
+	    decoder->dest[0] += decoder->slice_stride;
+	    decoder->dest[1] += decoder->slice_uv_stride;
+	    decoder->dest[2] += decoder->slice_uv_stride;
+	}
+	decoder->v_offset += 16;
+    }
+    if (decoder->v_offset > decoder->limit_y)
+	return 1;
+
+    return 0;
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+void Slice::decode (mpeg2_decoder_t * const decoder, const int code,
+		    const uint8_t * const buffer)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+
+    bitstream_init (decoder, buffer);
+
+    if (slice_init (decoder, code))
+	return;
+
+    while (1) {
+	int macroblock_modes;
+	int mba_inc;
+	const MBAtab * mba;
+
+	NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+
+	macroblock_modes = get_macroblock_modes (decoder);
+
+	/* maybe integrate MACROBLOCK_QUANT test into get_macroblock_modes ? */
+	if (macroblock_modes & MACROBLOCK_QUANT)
+	    get_quantizer_scale (decoder);
+
+	if (macroblock_modes & MACROBLOCK_INTRA) {
+
+	    int DCT_offset, DCT_stride;
+	    int offset;
+	    uint8_t * dest_y;
+
+	    if (decoder->concealment_motion_vectors) {
+		if (decoder->picture_structure == FRAME_PICTURE)
+		    motion_fr_conceal (decoder);
+		else
+		    motion_fi_conceal (decoder);
+	    } else {
+		decoder->f_motion.pmv[0][0] = decoder->f_motion.pmv[0][1] = 0;
+		decoder->f_motion.pmv[1][0] = decoder->f_motion.pmv[1][1] = 0;
+		decoder->b_motion.pmv[0][0] = decoder->b_motion.pmv[0][1] = 0;
+		decoder->b_motion.pmv[1][0] = decoder->b_motion.pmv[1][1] = 0;
+	    }
+
+	    if (macroblock_modes & DCT_TYPE_INTERLACED) {
+		DCT_offset = decoder->stride;
+		DCT_stride = decoder->stride * 2;
+	    } else {
+		DCT_offset = decoder->stride * 8;
+		DCT_stride = decoder->stride;
+	    }
+
+	    offset = decoder->offset;
+	    dest_y = decoder->dest[0] + offset;
+	    slice_intra_DCT (decoder, 0, dest_y, DCT_stride);
+	    slice_intra_DCT (decoder, 0, dest_y + 8, DCT_stride);
+	    slice_intra_DCT (decoder, 0, dest_y + DCT_offset, DCT_stride);
+	    slice_intra_DCT (decoder, 0, dest_y + DCT_offset + 8, DCT_stride);
+	    if (likely (decoder->chroma_format == 0)) {
+		slice_intra_DCT (decoder, 1, decoder->dest[1] + (offset >> 1),
+				 decoder->uv_stride);
+		slice_intra_DCT (decoder, 2, decoder->dest[2] + (offset >> 1),
+				 decoder->uv_stride);
+		if (decoder->coding_type == D_TYPE) {
+		    NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+		    DUMPBITS (bit_buf, bits, 1);
+		}
+	    } else if (likely (decoder->chroma_format == 1)) {
+		uint8_t * dest_u = decoder->dest[1] + (offset >> 1);
+		uint8_t * dest_v = decoder->dest[2] + (offset >> 1);
+		DCT_stride >>= 1;
+		DCT_offset >>= 1;
+		slice_intra_DCT (decoder, 1, dest_u, DCT_stride);
+		slice_intra_DCT (decoder, 2, dest_v, DCT_stride);
+		slice_intra_DCT (decoder, 1, dest_u + DCT_offset, DCT_stride);
+		slice_intra_DCT (decoder, 2, dest_v + DCT_offset, DCT_stride);
+	    } else {
+		uint8_t * dest_u = decoder->dest[1] + offset;
+		uint8_t * dest_v = decoder->dest[2] + offset;
+		slice_intra_DCT (decoder, 1, dest_u, DCT_stride);
+		slice_intra_DCT (decoder, 2, dest_v, DCT_stride);
+		slice_intra_DCT (decoder, 1, dest_u + DCT_offset, DCT_stride);
+		slice_intra_DCT (decoder, 2, dest_v + DCT_offset, DCT_stride);
+		slice_intra_DCT (decoder, 1, dest_u + 8, DCT_stride);
+		slice_intra_DCT (decoder, 2, dest_v + 8, DCT_stride);
+		slice_intra_DCT (decoder, 1, dest_u + DCT_offset + 8,
+				 DCT_stride);
+		slice_intra_DCT (decoder, 2, dest_v + DCT_offset + 8,
+				 DCT_stride);
+	    }
+	} else {
+
+	    motion_parser_t * parser;
+
+	    if (   ((macroblock_modes >> MOTION_TYPE_SHIFT) < 0)
+                || ((macroblock_modes >> MOTION_TYPE_SHIFT) >=
+                    (int)(sizeof(decoder->motion_parser) 
+                          / sizeof(decoder->motion_parser[0])))
+	       ) {
+	      throw MPEGInvalid();
+	      break; // Illegal !
+	    }
+
+	    parser =
+		decoder->motion_parser[macroblock_modes >> MOTION_TYPE_SHIFT];
+	    MOTION_CALL (parser, macroblock_modes);
+
+	    if (macroblock_modes & MACROBLOCK_PATTERN) {
+		int coded_block_pattern;
+		int DCT_offset, DCT_stride;
+
+		if (macroblock_modes & DCT_TYPE_INTERLACED) {
+		    DCT_offset = decoder->stride;
+		    DCT_stride = decoder->stride * 2;
+		} else {
+		    DCT_offset = decoder->stride * 8;
+		    DCT_stride = decoder->stride;
+		}
+
+		coded_block_pattern = get_coded_block_pattern (decoder);
+
+		if (likely (decoder->chroma_format == 0)) {
+		    int offset = decoder->offset;
+		    uint8_t * dest_y = decoder->dest[0] + offset;
+		    if (coded_block_pattern & 1)
+			slice_non_intra_DCT (decoder, 0, dest_y, DCT_stride);
+		    if (coded_block_pattern & 2)
+			slice_non_intra_DCT (decoder, 0, dest_y + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & 4)
+			slice_non_intra_DCT (decoder, 0, dest_y + DCT_offset,
+					     DCT_stride);
+		    if (coded_block_pattern & 8)
+		        slice_non_intra_DCT (decoder, 0,
+			  		     dest_y + DCT_offset + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & 16)
+			slice_non_intra_DCT (decoder, 1,
+					     decoder->dest[1] + (offset >> 1),
+					     decoder->uv_stride);
+		    if (coded_block_pattern & 32)
+			slice_non_intra_DCT (decoder, 2,
+					     decoder->dest[2] + (offset >> 1),
+					     decoder->uv_stride);
+		} else if (likely (decoder->chroma_format == 1)) {
+		    int offset;
+		    uint8_t * dest_y;
+
+		    coded_block_pattern |= bit_buf & (3 << 30);
+		    DUMPBITS (bit_buf, bits, 2);
+
+		    offset = decoder->offset;
+		    dest_y = decoder->dest[0] + offset;
+		    if (coded_block_pattern & 1)
+			slice_non_intra_DCT (decoder, 0, dest_y, DCT_stride);
+		    if (coded_block_pattern & 2)
+			slice_non_intra_DCT (decoder, 0, dest_y + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & 4)
+			slice_non_intra_DCT (decoder, 0, dest_y + DCT_offset,
+					     DCT_stride);
+		    if (coded_block_pattern & 8)
+			slice_non_intra_DCT (decoder, 0,
+					     dest_y + DCT_offset + 8,
+					     DCT_stride);
+
+		    DCT_stride >>= 1;
+		    DCT_offset = (DCT_offset + offset) >> 1;
+		    if (coded_block_pattern & 16)
+			slice_non_intra_DCT (decoder, 1,
+					     decoder->dest[1] + (offset >> 1),
+					     DCT_stride);
+		    if (coded_block_pattern & 32)
+			slice_non_intra_DCT (decoder, 2,
+					     decoder->dest[2] + (offset >> 1),
+					     DCT_stride);
+		    if (coded_block_pattern & (2 << 30))
+			slice_non_intra_DCT (decoder, 1,
+					     decoder->dest[1] + DCT_offset,
+					     DCT_stride);
+		    if (coded_block_pattern & (1 << 30))
+			slice_non_intra_DCT (decoder, 2,
+					     decoder->dest[2] + DCT_offset,
+					     DCT_stride);
+		} else {
+		    int offset;
+		    uint8_t * dest_y, * dest_u, * dest_v;
+
+		    coded_block_pattern |= bit_buf & (63 << 26);
+		    DUMPBITS (bit_buf, bits, 6);
+
+		    offset = decoder->offset;
+		    dest_y = decoder->dest[0] + offset;
+		    dest_u = decoder->dest[1] + offset;
+		    dest_v = decoder->dest[2] + offset;
+
+		    if (coded_block_pattern & 1)
+			slice_non_intra_DCT (decoder, 0, dest_y, DCT_stride);
+		    if (coded_block_pattern & 2)
+			slice_non_intra_DCT (decoder, 0, dest_y + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & 4)
+			slice_non_intra_DCT (decoder, 0, dest_y + DCT_offset,
+					     DCT_stride);
+		    if (coded_block_pattern & 8)
+			slice_non_intra_DCT (decoder, 0,
+					     dest_y + DCT_offset + 8,
+					     DCT_stride);
+
+		    if (coded_block_pattern & 16)
+			slice_non_intra_DCT (decoder, 1, dest_u, DCT_stride);
+		    if (coded_block_pattern & 32)
+			slice_non_intra_DCT (decoder, 2, dest_v, DCT_stride);
+		    if (coded_block_pattern & (32 << 26))
+			slice_non_intra_DCT (decoder, 1, dest_u + DCT_offset,
+					     DCT_stride);
+		    if (coded_block_pattern & (16 << 26))
+			slice_non_intra_DCT (decoder, 2, dest_v + DCT_offset,
+					     DCT_stride);
+		    if (coded_block_pattern & (8 << 26))
+			slice_non_intra_DCT (decoder, 1, dest_u + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & (4 << 26))
+			slice_non_intra_DCT (decoder, 2, dest_v + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & (2 << 26))
+			slice_non_intra_DCT (decoder, 1,
+					     dest_u + DCT_offset + 8,
+					     DCT_stride);
+		    if (coded_block_pattern & (1 << 26))
+			slice_non_intra_DCT (decoder, 2,
+					     dest_v + DCT_offset + 8,
+					     DCT_stride);
+		}
+	    }
+
+	    decoder->dc_dct_pred[0] = decoder->dc_dct_pred[1] =
+		decoder->dc_dct_pred[2] = 16384;
+	}
+
+	NEXT_MACROBLOCK;
+
+	NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+	mba_inc = 0;
+	while (1) {
+	    if (bit_buf >= 0x10000000) {
+		mba = MBA_5 + (UBITS (bit_buf, 5) - 2);
+		break;
+	    } else if (bit_buf >= 0x03000000) {
+		mba = MBA_11 + (UBITS (bit_buf, 11) - 24);
+		break;
+	    } else switch (UBITS (bit_buf, 11)) {
+	    case 8:		/* macroblock_escape */
+		mba_inc += 33;
+		/* pass through */
+	    case 15:	/* macroblock_stuffing (MPEG1 only) */
+		DUMPBITS (bit_buf, bits, 11);
+		NEEDBITS (bit_buf, bits, bit_ptr, decoder->bit_ptr_end);
+		continue;
+	    default:	/* end of slice, or error */
+	        emms();
+		return;
+	    }
+	}
+	DUMPBITS (bit_buf, bits, mba->len);
+	mba_inc += mba->mba;
+
+	if (mba_inc) {
+	    decoder->dc_dct_pred[0] = decoder->dc_dct_pred[1] =
+		decoder->dc_dct_pred[2] = 16384;
+
+	    if (decoder->coding_type == P_TYPE) {
+		do {
+		    MOTION_CALL (decoder->motion_parser[0],
+				 MACROBLOCK_MOTION_FORWARD);
+		    NEXT_MACROBLOCK;
+		} while (--mba_inc);
+	    } else {
+		do {
+		    MOTION_CALL (decoder->motion_parser[4], macroblock_modes);
+		    NEXT_MACROBLOCK;
+		} while (--mba_inc);
+	    }
+	}
+    }
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+    emms();
+}
diff --git a/startfinder.cpp b/startfinder.cpp
new file mode 100644
index 0000000..3e1a2a0
--- /dev/null
+++ b/startfinder.cpp
@@ -0,0 +1,118 @@
+#include <unistd.h>
+#include <errno.h>
+#include <typeinfo>
+
+#include "es.hpp"
+
+inline bool start_code( uint8_t *buf )
+{
+  return ( (buf[ 0 ] == 0) &&
+	   (buf[ 1 ] == 0) &&
+	   (buf[ 2 ] == 1) );
+}
+
+off_t ES::startfinder( off_t start,
+		       void (*progress)( off_t size, off_t location ),
+		       bool (ES::*todo)( uint8_t *buffer, off_t location,
+					 size_t len ) )
+{
+  ssize_t maxread = BLOCK + START_CODE_LENGTH;
+  off_t last_code = -1;
+  bool keepgoing = true;
+  off_t anchor = start;
+  off_t filesize = file->get_filesize();
+
+  while ( 1 ) {
+    int len = maxread;
+    if ( anchor + len > filesize ) {
+      len = filesize - anchor;
+    }
+
+    if ( len < START_CODE_LENGTH ) {
+      break;
+    }
+
+    int advance = len - START_CODE_LENGTH;
+
+    MapHandle *chunk = file->map( anchor, len );
+    uint8_t *buf = chunk->get_buf();
+
+    /* Look through every byte of buffer */
+    int i = 0;
+    while ( i < len - LARGEST_HEADER ) {
+      if ( start_code( buf + i ) ) {
+	keepgoing = (this->*todo)( buf + i, anchor + i, len - i );
+	if ( !keepgoing ) {
+	  last_code = anchor + i;
+	  break;
+	}
+      }
+      i++;
+    }
+    
+    if ( keepgoing ) {
+      while ( i < len - START_CODE_LENGTH ) {
+	if ( start_code( buf + i ) ) {
+	  try {
+	    keepgoing = (this->*todo)( buf + i, anchor + i, len - i );
+	    if ( !keepgoing ) {
+	      last_code = anchor + i;
+	      break;
+	    }
+	  } catch ( NeedBits x ) {
+	    if ( anchor + len == filesize ) {
+	      keepgoing = false;
+	      break;
+	    } else {
+	      advance = i;
+	      break;
+	    }
+	  }
+	}
+	i++;
+      }
+    }
+
+    delete chunk;
+
+    progress( filesize, anchor + len );
+    if ( !keepgoing ) break;
+    if ( advance == 0 ) break;
+
+    anchor += advance;
+  }
+
+  return last_code;
+}
+
+bool ES::first_sequence( uint8_t *buf, off_t location, size_t len )
+{
+  return (BitReader( buf, len ).readbits( 32 ) != 0x1b3);
+}
+
+bool ES::add_header( uint8_t *buf, off_t location, size_t len )
+{
+  BitReader br( buf, len );
+
+  MPEGHeader *hdr = MPEGHeader::make( br, file );
+  if ( hdr ) {
+    if ( first_header == NULL ) {
+      first_header = last_header = hdr;
+    } else {
+      last_header->set_next( hdr );
+    }
+    hdr->set_location( location );
+    last_header = hdr;
+
+    if ( (seq == NULL) && (typeid( *hdr ) == typeid( Sequence )) ) {
+      seq = dynamic_cast<Sequence *>( hdr );
+      ahabassert( seq );
+    }
+  }
+
+  if ( typeid( *hdr ) == typeid( SequenceEnd ) ) {
+    return false;
+  } else {
+    return true;
+  }
+}
diff --git a/vlc.h b/vlc.h
new file mode 100644
index 0000000..90177a8
--- /dev/null
+++ b/vlc.h
@@ -0,0 +1,464 @@
+/*
+ * vlc.h
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef LIBMPEG2_VLC_H
+#define LIBMPEG2_VLC_H
+
+#include "exceptions.hpp"
+
+#define GETWORD(bit_buf,shift,bit_ptr)				\
+do {								\
+    bit_buf |= ((bit_ptr[0] << 8) | bit_ptr[1]) << (shift);	\
+    bit_ptr += 2;						\
+} while (0)
+
+#define GETWORD1(bit_buf,shift,bit_ptr)				\
+do {								\
+    bit_buf |= ((bit_ptr[0] << 8)) << (shift);	\
+    bit_ptr += 2;						\
+} while (0)
+
+#define GETWORD0(bit_buf,shift,bit_ptr)				\
+do {								\
+    bit_buf |= (0) << (shift);	\
+    bit_ptr += 2;						\
+} while (0)
+
+static inline void bitstream_init (mpeg2_decoder_t * decoder,
+				   const uint8_t * start)
+{
+    decoder->bitstream_buf =
+	(start[0] << 24) | (start[1] << 16) | (start[2] << 8) | start[3];
+    decoder->bitstream_ptr = start + 4;
+    decoder->bitstream_bits = -16;
+}
+
+/* make sure that there are at least 16 valid bits in bit_buf */
+#define NEEDBITS(bit_buf,bits,bit_ptr,boundscheck)              \
+do {                                            \
+    if (unlikely (bits > 0)) {                  \
+      if (likely(bit_ptr + 1 < boundscheck) ) { \
+        GETWORD (bit_buf, bits, bit_ptr);       \
+      } else if ( bit_ptr < boundscheck ) {     \
+        GETWORD1 (bit_buf, bits, bit_ptr);      \
+    } else {                                    \
+        GETWORD0 (bit_buf, bits, bit_ptr);      \
+    }                                           \
+        bits -= 16;                             \
+    }                                           \
+} while (0)
+
+/*
+#define NEEDBITS(bit_buf,bits,bit_ptr,bc)	\
+do {                                            \
+    if (unlikely (bits > 0)) {                  \
+        GETWORD (bit_buf, bits, bit_ptr);       \
+        bits -= 16;                             \
+    }                                           \
+} while (0)
+*/
+
+/* remove num valid bits from bit_buf */
+#define DUMPBITS(bit_buf,bits,num)	\
+do {					\
+    bit_buf <<= (num);			\
+    bits += (num);			\
+} while (0)
+
+/* take num bits from the high part of bit_buf and zero extend them */
+#define UBITS(bit_buf,num) (((uint32_t)(bit_buf)) >> (32 - (num)))
+
+/* take num bits from the high part of bit_buf and sign extend them */
+#define SBITS(bit_buf,num) (((int32_t)(bit_buf)) >> (32 - (num)))
+
+typedef struct {
+    uint8_t modes;
+    uint8_t len;
+} MBtab;
+
+typedef struct {
+    uint8_t delta;
+    uint8_t len;
+} MVtab;
+
+typedef struct {
+    int8_t dmv;
+    uint8_t len;
+} DMVtab;
+
+typedef struct {
+    uint8_t cbp;
+    uint8_t len;
+} CBPtab;
+
+typedef struct {
+    uint8_t size;
+    uint8_t len;
+} DCtab;
+
+typedef struct {
+    uint8_t run;
+    uint8_t level;
+    uint8_t len;
+} DCTtab;
+
+typedef struct {
+    uint8_t mba;
+    uint8_t len;
+} MBAtab;
+
+
+#define INTRA MACROBLOCK_INTRA
+#define QUANT MACROBLOCK_QUANT
+
+static const MBtab MB_I [] = {
+    {INTRA|QUANT, 2}, {INTRA, 1}
+};
+
+#define MC MACROBLOCK_MOTION_FORWARD
+#define CODED MACROBLOCK_PATTERN
+
+static const MBtab MB_P [] = {
+    {INTRA|QUANT, 6}, {CODED|QUANT, 5}, {MC|CODED|QUANT, 5}, {INTRA,    5},
+    {MC,          3}, {MC,          3}, {MC,             3}, {MC,       3},
+    {CODED,       2}, {CODED,       2}, {CODED,          2}, {CODED,    2},
+    {CODED,       2}, {CODED,       2}, {CODED,          2}, {CODED,    2},
+    {MC|CODED,    1}, {MC|CODED,    1}, {MC|CODED,       1}, {MC|CODED, 1},
+    {MC|CODED,    1}, {MC|CODED,    1}, {MC|CODED,       1}, {MC|CODED, 1},
+    {MC|CODED,    1}, {MC|CODED,    1}, {MC|CODED,       1}, {MC|CODED, 1},
+    {MC|CODED,    1}, {MC|CODED,    1}, {MC|CODED,       1}, {MC|CODED, 1}
+};
+
+#define FWD MACROBLOCK_MOTION_FORWARD
+#define BWD MACROBLOCK_MOTION_BACKWARD
+#define INTER MACROBLOCK_MOTION_FORWARD|MACROBLOCK_MOTION_BACKWARD
+
+static const MBtab MB_B [] = {
+    {0,                 6}, {INTRA|QUANT,       6},
+    {BWD|CODED|QUANT,   6}, {FWD|CODED|QUANT,   6},
+    {INTER|CODED|QUANT, 5}, {INTER|CODED|QUANT, 5},
+					{INTRA,       5}, {INTRA,       5},
+    {FWD,         4}, {FWD,         4}, {FWD,         4}, {FWD,         4},
+    {FWD|CODED,   4}, {FWD|CODED,   4}, {FWD|CODED,   4}, {FWD|CODED,   4},
+    {BWD,         3}, {BWD,         3}, {BWD,         3}, {BWD,         3},
+    {BWD,         3}, {BWD,         3}, {BWD,         3}, {BWD,         3},
+    {BWD|CODED,   3}, {BWD|CODED,   3}, {BWD|CODED,   3}, {BWD|CODED,   3},
+    {BWD|CODED,   3}, {BWD|CODED,   3}, {BWD|CODED,   3}, {BWD|CODED,   3},
+    {INTER,       2}, {INTER,       2}, {INTER,       2}, {INTER,       2},
+    {INTER,       2}, {INTER,       2}, {INTER,       2}, {INTER,       2},
+    {INTER,       2}, {INTER,       2}, {INTER,       2}, {INTER,       2},
+    {INTER,       2}, {INTER,       2}, {INTER,       2}, {INTER,       2},
+    {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2},
+    {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2},
+    {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2},
+    {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2}, {INTER|CODED, 2}
+};
+
+#undef INTRA
+#undef QUANT
+#undef MC
+#undef CODED
+#undef FWD
+#undef BWD
+#undef INTER
+
+
+static const MVtab MV_4 [] = {
+    { 3, 6}, { 2, 4}, { 1, 3}, { 1, 3}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}
+};
+
+static const MVtab MV_10 [] = {
+    { 0,10}, { 0,10}, { 0,10}, { 0,10}, { 0,10}, { 0,10}, { 0,10}, { 0,10},
+    { 0,10}, { 0,10}, { 0,10}, { 0,10}, {15,10}, {14,10}, {13,10}, {12,10},
+    {11,10}, {10,10}, { 9, 9}, { 9, 9}, { 8, 9}, { 8, 9}, { 7, 9}, { 7, 9},
+    { 6, 7}, { 6, 7}, { 6, 7}, { 6, 7}, { 6, 7}, { 6, 7}, { 6, 7}, { 6, 7},
+    { 5, 7}, { 5, 7}, { 5, 7}, { 5, 7}, { 5, 7}, { 5, 7}, { 5, 7}, { 5, 7},
+    { 4, 7}, { 4, 7}, { 4, 7}, { 4, 7}, { 4, 7}, { 4, 7}, { 4, 7}, { 4, 7}
+};
+
+
+static const DMVtab DMV_2 [] = {
+    { 0, 1}, { 0, 1}, { 1, 2}, {-1, 2}
+};
+
+
+static const CBPtab CBP_7 [] = {
+    {0x11, 7}, {0x12, 7}, {0x14, 7}, {0x18, 7},
+    {0x21, 7}, {0x22, 7}, {0x24, 7}, {0x28, 7},
+    {0x3f, 6}, {0x3f, 6}, {0x30, 6}, {0x30, 6},
+    {0x09, 6}, {0x09, 6}, {0x06, 6}, {0x06, 6},
+    {0x1f, 5}, {0x1f, 5}, {0x1f, 5}, {0x1f, 5},
+    {0x10, 5}, {0x10, 5}, {0x10, 5}, {0x10, 5},
+    {0x2f, 5}, {0x2f, 5}, {0x2f, 5}, {0x2f, 5},
+    {0x20, 5}, {0x20, 5}, {0x20, 5}, {0x20, 5},
+    {0x07, 5}, {0x07, 5}, {0x07, 5}, {0x07, 5},
+    {0x0b, 5}, {0x0b, 5}, {0x0b, 5}, {0x0b, 5},
+    {0x0d, 5}, {0x0d, 5}, {0x0d, 5}, {0x0d, 5},
+    {0x0e, 5}, {0x0e, 5}, {0x0e, 5}, {0x0e, 5},
+    {0x05, 5}, {0x05, 5}, {0x05, 5}, {0x05, 5},
+    {0x0a, 5}, {0x0a, 5}, {0x0a, 5}, {0x0a, 5},
+    {0x03, 5}, {0x03, 5}, {0x03, 5}, {0x03, 5},
+    {0x0c, 5}, {0x0c, 5}, {0x0c, 5}, {0x0c, 5},
+    {0x01, 4}, {0x01, 4}, {0x01, 4}, {0x01, 4},
+    {0x01, 4}, {0x01, 4}, {0x01, 4}, {0x01, 4},
+    {0x02, 4}, {0x02, 4}, {0x02, 4}, {0x02, 4},
+    {0x02, 4}, {0x02, 4}, {0x02, 4}, {0x02, 4},
+    {0x04, 4}, {0x04, 4}, {0x04, 4}, {0x04, 4},
+    {0x04, 4}, {0x04, 4}, {0x04, 4}, {0x04, 4},
+    {0x08, 4}, {0x08, 4}, {0x08, 4}, {0x08, 4},
+    {0x08, 4}, {0x08, 4}, {0x08, 4}, {0x08, 4},
+    {0x0f, 3}, {0x0f, 3}, {0x0f, 3}, {0x0f, 3},
+    {0x0f, 3}, {0x0f, 3}, {0x0f, 3}, {0x0f, 3},
+    {0x0f, 3}, {0x0f, 3}, {0x0f, 3}, {0x0f, 3},
+    {0x0f, 3}, {0x0f, 3}, {0x0f, 3}, {0x0f, 3}
+};
+
+static const CBPtab CBP_9 [] = {
+    {0,    9}, {0x00, 9}, {0x39, 9}, {0x36, 9},
+    {0x37, 9}, {0x3b, 9}, {0x3d, 9}, {0x3e, 9},
+    {0x17, 8}, {0x17, 8}, {0x1b, 8}, {0x1b, 8},
+    {0x1d, 8}, {0x1d, 8}, {0x1e, 8}, {0x1e, 8},
+    {0x27, 8}, {0x27, 8}, {0x2b, 8}, {0x2b, 8},
+    {0x2d, 8}, {0x2d, 8}, {0x2e, 8}, {0x2e, 8},
+    {0x19, 8}, {0x19, 8}, {0x16, 8}, {0x16, 8},
+    {0x29, 8}, {0x29, 8}, {0x26, 8}, {0x26, 8},
+    {0x35, 8}, {0x35, 8}, {0x3a, 8}, {0x3a, 8},
+    {0x33, 8}, {0x33, 8}, {0x3c, 8}, {0x3c, 8},
+    {0x15, 8}, {0x15, 8}, {0x1a, 8}, {0x1a, 8},
+    {0x13, 8}, {0x13, 8}, {0x1c, 8}, {0x1c, 8},
+    {0x25, 8}, {0x25, 8}, {0x2a, 8}, {0x2a, 8},
+    {0x23, 8}, {0x23, 8}, {0x2c, 8}, {0x2c, 8},
+    {0x31, 8}, {0x31, 8}, {0x32, 8}, {0x32, 8},
+    {0x34, 8}, {0x34, 8}, {0x38, 8}, {0x38, 8}
+};
+
+
+static const DCtab DC_lum_5 [] = {
+    {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2},
+    {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2},
+    {0, 3}, {0, 3}, {0, 3}, {0, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3},
+    {4, 3}, {4, 3}, {4, 3}, {4, 3}, {5, 4}, {5, 4}, {6, 5}
+};
+
+static const DCtab DC_chrom_5 [] = {
+    {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2},
+    {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2},
+    {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2},
+    {3, 3}, {3, 3}, {3, 3}, {3, 3}, {4, 4}, {4, 4}, {5, 5}
+};
+
+static const DCtab DC_long [] = {
+    {6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, { 6, 5}, { 6, 5},
+    {6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, { 6, 5}, { 6, 5},
+    {7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6}, { 7, 6}, { 7, 6},
+    {8, 7}, {8, 7}, {8, 7}, {8, 7}, {9, 8}, {9, 8}, {10, 9}, {11, 9}
+};
+
+
+static const DCTtab DCT_16 [] = {
+    {129, 0, 0}, {129, 0, 0}, {129, 0, 0}, {129, 0, 0},
+    {129, 0, 0}, {129, 0, 0}, {129, 0, 0}, {129, 0, 0},
+    {129, 0, 0}, {129, 0, 0}, {129, 0, 0}, {129, 0, 0},
+    {129, 0, 0}, {129, 0, 0}, {129, 0, 0}, {129, 0, 0},
+    {  2,18, 0}, {  2,17, 0}, {  2,16, 0}, {  2,15, 0},
+    {  7, 3, 0}, { 17, 2, 0}, { 16, 2, 0}, { 15, 2, 0},
+    { 14, 2, 0}, { 13, 2, 0}, { 12, 2, 0}, { 32, 1, 0},
+    { 31, 1, 0}, { 30, 1, 0}, { 29, 1, 0}, { 28, 1, 0}
+};
+
+static const DCTtab DCT_15 [] = {
+    {  1,40,15}, {  1,39,15}, {  1,38,15}, {  1,37,15},
+    {  1,36,15}, {  1,35,15}, {  1,34,15}, {  1,33,15},
+    {  1,32,15}, {  2,14,15}, {  2,13,15}, {  2,12,15},
+    {  2,11,15}, {  2,10,15}, {  2, 9,15}, {  2, 8,15},
+    {  1,31,14}, {  1,31,14}, {  1,30,14}, {  1,30,14},
+    {  1,29,14}, {  1,29,14}, {  1,28,14}, {  1,28,14},
+    {  1,27,14}, {  1,27,14}, {  1,26,14}, {  1,26,14},
+    {  1,25,14}, {  1,25,14}, {  1,24,14}, {  1,24,14},
+    {  1,23,14}, {  1,23,14}, {  1,22,14}, {  1,22,14},
+    {  1,21,14}, {  1,21,14}, {  1,20,14}, {  1,20,14},
+    {  1,19,14}, {  1,19,14}, {  1,18,14}, {  1,18,14},
+    {  1,17,14}, {  1,17,14}, {  1,16,14}, {  1,16,14}
+};
+
+static const DCTtab DCT_13 [] = {
+    { 11, 2,13}, { 10, 2,13}, {  6, 3,13}, {  4, 4,13},
+    {  3, 5,13}, {  2, 7,13}, {  2, 6,13}, {  1,15,13},
+    {  1,14,13}, {  1,13,13}, {  1,12,13}, { 27, 1,13},
+    { 26, 1,13}, { 25, 1,13}, { 24, 1,13}, { 23, 1,13},
+    {  1,11,12}, {  1,11,12}, {  9, 2,12}, {  9, 2,12},
+    {  5, 3,12}, {  5, 3,12}, {  1,10,12}, {  1,10,12},
+    {  3, 4,12}, {  3, 4,12}, {  8, 2,12}, {  8, 2,12},
+    { 22, 1,12}, { 22, 1,12}, { 21, 1,12}, { 21, 1,12},
+    {  1, 9,12}, {  1, 9,12}, { 20, 1,12}, { 20, 1,12},
+    { 19, 1,12}, { 19, 1,12}, {  2, 5,12}, {  2, 5,12},
+    {  4, 3,12}, {  4, 3,12}, {  1, 8,12}, {  1, 8,12},
+    {  7, 2,12}, {  7, 2,12}, { 18, 1,12}, { 18, 1,12}
+};
+
+static const DCTtab DCT_B14_10 [] = {
+    { 17, 1,10}, {  6, 2,10}, {  1, 7,10}, {  3, 3,10},
+    {  2, 4,10}, { 16, 1,10}, { 15, 1,10}, {  5, 2,10}
+};
+
+static const DCTtab DCT_B14_8 [] = {
+    { 65, 0,12}, { 65, 0,12}, { 65, 0,12}, { 65, 0,12},
+    {  3, 2, 7}, {  3, 2, 7}, { 10, 1, 7}, { 10, 1, 7},
+    {  1, 4, 7}, {  1, 4, 7}, {  9, 1, 7}, {  9, 1, 7},
+    {  8, 1, 6}, {  8, 1, 6}, {  8, 1, 6}, {  8, 1, 6},
+    {  7, 1, 6}, {  7, 1, 6}, {  7, 1, 6}, {  7, 1, 6},
+    {  2, 2, 6}, {  2, 2, 6}, {  2, 2, 6}, {  2, 2, 6},
+    {  6, 1, 6}, {  6, 1, 6}, {  6, 1, 6}, {  6, 1, 6},
+    { 14, 1, 8}, {  1, 6, 8}, { 13, 1, 8}, { 12, 1, 8},
+    {  4, 2, 8}, {  2, 3, 8}, {  1, 5, 8}, { 11, 1, 8}
+};
+
+static const DCTtab DCT_B14AC_5 [] = {
+		 {  1, 3, 5}, {  5, 1, 5}, {  4, 1, 5},
+    {  1, 2, 4}, {  1, 2, 4}, {  3, 1, 4}, {  3, 1, 4},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {129, 0, 2}, {129, 0, 2}, {129, 0, 2}, {129, 0, 2},
+    {129, 0, 2}, {129, 0, 2}, {129, 0, 2}, {129, 0, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}
+};
+
+static const DCTtab DCT_B14DC_5 [] = {
+		 {  1, 3, 5}, {  5, 1, 5}, {  4, 1, 5},
+    {  1, 2, 4}, {  1, 2, 4}, {  3, 1, 4}, {  3, 1, 4},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1},
+    {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1},
+    {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1},
+    {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}
+};
+
+static const DCTtab DCT_B15_10 [] = {
+    {  6, 2, 9}, {  6, 2, 9}, { 15, 1, 9}, { 15, 1, 9},
+    {  3, 4,10}, { 17, 1,10}, { 16, 1, 9}, { 16, 1, 9}
+};
+
+static const DCTtab DCT_B15_8 [] = {
+    { 65, 0,12}, { 65, 0,12}, { 65, 0,12}, { 65, 0,12},
+    {  8, 1, 7}, {  8, 1, 7}, {  9, 1, 7}, {  9, 1, 7},
+    {  7, 1, 7}, {  7, 1, 7}, {  3, 2, 7}, {  3, 2, 7},
+    {  1, 7, 6}, {  1, 7, 6}, {  1, 7, 6}, {  1, 7, 6},
+    {  1, 6, 6}, {  1, 6, 6}, {  1, 6, 6}, {  1, 6, 6},
+    {  5, 1, 6}, {  5, 1, 6}, {  5, 1, 6}, {  5, 1, 6},
+    {  6, 1, 6}, {  6, 1, 6}, {  6, 1, 6}, {  6, 1, 6},
+    {  2, 5, 8}, { 12, 1, 8}, {  1,11, 8}, {  1,10, 8},
+    { 14, 1, 8}, { 13, 1, 8}, {  4, 2, 8}, {  2, 4, 8},
+    {  3, 1, 5}, {  3, 1, 5}, {  3, 1, 5}, {  3, 1, 5},
+    {  3, 1, 5}, {  3, 1, 5}, {  3, 1, 5}, {  3, 1, 5},
+    {  2, 2, 5}, {  2, 2, 5}, {  2, 2, 5}, {  2, 2, 5},
+    {  2, 2, 5}, {  2, 2, 5}, {  2, 2, 5}, {  2, 2, 5},
+    {  4, 1, 5}, {  4, 1, 5}, {  4, 1, 5}, {  4, 1, 5},
+    {  4, 1, 5}, {  4, 1, 5}, {  4, 1, 5}, {  4, 1, 5},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
+    {129, 0, 4}, {129, 0, 4}, {129, 0, 4}, {129, 0, 4},
+    {129, 0, 4}, {129, 0, 4}, {129, 0, 4}, {129, 0, 4},
+    {129, 0, 4}, {129, 0, 4}, {129, 0, 4}, {129, 0, 4},
+    {129, 0, 4}, {129, 0, 4}, {129, 0, 4}, {129, 0, 4},
+    {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4},
+    {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4},
+    {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4},
+    {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4}, {  1, 3, 4},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3}, {  1, 2, 3},
+    {  1, 4, 5}, {  1, 4, 5}, {  1, 4, 5}, {  1, 4, 5},
+    {  1, 4, 5}, {  1, 4, 5}, {  1, 4, 5}, {  1, 4, 5},
+    {  1, 5, 5}, {  1, 5, 5}, {  1, 5, 5}, {  1, 5, 5},
+    {  1, 5, 5}, {  1, 5, 5}, {  1, 5, 5}, {  1, 5, 5},
+    { 10, 1, 7}, { 10, 1, 7}, {  2, 3, 7}, {  2, 3, 7},
+    { 11, 1, 7}, { 11, 1, 7}, {  1, 8, 7}, {  1, 8, 7},
+    {  1, 9, 7}, {  1, 9, 7}, {  1,12, 8}, {  1,13, 8},
+    {  3, 3, 8}, {  5, 2, 8}, {  1,14, 8}, {  1,15, 8}
+};
+
+
+static const MBAtab MBA_5 [] = {
+		    {6, 5}, {5, 5}, {4, 4}, {4, 4}, {3, 4}, {3, 4},
+    {2, 3}, {2, 3}, {2, 3}, {2, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3},
+    {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1},
+    {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}
+};
+
+static const MBAtab MBA_11 [] = {
+    {32, 11}, {31, 11}, {30, 11}, {29, 11},
+    {28, 11}, {27, 11}, {26, 11}, {25, 11},
+    {24, 11}, {23, 11}, {22, 11}, {21, 11},
+    {20, 10}, {20, 10}, {19, 10}, {19, 10},
+    {18, 10}, {18, 10}, {17, 10}, {17, 10},
+    {16, 10}, {16, 10}, {15, 10}, {15, 10},
+    {14,  8}, {14,  8}, {14,  8}, {14,  8},
+    {14,  8}, {14,  8}, {14,  8}, {14,  8},
+    {13,  8}, {13,  8}, {13,  8}, {13,  8},
+    {13,  8}, {13,  8}, {13,  8}, {13,  8},
+    {12,  8}, {12,  8}, {12,  8}, {12,  8},
+    {12,  8}, {12,  8}, {12,  8}, {12,  8},
+    {11,  8}, {11,  8}, {11,  8}, {11,  8},
+    {11,  8}, {11,  8}, {11,  8}, {11,  8},
+    {10,  8}, {10,  8}, {10,  8}, {10,  8},
+    {10,  8}, {10,  8}, {10,  8}, {10,  8},
+    { 9,  8}, { 9,  8}, { 9,  8}, { 9,  8},
+    { 9,  8}, { 9,  8}, { 9,  8}, { 9,  8},
+    { 8,  7}, { 8,  7}, { 8,  7}, { 8,  7},
+    { 8,  7}, { 8,  7}, { 8,  7}, { 8,  7},
+    { 8,  7}, { 8,  7}, { 8,  7}, { 8,  7},
+    { 8,  7}, { 8,  7}, { 8,  7}, { 8,  7},
+    { 7,  7}, { 7,  7}, { 7,  7}, { 7,  7},
+    { 7,  7}, { 7,  7}, { 7,  7}, { 7,  7},
+    { 7,  7}, { 7,  7}, { 7,  7}, { 7,  7},
+    { 7,  7}, { 7,  7}, { 7,  7}, { 7,  7}
+};
+
+#endif /* LIBMPEG2_VLC_H */
diff --git a/xeventloop.cpp b/xeventloop.cpp
new file mode 100644
index 0000000..0f148d1
--- /dev/null
+++ b/xeventloop.cpp
@@ -0,0 +1,58 @@
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+
+#include "exceptions.hpp"
+#include "xeventloop.hpp"
+
+static void *thread_helper( void *xeventloop )
+{
+  XEventLoop *me = static_cast<XEventLoop *>( xeventloop );
+  ahabassert( me );
+  me->loop();
+  return NULL;
+}
+
+XEventLoop::XEventLoop( OpenGLDisplay *s_display )
+  : opq( 0 ),
+    display( s_display ),
+    live( true )
+{
+  unixassert( pthread_mutex_init( &mutex, NULL ) );
+  pthread_create( &thread_handle, NULL, thread_helper, this );  
+}
+
+void XEventLoop::loop( void )
+{
+  while ( 1 ) {
+    int key = display->getevent( true );
+
+    {
+      MutexLock x( &mutex );
+      if ( !live ) {
+	return;
+      }
+    }
+
+    if ( key ) {
+      XKey *op = new XKey( key );
+      try {
+	opq.enqueue( op );
+      } catch ( UnixAssertError *e ) {
+	return;
+      }
+    }
+  }
+}
+
+XEventLoop::~XEventLoop()
+{
+  {
+    MutexLock x( &mutex );
+    live = false;
+  }
+
+  display->makeevent();
+
+  pthread_join( thread_handle, NULL );
+}
diff --git a/xeventloop.hpp b/xeventloop.hpp
new file mode 100644
index 0000000..d9c63b6
--- /dev/null
+++ b/xeventloop.hpp
@@ -0,0 +1,25 @@
+#ifndef XEVENTLOOP_HPP
+#define XEVENTLOOP_HPP
+
+#include "decoderop.hpp"
+#include "ogl.hpp"
+
+class XEventLoop {
+private:
+  OperationQueue<DecoderOperation> opq;
+  OpenGLDisplay *display;
+
+  pthread_t thread_handle;
+
+  pthread_mutex_t mutex;
+  bool live;
+
+public:
+  XEventLoop( OpenGLDisplay *s_display );
+  ~XEventLoop();
+
+  void loop( void );
+  OperationQueue<DecoderOperation> *get_queue() { return &opq; }
+};
+
+#endif