rts/PrimOps.cmm

/* -----------------------------------------------------------------------------
 *
 * (c) The GHC Team, 1998-2011
 *
 * Out-of-line primitive operations
 *
 * This file contains the implementations of all the primitive
 * operations ("primops") which are not expanded inline.  See
 * ghc/compiler/prelude/primops.txt.pp for a list of all the primops;
 * this file contains code for most of those with the attribute
 * out_of_line=True.
 *
 * Entry convention: the entry convention for a primop is that all the
 * args are in Stg registers (R1, R2, etc.).  This is to make writing
 * the primops easier.  (see compiler/codeGen/CgCallConv.hs).
 *
 * Return convention: results from a primop are generally returned
 * using the ordinary unboxed tuple return convention.  The C-- parser
 * implements the RET_xxxx() macros to perform unboxed-tuple returns
 * based on the prevailing return convention.
 *
 * This file is written in a subset of C--, extended with various
 * features specific to GHC.  It is compiled by GHC directly.  For the
 * syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
 *
 * ---------------------------------------------------------------------------*/

#include "Cmm.h"

#ifdef __PIC__
import pthread_mutex_lock;
import pthread_mutex_unlock;
#endif
import base_ControlziExceptionziBase_nestedAtomically_closure;
import EnterCriticalSection;
import LeaveCriticalSection;
import ghczmprim_GHCziTypes_False_closure;
#if defined(USE_MINIINTERPRETER) || !defined(mingw32_HOST_OS)
import sm_mutex;
#endif

/*-----------------------------------------------------------------------------
  Array Primitives

  Basically just new*Array - the others are all inline macros.

  The size arg is always passed in R1, and the result returned in R1.

  The slow entry point is for returning from a heap check, the saved
  size argument must be re-loaded from the stack.
  -------------------------------------------------------------------------- */

/* for objects that are *less* than the size of a word, make sure we
 * round up to the nearest word for the size of the array.
 */

stg_newByteArrayzh
{
    W_ words, payload_words, n, p;
    MAYBE_GC(NO_PTRS,stg_newByteArrayzh);
    n = R1;
    payload_words = ROUNDUP_BYTES_TO_WDS(n);
    words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
    ("ptr" p) = foreign "C" allocate(MyCapability() "ptr",words) [];
    TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
    SET_HDR(p, stg_ARR_WORDS_info, CCCS);
    StgArrWords_bytes(p) = n;
    RET_P(p);
}

#define BA_ALIGN 16
#define BA_MASK  (BA_ALIGN-1)

stg_newPinnedByteArrayzh
{
    W_ words, n, bytes, payload_words, p;

    MAYBE_GC(NO_PTRS,stg_newPinnedByteArrayzh);
    n = R1;
    bytes = n;
    /* payload_words is what we will tell the profiler we had to allocate */
    payload_words = ROUNDUP_BYTES_TO_WDS(bytes);
    /* When we actually allocate memory, we need to allow space for the
       header: */
    bytes = bytes + SIZEOF_StgArrWords;
    /* And we want to align to BA_ALIGN bytes, so we need to allow space
       to shift up to BA_ALIGN - 1 bytes: */
    bytes = bytes + BA_ALIGN - 1;
    /* Now we convert to a number of words: */
    words = ROUNDUP_BYTES_TO_WDS(bytes);

    ("ptr" p) = foreign "C" allocatePinned(MyCapability() "ptr", words) [];
    TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);

    /* Now we need to move p forward so that the payload is aligned
       to BA_ALIGN bytes: */
    p = p + ((-p - SIZEOF_StgArrWords) & BA_MASK);

    SET_HDR(p, stg_ARR_WORDS_info, CCCS);
    StgArrWords_bytes(p) = n;
    RET_P(p);
}

stg_newAlignedPinnedByteArrayzh
{
    W_ words, n, bytes, payload_words, p, alignment;

    MAYBE_GC(NO_PTRS,stg_newAlignedPinnedByteArrayzh);
    n = R1;
    alignment = R2;

    /* we always supply at least word-aligned memory, so there's no
       need to allow extra space for alignment if the requirement is less
       than a word.  This also prevents mischief with alignment == 0. */
    if (alignment <= SIZEOF_W) { alignment = 1; }

    bytes = n;

    /* payload_words is what we will tell the profiler we had to allocate */
    payload_words = ROUNDUP_BYTES_TO_WDS(bytes);

    /* When we actually allocate memory, we need to allow space for the
       header: */
    bytes = bytes + SIZEOF_StgArrWords;
    /* And we want to align to <alignment> bytes, so we need to allow space
       to shift up to <alignment - 1> bytes: */
    bytes = bytes + alignment - 1;
    /* Now we convert to a number of words: */
    words = ROUNDUP_BYTES_TO_WDS(bytes);

    ("ptr" p) = foreign "C" allocatePinned(MyCapability() "ptr", words) [];
    TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);

    /* Now we need to move p forward so that the payload is aligned
       to <alignment> bytes. Note that we are assuming that
       <alignment> is a power of 2, which is technically not guaranteed */
    p = p + ((-p - SIZEOF_StgArrWords) & (alignment - 1));

    SET_HDR(p, stg_ARR_WORDS_info, CCCS);
    StgArrWords_bytes(p) = n;
    RET_P(p);
}

stg_newArrayzh
{
    W_ words, n, init, arr, p, size;
    /* Args: R1 = words, R2 = initialisation value */

    n = R1;
    MAYBE_GC(R2_PTR,stg_newArrayzh);

    // the mark area contains one byte for each 2^MUT_ARR_PTRS_CARD_BITS words
    // in the array, making sure we round up, and then rounding up to a whole
    // number of words.
    size = n + mutArrPtrsCardWords(n);
    words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + size;
    ("ptr" arr) = foreign "C" allocate(MyCapability() "ptr",words) [R2];
    TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(n), 0);

    SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, CCCS);
    StgMutArrPtrs_ptrs(arr) = n;
    StgMutArrPtrs_size(arr) = size;

    // Initialise all elements of the the array with the value in R2
    init = R2;
    p = arr + SIZEOF_StgMutArrPtrs;
  for:
    if (p < arr + WDS(words)) {
	W_[p] = init;
	p = p + WDS(1);
	goto for;
    }
    // Initialise the mark bits with 0
  for2:
    if (p < arr + WDS(size)) {
	W_[p] = 0;
	p = p + WDS(1);
	goto for2;
    }

    RET_P(arr);
}

stg_unsafeThawArrayzh
{
  // SUBTLETY TO DO WITH THE OLD GEN MUTABLE LIST
  //
  // A MUT_ARR_PTRS lives on the mutable list, but a MUT_ARR_PTRS_FROZEN 
  // normally doesn't.  However, when we freeze a MUT_ARR_PTRS, we leave
  // it on the mutable list for the GC to remove (removing something from
  // the mutable list is not easy).
  // 
  // So that we can tell whether a MUT_ARR_PTRS_FROZEN is on the mutable list,
  // when we freeze it we set the info ptr to be MUT_ARR_PTRS_FROZEN0
  // to indicate that it is still on the mutable list.
  //
  // So, when we thaw a MUT_ARR_PTRS_FROZEN, we must cope with two cases:
  // either it is on a mut_list, or it isn't.  We adopt the convention that
  // the closure type is MUT_ARR_PTRS_FROZEN0 if it is on the mutable list,
  // and MUT_ARR_PTRS_FROZEN otherwise.  In fact it wouldn't matter if
  // we put it on the mutable list more than once, but it would get scavenged
  // multiple times during GC, which would be unnecessarily slow.
  //
  if (StgHeader_info(R1) != stg_MUT_ARR_PTRS_FROZEN0_info) {
	SET_INFO(R1,stg_MUT_ARR_PTRS_DIRTY_info);
	recordMutable(R1, R1);
	// must be done after SET_INFO, because it ASSERTs closure_MUTABLE()
	RET_P(R1);
  } else {
	SET_INFO(R1,stg_MUT_ARR_PTRS_DIRTY_info);
	RET_P(R1);
  }
}

stg_newArrayArrayzh
{
    W_ words, n, arr, p, size;
    /* Args: R1 = words */

    n = R1;
    MAYBE_GC(NO_PTRS,stg_newArrayArrayzh);

    // the mark area contains one byte for each 2^MUT_ARR_PTRS_CARD_BITS words
    // in the array, making sure we round up, and then rounding up to a whole
    // number of words.
    size = n + mutArrPtrsCardWords(n);
    words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + size;
    ("ptr" arr) = foreign "C" allocate(MyCapability() "ptr",words) [];
    TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(n), 0);

    SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, W_[CCCS]);
    StgMutArrPtrs_ptrs(arr) = n;
    StgMutArrPtrs_size(arr) = size;

    // Initialise all elements of the array with a pointer to the new array
    p = arr + SIZEOF_StgMutArrPtrs;
  for:
    if (p < arr + WDS(words)) {
	W_[p] = arr;
	p = p + WDS(1);
	goto for;
    }
    // Initialise the mark bits with 0
  for2:
    if (p < arr + WDS(size)) {
	W_[p] = 0;
	p = p + WDS(1);
	goto for2;
    }

    RET_P(arr);
}


/* -----------------------------------------------------------------------------
   MutVar primitives
   -------------------------------------------------------------------------- */

stg_newMutVarzh
{
    W_ mv;
    /* Args: R1 = initialisation value */

    ALLOC_PRIM( SIZEOF_StgMutVar, R1_PTR, stg_newMutVarzh);

    mv = Hp - SIZEOF_StgMutVar + WDS(1);
    SET_HDR(mv,stg_MUT_VAR_DIRTY_info,CCCS);
    StgMutVar_var(mv) = R1;
    
    RET_P(mv);
}

stg_casMutVarzh
 /* MutVar# s a -> a -> a -> State# s -> (# State#, Int#, a #) */
{
    W_ mv, old, new, h;

    mv  = R1;
    old = R2;
    new = R3;

    (h) = foreign "C" cas(mv + SIZEOF_StgHeader + OFFSET_StgMutVar_var,
                          old, new) [];
    if (h != old) {
        RET_NP(1,h);
    } else {
        if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) {
           foreign "C" dirty_MUT_VAR(BaseReg "ptr", mv "ptr") [];
        }
        RET_NP(0,h);
    }
}


stg_atomicModifyMutVarzh
{
    W_ mv, f, z, x, y, r, h;
    /* Args: R1 :: MutVar#,  R2 :: a -> (a,b) */

    /* If x is the current contents of the MutVar#, then 
       We want to make the new contents point to

         (sel_0 (f x))
 
       and the return value is
	 
	 (sel_1 (f x))

        obviously we can share (f x).

         z = [stg_ap_2 f x]  (max (HS + 2) MIN_UPD_SIZE)
	 y = [stg_sel_0 z]   (max (HS + 1) MIN_UPD_SIZE)
         r = [stg_sel_1 z]   (max (HS + 1) MIN_UPD_SIZE)
    */

#if MIN_UPD_SIZE > 1
#define THUNK_1_SIZE (SIZEOF_StgThunkHeader + WDS(MIN_UPD_SIZE))
#define TICK_ALLOC_THUNK_1() TICK_ALLOC_UP_THK(WDS(1),WDS(MIN_UPD_SIZE-1))
#else
#define THUNK_1_SIZE (SIZEOF_StgThunkHeader + WDS(1))
#define TICK_ALLOC_THUNK_1() TICK_ALLOC_UP_THK(WDS(1),0)
#endif

#if MIN_UPD_SIZE > 2
#define THUNK_2_SIZE (SIZEOF_StgThunkHeader + WDS(MIN_UPD_SIZE))
#define TICK_ALLOC_THUNK_2() TICK_ALLOC_UP_THK(WDS(2),WDS(MIN_UPD_SIZE-2))
#else
#define THUNK_2_SIZE (SIZEOF_StgThunkHeader + WDS(2))
#define TICK_ALLOC_THUNK_2() TICK_ALLOC_UP_THK(WDS(2),0)
#endif

#define SIZE (THUNK_2_SIZE + THUNK_1_SIZE + THUNK_1_SIZE)

   HP_CHK_GEN_TICKY(SIZE, R1_PTR & R2_PTR, stg_atomicModifyMutVarzh);

   mv = R1;
   f = R2;

   TICK_ALLOC_THUNK_2();
   CCCS_ALLOC(THUNK_2_SIZE);
   z = Hp - THUNK_2_SIZE + WDS(1);
   SET_HDR(z, stg_ap_2_upd_info, CCCS);
   LDV_RECORD_CREATE(z);
   StgThunk_payload(z,0) = f;

   TICK_ALLOC_THUNK_1();
   CCCS_ALLOC(THUNK_1_SIZE);
   y = z - THUNK_1_SIZE;
   SET_HDR(y, stg_sel_0_upd_info, CCCS);
   LDV_RECORD_CREATE(y);
   StgThunk_payload(y,0) = z;

   TICK_ALLOC_THUNK_1();
   CCCS_ALLOC(THUNK_1_SIZE);
   r = y - THUNK_1_SIZE;
   SET_HDR(r, stg_sel_1_upd_info, CCCS);
   LDV_RECORD_CREATE(r);
   StgThunk_payload(r,0) = z;

 retry:
   x = StgMutVar_var(mv);
   StgThunk_payload(z,1) = x;
#ifdef THREADED_RTS
   (h) = foreign "C" cas(mv + SIZEOF_StgHeader + OFFSET_StgMutVar_var, x, y) [];
   if (h != x) { goto retry; }
#else
   StgMutVar_var(mv) = y;
#endif

   if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) {
     foreign "C" dirty_MUT_VAR(BaseReg "ptr", mv "ptr") [];
   }

   RET_P(r);
}

/* -----------------------------------------------------------------------------
   Weak Pointer Primitives
   -------------------------------------------------------------------------- */

STRING(stg_weak_msg,"New weak pointer at %p\n")

stg_mkWeakzh
{
  /* R1 = key
     R2 = value
     R3 = finalizer (or stg_NO_FINALIZER_closure)
  */
  W_ w;

  ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR & R3_PTR, stg_mkWeakzh );

  w = Hp - SIZEOF_StgWeak + WDS(1);
  SET_HDR(w, stg_WEAK_info, CCCS);

  // We don't care about cfinalizer here.
  // Should StgWeak_cfinalizer(w) be stg_NO_FINALIZER_closure or
  // something else?

  StgWeak_key(w)        = R1;
  StgWeak_value(w)      = R2;
  StgWeak_finalizer(w)  = R3;
  StgWeak_cfinalizer(w) = stg_NO_FINALIZER_closure;

  ACQUIRE_LOCK(sm_mutex);
  StgWeak_link(w)	= W_[weak_ptr_list];
  W_[weak_ptr_list] 	= w;
  RELEASE_LOCK(sm_mutex);

  IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []);

  RET_P(w);
}

stg_mkWeakNoFinalizzerzh
{
  /* R1 = key
     R2 = value
   */
  R3 = stg_NO_FINALIZER_closure;

  jump stg_mkWeakzh;
}

stg_mkWeakForeignEnvzh
{
  /* R1 = key
     R2 = value
     R3 = finalizer
     R4 = pointer
     R5 = has environment (0 or 1)
     R6 = environment
  */
  W_ w, payload_words, words, p;

  W_ key, val, fptr, ptr, flag, eptr;

  key  = R1;
  val  = R2;
  fptr = R3;
  ptr  = R4;
  flag = R5;
  eptr = R6;

  ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR, stg_mkWeakForeignEnvzh );

  w = Hp - SIZEOF_StgWeak + WDS(1);
  SET_HDR(w, stg_WEAK_info, CCCS);

  payload_words = 4;
  words         = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
  ("ptr" p)     = foreign "C" allocate(MyCapability() "ptr", words) [];

  TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
  SET_HDR(p, stg_ARR_WORDS_info, CCCS);

  StgArrWords_bytes(p)     = WDS(payload_words);
  StgArrWords_payload(p,0) = fptr;
  StgArrWords_payload(p,1) = ptr;
  StgArrWords_payload(p,2) = eptr;
  StgArrWords_payload(p,3) = flag;

  // We don't care about the value here.
  // Should StgWeak_value(w) be stg_NO_FINALIZER_closure or something else?

  StgWeak_key(w)        = key;
  StgWeak_value(w)      = val;
  StgWeak_finalizer(w)  = stg_NO_FINALIZER_closure;
  StgWeak_cfinalizer(w) = p;

  ACQUIRE_LOCK(sm_mutex);
  StgWeak_link(w)   = W_[weak_ptr_list];
  W_[weak_ptr_list] = w;
  RELEASE_LOCK(sm_mutex);

  IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []);

  RET_P(w);
}

stg_finalizzeWeakzh
{
  /* R1 = weak ptr
   */
  W_ w, f, arr;

  w = R1;

  // already dead?
  if (GET_INFO(w) == stg_DEAD_WEAK_info) {
      RET_NP(0,stg_NO_FINALIZER_closure);
  }

  // kill it
#ifdef PROFILING
  // @LDV profiling
  // A weak pointer is inherently used, so we do not need to call
  // LDV_recordDead_FILL_SLOP_DYNAMIC():
  //    LDV_recordDead_FILL_SLOP_DYNAMIC((StgClosure *)w);
  // or, LDV_recordDead():
  //    LDV_recordDead((StgClosure *)w, sizeofW(StgWeak) - sizeofW(StgProfHeader));
  // Furthermore, when PROFILING is turned on, dead weak pointers are exactly as 
  // large as weak pointers, so there is no need to fill the slop, either.
  // See stg_DEAD_WEAK_info in StgMiscClosures.hc.
#endif

  //
  // Todo: maybe use SET_HDR() and remove LDV_recordCreate()?
  //
  SET_INFO(w,stg_DEAD_WEAK_info);
  LDV_RECORD_CREATE(w);

  f   = StgWeak_finalizer(w);
  arr = StgWeak_cfinalizer(w);

  StgDeadWeak_link(w) = StgWeak_link(w);

  if (arr != stg_NO_FINALIZER_closure) {
    foreign "C" runCFinalizer(StgArrWords_payload(arr,0),
                              StgArrWords_payload(arr,1),
                              StgArrWords_payload(arr,2),
                              StgArrWords_payload(arr,3)) [];
  }

  /* return the finalizer */
  if (f == stg_NO_FINALIZER_closure) {
      RET_NP(0,stg_NO_FINALIZER_closure);
  } else {
      RET_NP(1,f);
  }
}

stg_deRefWeakzh
{
  /* R1 = weak ptr */
  W_ w, code, val;

  w = R1;
  if (GET_INFO(w) == stg_WEAK_info) {
    code = 1;
    val = StgWeak_value(w);
  } else {
    code = 0;
    val = w;
  }
  RET_NP(code,val);
}

/* -----------------------------------------------------------------------------
   Floating point operations.
   -------------------------------------------------------------------------- */

stg_decodeFloatzuIntzh
{ 
    W_ p;
    F_ arg;
    W_ mp_tmp1;
    W_ mp_tmp_w;

    STK_CHK_GEN( WDS(2), NO_PTRS, stg_decodeFloatzuIntzh );

    mp_tmp1  = Sp - WDS(1);
    mp_tmp_w = Sp - WDS(2);
    
    /* arguments: F1 = Float# */
    arg = F1;
    
    /* Perform the operation */
    foreign "C" __decodeFloat_Int(mp_tmp1 "ptr", mp_tmp_w "ptr", arg) [];
    
    /* returns: (Int# (mantissa), Int# (exponent)) */
    RET_NN(W_[mp_tmp1], W_[mp_tmp_w]);
}

stg_decodeDoublezu2Intzh
{ 
    D_ arg;
    W_ p;
    W_ mp_tmp1;
    W_ mp_tmp2;
    W_ mp_result1;
    W_ mp_result2;

    STK_CHK_GEN( WDS(4), NO_PTRS, stg_decodeDoublezu2Intzh );

    mp_tmp1    = Sp - WDS(1);
    mp_tmp2    = Sp - WDS(2);
    mp_result1 = Sp - WDS(3);
    mp_result2 = Sp - WDS(4);

    /* arguments: D1 = Double# */
    arg = D1;

    /* Perform the operation */
    foreign "C" __decodeDouble_2Int(mp_tmp1 "ptr", mp_tmp2 "ptr",
                                    mp_result1 "ptr", mp_result2 "ptr",
                                    arg) [];

    /* returns:
       (Int# (mant sign), Word# (mant high), Word# (mant low), Int# (expn)) */
    RET_NNNN(W_[mp_tmp1], W_[mp_tmp2], W_[mp_result1], W_[mp_result2]);
}

/* -----------------------------------------------------------------------------
 * Concurrency primitives
 * -------------------------------------------------------------------------- */

stg_forkzh
{
  /* args: R1 = closure to spark */

  MAYBE_GC(R1_PTR, stg_forkzh);

  W_ closure;
  W_ threadid;
  closure = R1;

  ("ptr" threadid) = foreign "C" createIOThread( MyCapability() "ptr", 
				RtsFlags_GcFlags_initialStkSize(RtsFlags), 
				closure "ptr") [];

  /* start blocked if the current thread is blocked */
  StgTSO_flags(threadid) = %lobits16(
     TO_W_(StgTSO_flags(threadid)) | 
     TO_W_(StgTSO_flags(CurrentTSO)) & (TSO_BLOCKEX | TSO_INTERRUPTIBLE));

  foreign "C" scheduleThread(MyCapability() "ptr", threadid "ptr") [];

  // context switch soon, but not immediately: we don't want every
  // forkIO to force a context-switch.
  Capability_context_switch(MyCapability()) = 1 :: CInt;
  
  RET_P(threadid);
}

stg_forkOnzh
{
  /* args: R1 = cpu, R2 = closure to spark */

  MAYBE_GC(R2_PTR, stg_forkOnzh);

  W_ cpu;
  W_ closure;
  W_ threadid;
  cpu = R1;
  closure = R2;

  ("ptr" threadid) = foreign "C" createIOThread( MyCapability() "ptr", 
				RtsFlags_GcFlags_initialStkSize(RtsFlags), 
				closure "ptr") [];

  /* start blocked if the current thread is blocked */
  StgTSO_flags(threadid) = %lobits16(
     TO_W_(StgTSO_flags(threadid)) | 
     TO_W_(StgTSO_flags(CurrentTSO)) & (TSO_BLOCKEX | TSO_INTERRUPTIBLE));

  foreign "C" scheduleThreadOn(MyCapability() "ptr", cpu, threadid "ptr") [];

  // context switch soon, but not immediately: we don't want every
  // forkIO to force a context-switch.
  Capability_context_switch(MyCapability()) = 1 :: CInt;
  
  RET_P(threadid);
}

stg_yieldzh
{
  // when we yield to the scheduler, we have to tell it to put the
  // current thread to the back of the queue by setting the
  // context_switch flag.  If we don't do this, it will run the same
  // thread again.
  Capability_context_switch(MyCapability()) = 1 :: CInt;
  jump stg_yield_noregs;
}

stg_myThreadIdzh
{
  /* no args. */
  RET_P(CurrentTSO);
}

stg_labelThreadzh
{
  /* args: 
	R1 = ThreadId#
	R2 = Addr# */
#if defined(DEBUG) || defined(TRACING) || defined(DTRACE)
  foreign "C" labelThread(MyCapability() "ptr", R1 "ptr", R2 "ptr") [];
#endif
  jump %ENTRY_CODE(Sp(0));
}

stg_isCurrentThreadBoundzh
{
  /* no args */
  W_ r;
  (r) = foreign "C" isThreadBound(CurrentTSO) [];
  RET_N(r);
}

stg_threadStatuszh
{
    /* args: R1 :: ThreadId# */
    W_ tso;
    W_ why_blocked;
    W_ what_next;
    W_ ret, cap, locked;

    tso = R1;

    what_next   = TO_W_(StgTSO_what_next(tso));
    why_blocked = TO_W_(StgTSO_why_blocked(tso));
    // Note: these two reads are not atomic, so they might end up
    // being inconsistent.  It doesn't matter, since we
    // only return one or the other.  If we wanted to return the
    // contents of block_info too, then we'd have to do some synchronisation.

    if (what_next == ThreadComplete) {
        ret = 16;  // NB. magic, matches up with GHC.Conc.threadStatus
    } else {
        if (what_next == ThreadKilled) {
            ret = 17;
        } else {
            ret = why_blocked;
        }
    }

    cap = TO_W_(Capability_no(StgTSO_cap(tso)));

    if ((TO_W_(StgTSO_flags(tso)) & TSO_LOCKED) != 0) {
        locked = 1;
    } else {
        locked = 0;
    }

    RET_NNN(ret,cap,locked);
}

/* -----------------------------------------------------------------------------
 * TVar primitives
 * -------------------------------------------------------------------------- */

#define SP_OFF 0

// Catch retry frame ------------------------------------------------------------

INFO_TABLE_RET(stg_catch_retry_frame, CATCH_RETRY_FRAME,
#if defined(PROFILING)
  W_ unused1, W_ unused2,
#endif
  W_ unused3, P_ unused4, P_ unused5)
{
   W_ r, frame, trec, outer;

   frame = Sp;
   trec = StgTSO_trec(CurrentTSO);
   outer  = StgTRecHeader_enclosing_trec(trec);
   (r) = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
   if (r != 0) {
     /* Succeeded (either first branch or second branch) */
     StgTSO_trec(CurrentTSO) = outer;
     Sp = Sp + SIZEOF_StgCatchRetryFrame;
     jump %ENTRY_CODE(Sp(SP_OFF));
   } else {
     /* Did not commit: re-execute */
     W_ new_trec;
     ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
     StgTSO_trec(CurrentTSO) = new_trec;
     if (StgCatchRetryFrame_running_alt_code(frame) != 0::I32) {
       R1 = StgCatchRetryFrame_alt_code(frame);
     } else {
       R1 = StgCatchRetryFrame_first_code(frame);
     }
     jump stg_ap_v_fast;
   }
}


// Atomically frame ------------------------------------------------------------

INFO_TABLE_RET(stg_atomically_frame, ATOMICALLY_FRAME,
#if defined(PROFILING)
  W_ unused1, W_ unused2,
#endif
  P_ code, P_ next_invariant_to_check, P_ result)
{
  W_ frame, trec, valid, next_invariant, q, outer;

  frame  = Sp;
  trec   = StgTSO_trec(CurrentTSO);
  result = R1;
  outer  = StgTRecHeader_enclosing_trec(trec);

  if (outer == NO_TREC) {
    /* First time back at the atomically frame -- pick up invariants */
    ("ptr" q) = foreign "C" stmGetInvariantsToCheck(MyCapability() "ptr", trec "ptr") [];
    StgAtomicallyFrame_next_invariant_to_check(frame) = q;
    StgAtomicallyFrame_result(frame) = result;

  } else {
    /* Second/subsequent time back at the atomically frame -- abort the
     * tx that's checking the invariant and move on to the next one */
    StgTSO_trec(CurrentTSO) = outer;
    q = StgAtomicallyFrame_next_invariant_to_check(frame);
    StgInvariantCheckQueue_my_execution(q) = trec;
    foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
    /* Don't free trec -- it's linked from q and will be stashed in the
     * invariant if we eventually commit. */
    q = StgInvariantCheckQueue_next_queue_entry(q);
    StgAtomicallyFrame_next_invariant_to_check(frame) = q;
    trec = outer;
  }

  q = StgAtomicallyFrame_next_invariant_to_check(frame);

  if (q != END_INVARIANT_CHECK_QUEUE) {
    /* We can't commit yet: another invariant to check */
    ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", trec "ptr") [];
    StgTSO_trec(CurrentTSO) = trec;

    next_invariant = StgInvariantCheckQueue_invariant(q);
    R1 = StgAtomicInvariant_code(next_invariant);
    jump stg_ap_v_fast;

  } else {

    /* We've got no more invariants to check, try to commit */
    (valid) = foreign "C" stmCommitTransaction(MyCapability() "ptr", trec "ptr") [];
    if (valid != 0) {
      /* Transaction was valid: commit succeeded */
      StgTSO_trec(CurrentTSO) = NO_TREC;
      R1 = StgAtomicallyFrame_result(frame);
      Sp = Sp + SIZEOF_StgAtomicallyFrame;
      jump %ENTRY_CODE(Sp(SP_OFF));
    } else {
      /* Transaction was not valid: try again */
      ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
      StgTSO_trec(CurrentTSO) = trec;
      StgAtomicallyFrame_next_invariant_to_check(frame) = END_INVARIANT_CHECK_QUEUE;
      R1 = StgAtomicallyFrame_code(frame);
      jump stg_ap_v_fast;
    }
  }
}

INFO_TABLE_RET(stg_atomically_waiting_frame, ATOMICALLY_FRAME,
#if defined(PROFILING)
  W_ unused1, W_ unused2,
#endif
  P_ code, P_ next_invariant_to_check, P_ result)
{
  W_ frame, trec, valid;

  frame = Sp;

  /* The TSO is currently waiting: should we stop waiting? */
  (valid) = foreign "C" stmReWait(MyCapability() "ptr", CurrentTSO "ptr") [];
  if (valid != 0) {
    /* Previous attempt is still valid: no point trying again yet */
    jump stg_block_noregs;
  } else {
    /* Previous attempt is no longer valid: try again */
    ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
    StgTSO_trec(CurrentTSO) = trec;
    StgHeader_info(frame) = stg_atomically_frame_info;
    R1 = StgAtomicallyFrame_code(frame);
    jump stg_ap_v_fast;
  }
}

// STM catch frame --------------------------------------------------------------

#define SP_OFF 0

/* Catch frames are very similar to update frames, but when entering
 * one we just pop the frame off the stack and perform the correct
 * kind of return to the activation record underneath us on the stack.
 */

INFO_TABLE_RET(stg_catch_stm_frame, CATCH_STM_FRAME,
#if defined(PROFILING)
  W_ unused1, W_ unused2,
#endif
  P_ unused3, P_ unused4)
   {
      W_ r, frame, trec, outer;
      frame = Sp;
      trec = StgTSO_trec(CurrentTSO);
      outer  = StgTRecHeader_enclosing_trec(trec);
      (r) = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
      if (r != 0) {
        /* Commit succeeded */
        StgTSO_trec(CurrentTSO) = outer;
        Sp = Sp + SIZEOF_StgCatchSTMFrame;
        jump %ENTRY_CODE(Sp(SP_OFF));
      } else {
        /* Commit failed */
        W_ new_trec;
        ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
        StgTSO_trec(CurrentTSO) = new_trec;
        R1 = StgCatchSTMFrame_code(frame);
        jump stg_ap_v_fast;
      }
   }


// Primop definition ------------------------------------------------------------

stg_atomicallyzh
{
  W_ frame;
  W_ old_trec;
  W_ new_trec;
  
  // stmStartTransaction may allocate
  MAYBE_GC (R1_PTR, stg_atomicallyzh); 

  /* Args: R1 = m :: STM a */
  STK_CHK_GEN(SIZEOF_StgAtomicallyFrame + WDS(1), R1_PTR, stg_atomicallyzh);

  old_trec = StgTSO_trec(CurrentTSO);

  /* Nested transactions are not allowed; raise an exception */
  if (old_trec != NO_TREC) {
     R1 = base_ControlziExceptionziBase_nestedAtomically_closure;
     jump stg_raisezh;
  }

  /* Set up the atomically frame */
  Sp = Sp - SIZEOF_StgAtomicallyFrame;
  frame = Sp;

  SET_HDR(frame,stg_atomically_frame_info, CCCS);
  StgAtomicallyFrame_code(frame) = R1;
  StgAtomicallyFrame_result(frame) = NO_TREC;
  StgAtomicallyFrame_next_invariant_to_check(frame) = END_INVARIANT_CHECK_QUEUE;

  /* Start the memory transcation */
  ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", old_trec "ptr") [R1];
  StgTSO_trec(CurrentTSO) = new_trec;

  /* Apply R1 to the realworld token */
  jump stg_ap_v_fast;
}

// A closure representing "atomically x".  This is used when a thread
// inside a transaction receives an asynchronous exception; see #5866.
// It is somewhat similar to the stg_raise closure.
//
INFO_TABLE(stg_atomically,1,0,THUNK_1_0,"atomically","atomically")
{
  R1 = StgThunk_payload(R1,0);
  jump stg_atomicallyzh;
}


stg_catchSTMzh
{
  W_ frame;
  
  /* Args: R1 :: STM a */
  /* Args: R2 :: Exception -> STM a */
  STK_CHK_GEN(SIZEOF_StgCatchSTMFrame + WDS(1), R1_PTR & R2_PTR, stg_catchSTMzh);

  /* Set up the catch frame */
  Sp = Sp - SIZEOF_StgCatchSTMFrame;
  frame = Sp;

  SET_HDR(frame, stg_catch_stm_frame_info, CCCS);
  StgCatchSTMFrame_handler(frame) = R2;
  StgCatchSTMFrame_code(frame) = R1;

  /* Start a nested transaction to run the body of the try block in */
  W_ cur_trec;  
  W_ new_trec;
  cur_trec = StgTSO_trec(CurrentTSO);
  ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", cur_trec "ptr");
  StgTSO_trec(CurrentTSO) = new_trec;

  /* Apply R1 to the realworld token */
  jump stg_ap_v_fast;
}


stg_catchRetryzh
{
  W_ frame;
  W_ new_trec;
  W_ trec;

  // stmStartTransaction may allocate
  MAYBE_GC (R1_PTR & R2_PTR, stg_catchRetryzh); 

  /* Args: R1 :: STM a */
  /* Args: R2 :: STM a */
  STK_CHK_GEN(SIZEOF_StgCatchRetryFrame + WDS(1), R1_PTR & R2_PTR, stg_catchRetryzh);

  /* Start a nested transaction within which to run the first code */
  trec = StgTSO_trec(CurrentTSO);
  ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", trec "ptr") [R1,R2];
  StgTSO_trec(CurrentTSO) = new_trec;

  /* Set up the catch-retry frame */
  Sp = Sp - SIZEOF_StgCatchRetryFrame;
  frame = Sp;
  
  SET_HDR(frame, stg_catch_retry_frame_info, CCCS);
  StgCatchRetryFrame_running_alt_code(frame) = 0 :: CInt; // false;
  StgCatchRetryFrame_first_code(frame) = R1;
  StgCatchRetryFrame_alt_code(frame) = R2;

  /* Apply R1 to the realworld token */
  jump stg_ap_v_fast;
}


stg_retryzh
{
  W_ frame_type;
  W_ frame;
  W_ trec;
  W_ outer;
  W_ r;

  MAYBE_GC (NO_PTRS, stg_retryzh); // STM operations may allocate

  // Find the enclosing ATOMICALLY_FRAME or CATCH_RETRY_FRAME
retry_pop_stack:
  SAVE_THREAD_STATE();
  (frame_type) = foreign "C" findRetryFrameHelper(MyCapability(), CurrentTSO "ptr") [];
  LOAD_THREAD_STATE();
  frame = Sp;
  trec = StgTSO_trec(CurrentTSO);
  outer  = StgTRecHeader_enclosing_trec(trec);

  if (frame_type == CATCH_RETRY_FRAME) {
    // The retry reaches a CATCH_RETRY_FRAME before the atomic frame
    ASSERT(outer != NO_TREC);
    // Abort the transaction attempting the current branch
    foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
    foreign "C" stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr") [];
    if (!StgCatchRetryFrame_running_alt_code(frame) != 0::I32) {
      // Retry in the first branch: try the alternative
      ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
      StgTSO_trec(CurrentTSO) = trec;
      StgCatchRetryFrame_running_alt_code(frame) = 1 :: CInt; // true;
      R1 = StgCatchRetryFrame_alt_code(frame);
      jump stg_ap_v_fast;
    } else {
      // Retry in the alternative code: propagate the retry
      StgTSO_trec(CurrentTSO) = outer;
      Sp = Sp + SIZEOF_StgCatchRetryFrame;
      goto retry_pop_stack;
    }
  }

  // We've reached the ATOMICALLY_FRAME: attempt to wait 
  ASSERT(frame_type == ATOMICALLY_FRAME);
  if (outer != NO_TREC) {
    // We called retry while checking invariants, so abort the current
    // invariant check (merging its TVar accesses into the parents read
    // set so we'll wait on them)
    foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
    foreign "C" stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr") [];
    trec = outer;
    StgTSO_trec(CurrentTSO) = trec;
    outer  = StgTRecHeader_enclosing_trec(trec);
  }
  ASSERT(outer == NO_TREC);

  (r) = foreign "C" stmWait(MyCapability() "ptr", CurrentTSO "ptr", trec "ptr") [];
  if (r != 0) {
    // Transaction was valid: stmWait put us on the TVars' queues, we now block
    StgHeader_info(frame) = stg_atomically_waiting_frame_info;
    Sp = frame;
    // Fix up the stack in the unregisterised case: the return convention is different.
    R3 = trec; // passing to stmWaitUnblock()
    jump stg_block_stmwait;
  } else {
    // Transaction was not valid: retry immediately
    ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
    StgTSO_trec(CurrentTSO) = trec;
    R1 = StgAtomicallyFrame_code(frame);
    Sp = frame;
    jump stg_ap_v_fast;
  }
}


stg_checkzh
{
  W_ trec, closure;

  /* Args: R1 = invariant closure */
  MAYBE_GC (R1_PTR, stg_checkzh); 

  trec = StgTSO_trec(CurrentTSO);
  closure = R1;
  foreign "C" stmAddInvariantToCheck(MyCapability() "ptr", 
                                     trec "ptr",
                                     closure "ptr") [];

  jump %ENTRY_CODE(Sp(0));
}


stg_newTVarzh
{
  W_ tv;
  W_ new_value;

  /* Args: R1 = initialisation value */

  MAYBE_GC (R1_PTR, stg_newTVarzh); 
  new_value = R1;
  ("ptr" tv) = foreign "C" stmNewTVar(MyCapability() "ptr", new_value "ptr") [];
  RET_P(tv);
}


stg_readTVarzh
{
  W_ trec;
  W_ tvar;
  W_ result;

  /* Args: R1 = TVar closure */

  MAYBE_GC (R1_PTR, stg_readTVarzh); // Call to stmReadTVar may allocate
  trec = StgTSO_trec(CurrentTSO);
  tvar = R1;
  ("ptr" result) = foreign "C" stmReadTVar(MyCapability() "ptr", trec "ptr", tvar "ptr") [];

  RET_P(result);
}

stg_readTVarIOzh
{
    W_ result;

again:
    result = StgTVar_current_value(R1);
    if (%INFO_PTR(result) == stg_TREC_HEADER_info) {
        goto again;
    }
    RET_P(result);
}

stg_writeTVarzh
{
  W_ trec;
  W_ tvar;
  W_ new_value;
  
  /* Args: R1 = TVar closure */
  /*       R2 = New value    */

  MAYBE_GC (R1_PTR & R2_PTR, stg_writeTVarzh); // Call to stmWriteTVar may allocate
  trec = StgTSO_trec(CurrentTSO);
  tvar = R1;
  new_value = R2;
  foreign "C" stmWriteTVar(MyCapability() "ptr", trec "ptr", tvar "ptr", new_value "ptr") [];

  jump %ENTRY_CODE(Sp(0));
}


/* -----------------------------------------------------------------------------
 * MVar primitives
 *
 * take & putMVar work as follows.  Firstly, an important invariant:
 *
 *    If the MVar is full, then the blocking queue contains only
 *    threads blocked on putMVar, and if the MVar is empty then the
 *    blocking queue contains only threads blocked on takeMVar.
 *
 * takeMvar:
 *    MVar empty : then add ourselves to the blocking queue
 *    MVar full  : remove the value from the MVar, and
 *                 blocking queue empty     : return
 *                 blocking queue non-empty : perform the first blocked putMVar
 *                                            from the queue, and wake up the
 *                                            thread (MVar is now full again)
 *
 * putMVar is just the dual of the above algorithm.
 *
 * How do we "perform a putMVar"?  Well, we have to fiddle around with
 * the stack of the thread waiting to do the putMVar.  See
 * stg_block_putmvar and stg_block_takemvar in HeapStackCheck.c for
 * the stack layout, and the PerformPut and PerformTake macros below.
 *
 * It is important that a blocked take or put is woken up with the
 * take/put already performed, because otherwise there would be a
 * small window of vulnerability where the thread could receive an
 * exception and never perform its take or put, and we'd end up with a
 * deadlock.
 *
 * -------------------------------------------------------------------------- */

stg_isEmptyMVarzh
{
    /* args: R1 = MVar closure */

    if (StgMVar_value(R1) == stg_END_TSO_QUEUE_closure) {
	RET_N(1);
    } else {
	RET_N(0);
    }
}

stg_newMVarzh
{
    /* args: none */
    W_ mvar;

    ALLOC_PRIM ( SIZEOF_StgMVar, NO_PTRS, stg_newMVarzh );
  
    mvar = Hp - SIZEOF_StgMVar + WDS(1);
    SET_HDR(mvar,stg_MVAR_DIRTY_info,CCCS);
        // MVARs start dirty: generation 0 has no mutable list
    StgMVar_head(mvar)  = stg_END_TSO_QUEUE_closure;
    StgMVar_tail(mvar)  = stg_END_TSO_QUEUE_closure;
    StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
    RET_P(mvar);
}


#define PerformTake(stack, value)               \
    W_ sp;                                      \
    sp = StgStack_sp(stack);                    \
    W_[sp + WDS(1)] = value;                    \
    W_[sp + WDS(0)] = stg_gc_unpt_r1_info;

#define PerformPut(stack,lval)                  \
    W_ sp;                                      \
    sp = StgStack_sp(stack) + WDS(3);           \
    StgStack_sp(stack) = sp;                    \
    lval = W_[sp - WDS(1)];

stg_takeMVarzh
{
    W_ mvar, val, info, tso, q;

    /* args: R1 = MVar closure */
    mvar = R1;

#if defined(THREADED_RTS)
    ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
#else
    info = GET_INFO(mvar);
#endif
        
    if (info == stg_MVAR_CLEAN_info) {
        foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr") [];
    }

    /* If the MVar is empty, put ourselves on its blocking queue,
     * and wait until we're woken up.
     */
    if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) {
        
        // Note [mvar-heap-check] We want to do the heap check in the
        // branch here, to avoid the conditional in the common case.
        // However, we've already locked the MVar above, so we better
        // be careful to unlock it again if the the heap check fails.
        // Unfortunately we don't have an easy way to inject any code
        // into the heap check generated by the code generator, so we
        // have to do it in stg_gc_gen (see HeapStackCheck.cmm).
        HP_CHK_GEN_TICKY(SIZEOF_StgMVarTSOQueue, R1_PTR, stg_takeMVarzh);
        TICK_ALLOC_PRIM(SIZEOF_StgMVarTSOQueue, 0, 0);
        CCCS_ALLOC(SIZEOF_StgMVarTSOQueue);

        q = Hp - SIZEOF_StgMVarTSOQueue + WDS(1);

        SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM);
        StgMVarTSOQueue_link(q) = END_TSO_QUEUE;
        StgMVarTSOQueue_tso(q)  = CurrentTSO;

	if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
	    StgMVar_head(mvar) = q;
	} else {
            StgMVarTSOQueue_link(StgMVar_tail(mvar)) = q;
            foreign "C" recordClosureMutated(MyCapability() "ptr",
                                             StgMVar_tail(mvar)) [];
	}
	StgTSO__link(CurrentTSO)       = q;
	StgTSO_block_info(CurrentTSO)  = mvar;
	StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
	StgMVar_tail(mvar)             = q;
	
        R1 = mvar;
	jump stg_block_takemvar;
    }
    
    /* we got the value... */
    val = StgMVar_value(mvar);
    
    q = StgMVar_head(mvar);
loop:
    if (q == stg_END_TSO_QUEUE_closure) {
        /* No further putMVars, MVar is now empty */
        StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
        unlockClosure(mvar, stg_MVAR_DIRTY_info);
        RET_P(val);
    }
    if (StgHeader_info(q) == stg_IND_info ||
        StgHeader_info(q) == stg_MSG_NULL_info) {
        q = StgInd_indirectee(q);
        goto loop;
    }
    
    // There are putMVar(s) waiting... wake up the first thread on the queue
    
    tso = StgMVarTSOQueue_tso(q);
    StgMVar_head(mvar) = StgMVarTSOQueue_link(q);
    if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
        StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
    }

    ASSERT(StgTSO_why_blocked(tso) == BlockedOnMVar::I16);
    ASSERT(StgTSO_block_info(tso) == mvar);

    // actually perform the putMVar for the thread that we just woke up
    W_ stack;
    stack = StgTSO_stackobj(tso);
    PerformPut(stack, StgMVar_value(mvar));

    // indicate that the MVar operation has now completed.
    StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
    
    // no need to mark the TSO dirty, we have only written END_TSO_QUEUE.

    foreign "C" tryWakeupThread(MyCapability() "ptr", tso) [];
    
    unlockClosure(mvar, stg_MVAR_DIRTY_info);
    RET_P(val);
}


stg_tryTakeMVarzh
{
    W_ mvar, val, info, tso, q;

    /* args: R1 = MVar closure */
    mvar = R1;

#if defined(THREADED_RTS)
    ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
#else
    info = GET_INFO(mvar);
#endif
        
    /* If the MVar is empty, put ourselves on its blocking queue,
     * and wait until we're woken up.
     */
    if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) {
#if defined(THREADED_RTS)
        unlockClosure(mvar, info);
#endif
	/* HACK: we need a pointer to pass back, 
	 * so we abuse NO_FINALIZER_closure
	 */
	RET_NP(0, stg_NO_FINALIZER_closure);
    }
    
    if (info == stg_MVAR_CLEAN_info) {
        foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr") [];
    }

    /* we got the value... */
    val = StgMVar_value(mvar);
    
    q = StgMVar_head(mvar);
loop:
    if (q == stg_END_TSO_QUEUE_closure) {
        /* No further putMVars, MVar is now empty */
        StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
        unlockClosure(mvar, stg_MVAR_DIRTY_info);
        RET_NP(1, val);
    }
    if (StgHeader_info(q) == stg_IND_info ||
        StgHeader_info(q) == stg_MSG_NULL_info) {
        q = StgInd_indirectee(q);
        goto loop;
    }
    
    // There are putMVar(s) waiting... wake up the first thread on the queue
    
    tso = StgMVarTSOQueue_tso(q);
    StgMVar_head(mvar) = StgMVarTSOQueue_link(q);
    if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
        StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
    }

    ASSERT(StgTSO_why_blocked(tso) == BlockedOnMVar::I16);
    ASSERT(StgTSO_block_info(tso) == mvar);

    // actually perform the putMVar for the thread that we just woke up
    W_ stack;
    stack = StgTSO_stackobj(tso);
    PerformPut(stack, StgMVar_value(mvar));

    // indicate that the MVar operation has now completed.
    StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
    
    // no need to mark the TSO dirty, we have only written END_TSO_QUEUE.

    foreign "C" tryWakeupThread(MyCapability() "ptr", tso) [];
    
    unlockClosure(mvar, stg_MVAR_DIRTY_info);
    RET_NP(1,val);
}


stg_putMVarzh
{
    W_ mvar, val, info, tso, q;

    /* args: R1 = MVar, R2 = value */
    mvar = R1;
    val  = R2;

#if defined(THREADED_RTS)
    ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
#else
    info = GET_INFO(mvar);
#endif

    if (info == stg_MVAR_CLEAN_info) {
        foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr");
    }

    if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) {

        // see Note [mvar-heap-check] above
        HP_CHK_GEN_TICKY(SIZEOF_StgMVarTSOQueue, R1_PTR & R2_PTR, stg_putMVarzh);
        TICK_ALLOC_PRIM(SIZEOF_StgMVarTSOQueue, 0, 0);
        CCCS_ALLOC(SIZEOF_StgMVarTSOQueue);

        q = Hp - SIZEOF_StgMVarTSOQueue + WDS(1);

        SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM);
        StgMVarTSOQueue_link(q) = END_TSO_QUEUE;
        StgMVarTSOQueue_tso(q)  = CurrentTSO;

	if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
	    StgMVar_head(mvar) = q;
	} else {
            StgMVarTSOQueue_link(StgMVar_tail(mvar)) = q;
            foreign "C" recordClosureMutated(MyCapability() "ptr",
                                             StgMVar_tail(mvar)) [];
	}
	StgTSO__link(CurrentTSO)       = q;
	StgTSO_block_info(CurrentTSO)  = mvar;
	StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
	StgMVar_tail(mvar)             = q;

        R1 = mvar;
        R2 = val;
	jump stg_block_putmvar;
    }
  
    q = StgMVar_head(mvar);
loop:
    if (q == stg_END_TSO_QUEUE_closure) {
	/* No further takes, the MVar is now full. */
	StgMVar_value(mvar) = val;
    	unlockClosure(mvar, stg_MVAR_DIRTY_info);
	jump %ENTRY_CODE(Sp(0));
    }
    if (StgHeader_info(q) == stg_IND_info ||
        StgHeader_info(q) == stg_MSG_NULL_info) {
        q = StgInd_indirectee(q);
        goto loop;
    }

    // There are takeMVar(s) waiting: wake up the first one
    
    tso = StgMVarTSOQueue_tso(q);
    StgMVar_head(mvar) = StgMVarTSOQueue_link(q);
    if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
        StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
    }

    ASSERT(StgTSO_why_blocked(tso) == BlockedOnMVar::I16);
    ASSERT(StgTSO_block_info(tso) == mvar);

    // actually perform the takeMVar
    W_ stack;
    stack = StgTSO_stackobj(tso);
    PerformTake(stack, val);

    // indicate that the MVar operation has now completed.
    StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;

    if (TO_W_(StgStack_dirty(stack)) == 0) {
        foreign "C" dirty_STACK(MyCapability() "ptr", stack "ptr") [];
    }
    
    foreign "C" tryWakeupThread(MyCapability() "ptr", tso) [];

    unlockClosure(mvar, stg_MVAR_DIRTY_info);
    jump %ENTRY_CODE(Sp(0));
}


stg_tryPutMVarzh
{
    W_ mvar, val, info, tso, q;

    /* args: R1 = MVar, R2 = value */
    mvar = R1;
    val  = R2;

#if defined(THREADED_RTS)
    ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
#else
    info = GET_INFO(mvar);
#endif

    if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) {
#if defined(THREADED_RTS)
    	unlockClosure(mvar, info);
#endif
	RET_N(0);
    }
  
    if (info == stg_MVAR_CLEAN_info) {
        foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr");
    }

    q = StgMVar_head(mvar);
loop:
    if (q == stg_END_TSO_QUEUE_closure) {
	/* No further takes, the MVar is now full. */
	StgMVar_value(mvar) = val;
    	unlockClosure(mvar, stg_MVAR_DIRTY_info);
        RET_N(1);
    }
    if (StgHeader_info(q) == stg_IND_info ||
        StgHeader_info(q) == stg_MSG_NULL_info) {
        q = StgInd_indirectee(q);
        goto loop;
    }

    // There are takeMVar(s) waiting: wake up the first one
    
    tso = StgMVarTSOQueue_tso(q);
    StgMVar_head(mvar) = StgMVarTSOQueue_link(q);
    if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
        StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
    }

    ASSERT(StgTSO_why_blocked(tso) == BlockedOnMVar::I16);
    ASSERT(StgTSO_block_info(tso) == mvar);

    // actually perform the takeMVar
    W_ stack;
    stack = StgTSO_stackobj(tso);
    PerformTake(stack, val);

    // indicate that the MVar operation has now completed.
    StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
    
    if (TO_W_(StgStack_dirty(stack)) == 0) {
        foreign "C" dirty_STACK(MyCapability() "ptr", stack "ptr") [];
    }
    
    foreign "C" tryWakeupThread(MyCapability() "ptr", tso) [];

    unlockClosure(mvar, stg_MVAR_DIRTY_info);
    RET_N(1);
}


/* -----------------------------------------------------------------------------
   Stable pointer primitives
   -------------------------------------------------------------------------  */

stg_makeStableNamezh
{
    W_ index, sn_obj;

    ALLOC_PRIM( SIZEOF_StgStableName, R1_PTR, stg_makeStableNamezh );
  
    (index) = foreign "C" lookupStableName(R1 "ptr") [];

    /* Is there already a StableName for this heap object?
     *  stable_ptr_table is a pointer to an array of snEntry structs.
     */
    if ( snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry) == NULL ) {
	sn_obj = Hp - SIZEOF_StgStableName + WDS(1);
	SET_HDR(sn_obj, stg_STABLE_NAME_info, CCCS);
	StgStableName_sn(sn_obj) = index;
	snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry) = sn_obj;
    } else {
	sn_obj = snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry);
    }
    
    RET_P(sn_obj);
}


stg_makeStablePtrzh
{
    /* Args: R1 = a */
    W_ sp;
    MAYBE_GC(R1_PTR, stg_makeStablePtrzh);
    ("ptr" sp) = foreign "C" getStablePtr(R1 "ptr") [];
    RET_N(sp);
}

stg_deRefStablePtrzh
{
    /* Args: R1 = the stable ptr */
    W_ r, sp;
    sp = R1;
    r = snEntry_addr(W_[stable_ptr_table] + sp*SIZEOF_snEntry);
    RET_P(r);
}

/* -----------------------------------------------------------------------------
   Bytecode object primitives
   -------------------------------------------------------------------------  */

stg_newBCOzh
{
    /* R1 = instrs
       R2 = literals
       R3 = ptrs
       R4 = arity
       R5 = bitmap array
    */
    W_ bco, bitmap_arr, bytes, words;
    
    bitmap_arr = R5;

    words = BYTES_TO_WDS(SIZEOF_StgBCO) + BYTE_ARR_WDS(bitmap_arr);
    bytes = WDS(words);

    ALLOC_PRIM( bytes, R1_PTR&R2_PTR&R3_PTR&R5_PTR, stg_newBCOzh );

    bco = Hp - bytes + WDS(1);
    SET_HDR(bco, stg_BCO_info, CCCS);
    
    StgBCO_instrs(bco)     = R1;
    StgBCO_literals(bco)   = R2;
    StgBCO_ptrs(bco)       = R3;
    StgBCO_arity(bco)      = HALF_W_(R4);
    StgBCO_size(bco)       = HALF_W_(words);
    
    // Copy the arity/bitmap info into the BCO
    W_ i;
    i = 0;
for:
    if (i < BYTE_ARR_WDS(bitmap_arr)) {
	StgBCO_bitmap(bco,i) = StgArrWords_payload(bitmap_arr,i);
	i = i + 1;
	goto for;
    }
    
    RET_P(bco);
}


stg_mkApUpd0zh
{
    // R1 = the BCO# for the AP
    //	
    W_ ap;

    // This function is *only* used to wrap zero-arity BCOs in an
    // updatable wrapper (see ByteCodeLink.lhs).  An AP thunk is always
    // saturated and always points directly to a FUN or BCO.
    ASSERT(%INFO_TYPE(%GET_STD_INFO(R1)) == HALF_W_(BCO) &&
	   StgBCO_arity(R1) == HALF_W_(0));

    HP_CHK_GEN_TICKY(SIZEOF_StgAP, R1_PTR, stg_mkApUpd0zh);
    TICK_ALLOC_UP_THK(0, 0);
    CCCS_ALLOC(SIZEOF_StgAP);

    ap = Hp - SIZEOF_StgAP + WDS(1);
    SET_HDR(ap, stg_AP_info, CCCS);
    
    StgAP_n_args(ap) = HALF_W_(0);
    StgAP_fun(ap) = R1;
    
    RET_P(ap);
}

stg_unpackClosurezh
{
/* args: R1 = closure to analyze */
// TODO: Consider the absence of ptrs or nonptrs as a special case ?

    W_ info, ptrs, nptrs, p, ptrs_arr, nptrs_arr;
    info  = %GET_STD_INFO(UNTAG(R1));

    // Some closures have non-standard layout, so we omit those here.
    W_ type;
    type = TO_W_(%INFO_TYPE(info));
    switch [0 .. N_CLOSURE_TYPES] type {
    case THUNK_SELECTOR : {
        ptrs = 1;
        nptrs = 0;
        goto out;
    }
    case THUNK, THUNK_1_0, THUNK_0_1, THUNK_2_0, THUNK_1_1, 
         THUNK_0_2, THUNK_STATIC, AP, PAP, AP_STACK, BCO : {
        ptrs = 0;
        nptrs = 0;
        goto out;
    }
    default: {
        ptrs  = TO_W_(%INFO_PTRS(info)); 
        nptrs = TO_W_(%INFO_NPTRS(info));
        goto out;
    }}
out:

    W_ ptrs_arr_sz, ptrs_arr_cards, nptrs_arr_sz;
    nptrs_arr_sz = SIZEOF_StgArrWords   + WDS(nptrs);
    ptrs_arr_cards = mutArrPtrsCardWords(ptrs);
    ptrs_arr_sz  = SIZEOF_StgMutArrPtrs + WDS(ptrs) + WDS(ptrs_arr_cards);

    ALLOC_PRIM (ptrs_arr_sz + nptrs_arr_sz, R1_PTR, stg_unpackClosurezh);

    W_ clos;
    clos = UNTAG(R1);

    ptrs_arr  = Hp - nptrs_arr_sz - ptrs_arr_sz + WDS(1);
    nptrs_arr = Hp - nptrs_arr_sz + WDS(1);

    SET_HDR(ptrs_arr, stg_MUT_ARR_PTRS_FROZEN_info, CCCS);
    StgMutArrPtrs_ptrs(ptrs_arr) = ptrs;
    StgMutArrPtrs_size(ptrs_arr) = ptrs + ptrs_arr_cards;

    p = 0;
for:
    if(p < ptrs) {
    	 W_[ptrs_arr + SIZEOF_StgMutArrPtrs + WDS(p)] = StgClosure_payload(clos,p);
	 p = p + 1;
	 goto for;
    }
    /* We can leave the card table uninitialised, since the array is
       allocated in the nursery.  The GC will fill it in if/when the array
       is promoted. */
    
    SET_HDR(nptrs_arr, stg_ARR_WORDS_info, CCCS);
    StgArrWords_bytes(nptrs_arr) = WDS(nptrs);
    p = 0;
for2:
    if(p < nptrs) {
    	 W_[BYTE_ARR_CTS(nptrs_arr) + WDS(p)] = StgClosure_payload(clos, p+ptrs);
    	 p = p + 1;
	 goto for2;
    }
    RET_NPP(info, ptrs_arr, nptrs_arr);
}

/* -----------------------------------------------------------------------------
   Thread I/O blocking primitives
   -------------------------------------------------------------------------- */

/* Add a thread to the end of the blocked queue. (C-- version of the C
 * macro in Schedule.h).
 */
#define APPEND_TO_BLOCKED_QUEUE(tso)			\
    ASSERT(StgTSO__link(tso) == END_TSO_QUEUE);		\
    if (W_[blocked_queue_hd] == END_TSO_QUEUE) {	\
      W_[blocked_queue_hd] = tso;			\
    } else {						\
      foreign "C" setTSOLink(MyCapability() "ptr", W_[blocked_queue_tl] "ptr", tso) []; \
    }							\
    W_[blocked_queue_tl] = tso;

stg_waitReadzh
{
    /* args: R1 */
#ifdef THREADED_RTS
    foreign "C" barf("waitRead# on threaded RTS") never returns;
#else

    ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
    StgTSO_why_blocked(CurrentTSO) = BlockedOnRead::I16;
    StgTSO_block_info(CurrentTSO) = R1;
    // No locking - we're not going to use this interface in the
    // threaded RTS anyway.
    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
    jump stg_block_noregs;
#endif
}

stg_waitWritezh
{
    /* args: R1 */
#ifdef THREADED_RTS
    foreign "C" barf("waitWrite# on threaded RTS") never returns;
#else

    ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
    StgTSO_why_blocked(CurrentTSO) = BlockedOnWrite::I16;
    StgTSO_block_info(CurrentTSO) = R1;
    // No locking - we're not going to use this interface in the
    // threaded RTS anyway.
    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
    jump stg_block_noregs;
#endif
}


STRING(stg_delayzh_malloc_str, "stg_delayzh")
stg_delayzh
{
#ifdef mingw32_HOST_OS
    W_ ares;
    CInt reqID;
#else
    W_ t, prev, target;
#endif

#ifdef THREADED_RTS
    foreign "C" barf("delay# on threaded RTS") never returns;
#else

    /* args: R1 (microsecond delay amount) */
    ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
    StgTSO_why_blocked(CurrentTSO) = BlockedOnDelay::I16;

#ifdef mingw32_HOST_OS

    /* could probably allocate this on the heap instead */
    ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
					    stg_delayzh_malloc_str);
    (reqID) = foreign "C" addDelayRequest(R1);
    StgAsyncIOResult_reqID(ares)   = reqID;
    StgAsyncIOResult_len(ares)     = 0;
    StgAsyncIOResult_errCode(ares) = 0;
    StgTSO_block_info(CurrentTSO)  = ares;

    /* Having all async-blocked threads reside on the blocked_queue
     * simplifies matters, so change the status to OnDoProc put the
     * delayed thread on the blocked_queue.
     */
    StgTSO_why_blocked(CurrentTSO) = BlockedOnDoProc::I16;
    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
    jump stg_block_async_void;

#else


    (target) = foreign "C" getDelayTarget(R1) [R1];

    StgTSO_block_info(CurrentTSO) = target;

    /* Insert the new thread in the sleeping queue. */
    prev = NULL;
    t = W_[sleeping_queue];
while:
    if (t != END_TSO_QUEUE && StgTSO_block_info(t) < target) {
	prev = t;
	t = StgTSO__link(t);
	goto while;
    }

    StgTSO__link(CurrentTSO) = t;
    if (prev == NULL) {
	W_[sleeping_queue] = CurrentTSO;
    } else {
        foreign "C" setTSOLink(MyCapability() "ptr", prev "ptr", CurrentTSO) [];
    }
    jump stg_block_noregs;
#endif
#endif /* !THREADED_RTS */
}


#ifdef mingw32_HOST_OS
STRING(stg_asyncReadzh_malloc_str, "stg_asyncReadzh")
stg_asyncReadzh
{
    W_ ares;
    CInt reqID;

#ifdef THREADED_RTS
    foreign "C" barf("asyncRead# on threaded RTS") never returns;
#else

    /* args: R1 = fd, R2 = isSock, R3 = len, R4 = buf */
    ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
    StgTSO_why_blocked(CurrentTSO) = BlockedOnRead::I16;

    /* could probably allocate this on the heap instead */
    ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
					    stg_asyncReadzh_malloc_str)
			[R1,R2,R3,R4];
    (reqID) = foreign "C" addIORequest(R1, 0/*FALSE*/,R2,R3,R4 "ptr") [];
    StgAsyncIOResult_reqID(ares)   = reqID;
    StgAsyncIOResult_len(ares)     = 0;
    StgAsyncIOResult_errCode(ares) = 0;
    StgTSO_block_info(CurrentTSO)  = ares;
    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
    jump stg_block_async;
#endif
}

STRING(stg_asyncWritezh_malloc_str, "stg_asyncWritezh")
stg_asyncWritezh
{
    W_ ares;
    CInt reqID;

#ifdef THREADED_RTS
    foreign "C" barf("asyncWrite# on threaded RTS") never returns;
#else

    /* args: R1 = fd, R2 = isSock, R3 = len, R4 = buf */
    ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
    StgTSO_why_blocked(CurrentTSO) = BlockedOnWrite::I16;

    ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
					    stg_asyncWritezh_malloc_str)
			[R1,R2,R3,R4];
    (reqID) = foreign "C" addIORequest(R1, 1/*TRUE*/,R2,R3,R4 "ptr") [];

    StgAsyncIOResult_reqID(ares)   = reqID;
    StgAsyncIOResult_len(ares)     = 0;
    StgAsyncIOResult_errCode(ares) = 0;
    StgTSO_block_info(CurrentTSO)  = ares;
    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
    jump stg_block_async;
#endif
}

STRING(stg_asyncDoProczh_malloc_str, "stg_asyncDoProczh")
stg_asyncDoProczh
{
    W_ ares;
    CInt reqID;

#ifdef THREADED_RTS
    foreign "C" barf("asyncDoProc# on threaded RTS") never returns;
#else

    /* args: R1 = proc, R2 = param */
    ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
    StgTSO_why_blocked(CurrentTSO) = BlockedOnDoProc::I16;

    /* could probably allocate this on the heap instead */
    ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
					    stg_asyncDoProczh_malloc_str) 
				[R1,R2];
    (reqID) = foreign "C" addDoProcRequest(R1 "ptr",R2 "ptr") [];
    StgAsyncIOResult_reqID(ares)   = reqID;
    StgAsyncIOResult_len(ares)     = 0;
    StgAsyncIOResult_errCode(ares) = 0;
    StgTSO_block_info(CurrentTSO) = ares;
    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
    jump stg_block_async;
#endif
}
#endif

/* -----------------------------------------------------------------------------
 * noDuplicate#
 *
 * noDuplicate# tries to ensure that none of the thunks under
 * evaluation by the current thread are also under evaluation by
 * another thread.  It relies on *both* threads doing noDuplicate#;
 * the second one will get blocked if they are duplicating some work.
 *
 * The idea is that noDuplicate# is used within unsafePerformIO to
 * ensure that the IO operation is performed at most once.
 * noDuplicate# calls threadPaused which acquires an exclusive lock on
 * all the thunks currently under evaluation by the current thread.
 *
 * Consider the following scenario.  There is a thunk A, whose
 * evaluation requires evaluating thunk B, where thunk B is an
 * unsafePerformIO.  Two threads, 1 and 2, bother enter A.  Thread 2
 * is pre-empted before it enters B, and claims A by blackholing it
 * (in threadPaused).  Thread 1 now enters B, and calls noDuplicate#.
 *
 *      thread 1                      thread 2
 *   +-----------+                 +---------------+
 *   |    -------+-----> A <-------+-------        |
 *   |  update   |   BLACKHOLE     | marked_update |
 *   +-----------+                 +---------------+
 *   |           |                 |               | 
 *        ...                             ...
 *   |           |                 +---------------+
 *   +-----------+
 *   |     ------+-----> B
 *   |  update   |   BLACKHOLE
 *   +-----------+
 *
 * At this point: A is a blackhole, owned by thread 2.  noDuplicate#
 * calls threadPaused, which walks up the stack and
 *  - claims B on behalf of thread 1
 *  - then it reaches the update frame for A, which it sees is already
 *    a BLACKHOLE and is therefore owned by another thread.  Since
 *    thread 1 is duplicating work, the computation up to the update
 *    frame for A is suspended, including thunk B.
 *  - thunk B, which is an unsafePerformIO, has now been reverted to
 *    an AP_STACK which could be duplicated - BAD!
 *  - The solution is as follows: before calling threadPaused, we
 *    leave a frame on the stack (stg_noDuplicate_info) that will call
 *    noDuplicate# again if the current computation is suspended and
 *    restarted.
 *
 * See the test program in concurrent/prog003 for a way to demonstrate
 * this.  It needs to be run with +RTS -N3 or greater, and the bug
 * only manifests occasionally (once very 10 runs or so).
 * -------------------------------------------------------------------------- */

INFO_TABLE_RET(stg_noDuplicate, RET_SMALL)
{
    Sp_adj(1);
    jump stg_noDuplicatezh;
}

stg_noDuplicatezh
{
    STK_CHK_GEN( WDS(1), NO_PTRS, stg_noDuplicatezh );
    // leave noDuplicate frame in case the current
    // computation is suspended and restarted (see above).
    Sp_adj(-1);
    Sp(0) = stg_noDuplicate_info;

    SAVE_THREAD_STATE();
    ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
    foreign "C" threadPaused (MyCapability() "ptr", CurrentTSO "ptr") [];
    
    if (StgTSO_what_next(CurrentTSO) == ThreadKilled::I16) {
        jump stg_threadFinished;
    } else {
        LOAD_THREAD_STATE();
        ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
        // remove the stg_noDuplicate frame if it is still there.
        if (Sp(0) == stg_noDuplicate_info) {
            Sp_adj(1);
        }
        jump %ENTRY_CODE(Sp(0));
    }
}

/* -----------------------------------------------------------------------------
   Misc. primitives
   -------------------------------------------------------------------------- */

stg_getApStackValzh
{
   W_ ap_stack, offset, val, ok;

   /* args: R1 = AP_STACK, R2 = offset */
   ap_stack = R1;
   offset   = R2;

   if (%INFO_PTR(ap_stack) == stg_AP_STACK_info) {
        ok = 1;
        val = StgAP_STACK_payload(ap_stack,offset); 
   } else {
        ok = 0;
        val = R1;
   }
   RET_NP(ok,val);
}

// Write the cost center stack of the first argument on stderr; return
// the second.  Possibly only makes sense for already evaluated
// things?
stg_traceCcszh
{
    W_ ccs;

#ifdef PROFILING
    ccs = StgHeader_ccs(UNTAG(R1));
    foreign "C" fprintCCS_stderr(ccs "ptr") [R2];
#endif

    R1 = R2;
    ENTER();
}

stg_getSparkzh
{
   W_ spark;

#ifndef THREADED_RTS
   RET_NP(0,ghczmprim_GHCziTypes_False_closure);
#else
   (spark) = foreign "C" findSpark(MyCapability());
   if (spark != 0) {
      RET_NP(1,spark);
   } else {
      RET_NP(0,ghczmprim_GHCziTypes_False_closure);
   }
#endif
}

stg_numSparkszh
{
  W_ n;
#ifdef THREADED_RTS
  (n) = foreign "C" dequeElements(Capability_sparks(MyCapability()));
#else
  n = 0;
#endif
  RET_N(n);
}

stg_traceEventzh
{
   W_ msg;
   msg = R1;

#if defined(TRACING) || defined(DEBUG)

   foreign "C" traceUserMsg(MyCapability() "ptr", msg "ptr") [];

#elif defined(DTRACE)

   W_ enabled;

   // We should go through the macro HASKELLEVENT_USER_MSG_ENABLED from
   // RtsProbes.h, but that header file includes unistd.h, which doesn't
   // work in Cmm
#if !defined(solaris2_TARGET_OS)
   (enabled) = foreign "C" __dtrace_isenabled$HaskellEvent$user__msg$v1() [];
#else
   // Solaris' DTrace can't handle the
   //     __dtrace_isenabled$HaskellEvent$user__msg$v1
   // call above. This call is just for testing whether the user__msg
   // probe is enabled, and is here for just performance optimization.
   // Since preparation for the probe is not that complex I disable usage of
   // this test above for Solaris and enable the probe usage manually
   // here. Please note that this does not mean that the probe will be
   // used during the runtime! You still need to enable it by consumption
   // in your dtrace script as you do with any other probe.
   enabled = 1;
#endif
   if (enabled != 0) {
     foreign "C" dtraceUserMsgWrapper(MyCapability() "ptr", msg "ptr") [];
   }

#endif
   jump %ENTRY_CODE(Sp(0));
}