From 3b1a01da30301b3b0da959edb85324e4db221e7c Mon Sep 17 00:00:00 2001 From: jashook Date: Mon, 5 Jun 2017 14:48:51 -0700 Subject: [PATCH 01/12] [*64] Allow more fastTailCalls involving structs Before this change structs on Arm64 and Amd64 Unix could pessimize when we could fastTailCall if they were engregisterable and took more than one register. --- src/jit/compiler.h | 21 +- src/jit/lclvars.cpp | 67 ++++++- src/jit/lower.cpp | 1 - src/jit/morph.cpp | 185 +++++++++++++++--- .../DoNotFastTailCallWithStructs.cs | 81 ++++++++ .../DoNotFastTailCallWithStructs.csproj | 45 +++++ .../StackFixup.cs} | 3 + .../StackFixup.csproj} | 2 +- .../opt/FastTailCall/StructPassingSimple.cs | 38 ++++ .../FastTailCall/StructPassingSimple.csproj | 33 ++++ tests/src/JIT/opt/FastTailCall/x64UnixSb.cs | 86 ++++++++ .../src/JIT/opt/FastTailCall/x64UnixSb.csproj | 45 +++++ 12 files changed, 571 insertions(+), 36 deletions(-) create mode 100644 tests/src/JIT/opt/FastTailCall/DoNotFastTailCallWithStructs.cs create mode 100644 tests/src/JIT/opt/FastTailCall/DoNotFastTailCallWithStructs.csproj rename tests/src/JIT/opt/{Tailcall/FastTailCallStackFixup.cs => FastTailCall/StackFixup.cs} (89%) rename tests/src/JIT/opt/{Tailcall/FastTailCallStackFixup.csproj => FastTailCall/StackFixup.csproj} (97%) create mode 100644 tests/src/JIT/opt/FastTailCall/StructPassingSimple.cs create mode 100644 tests/src/JIT/opt/FastTailCall/StructPassingSimple.csproj create mode 100644 tests/src/JIT/opt/FastTailCall/x64UnixSb.cs create mode 100644 tests/src/JIT/opt/FastTailCall/x64UnixSb.csproj diff --git a/src/jit/compiler.h b/src/jit/compiler.h index bc178ca602e9..671e6d8cfd48 100644 --- a/src/jit/compiler.h +++ b/src/jit/compiler.h @@ -8259,13 +8259,20 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX bool compPublishStubParam : 1; // EAX captured in prolog will be available through an instrinsic bool compRetBuffDefStack : 1; // The ret buff argument definitely points into the stack. - var_types compRetType; // Return type of the method as declared in IL - var_types compRetNativeType; // Normalized return type as per target arch ABI - unsigned compILargsCount; // Number of arguments (incl. implicit but not hidden) - unsigned compArgsCount; // Number of arguments (incl. implicit and hidden) - unsigned compRetBuffArg; // position of hidden return param var (0, 1) (BAD_VAR_NUM means not present); - int compTypeCtxtArg; // position of hidden param for type context for generic code (CORINFO_CALLCONV_PARAMTYPE) - unsigned compThisArg; // position of implicit this pointer param (not to be confused with lvaArg0Var) + var_types compRetType; // Return type of the method as declared in IL + var_types compRetNativeType; // Normalized return type as per target arch ABI + unsigned compILargsCount; // Number of arguments (incl. implicit but not hidden) + unsigned compArgsCount; // Number of arguments (incl. implicit and hidden) + +#ifdef FEATURE_MULTIREG_ARGS + unsigned compArgRegCount; // Number of incoming integer args + unsigned compFloatArgRegCount; // Number of incoming floating point args + unsigned compStackSize; // Incoming stack size +#endif // FEATURE_MULTIREG_ARGS + + unsigned compRetBuffArg; // position of hidden return param var (0, 1) (BAD_VAR_NUM means not present); + int compTypeCtxtArg; // position of hidden param for type context for generic code (CORINFO_CALLCONV_PARAMTYPE) + unsigned compThisArg; // position of implicit this pointer param (not to be confused with lvaArg0Var) unsigned compILlocalsCount; // Number of vars : args + locals (incl. implicit but not hidden) unsigned compLocalsCount; // Number of vars : args + locals (incl. implicit and hidden) unsigned compMaxStack; diff --git a/src/jit/lclvars.cpp b/src/jit/lclvars.cpp index 48a88ba6dc91..3916311d9fe1 100644 --- a/src/jit/lclvars.cpp +++ b/src/jit/lclvars.cpp @@ -108,12 +108,19 @@ void Compiler::lvaInitTypeRef() /* Set compArgsCount and compLocalsCount */ info.compArgsCount = info.compMethodInfo->args.numArgs; + unsigned argRegCount = 0; + + auto incrementArgCount = [this, &argRegCount]() + { + ++info.compArgsCount; + ++argRegCount; + }; // Is there a 'this' pointer if (!info.compIsStatic) { - info.compArgsCount++; + incrementArgCount(); } else { @@ -167,7 +174,7 @@ void Compiler::lvaInitTypeRef() if (hasRetBuffArg) { - info.compArgsCount++; + incrementArgCount(); } else { @@ -179,14 +186,14 @@ void Compiler::lvaInitTypeRef() if (info.compIsVarArgs) { - info.compArgsCount++; + incrementArgCount(); } // Is there an extra parameter used to pass instantiation info to // shared generic methods and shared generic struct instance methods? if (info.compMethodInfo->args.callConv & CORINFO_CALLCONV_PARAMTYPE) { - info.compArgsCount++; + incrementArgCount(); } else { @@ -235,6 +242,40 @@ void Compiler::lvaInitTypeRef() lvaInitArgs(&varDscInfo); + //------------------------------------------------------------------------- + // Calculate the argument register usage. + // + // This will later be used for fastTailCall determination + //------------------------------------------------------------------------- + + unsigned floatingRegCount = 0; + unsigned stackArgCount = 0; + unsigned stackSize = 0; + + auto incrementRegCount = [&floatingRegCount, &argRegCount](LclVarDsc* varDsc) + { + varDsc->IsFloatRegType() ? ++floatingRegCount : ++argRegCount; + }; + + for (LclVarDsc* varDsc, unsigned argNum = 0; argNum < info.compMethodInfo->args.numArgs; argNum++, varDsc++) + { + if (varDsc->lvRegNum != REG_STK) + { + incrementRegCount(varDsc); + +#ifdef (FEATURE_MULTIREG_ARGS && !WINDOWS_AMD64_ABI) + if (varDsc->lvOtherArgReg != REG_NA) + { + incrementRegCount(varDsc); + } +#endif // (FEATURE_MULTIREG_ARGS && !WINDOWS_AMD64_ABI) + } + else + { + stackSize += varDsc->lvSize(); + } + } + //------------------------------------------------------------------------- // Finally the local variables //------------------------------------------------------------------------- @@ -255,13 +296,25 @@ void Compiler::lvaInitTypeRef() varDsc->lvPinned = ((corInfoType & CORINFO_TYPE_MOD_PINNED) != 0); varDsc->lvOnFrame = true; // The final home for this local variable might be our local stack frame + CORINFO_CLASS_HANDLE clsHnd = info.compCompHnd->getArgClass(&info.compMethodInfo->locals, localsSig); + stackSize += roundUp(compGetTypeSize(strip(corInfoType), clsHnd), TARGET_POINTER_SIZE); + if (strip(corInfoType) == CORINFO_TYPE_CLASS) { - CORINFO_CLASS_HANDLE clsHnd = info.compCompHnd->getArgClass(&info.compMethodInfo->locals, localsSig); lvaSetClass(varNum, clsHnd); } } + //------------------------------------------------------------------------- + // Save the register usage information and stack size. + //------------------------------------------------------------------------- + + stackSize += stackArgCount * REGSIZE_BYTES; + + info.compArgRegCount = argRegCount; + info.compFloatArgRegCount = floatingRegCount; + info.compStackSize = stackSize; + if ( // If there already exist unsafe buffers, don't mark more structs as unsafe // as that will cause them to be placed along with the real unsafe buffers, // unnecessarily exposing them to overruns. This can affect GS tests which @@ -1253,6 +1306,10 @@ void Compiler::lvaInitVarDsc(LclVarDsc* varDsc, #ifdef DEBUG varDsc->lvStkOffs = BAD_STK_OFFS; #endif + +#ifdef FEATURE_MULTIREG_ARGS + varDsc->lvOtherArgReg = REG_NA; +#endif // FEATURE_MULTIREG_ARGS } /***************************************************************************** diff --git a/src/jit/lower.cpp b/src/jit/lower.cpp index a626ff98e7bf..27b84b6144d3 100644 --- a/src/jit/lower.cpp +++ b/src/jit/lower.cpp @@ -1701,7 +1701,6 @@ void Lowering::LowerFastTailCall(GenTreeCall* call) fgArgTabEntryPtr argTabEntry = comp->gtArgEntryByNode(call, putArgStkNode); assert(argTabEntry); unsigned callerArgNum = argTabEntry->argNum - calleeNonStandardArgCount; - noway_assert(callerArgNum < comp->info.compArgsCount); unsigned callerArgLclNum = callerArgNum; LclVarDsc* callerArgDsc = comp->lvaTable + callerArgLclNum; diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp index 92091c97578e..56f3149f1d67 100644 --- a/src/jit/morph.cpp +++ b/src/jit/morph.cpp @@ -6943,20 +6943,33 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) // Note that callee being a vararg method is not a problem since we can account the params being passed. // Count of caller args including implicit and hidden (i.e. thisPtr, RetBuf, GenericContext, VarargCookie) - unsigned nCallerArgs = info.compArgsCount; + unsigned callerArgRegCount = info.compArgRegCount; + unsigned callerFloatArgRegCount = info.compFloatArgRegCount; + + // TODO ARM64, UNIX x64 + // + // Currently we can track the caller's inbound stack size; however, we cannot + // easily determine the caller's outbound stack size (the callee's inbound stack + // size). This information is computed in fgMorphArgs which currently is + // dependent on the canFastTailCall decision. + // + // Note that we can get around this by excluding all struct which cannot + // be engregistered. // Count the callee args including implicit and hidden. // Note that GenericContext and VarargCookie are added by importer while // importing the call to gtCallArgs list along with explicit user args. - unsigned nCalleeArgs = 0; + unsigned calleeArgRegCount = 0; + unsigned calleeFloatArgRegCount = 0; + if (callee->gtCallObjp) // thisPtr { - nCalleeArgs++; + ++calleeArgRegCount; } if (callee->HasRetBufArg()) // RetBuf { - nCalleeArgs++; + ++calleeArgRegCount; // If callee has RetBuf param, caller too must have it. // Otherwise go the slow route. @@ -6971,10 +6984,9 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) // non-standard and secret params passed in registers (e.g. R10, R11) since // these won't contribute to out-going arg size. bool hasMultiByteArgs = false; + bool hasTwoSlotSizedStruct = false; for (GenTreePtr args = callee->gtCallArgs; (args != nullptr) && !hasMultiByteArgs; args = args->gtOp.gtOp2) { - nCalleeArgs++; - assert(args->OperIsList()); GenTreePtr argx = args->gtOp.gtOp1; @@ -7001,24 +7013,83 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) { #if defined(_TARGET_AMD64_) || defined(_TARGET_ARM64_) + // **NOTE** + // + // hasMultiByteArgs will determine if the struct can be passed + // in registers. If it cannot we will break the loop and not + // fastTailCall. unsigned typeSize = 0; hasMultiByteArgs = !VarTypeIsMultiByteAndCanEnreg(argx->TypeGet(), objClass, &typeSize, false); -#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING) || defined(_TARGET_ARM64_) - // On System V/arm64 the args could be a 2 eightbyte struct that is passed in two registers. - // Account for the second eightbyte in the nCalleeArgs. - // https://github.com/dotnet/coreclr/issues/2666 - // TODO-CQ-Amd64-Unix/arm64: Structs of size between 9 to 16 bytes are conservatively estimated - // as two args, since they need two registers whereas nCallerArgs is - // counting such an arg as one. This would mean we will not be optimizing - // certain calls though technically possible. +#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING) + SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc; + + assert(objClass != nullptr); + eeGetSystemVAmd64PassStructInRegisterDescriptor(objClass, &structDesc); - if (typeSize > TARGET_POINTER_SIZE) + // TODO. Here we have made the assumption that multibyte struct + // arguments will cause a no fastTailCall decision. + if (!structDesc.passedInRegisters) { - unsigned extraArgRegsToAdd = (typeSize / TARGET_POINTER_SIZE); - nCalleeArgs += extraArgRegsToAdd; + // TODO do not approx callee stack size. + noway_assert(hasMultiByteArgs); } -#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING || _TARGET_ARM64_ + else + { + if (structDesc.eightByteCount > 1) + { + hasTwoSlotSizedStruct = true; + } + + for (unsigned int i = 0; i < structDesc.eightByteCount; i++) + { + if (structDesc.IsIntegralSlot(i)) + { + ++calleeArgRegCount; + } + else if (structDesc.IsSseSlot(i)) + { + ++calleeFloatArgRegCount; + } + else + { + assert(false && "Invalid eightbyte classification type."); + break; + } + } + } + +#else // ARM64 + var_types hfaType = GetHfaType(argx); + bool isHfaArg = varTypeIsFloating(hfaType); + unsigned size = 1; + + if (isHfaArg) + { + size = GetHfaCount(argx); + } + else + { + // Structs are either passed in 1 or 2 (64-bit) slots + unsigned roundupSize = roundUp(info.compCompHnd->getClassSize(argx->gtArgPlace.gtArgPlaceClsHnd), + TARGET_POINTER_SIZE); + size = roundupSize / TARGET_POINTER_SIZE; + + if (size > 2) + { + // TODO do not approx callee stack size. + noway_assert(hasMultiByteArgs); + } + + else if (size == 2) + { + hasTwoSlotSizedStruct = true; + } + } + + calleeArgRegCount += size; + +#endif // FEATURE_UNIX_AMD64_STRUCT_PASSING #else assert(!"Target platform ABI rules regarding passing struct type args in registers"); @@ -7030,6 +7101,10 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) hasMultiByteArgs = true; } } + else + { + varTypeIsFloating(argx) ? ++calleeFloatArgRegCount : ++calleeArgRegCount; + } } // Go the slow route, if it has multi-byte params @@ -7038,20 +7113,86 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) return false; } + const unsigned maxRegArgs = MAX_REG_ARG; + // If we reached here means that callee has only those argument types which can be passed in // a register and if passed on stack will occupy exactly one stack slot in out-going arg area. - // If we are passing args on stack for callee and it has more args passed on stack than - // caller, then fast tail call cannot be performed. + // If we are passing args on stack for the callee and it has more args passed on stack than + // the caller, then fast tail call cannot be performed. // // Note that the GC'ness of on stack args need not match since the arg setup area is marked // as non-interruptible for fast tail calls. - if ((nCalleeArgs > MAX_REG_ARG) && (nCallerArgs < nCalleeArgs)) + +#ifdef WINDOWS_AMD64_ABI + // x64 Windows: If we have more callee registers used than MAX_REG_ARG, then + // make sure the callee's incoming arguments is less than the caller's + if ((((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) && + ((callerArgRegCount + callerFloatArgRegCount) > (callerArgRegCount + callerFloatArgRegCount))) { + JITDUMP("Will not fastTailCall ((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) && (callerArgRegCount > calleeArgRegCount)"); + return false; + } + +#elif (_TARGET_AMD64_ && UNIX_AMD64_ABI) || _TARGET_ARM64_ + + // For *nix Amd64 and Arm64 check to see if all arguments for the callee + // and caller are passing in registers. If not make sure that the stack + // size for the callee is at less than or equal to the caller's. + // + // Also, in the case that we have to pass arguments on the stack make sure + // that we are not dealing with structs that are >8 bytes. + + bool hasStackArgs = false; + + unsigned calleeIntStackArgCount = calleeArgRegCount > maxRegArgs ? calleeArgRegCount - maxRegArgs : 0; + unsigned calleeFloatStackArgCount = calleeFloatArgRegCount > maxFloatRegArgs ? calleeFloatArgRegCount - maxFloatRegArgs : 0; + + unsigned callerIntStackArgCount = callerArgRegCount > maxRegArgs ? callerArgRegCount - maxRegArgs : 0; + unsigned callerFloatStackArgCount = callerFloatArgRegCount > maxFloatRegArgs ? callerFloatArgRegCount - maxFloatRegArgs : 0; + + if (calleeFloatArgRegCount > maxFloatRegArgs || argRegCount > maxRegArgs) + { + hasStackArgs = true; + } + + // We have a >8 byte struct in the callee and arguments that have to go + // on the stack. Do not fastTailCall. + if (hasStackArgs && hasTwoSlotSizedStruct) + { + JITDUMP("Will not fastTailCall hasStackArgs && hasTwoSlotSizedStruct"); + return false; + } + + const int maxFloatRegArgs = MAX_FLOAT_REG_ARG; + + auto calculateWorstCaseStackSize = [&maxRegArgs, &maxFloatRegArgs](int argRegCount, int floatArgRegCount) + { + const unsigned numSpilledFloatRegs = floatArgRegCount > maxFloatRegArgs ? floatArgRegCount - maxFloatRegArgs : 0; + const unsigned numSpilledIntRegs = argRegCount > maxRegArgs ? argRegCount - maxRegArgs : 0; + + const unsigned worstCaseStackSize = (numSpilledFloatRegs + numSpilledIntRegs) * TARGET_POINTER_SIZE; + + return worstCaseStackSize; + }; + + const unsigned worstCaseCallerStackSize = calculateWorstCaseStackSize(callerArgRegCount, callerFloatArgRegCount); + const unsigned worstCaseCalleeStackSize = calculateWorstCaseStackSize(calleeArgRegCount, calleeFloatArgRegCount); + + if (worstCaseCalleeStackSize > worstCaseCallerStackSize) + { + JITDUMP("Will not fastTailCall worstCaseCalleeStackSize > worstCaseCallerStackSize"); return false; } - return true; #else + + NYI("fastTailCall not supported on this Architecture."); + +#endif // WINDOWS_AMD64_ABI + + JITDUMP("Will fastTailCall"); + return true; +#else // FEATURE_FASTTAILCALL return false; #endif } diff --git a/tests/src/JIT/opt/FastTailCall/DoNotFastTailCallWithStructs.cs b/tests/src/JIT/opt/FastTailCall/DoNotFastTailCallWithStructs.cs new file mode 100644 index 000000000000..5ddc0f60e7f6 --- /dev/null +++ b/tests/src/JIT/opt/FastTailCall/DoNotFastTailCallWithStructs.cs @@ -0,0 +1,81 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; + +// 10 byte struct +public struct A +{ + public int a; + public int b; + public short c; +} + +// 32 byte struct +public struct B +{ + public long a; + public long b; + public long c; + public long d; +} + +public class DoNotFastTailCallWithStructs +{ + static A a; + static B b; + + public static void Foo(B b, int count=1000) + { + if (count == 100) + { + return count; + } + + if (count-- % 2 == 0) + { + return Foo(a, count); + } + + else + { + return Foo(b, count); + } + } + + public static void Foo(A a, int count=1000) + { + if (count == 100) + { + return count; + } + + if (count-- % 2 == 0) + { + return Foo(a, count); + } + + else + { + return Foo(b, count); + } + } + + public static int Main() + { + a = new A(); + b = new B(); + + a.a = 100; + a.b = 1000; + a.c = 10000; + + b.a = 500; + b.b = 5000; + b.c = 50000; + b.d = 500000; + + return Foo(a); + } +} \ No newline at end of file diff --git a/tests/src/JIT/opt/FastTailCall/DoNotFastTailCallWithStructs.csproj b/tests/src/JIT/opt/FastTailCall/DoNotFastTailCallWithStructs.csproj new file mode 100644 index 000000000000..1a8a6d7acfac --- /dev/null +++ b/tests/src/JIT/opt/FastTailCall/DoNotFastTailCallWithStructs.csproj @@ -0,0 +1,45 @@ + + + + + Debug + AnyCPU + $(MSBuildProjectName) + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + + + + + + + + + False + + + + + None + True + True + True + True + True + True + $(DefineConstants);CORECLR + + + + + + + + + + + diff --git a/tests/src/JIT/opt/Tailcall/FastTailCallStackFixup.cs b/tests/src/JIT/opt/FastTailCall/StackFixup.cs similarity index 89% rename from tests/src/JIT/opt/Tailcall/FastTailCallStackFixup.cs rename to tests/src/JIT/opt/FastTailCall/StackFixup.cs index f4caf37fd279..eaf095032aad 100644 --- a/tests/src/JIT/opt/Tailcall/FastTailCallStackFixup.cs +++ b/tests/src/JIT/opt/FastTailCall/StackFixup.cs @@ -1,3 +1,6 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. using System; public struct A diff --git a/tests/src/JIT/opt/Tailcall/FastTailCallStackFixup.csproj b/tests/src/JIT/opt/FastTailCall/StackFixup.csproj similarity index 97% rename from tests/src/JIT/opt/Tailcall/FastTailCallStackFixup.csproj rename to tests/src/JIT/opt/FastTailCall/StackFixup.csproj index b065595dd6b5..f7e3e9792b67 100644 --- a/tests/src/JIT/opt/Tailcall/FastTailCallStackFixup.csproj +++ b/tests/src/JIT/opt/FastTailCall/StackFixup.csproj @@ -34,7 +34,7 @@ $(DefineConstants);CORECLR - + diff --git a/tests/src/JIT/opt/FastTailCall/StructPassingSimple.cs b/tests/src/JIT/opt/FastTailCall/StructPassingSimple.cs new file mode 100644 index 000000000000..13ca0cbc4827 --- /dev/null +++ b/tests/src/JIT/opt/FastTailCall/StructPassingSimple.cs @@ -0,0 +1,38 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. +using System; + +// 10 byte struct +public struct A +{ + public int a; + public int b; + public short c; +} + +class TailCallStructPassingSimple +{ + // Simple tail call candidate that would be ignored on Arm64 and amd64 Unix + // due to https://github.com/dotnet/coreclr/issues/2666 + public static int ImplicitTailCallTenByteStruct(A a, int count=1000) + { + if (count-- == 0) + { + return 100; + } + + return ImplicitTailCallTenByteStruct(a, count); + } + + public static int Main() + { + A temp = new A(); + temp.a = 50; + temp.b = 500; + temp.c = 62; + + int ret = ImplicitTailCallTenByteStruct(temp); + return ret; + } +} \ No newline at end of file diff --git a/tests/src/JIT/opt/FastTailCall/StructPassingSimple.csproj b/tests/src/JIT/opt/FastTailCall/StructPassingSimple.csproj new file mode 100644 index 000000000000..0cac87b23c77 --- /dev/null +++ b/tests/src/JIT/opt/FastTailCall/StructPassingSimple.csproj @@ -0,0 +1,33 @@ + + + + + Debug + AnyCPU + $(MSBuildProjectName) + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + + + + + + + + + False + + + + + + + + + + + + diff --git a/tests/src/JIT/opt/FastTailCall/x64UnixSb.cs b/tests/src/JIT/opt/FastTailCall/x64UnixSb.cs new file mode 100644 index 000000000000..9f38ac595f0b --- /dev/null +++ b/tests/src/JIT/opt/FastTailCall/x64UnixSb.cs @@ -0,0 +1,86 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; + +public class x64UnixSb +{ + public static int callee(int one, + int two, + int three, + int four, + int five, + int six, + int seven, + int eight) + { + int count = 0; + + // Make sure this is not inlined. + for (int i = 0; i < one; ++i) + { + if (i % 4 == 0) ++count; + } + + return count; + } + + // Eight incoming arguments, all passed in registers + // + // nCallerArgs: 8, stackSize: 0 bytes + public static int caller(float one, + float two, + float three, + float four, + float five, + float six, + float seven, + float eight) + { + if (one % 2 == 0) + { + // Eight outgoing arguments, six passed in registers, two on the stack. + // + // nCalleeArgs: 8, stackSize: 8 bytes + // + // This is a fastTailCall candidate that should not be fastTailCalled + // because the callee's stack size will be larger than the caller's + return callee((int) two, + (int) one, + (int) eight, + (int) five, + (int) four, + (int) seven, + (int) six, + (int) three); + } + else + { + // Eight outgoing arguments, six passed in registers, two on the stack. + // + // nCalleeArgs: 8, stackSize: 8 bytes + // + // This is a fastTailCall candidate that should not be fastTailCalled + // because the callee's stack size will be larger than the caller's + return callee((int) one, + (int) two, + (int) three, + (int) four, + (int) five, + (int) six, + (int) seven, + (int) eight); + } + + + } + + public static int Main() + { + // We have 8 floating args on unix. + int a = caller(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); + + return a; + } +} \ No newline at end of file diff --git a/tests/src/JIT/opt/FastTailCall/x64UnixSb.csproj b/tests/src/JIT/opt/FastTailCall/x64UnixSb.csproj new file mode 100644 index 000000000000..e087aeb603ae --- /dev/null +++ b/tests/src/JIT/opt/FastTailCall/x64UnixSb.csproj @@ -0,0 +1,45 @@ + + + + + Debug + AnyCPU + $(MSBuildProjectName) + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + + + + + + + + + False + + + + + None + True + True + True + True + True + True + $(DefineConstants);CORECLR + + + + + + + + + + + From 6b4e1b84e5281696a7844f16e37b829a5f0396f8 Mon Sep 17 00:00:00 2001 From: jashook Date: Mon, 26 Jun 2017 11:58:41 -0700 Subject: [PATCH 02/12] Exclude stackBound two slot size structs from fastTailCalls --- src/jit/morph.cpp | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp index 56f3149f1d67..28c2aee8dbba 100644 --- a/src/jit/morph.cpp +++ b/src/jit/morph.cpp @@ -7147,10 +7147,14 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) unsigned calleeIntStackArgCount = calleeArgRegCount > maxRegArgs ? calleeArgRegCount - maxRegArgs : 0; unsigned calleeFloatStackArgCount = calleeFloatArgRegCount > maxFloatRegArgs ? calleeFloatArgRegCount - maxFloatRegArgs : 0; + unsigned calleeStackArgCount = calleeIntStackArgCount + calleeFloatStackArgCount; + unsigned callerIntStackArgCount = callerArgRegCount > maxRegArgs ? callerArgRegCount - maxRegArgs : 0; unsigned callerFloatStackArgCount = callerFloatArgRegCount > maxFloatRegArgs ? callerFloatArgRegCount - maxFloatRegArgs : 0; - if (calleeFloatArgRegCount > maxFloatRegArgs || argRegCount > maxRegArgs) + unsigned callerStackArgCount = callerIntStackArgCount + callerFloatStackArgCount; + + if (callerStackArgCount > 0 || calleeStackArgCount > 0) { hasStackArgs = true; } @@ -7163,24 +7167,9 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) return false; } - const int maxFloatRegArgs = MAX_FLOAT_REG_ARG; - - auto calculateWorstCaseStackSize = [&maxRegArgs, &maxFloatRegArgs](int argRegCount, int floatArgRegCount) - { - const unsigned numSpilledFloatRegs = floatArgRegCount > maxFloatRegArgs ? floatArgRegCount - maxFloatRegArgs : 0; - const unsigned numSpilledIntRegs = argRegCount > maxRegArgs ? argRegCount - maxRegArgs : 0; - - const unsigned worstCaseStackSize = (numSpilledFloatRegs + numSpilledIntRegs) * TARGET_POINTER_SIZE; - - return worstCaseStackSize; - }; - - const unsigned worstCaseCallerStackSize = calculateWorstCaseStackSize(callerArgRegCount, callerFloatArgRegCount); - const unsigned worstCaseCalleeStackSize = calculateWorstCaseStackSize(calleeArgRegCount, calleeFloatArgRegCount); - - if (worstCaseCalleeStackSize > worstCaseCallerStackSize) + if (calleeStackArgCount > calleeStackArgCount) { - JITDUMP("Will not fastTailCall worstCaseCalleeStackSize > worstCaseCallerStackSize"); + JITDUMP("Will not fastTailCall calleeStackArgCount > calleeStackArgCount"); return false; } From 0f345370ee569bcb5eb66af22c7ef2630d820255 Mon Sep 17 00:00:00 2001 From: jashook Date: Mon, 26 Jun 2017 14:34:55 -0700 Subject: [PATCH 03/12] Address feedback --- src/jit/lclvars.cpp | 78 ++++++++++++------ src/jit/morph.cpp | 14 ++-- .../DoNotFastTailCallWithStructs.cs | 81 ------------------- .../DoNotFastTailCallWithStructs.csproj | 45 ----------- 4 files changed, 60 insertions(+), 158 deletions(-) delete mode 100644 tests/src/JIT/opt/FastTailCall/DoNotFastTailCallWithStructs.cs delete mode 100644 tests/src/JIT/opt/FastTailCall/DoNotFastTailCallWithStructs.csproj diff --git a/src/jit/lclvars.cpp b/src/jit/lclvars.cpp index 3916311d9fe1..6a667a5dec14 100644 --- a/src/jit/lclvars.cpp +++ b/src/jit/lclvars.cpp @@ -108,19 +108,12 @@ void Compiler::lvaInitTypeRef() /* Set compArgsCount and compLocalsCount */ info.compArgsCount = info.compMethodInfo->args.numArgs; - unsigned argRegCount = 0; - - auto incrementArgCount = [this, &argRegCount]() - { - ++info.compArgsCount; - ++argRegCount; - }; // Is there a 'this' pointer if (!info.compIsStatic) { - incrementArgCount(); + info.compArgsCount++; } else { @@ -174,7 +167,7 @@ void Compiler::lvaInitTypeRef() if (hasRetBuffArg) { - incrementArgCount(); + info.compArgsCount++; } else { @@ -186,14 +179,14 @@ void Compiler::lvaInitTypeRef() if (info.compIsVarArgs) { - incrementArgCount(); + info.compArgsCount++; } // Is there an extra parameter used to pass instantiation info to // shared generic methods and shared generic struct instance methods? if (info.compMethodInfo->args.callConv & CORINFO_CALLCONV_PARAMTYPE) { - incrementArgCount(); + info.compArgsCount++; } else { @@ -242,12 +235,15 @@ void Compiler::lvaInitTypeRef() lvaInitArgs(&varDscInfo); +#if FEATURE_FASTTAILCALL + //------------------------------------------------------------------------- // Calculate the argument register usage. // // This will later be used for fastTailCall determination //------------------------------------------------------------------------- + unsigned argRegCount = 0; unsigned floatingRegCount = 0; unsigned stackArgCount = 0; unsigned stackSize = 0; @@ -257,25 +253,34 @@ void Compiler::lvaInitTypeRef() varDsc->IsFloatRegType() ? ++floatingRegCount : ++argRegCount; }; - for (LclVarDsc* varDsc, unsigned argNum = 0; argNum < info.compMethodInfo->args.numArgs; argNum++, varDsc++) + unsigned argNum; + LclVarDsc* curDsc; + + for (curDsc = lvaTable, argNum = 0; argNum < info.compMethodInfo->args.numArgs; argNum++, curDsc++) { - if (varDsc->lvRegNum != REG_STK) + if (curDsc->lvIsRegArg) { - incrementRegCount(varDsc); + incrementRegCount(curDsc); -#ifdef (FEATURE_MULTIREG_ARGS && !WINDOWS_AMD64_ABI) - if (varDsc->lvOtherArgReg != REG_NA) +#if defined(FEATURE_MULTIREG_ARGS) && defined(UNIX_AMD64_ABI) + if (curDsc->lvOtherArgReg != REG_NA) { - incrementRegCount(varDsc); + incrementRegCount(curDsc); } -#endif // (FEATURE_MULTIREG_ARGS && !WINDOWS_AMD64_ABI) +#endif // defined(FEATURE_MULTIREG_ARGS) && defined(UNIX_AMD64_ABI) + } + if (varTypeIsStruct(curDsc)) + { + stackSize += curDsc->lvSize(); } else { - stackSize += varDsc->lvSize(); + stackSize += TARGET_POINTER_SIZE; } } +#endif // FEATURE_FASTTAILCALL + //------------------------------------------------------------------------- // Finally the local variables //------------------------------------------------------------------------- @@ -288,23 +293,42 @@ void Compiler::lvaInitTypeRef() i++, varNum++, varDsc++, localsSig = info.compCompHnd->getArgNext(localsSig)) { CORINFO_CLASS_HANDLE typeHnd; - CorInfoTypeWithMod corInfoType = + CorInfoTypeWithMod corInfoTypeWithMod = info.compCompHnd->getArgType(&info.compMethodInfo->locals, localsSig, &typeHnd); + CorInfoType corInfoType = strip(corInfoTypeWithMod); - lvaInitVarDsc(varDsc, varNum, strip(corInfoType), typeHnd, localsSig, &info.compMethodInfo->locals); + lvaInitVarDsc(varDsc, varNum, corInfoType, typeHnd, localsSig, &info.compMethodInfo->locals); - varDsc->lvPinned = ((corInfoType & CORINFO_TYPE_MOD_PINNED) != 0); + varDsc->lvPinned = ((corInfoTypeWithMod & CORINFO_TYPE_MOD_PINNED) != 0); varDsc->lvOnFrame = true; // The final home for this local variable might be our local stack frame - CORINFO_CLASS_HANDLE clsHnd = info.compCompHnd->getArgClass(&info.compMethodInfo->locals, localsSig); - stackSize += roundUp(compGetTypeSize(strip(corInfoType), clsHnd), TARGET_POINTER_SIZE); - - if (strip(corInfoType) == CORINFO_TYPE_CLASS) + if (corInfoType == CORINFO_TYPE_CLASS) { + CORINFO_CLASS_HANDLE clsHnd = info.compCompHnd->getArgClass(&info.compMethodInfo->locals, localsSig); + +#if FEATURE_FASTTAILCALL + stackSize += roundUp(compGetTypeSize(corInfoType, clsHnd), TARGET_POINTER_SIZE); +#endif // FEATURE_FASTTAILCALL + lvaSetClass(varNum, clsHnd); } + +#if FEATURE_FASTTAILCALL + else if(corInfoType == CORINFO_TYPE_VALUECLASS) + { + CORINFO_CLASS_HANDLE typeHandle = varDsc->lvVerTypeInfo.GetClassHandle(); + stackSize += info.compCompHnd->getClassSize(typeHandle); + } + + else + { + stackSize += TARGET_POINTER_SIZE; // Use is one slot size. + } +#endif // FEATURE_FASTTAILCALL + } +#if FEATURE_FASTTAILCALL //------------------------------------------------------------------------- // Save the register usage information and stack size. //------------------------------------------------------------------------- @@ -315,6 +339,8 @@ void Compiler::lvaInitTypeRef() info.compFloatArgRegCount = floatingRegCount; info.compStackSize = stackSize; +#endif //FEATURE_FASTTAILCALL + if ( // If there already exist unsafe buffers, don't mark more structs as unsafe // as that will cause them to be placed along with the real unsafe buffers, // unnecessarily exposing them to overruns. This can affect GS tests which diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp index 28c2aee8dbba..9b1e73563e6d 100644 --- a/src/jit/morph.cpp +++ b/src/jit/morph.cpp @@ -6946,7 +6946,8 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) unsigned callerArgRegCount = info.compArgRegCount; unsigned callerFloatArgRegCount = info.compFloatArgRegCount; - // TODO ARM64, UNIX x64 + // TODO-Linux-x86 + // TODO-ARM64 // // Currently we can track the caller's inbound stack size; however, we cannot // easily determine the caller's outbound stack size (the callee's inbound stack @@ -7059,7 +7060,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) } } -#else // ARM64 +#elif defined(_TARGET_ARM64_) // ARM64 var_types hfaType = GetHfaType(argx); bool isHfaArg = varTypeIsFloating(hfaType); unsigned size = 1; @@ -7127,22 +7128,23 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) // x64 Windows: If we have more callee registers used than MAX_REG_ARG, then // make sure the callee's incoming arguments is less than the caller's if ((((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) && - ((callerArgRegCount + callerFloatArgRegCount) > (callerArgRegCount + callerFloatArgRegCount))) + ((calleeArgRegCount + calleeFloatArgRegCount) > (callerArgRegCount + callerFloatArgRegCount))) { JITDUMP("Will not fastTailCall ((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) && (callerArgRegCount > calleeArgRegCount)"); return false; } -#elif (_TARGET_AMD64_ && UNIX_AMD64_ABI) || _TARGET_ARM64_ +#elif (defined(_TARGET_AMD64_) && defined(UNIX_AMD64_ABI) || defined(_TARGET_ARM64_)) // For *nix Amd64 and Arm64 check to see if all arguments for the callee - // and caller are passing in registers. If not make sure that the stack - // size for the callee is at less than or equal to the caller's. + // and caller are passing in registers. If not, ensure that the outgoing argument stack size + // requirement for the callee is less than or equal to the caller's entire stack frame usage. // // Also, in the case that we have to pass arguments on the stack make sure // that we are not dealing with structs that are >8 bytes. bool hasStackArgs = false; + unsigned maxFloatRegArgs = MAX_FLOAT_REG_ARG; unsigned calleeIntStackArgCount = calleeArgRegCount > maxRegArgs ? calleeArgRegCount - maxRegArgs : 0; unsigned calleeFloatStackArgCount = calleeFloatArgRegCount > maxFloatRegArgs ? calleeFloatArgRegCount - maxFloatRegArgs : 0; diff --git a/tests/src/JIT/opt/FastTailCall/DoNotFastTailCallWithStructs.cs b/tests/src/JIT/opt/FastTailCall/DoNotFastTailCallWithStructs.cs deleted file mode 100644 index 5ddc0f60e7f6..000000000000 --- a/tests/src/JIT/opt/FastTailCall/DoNotFastTailCallWithStructs.cs +++ /dev/null @@ -1,81 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; - -// 10 byte struct -public struct A -{ - public int a; - public int b; - public short c; -} - -// 32 byte struct -public struct B -{ - public long a; - public long b; - public long c; - public long d; -} - -public class DoNotFastTailCallWithStructs -{ - static A a; - static B b; - - public static void Foo(B b, int count=1000) - { - if (count == 100) - { - return count; - } - - if (count-- % 2 == 0) - { - return Foo(a, count); - } - - else - { - return Foo(b, count); - } - } - - public static void Foo(A a, int count=1000) - { - if (count == 100) - { - return count; - } - - if (count-- % 2 == 0) - { - return Foo(a, count); - } - - else - { - return Foo(b, count); - } - } - - public static int Main() - { - a = new A(); - b = new B(); - - a.a = 100; - a.b = 1000; - a.c = 10000; - - b.a = 500; - b.b = 5000; - b.c = 50000; - b.d = 500000; - - return Foo(a); - } -} \ No newline at end of file diff --git a/tests/src/JIT/opt/FastTailCall/DoNotFastTailCallWithStructs.csproj b/tests/src/JIT/opt/FastTailCall/DoNotFastTailCallWithStructs.csproj deleted file mode 100644 index 1a8a6d7acfac..000000000000 --- a/tests/src/JIT/opt/FastTailCall/DoNotFastTailCallWithStructs.csproj +++ /dev/null @@ -1,45 +0,0 @@ - - - - - Debug - AnyCPU - $(MSBuildProjectName) - 2.0 - {95DFC527-4DC1-495E-97D7-E94EE1F7140D} - Exe - {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} - ..\..\ - true - - - - - - - - - False - - - - - None - True - True - True - True - True - True - $(DefineConstants);CORECLR - - - - - - - - - - - From facad5279a7031e6eae793d3577a36dafd5220fc Mon Sep 17 00:00:00 2001 From: jashook Date: Mon, 26 Jun 2017 15:00:33 -0700 Subject: [PATCH 04/12] Fix a > a --- src/jit/morph.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp index 9b1e73563e6d..3fcfb49a2552 100644 --- a/src/jit/morph.cpp +++ b/src/jit/morph.cpp @@ -7130,7 +7130,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) if ((((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) && ((calleeArgRegCount + calleeFloatArgRegCount) > (callerArgRegCount + callerFloatArgRegCount))) { - JITDUMP("Will not fastTailCall ((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) && (callerArgRegCount > calleeArgRegCount)"); + JITDUMP("Will not fastTailCall (((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) && ((calleeArgRegCount + calleeFloatArgRegCount) > (callerArgRegCount + callerFloatArgRegCount))"); return false; } @@ -7169,7 +7169,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) return false; } - if (calleeStackArgCount > calleeStackArgCount) + if (calleeStackArgCount > callerStackArgCount) { JITDUMP("Will not fastTailCall calleeStackArgCount > calleeStackArgCount"); return false; From faef47ce9dd56b8f1125ee00d53fad428d3580e3 Mon Sep 17 00:00:00 2001 From: jashook Date: Mon, 26 Jun 2017 15:03:18 -0700 Subject: [PATCH 05/12] Fix log message --- src/jit/morph.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp index 3fcfb49a2552..95f7398b86d4 100644 --- a/src/jit/morph.cpp +++ b/src/jit/morph.cpp @@ -7171,7 +7171,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) if (calleeStackArgCount > callerStackArgCount) { - JITDUMP("Will not fastTailCall calleeStackArgCount > calleeStackArgCount"); + JITDUMP("Will not fastTailCall calleeStackArgCount > callerStackArgCount"); return false; } From 55cc1b1ecf14fd5a3b0a789ede83bf8c5c786fa7 Mon Sep 17 00:00:00 2001 From: jashook Date: Tue, 27 Jun 2017 09:14:04 -0700 Subject: [PATCH 06/12] Requiring nCalleeArgs==nCallerArgs if hasStackArgs LowerFastTailCall assumes nCallerArgs == nCalleeArgs. This is tracked with https://github.com/dotnet/coreclr/issues/12468. It should be fixed in a seperate PR. --- src/jit/compiler.h | 6 ++--- src/jit/lclvars.cpp | 26 ++++----------------- src/jit/lower.cpp | 1 + src/jit/morph.cpp | 57 ++++++++++++++++++++++++++++++++++----------- 4 files changed, 51 insertions(+), 39 deletions(-) diff --git a/src/jit/compiler.h b/src/jit/compiler.h index 671e6d8cfd48..c2cf084b6664 100644 --- a/src/jit/compiler.h +++ b/src/jit/compiler.h @@ -8264,11 +8264,11 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX unsigned compILargsCount; // Number of arguments (incl. implicit but not hidden) unsigned compArgsCount; // Number of arguments (incl. implicit and hidden) -#ifdef FEATURE_MULTIREG_ARGS +#if defined(FEATURE_MULTIREG_ARGS) && defined(FEATURE_FASTTAILCALL) unsigned compArgRegCount; // Number of incoming integer args unsigned compFloatArgRegCount; // Number of incoming floating point args - unsigned compStackSize; // Incoming stack size -#endif // FEATURE_MULTIREG_ARGS + size_t compStackSize; // Incoming stack size +#endif // defined(FEATURE_MULTIREG_ARGS) && defined(FEATURE_FASTTAILCALL) unsigned compRetBuffArg; // position of hidden return param var (0, 1) (BAD_VAR_NUM means not present); int compTypeCtxtArg; // position of hidden param for type context for generic code (CORINFO_CALLCONV_PARAMTYPE) diff --git a/src/jit/lclvars.cpp b/src/jit/lclvars.cpp index 6a667a5dec14..6136e77d44d6 100644 --- a/src/jit/lclvars.cpp +++ b/src/jit/lclvars.cpp @@ -246,7 +246,8 @@ void Compiler::lvaInitTypeRef() unsigned argRegCount = 0; unsigned floatingRegCount = 0; unsigned stackArgCount = 0; - unsigned stackSize = 0; + size_t stackSize = 0; + unsigned compArgCount = info.compArgsCount; auto incrementRegCount = [&floatingRegCount, &argRegCount](LclVarDsc* varDsc) { @@ -256,7 +257,7 @@ void Compiler::lvaInitTypeRef() unsigned argNum; LclVarDsc* curDsc; - for (curDsc = lvaTable, argNum = 0; argNum < info.compMethodInfo->args.numArgs; argNum++, curDsc++) + for (curDsc = lvaTable, argNum = 0; argNum < varDscInfo.varNum; argNum++, curDsc++) { if (curDsc->lvIsRegArg) { @@ -269,7 +270,7 @@ void Compiler::lvaInitTypeRef() } #endif // defined(FEATURE_MULTIREG_ARGS) && defined(UNIX_AMD64_ABI) } - if (varTypeIsStruct(curDsc)) + else if (varTypeIsStruct(curDsc)) { stackSize += curDsc->lvSize(); } @@ -305,27 +306,8 @@ void Compiler::lvaInitTypeRef() if (corInfoType == CORINFO_TYPE_CLASS) { CORINFO_CLASS_HANDLE clsHnd = info.compCompHnd->getArgClass(&info.compMethodInfo->locals, localsSig); - -#if FEATURE_FASTTAILCALL - stackSize += roundUp(compGetTypeSize(corInfoType, clsHnd), TARGET_POINTER_SIZE); -#endif // FEATURE_FASTTAILCALL - lvaSetClass(varNum, clsHnd); } - -#if FEATURE_FASTTAILCALL - else if(corInfoType == CORINFO_TYPE_VALUECLASS) - { - CORINFO_CLASS_HANDLE typeHandle = varDsc->lvVerTypeInfo.GetClassHandle(); - stackSize += info.compCompHnd->getClassSize(typeHandle); - } - - else - { - stackSize += TARGET_POINTER_SIZE; // Use is one slot size. - } -#endif // FEATURE_FASTTAILCALL - } #if FEATURE_FASTTAILCALL diff --git a/src/jit/lower.cpp b/src/jit/lower.cpp index 27b84b6144d3..a626ff98e7bf 100644 --- a/src/jit/lower.cpp +++ b/src/jit/lower.cpp @@ -1701,6 +1701,7 @@ void Lowering::LowerFastTailCall(GenTreeCall* call) fgArgTabEntryPtr argTabEntry = comp->gtArgEntryByNode(call, putArgStkNode); assert(argTabEntry); unsigned callerArgNum = argTabEntry->argNum - calleeNonStandardArgCount; + noway_assert(callerArgNum < comp->info.compArgsCount); unsigned callerArgLclNum = callerArgNum; LclVarDsc* callerArgDsc = comp->lvaTable + callerArgLclNum; diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp index 95f7398b86d4..d5a54f13f8ff 100644 --- a/src/jit/morph.cpp +++ b/src/jit/morph.cpp @@ -6934,6 +6934,14 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) } #endif + FILE* file = fopen("/Users/jashoo/projects/dev/output.txt", "a"); + if (file != nullptr) + { + fprintf(file, "****** canFastTailCall: %s - (MethodHash=%08x)\n", info.compFullName, info.compMethodHash()); + } + + unsigned nCallerArgs = info.compArgsCount; + // Note on vararg methods: // If the caller is vararg method, we don't know the number of arguments passed by caller's caller. // But we can be sure that in-coming arg area of vararg caller would be sufficient to hold its @@ -6946,7 +6954,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) unsigned callerArgRegCount = info.compArgRegCount; unsigned callerFloatArgRegCount = info.compFloatArgRegCount; - // TODO-Linux-x86 + // TODO-Linux-x64 // TODO-ARM64 // // Currently we can track the caller's inbound stack size; however, we cannot @@ -6986,8 +6994,10 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) // these won't contribute to out-going arg size. bool hasMultiByteArgs = false; bool hasTwoSlotSizedStruct = false; + unsigned nCalleeArgs = calleeArgRegCount; // Keep track of how many args we have. for (GenTreePtr args = callee->gtCallArgs; (args != nullptr) && !hasMultiByteArgs; args = args->gtOp.gtOp2) { + ++nCalleeArgs; assert(args->OperIsList()); GenTreePtr argx = args->gtOp.gtOp1; @@ -7014,8 +7024,6 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) { #if defined(_TARGET_AMD64_) || defined(_TARGET_ARM64_) - // **NOTE** - // // hasMultiByteArgs will determine if the struct can be passed // in registers. If it cannot we will break the loop and not // fastTailCall. @@ -7108,9 +7116,20 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) } } + auto logToFile = [file] (const char* str) + { + if (file != nullptr) + { + fprintf(file, "%s\n", str); + fprintf(file, "----------------------------------------------------------------\n"); + fclose(file); + } + }; + // Go the slow route, if it has multi-byte params if (hasMultiByteArgs) { + logToFile("hasMultiByteArgs"); return false; } @@ -7130,7 +7149,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) if ((((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) && ((calleeArgRegCount + calleeFloatArgRegCount) > (callerArgRegCount + callerFloatArgRegCount))) { - JITDUMP("Will not fastTailCall (((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) && ((calleeArgRegCount + calleeFloatArgRegCount) > (callerArgRegCount + callerFloatArgRegCount))"); + logToFile("Will not fastTailCall (((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) && ((calleeArgRegCount + calleeFloatArgRegCount) > (callerArgRegCount + callerFloatArgRegCount))"); return false; } @@ -7150,13 +7169,10 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) unsigned calleeFloatStackArgCount = calleeFloatArgRegCount > maxFloatRegArgs ? calleeFloatArgRegCount - maxFloatRegArgs : 0; unsigned calleeStackArgCount = calleeIntStackArgCount + calleeFloatStackArgCount; + unsigned callerStackSize = info.compStackSize; + unsigned calleeStackSize = calleeStackArgCount * TARGET_POINTER_SIZE; - unsigned callerIntStackArgCount = callerArgRegCount > maxRegArgs ? callerArgRegCount - maxRegArgs : 0; - unsigned callerFloatStackArgCount = callerFloatArgRegCount > maxFloatRegArgs ? callerFloatArgRegCount - maxFloatRegArgs : 0; - - unsigned callerStackArgCount = callerIntStackArgCount + callerFloatStackArgCount; - - if (callerStackArgCount > 0 || calleeStackArgCount > 0) + if (callerStackSize > 0 || calleeStackSize > 0) { hasStackArgs = true; } @@ -7165,13 +7181,26 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) // on the stack. Do not fastTailCall. if (hasStackArgs && hasTwoSlotSizedStruct) { - JITDUMP("Will not fastTailCall hasStackArgs && hasTwoSlotSizedStruct"); + logToFile("Will not fastTailCall hasStackArgs && hasTwoSlotSizedStruct"); + return false; + } + + // TODO-Linux-x64 + // TODO-ARM64 + // + // LowerFastTailCall current assumes nCalleeArgs == nCallerArgs. This is + // not true in many cases on x64 linux, remove this pessimization when + // LowerFastTailCall is fixed. See https://github.com/dotnet/coreclr/issues/12468 + // for more information. + if (hasStackArgs && (nCalleeArgs != nCallerArgs)) + { + logToFile("Will not fastTailCall hasStackArgs && (nCalleeArgs != nCallerArgs)"); return false; } - if (calleeStackArgCount > callerStackArgCount) + if (calleeStackSize > callerStackSize) { - JITDUMP("Will not fastTailCall calleeStackArgCount > callerStackArgCount"); + logToFile("Will not fastTailCall calleeStackArgCount > callerStackArgCount"); return false; } @@ -7181,7 +7210,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) #endif // WINDOWS_AMD64_ABI - JITDUMP("Will fastTailCall"); + logToFile("Will fastTailCall"); return true; #else // FEATURE_FASTTAILCALL return false; From 3e48cf4dc218b65e73b47ba1780b76cadfea7b0f Mon Sep 17 00:00:00 2001 From: jashook Date: Tue, 27 Jun 2017 09:19:24 -0700 Subject: [PATCH 07/12] Remove debug code. --- src/jit/morph.cpp | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp index d5a54f13f8ff..2c90fe8dc3d0 100644 --- a/src/jit/morph.cpp +++ b/src/jit/morph.cpp @@ -6934,12 +6934,6 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) } #endif - FILE* file = fopen("/Users/jashoo/projects/dev/output.txt", "a"); - if (file != nullptr) - { - fprintf(file, "****** canFastTailCall: %s - (MethodHash=%08x)\n", info.compFullName, info.compMethodHash()); - } - unsigned nCallerArgs = info.compArgsCount; // Note on vararg methods: @@ -7116,20 +7110,10 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) } } - auto logToFile = [file] (const char* str) - { - if (file != nullptr) - { - fprintf(file, "%s\n", str); - fprintf(file, "----------------------------------------------------------------\n"); - fclose(file); - } - }; - // Go the slow route, if it has multi-byte params if (hasMultiByteArgs) { - logToFile("hasMultiByteArgs"); + JITDUMP("hasMultiByteArgs"); return false; } @@ -7149,7 +7133,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) if ((((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) && ((calleeArgRegCount + calleeFloatArgRegCount) > (callerArgRegCount + callerFloatArgRegCount))) { - logToFile("Will not fastTailCall (((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) && ((calleeArgRegCount + calleeFloatArgRegCount) > (callerArgRegCount + callerFloatArgRegCount))"); + JITDUMP("Will not fastTailCall (((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) && ((calleeArgRegCount + calleeFloatArgRegCount) > (callerArgRegCount + callerFloatArgRegCount))"); return false; } @@ -7181,7 +7165,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) // on the stack. Do not fastTailCall. if (hasStackArgs && hasTwoSlotSizedStruct) { - logToFile("Will not fastTailCall hasStackArgs && hasTwoSlotSizedStruct"); + JITDUMP("Will not fastTailCall hasStackArgs && hasTwoSlotSizedStruct"); return false; } @@ -7194,13 +7178,13 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) // for more information. if (hasStackArgs && (nCalleeArgs != nCallerArgs)) { - logToFile("Will not fastTailCall hasStackArgs && (nCalleeArgs != nCallerArgs)"); + JITDUMP("Will not fastTailCall hasStackArgs && (nCalleeArgs != nCallerArgs)"); return false; } if (calleeStackSize > callerStackSize) { - logToFile("Will not fastTailCall calleeStackArgCount > callerStackArgCount"); + JITDUMP("Will not fastTailCall calleeStackArgCount > callerStackArgCount"); return false; } @@ -7210,7 +7194,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) #endif // WINDOWS_AMD64_ABI - logToFile("Will fastTailCall"); + JITDUMP("Will fastTailCall"); return true; #else // FEATURE_FASTTAILCALL return false; From 01435c4d622aefa59676d7dd8fc29e945dbc1586 Mon Sep 17 00:00:00 2001 From: jashook Date: Tue, 27 Jun 2017 09:35:28 -0700 Subject: [PATCH 08/12] Fix windows build breaks --- src/jit/lclvars.cpp | 4 ++-- src/jit/morph.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/jit/lclvars.cpp b/src/jit/lclvars.cpp index 6136e77d44d6..323bef36d0b9 100644 --- a/src/jit/lclvars.cpp +++ b/src/jit/lclvars.cpp @@ -1315,9 +1315,9 @@ void Compiler::lvaInitVarDsc(LclVarDsc* varDsc, varDsc->lvStkOffs = BAD_STK_OFFS; #endif -#ifdef FEATURE_MULTIREG_ARGS +#if defined(FEATURE_MULTIREG_ARGS) && !defined(WINDOWS_AMD64_ABI) varDsc->lvOtherArgReg = REG_NA; -#endif // FEATURE_MULTIREG_ARGS +#endif // defined(FEATURE_MULTIREG_ARGS) && !defined(WINDOWS_AMD64_ABI) } /***************************************************************************** diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp index 2c90fe8dc3d0..37fb49f53d97 100644 --- a/src/jit/morph.cpp +++ b/src/jit/morph.cpp @@ -7130,7 +7130,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) #ifdef WINDOWS_AMD64_ABI // x64 Windows: If we have more callee registers used than MAX_REG_ARG, then // make sure the callee's incoming arguments is less than the caller's - if ((((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) && + if (((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) && ((calleeArgRegCount + calleeFloatArgRegCount) > (callerArgRegCount + callerFloatArgRegCount))) { JITDUMP("Will not fastTailCall (((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) && ((calleeArgRegCount + calleeFloatArgRegCount) > (callerArgRegCount + callerFloatArgRegCount))"); From cfea02f357efbb54f0fb4a3f6758dcf39f08d83c Mon Sep 17 00:00:00 2001 From: jashook Date: Tue, 27 Jun 2017 10:16:21 -0700 Subject: [PATCH 09/12] Fix windows warnings --- src/jit/morph.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp index 37fb49f53d97..a02b37470097 100644 --- a/src/jit/morph.cpp +++ b/src/jit/morph.cpp @@ -7074,7 +7074,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) else { // Structs are either passed in 1 or 2 (64-bit) slots - unsigned roundupSize = roundUp(info.compCompHnd->getClassSize(argx->gtArgPlace.gtArgPlaceClsHnd), + size_t roundupSize = roundUp(info.compCompHnd->getClassSize(argx->gtArgPlace.gtArgPlaceClsHnd), TARGET_POINTER_SIZE); size = roundupSize / TARGET_POINTER_SIZE; @@ -7153,7 +7153,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) unsigned calleeFloatStackArgCount = calleeFloatArgRegCount > maxFloatRegArgs ? calleeFloatArgRegCount - maxFloatRegArgs : 0; unsigned calleeStackArgCount = calleeIntStackArgCount + calleeFloatStackArgCount; - unsigned callerStackSize = info.compStackSize; + size_t callerStackSize = info.compStackSize; unsigned calleeStackSize = calleeStackArgCount * TARGET_POINTER_SIZE; if (callerStackSize > 0 || calleeStackSize > 0) From f13b6cd4ba44255a8fb479d3af62b645ce210bc7 Mon Sep 17 00:00:00 2001 From: jashook Date: Tue, 27 Jun 2017 10:18:06 -0700 Subject: [PATCH 10/12] Fix last windows warning --- src/jit/morph.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp index a02b37470097..0fc75c966ab1 100644 --- a/src/jit/morph.cpp +++ b/src/jit/morph.cpp @@ -7065,7 +7065,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) #elif defined(_TARGET_ARM64_) // ARM64 var_types hfaType = GetHfaType(argx); bool isHfaArg = varTypeIsFloating(hfaType); - unsigned size = 1; + size_t size = 1; if (isHfaArg) { From b1370ad50a5bd2d1ec68adc73d172d8e68a16439 Mon Sep 17 00:00:00 2001 From: jashook Date: Tue, 27 Jun 2017 10:19:51 -0700 Subject: [PATCH 11/12] Fix hopefully the last windows warning --- src/jit/morph.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp index 0fc75c966ab1..3e83717fa686 100644 --- a/src/jit/morph.cpp +++ b/src/jit/morph.cpp @@ -6945,8 +6945,8 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) // Note that callee being a vararg method is not a problem since we can account the params being passed. // Count of caller args including implicit and hidden (i.e. thisPtr, RetBuf, GenericContext, VarargCookie) - unsigned callerArgRegCount = info.compArgRegCount; - unsigned callerFloatArgRegCount = info.compFloatArgRegCount; + size_t callerArgRegCount = info.compArgRegCount; + size_t callerFloatArgRegCount = info.compFloatArgRegCount; // TODO-Linux-x64 // TODO-ARM64 @@ -6962,8 +6962,8 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) // Count the callee args including implicit and hidden. // Note that GenericContext and VarargCookie are added by importer while // importing the call to gtCallArgs list along with explicit user args. - unsigned calleeArgRegCount = 0; - unsigned calleeFloatArgRegCount = 0; + size_t calleeArgRegCount = 0; + size_t calleeFloatArgRegCount = 0; if (callee->gtCallObjp) // thisPtr { From 56d267ec254361859b4ecc28fbe6f1be4fdeb62e Mon Sep 17 00:00:00 2001 From: jashook Date: Tue, 27 Jun 2017 12:34:05 -0700 Subject: [PATCH 12/12] Make the correct fastTailCall decision on Windows Now shows no diff in fastTailCall decisions on Amd64 Windows. --- src/jit/morph.cpp | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src/jit/morph.cpp b/src/jit/morph.cpp index 3e83717fa686..5dd29fe3711b 100644 --- a/src/jit/morph.cpp +++ b/src/jit/morph.cpp @@ -6988,7 +6988,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) // these won't contribute to out-going arg size. bool hasMultiByteArgs = false; bool hasTwoSlotSizedStruct = false; - unsigned nCalleeArgs = calleeArgRegCount; // Keep track of how many args we have. + size_t nCalleeArgs = calleeArgRegCount; // Keep track of how many args we have. for (GenTreePtr args = callee->gtCallArgs; (args != nullptr) && !hasMultiByteArgs; args = args->gtOp.gtOp2) { ++nCalleeArgs; @@ -7092,6 +7092,10 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) calleeArgRegCount += size; +#elif defined(WINDOWS_AMD64_ABI) + + ++calleeArgRegCount; + #endif // FEATURE_UNIX_AMD64_STRUCT_PASSING #else @@ -7113,7 +7117,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) // Go the slow route, if it has multi-byte params if (hasMultiByteArgs) { - JITDUMP("hasMultiByteArgs"); + JITDUMP("Will not fastTailCall hasMultiByteArgs"); return false; } @@ -7128,12 +7132,15 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) // as non-interruptible for fast tail calls. #ifdef WINDOWS_AMD64_ABI + size_t calleeStackSlots = ((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) ? (calleeArgRegCount + calleeFloatArgRegCount) - maxRegArgs : 0; + size_t calleeStackSize = calleeStackSlots * TARGET_POINTER_SIZE; + size_t callerStackSize = info.compStackSize; + // x64 Windows: If we have more callee registers used than MAX_REG_ARG, then // make sure the callee's incoming arguments is less than the caller's - if (((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) && - ((calleeArgRegCount + calleeFloatArgRegCount) > (callerArgRegCount + callerFloatArgRegCount))) + if ((calleeStackSlots > 0) && (calleeStackSize > callerStackSize)) { - JITDUMP("Will not fastTailCall (((calleeArgRegCount + calleeFloatArgRegCount) > maxRegArgs) && ((calleeArgRegCount + calleeFloatArgRegCount) > (callerArgRegCount + callerFloatArgRegCount))"); + JITDUMP("Will not fastTailCall (calleeStackSlots > 0) && (calleeStackSize > callerStackSize)"); return false; } @@ -7147,14 +7154,14 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee) // that we are not dealing with structs that are >8 bytes. bool hasStackArgs = false; - unsigned maxFloatRegArgs = MAX_FLOAT_REG_ARG; + size_t maxFloatRegArgs = MAX_FLOAT_REG_ARG; - unsigned calleeIntStackArgCount = calleeArgRegCount > maxRegArgs ? calleeArgRegCount - maxRegArgs : 0; - unsigned calleeFloatStackArgCount = calleeFloatArgRegCount > maxFloatRegArgs ? calleeFloatArgRegCount - maxFloatRegArgs : 0; + size_t calleeIntStackArgCount = calleeArgRegCount > maxRegArgs ? calleeArgRegCount - maxRegArgs : 0; + size_t calleeFloatStackArgCount = calleeFloatArgRegCount > maxFloatRegArgs ? calleeFloatArgRegCount - maxFloatRegArgs : 0; - unsigned calleeStackArgCount = calleeIntStackArgCount + calleeFloatStackArgCount; + size_t calleeStackArgCount = calleeIntStackArgCount + calleeFloatStackArgCount; size_t callerStackSize = info.compStackSize; - unsigned calleeStackSize = calleeStackArgCount * TARGET_POINTER_SIZE; + size_t calleeStackSize = calleeStackArgCount * TARGET_POINTER_SIZE; if (callerStackSize > 0 || calleeStackSize > 0) {